All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
@ 2021-10-23 19:47 Woodhouse, David
  2021-10-24 11:15 ` David Woodhouse
                   ` (3 more replies)
  0 siblings, 4 replies; 28+ messages in thread
From: Woodhouse, David @ 2021-10-23 19:47 UTC (permalink / raw)
  To: kvm; +Cc: jmattson, pbonzini, wanpengli, seanjc, vkuznets, mtosatti, joro

From: David Woodhouse <dwmw@amazon.co.uk>

There are circumstances whem kvm_xen_update_runstate_guest() should not
sleep because it ends up being called from __schedule() when the vCPU
is preempted:

[  222.830825]  kvm_xen_update_runstate_guest+0x24/0x100
[  222.830878]  kvm_arch_vcpu_put+0x14c/0x200
[  222.830920]  kvm_sched_out+0x30/0x40
[  222.830960]  __schedule+0x55c/0x9f0

To handle this, make it use the same trick as __kvm_xen_has_interrupt(),
of using the hva from the gfn_to_hva_cache directly. Then it can use
pagefault_disable() around the accesses and just bail out if the page
is absent (which is unlikely).

I almost switched to using a gfn_to_pfn_cache here and bailing out if
kvm_map_gfn() fails, like kvm_steal_time_set_preempted() does — but on
closer inspection it looks like kvm_map_gfn() will *always* fail in
atomic context for a page in IOMEM, which means it will silently fail
to make the update every single time for such guests, AFAICT. So I
didn't do it that way after all. And will probably fix that one too.

Cc: stable@vger.kernel.org
Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/xen.c | 97 +++++++++++++++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 8f62baebd028..3a7f1b31d77b 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -93,32 +93,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 {
 	struct kvm_vcpu_xen *vx = &v->arch.xen;
+	struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
+	struct kvm_memslots *slots = kvm_memslots(v->kvm);
+	bool atomic = (state == RUNSTATE_runnable);
 	uint64_t state_entry_time;
-	unsigned int offset;
+	int __user *user_state;
+	uint64_t __user *user_times;
 
 	kvm_xen_update_runstate(v, state);
 
 	if (!vx->runstate_set)
 		return;
 
-	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+	if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
+	    kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
+		return;
+
+	/* We made sure it fits in a single page */
+	BUG_ON(!ghc->memslot);
+
+	if (atomic)
+		pagefault_disable();
 
-	offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
-#ifdef CONFIG_X86_64
 	/*
-	 * The only difference is alignment of uint64_t in 32-bit.
-	 * So the first field 'state' is accessed directly using
-	 * offsetof() (where its offset happens to be zero), while the
-	 * remaining fields which are all uint64_t, start at 'offset'
-	 * which we tweak here by adding 4.
+	 * The only difference between 32-bit and 64-bit versions of the
+	 * runstate struct us the alignment of uint64_t in 32-bit, which
+	 * means that the 64-bit version has an additional 4 bytes of
+	 * padding after the first field 'state'.
+	 *
+	 * So we use 'int __user *user_state' to point to the state field,
+	 * and 'uint64_t __user *user_times' for runstate_entry_time. So
+	 * the actual array of time[] in each state starts at user_times[1].
 	 */
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
+	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
+	user_state = (int __user *)ghc->hva;
+
+	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+
+	user_times = (uint64_t __user *)(ghc->hva +
+					 offsetof(struct compat_vcpu_runstate_info,
+						  state_entry_time));
+#ifdef CONFIG_X86_64
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
 		     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
 		     offsetof(struct compat_vcpu_runstate_info, time) + 4);
 
 	if (v->kvm->arch.xen.long_mode)
-		offset = offsetof(struct vcpu_runstate_info, state_entry_time);
+		user_times = (uint64_t __user *)(ghc->hva +
+						 offsetof(struct vcpu_runstate_info,
+							  state_entry_time));
 #endif
 	/*
 	 * First write the updated state_entry_time at the appropriate
@@ -132,28 +157,21 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) !=
 		     sizeof(state_entry_time));
 
-	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-					  &state_entry_time, offset,
-					  sizeof(state_entry_time)))
-		return;
+	if (__put_user(state_entry_time, user_times))
+		goto out;
 	smp_wmb();
 
 	/*
 	 * Next, write the new runstate. This is in the *same* place
 	 * for 32-bit and 64-bit guests, asserted here for paranoia.
 	 */
-	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
-		     offsetof(struct compat_vcpu_runstate_info, state));
 	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
 		     sizeof(vx->current_runstate));
 	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
 		     sizeof(vx->current_runstate));
 
-	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-					  &vx->current_runstate,
-					  offsetof(struct vcpu_runstate_info, state),
-					  sizeof(vx->current_runstate)))
-		return;
+	if (__put_user(vx->current_runstate, user_state))
+		goto out;
 
 	/*
 	 * Write the actual runstate times immediately after the
@@ -168,24 +186,21 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
 		     sizeof(vx->runstate_times));
 
-	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-					  &vx->runstate_times[0],
-					  offset + sizeof(u64),
-					  sizeof(vx->runstate_times)))
-		return;
-
+	if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
+		goto out;
 	smp_wmb();
 
 	/*
 	 * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
 	 * runstate_entry_time field.
 	 */
-
 	state_entry_time &= ~XEN_RUNSTATE_UPDATE;
-	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-					  &state_entry_time, offset,
-					  sizeof(state_entry_time)))
-		return;
+	__put_user(state_entry_time, user_times);
+	smp_wmb();
+
+ out:
+	if (atomic)
+		pagefault_enable();
 }
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -337,6 +352,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			break;
 		}
 
+		/* It must fit within a single page */
+		if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
+			r = -EINVAL;
+			break;
+		}
+
 		r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.vcpu_info_cache,
 					      data->u.gpa,
@@ -354,6 +375,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			break;
 		}
 
+		/* It must fit within a single page */
+		if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
+			r = -EINVAL;
+			break;
+		}
+
 		r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.vcpu_time_info_cache,
 					      data->u.gpa,
@@ -375,6 +402,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			break;
 		}
 
+		/* It must fit within a single page */
+		if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
+			r = -EINVAL;
+			break;
+		}
+
 		r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.runstate_cache,
 					      data->u.gpa,





Amazon Development Centre (London) Ltd. Registered in England and Wales with registration number 04543232 with its registered office at 1 Principal Place, Worship Street, London EC2A 2FA, United Kingdom.



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-23 19:47 [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU Woodhouse, David
@ 2021-10-24 11:15 ` David Woodhouse
  2021-10-25 10:23 ` Paolo Bonzini
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 28+ messages in thread
From: David Woodhouse @ 2021-10-24 11:15 UTC (permalink / raw)
  To: kvm; +Cc: jmattson, pbonzini, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 427 bytes --]

On Sat, 2021-10-23 at 19:47 +0000, Woodhouse, David wrote:
> 
> +       __put_user(state_entry_time, user_times);
> +       smp_wmb();
> +
> + out:

Needs a mark_page_dirty_in_slot() here, which I knew at the time I
*started* typing, but forgot by the time I got to the end. I'll include
that in a v2 but wait for any other feedback before I post it.

> +       if (atomic)
> +               pagefault_enable();


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-23 19:47 [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU Woodhouse, David
  2021-10-24 11:15 ` David Woodhouse
@ 2021-10-25 10:23 ` Paolo Bonzini
  2021-10-25 10:39   ` [EXTERNAL] " David Woodhouse
  2021-10-25 11:43 ` Paolo Bonzini
  2021-10-25 13:29 ` [PATCH v2] " David Woodhouse
  3 siblings, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2021-10-25 10:23 UTC (permalink / raw)
  To: Woodhouse, David, kvm
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

On 23/10/21 21:47, Woodhouse, David wrote:
> I almost switched to using a gfn_to_pfn_cache here and bailing out if
> kvm_map_gfn() fails, like kvm_steal_time_set_preempted() does — but on
> closer inspection it looks like kvm_map_gfn() will*always*  fail in
> atomic context for a page in IOMEM, which means it will silently fail
> to make the update every single time for such guests, AFAICT. So I
> didn't do it that way after all. And will probably fix that one too.

Not every single time, only if the cache is absent, stale or not 
initialized.

In the case of steal time, record_steal_time can update the cache 
because it's never called from atomic context, while 
kvm_steal_time_set_preempted is just advisory and therefore can fail 
just fine.

One possible solution (which I even have unfinished patches for) is to 
put all the gfn_to_pfn_caches on a list, and refresh them when the MMU 
notifier receives an invalidation.

Paolo


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 10:23 ` Paolo Bonzini
@ 2021-10-25 10:39   ` David Woodhouse
  2021-10-25 12:12     ` Paolo Bonzini
  2021-10-25 12:19     ` David Woodhouse
  0 siblings, 2 replies; 28+ messages in thread
From: David Woodhouse @ 2021-10-25 10:39 UTC (permalink / raw)
  To: Paolo Bonzini, kvm, Raslan, KarimAllah
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 2014 bytes --]

On Mon, 2021-10-25 at 12:23 +0200, Paolo Bonzini wrote:
> On 23/10/21 21:47, Woodhouse, David wrote:
> > I almost switched to using a gfn_to_pfn_cache here and bailing out if
> > kvm_map_gfn() fails, like kvm_steal_time_set_preempted() does — but on
> > closer inspection it looks like kvm_map_gfn() will*always*  fail in
> > atomic context for a page in IOMEM, which means it will silently fail
> > to make the update every single time for such guests, AFAICT. So I
> > didn't do it that way after all. And will probably fix that one too.
> 
> 
> 
> Not every single time, only if the cache is absent, stale or not
> initialized.

Hm, my reading of it suggests that it will fail even when the cache is
valid, on IOMEM PFNs for which pfn_valid() is not set:

        if (pfn_valid(pfn)) {
                page = pfn_to_page(pfn);
                if (atomic)
                        hva = kmap_atomic(page);
                else
                        hva = kmap(page);
#ifdef CONFIG_HAS_IOMEM
        } else if (!atomic) {
                hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE,
MEMREMAP_WB);
        } else {
                return -EINVAL;
#endif
        }

> In the case of steal time, record_steal_time can update the cache
> because it's never called from atomic context, while
> kvm_steal_time_set_preempted is just advisory and therefore can fail
> just fine.

Sure, but it's an 'advisory' thing which is used for optimisations in
the guest like spotting that the holder of a spinlock is preempted, and
yielding to it? We didn't do those optimisations just for fun, did we?

> One possible solution (which I even have unfinished patches for) is to
> put all the gfn_to_pfn_caches on a list, and refresh them when the MMU
> notifier receives an invalidation.

For this use case I'm not even sure why I'd *want* to cache the PFN and
explicitly kmap/memremap it, when surely by *definition* there's a
perfectly serviceable HVA which already points to it? 

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-23 19:47 [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU Woodhouse, David
  2021-10-24 11:15 ` David Woodhouse
  2021-10-25 10:23 ` Paolo Bonzini
@ 2021-10-25 11:43 ` Paolo Bonzini
  2021-10-25 13:30   ` Woodhouse, David
  2021-10-25 13:29 ` [PATCH v2] " David Woodhouse
  3 siblings, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2021-10-25 11:43 UTC (permalink / raw)
  To: Woodhouse, David, kvm
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

On 23/10/21 21:47, Woodhouse, David wrote:
>   	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
>   		     sizeof(vx->current_runstate));
>   	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
>   		     sizeof(vx->current_runstate));

We can also use sizeof_field here, while you're at it (separate patch, 
though).

Paolo


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 10:39   ` [EXTERNAL] " David Woodhouse
@ 2021-10-25 12:12     ` Paolo Bonzini
  2021-10-25 12:19     ` David Woodhouse
  1 sibling, 0 replies; 28+ messages in thread
From: Paolo Bonzini @ 2021-10-25 12:12 UTC (permalink / raw)
  To: David Woodhouse, kvm, Raslan, KarimAllah
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

On 25/10/21 12:39, David Woodhouse wrote:
>> Not every single time, only if the cache is absent, stale or not
>> initialized.
> Hm, my reading of it suggests that it will fail even when the cache is
> valid, on IOMEM PFNs for which pfn_valid() is not set:
> 
>          if (pfn_valid(pfn)) {
>                  page = pfn_to_page(pfn);
>                  if (atomic)
>                          hva = kmap_atomic(page);
>                  else
>                          hva = kmap(page);
> #ifdef CONFIG_HAS_IOMEM
>          } else if (!atomic) {
>                  hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE,
> MEMREMAP_WB);
>          } else {
>                  return -EINVAL;
> #endif
>          }
> 

Yeah, you're right.  That's the "if" above.

> For this use case I'm not even sure why I'd *want* to cache the PFN and
> explicitly kmap/memremap it, when surely by *definition* there's a
> perfectly serviceable HVA which already points to it? 

The point of the gfn_to_pfn cache would be to know in advance that there 
won't be a page fault in atomic context.  You certainly don't want to 
memremap/memunmap it here, it would be awfully slow, but pulling the 
kmap/memremap to the MMU notifier would make sense.

Paolo


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 10:39   ` [EXTERNAL] " David Woodhouse
  2021-10-25 12:12     ` Paolo Bonzini
@ 2021-10-25 12:19     ` David Woodhouse
  2021-10-25 12:22       ` Paolo Bonzini
  2021-10-28 22:22       ` [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU David Woodhouse
  1 sibling, 2 replies; 28+ messages in thread
From: David Woodhouse @ 2021-10-25 12:19 UTC (permalink / raw)
  To: Paolo Bonzini, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 3266 bytes --]

On Mon, 2021-10-25 at 11:39 +0100, David Woodhouse wrote:
> > One possible solution (which I even have unfinished patches for) is to
> > put all the gfn_to_pfn_caches on a list, and refresh them when the MMU
> > notifier receives an invalidation.
> 
> For this use case I'm not even sure why I'd *want* to cache the PFN and
> explicitly kmap/memremap it, when surely by *definition* there's a
> perfectly serviceable HVA which already points to it?

That's indeed true for *this* use case but my *next* use case is
actually implementing the event channel delivery.

What we have in-kernel already is everything we absolutely *need* in
order to host Xen guests, but I really do want to fix the fact that
even IPIs and timers are bouncing up through userspace.

Xen 2-level event channel delivery is a series of test-and-set
operations. For delivering a given port#, we:

 • Test-and-set the corresponding port# bit in the shared info page.
   If it was already set, we're done.

 • Test the corresponding 'masked' bit in the shared info page. If it
   was already set, we're done.

 • Test-and-test the bit in the target vcpu_info 'evtchn_pending_sel'
   which corresponds to the *word* in which the port# resides. If it
   was already set, we're done.
 
 • Set the 'evtchn_upcall_pending' bit in the target vcpu_info to
   trigger the vector delivery.

In João and Ankur's original version¹ this was really simple; it looked
like this:

	if (test_and_set_bit(p, (unsigned long *) shared_info->evtchn_pending))
		return 1;

	if (!test_bit(p, (unsigned long *) shared_info->evtchn_mask) &&
	    !test_and_set_bit(p / BITS_PER_EVTCHN_WORD,
			      (unsigned long *) &vcpu_info->evtchn_pending_sel))
		return kvm_xen_evtchn_2l_vcpu_set_pending(vcpu_info);

Yay for permanently pinned pages! :)

So, with a fixed version of kvm_map_gfn() I suppose I could do the
same, but that's *two* maps/unmaps for each interrupt? That's probably
worse than just bouncing out and letting userspace do it!

So even for the event channel delivery use case, if I'm not allowed to
just pin the pages permanently then I stand by the observation that I
*have* a perfectly serviceable HVA for it already.

I can even do the test-and-set in userspace based on the futex
primitives, but the annoying part is that if the page does end up
absent, I need to *store* the pending operation because there will be
times when we're trying to deliver interrupts but *can't* sleep and
wait for the page. So that probably means 512 bytes of evtchn bitmap
*per vcpu* in order to store the event channels which are pending for
each vCPU, and a way to replay them from a context which *can* sleep.

And if I have *that* then I might as well use it to solve the problem
of the gpa_to_hva_cache being single-threaded, and let a vCPU do its
own writes to its vcpu_info *every* time.

With perhaps a little more thinking about how I use a gpa_to_hva_cache
for the shinfo page (which you removed in commit 319afe68), but perhaps
starting with the observation that it's only not thread-capable when
it's *invalid* and needs to be refreshed...


¹ https://lore.kernel.org/lkml/20190220201609.28290-12-joao.m.martins@oracle.com/


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 12:19     ` David Woodhouse
@ 2021-10-25 12:22       ` Paolo Bonzini
  2021-10-25 13:13         ` David Woodhouse
  2021-10-30  7:58         ` David Woodhouse
  2021-10-28 22:22       ` [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU David Woodhouse
  1 sibling, 2 replies; 28+ messages in thread
From: Paolo Bonzini @ 2021-10-25 12:22 UTC (permalink / raw)
  To: David Woodhouse, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

On 25/10/21 14:19, David Woodhouse wrote:
> So, with a fixed version of kvm_map_gfn() I suppose I could do the
> same, but that's*two*  maps/unmaps for each interrupt? That's probably
> worse than just bouncing out and letting userspace do it!

Absolutely!  The fixed version of kvm_map_gfn should not do any 
map/unmap, it should do it eagerly on MMU notifier operations.

Paolo


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 12:22       ` Paolo Bonzini
@ 2021-10-25 13:13         ` David Woodhouse
  2021-10-29 10:48           ` David Woodhouse
  2021-10-30  7:58         ` David Woodhouse
  1 sibling, 1 reply; 28+ messages in thread
From: David Woodhouse @ 2021-10-25 13:13 UTC (permalink / raw)
  To: Paolo Bonzini, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 1056 bytes --]

On Mon, 2021-10-25 at 14:22 +0200, Paolo Bonzini wrote:
> On 25/10/21 14:19, David Woodhouse wrote:
> > So, with a fixed version of kvm_map_gfn() I suppose I could do the
> > same, but that's*two*  maps/unmaps for each interrupt? That's probably
> > worse than just bouncing out and letting userspace do it!
> 
> 
> 
> Absolutely!  The fixed version of kvm_map_gfn should not do any
> map/unmap, it should do it eagerly on MMU notifier operations.

When you put it like that, it just seems so stunningly redundant :)

"When we get notified that the guest HVA has been mapped, we create our
own kernel mapping of the same page. When we are notifed that the guest
HVA gets unmapped, we tear down our kernel mapping of it."

The really important part of that is the *synchronisation*, using the
notifier to send a request to each vCPU to ensure that they aren't
currently *using* the virtual address in question.

If we can get that part right, then perhaps it shouldn't *matter*
whether the HVA in question is a guest or a kernel one?

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v2] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-23 19:47 [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU Woodhouse, David
                   ` (2 preceding siblings ...)
  2021-10-25 11:43 ` Paolo Bonzini
@ 2021-10-25 13:29 ` David Woodhouse
  2022-02-09 14:30   ` David Woodhouse
  3 siblings, 1 reply; 28+ messages in thread
From: David Woodhouse @ 2021-10-25 13:29 UTC (permalink / raw)
  To: kvm; +Cc: jmattson, pbonzini, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 7927 bytes --]

From: David Woodhouse <dwmw@amazon.co.uk>

There are circumstances whem kvm_xen_update_runstate_guest() should not
sleep because it ends up being called from __schedule() when the vCPU
is preempted:

[  222.830825]  kvm_xen_update_runstate_guest+0x24/0x100
[  222.830878]  kvm_arch_vcpu_put+0x14c/0x200
[  222.830920]  kvm_sched_out+0x30/0x40
[  222.830960]  __schedule+0x55c/0x9f0

To handle this, make it use the same trick as __kvm_xen_has_interrupt(),
of using the hva from the gfn_to_hva_cache directly. Then it can use
pagefault_disable() around the accesses and just bail out if the page
is absent (which is unlikely).

I almost switched to using a gfn_to_pfn_cache here and bailing out if
kvm_map_gfn() fails, like kvm_steal_time_set_preempted() does — but on
closer inspection it looks like kvm_map_gfn() will *always* fail in
atomic context for a page in IOMEM, which means it will silently fail
to make the update every single time for such guests, AFAICT. So I
didn't do it that way after all. And will probably fix that one too.

Cc: stable@vger.kernel.org
Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
v2: Mark the page dirty after writing to it, add stable tag.

 arch/x86/kvm/xen.c | 99 +++++++++++++++++++++++++++++++---------------
 1 file changed, 67 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 8f62baebd028..a0bd0014ec66 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -93,32 +93,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 {
 	struct kvm_vcpu_xen *vx = &v->arch.xen;
+	struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
+	struct kvm_memslots *slots = kvm_memslots(v->kvm);
+	bool atomic = (state == RUNSTATE_runnable);
 	uint64_t state_entry_time;
-	unsigned int offset;
+	int __user *user_state;
+	uint64_t __user *user_times;
 
 	kvm_xen_update_runstate(v, state);
 
 	if (!vx->runstate_set)
 		return;
 
-	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+	if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
+	    kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
+		return;
+
+	/* We made sure it fits in a single page */
+	BUG_ON(!ghc->memslot);
+
+	if (atomic)
+		pagefault_disable();
 
-	offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
-#ifdef CONFIG_X86_64
 	/*
-	 * The only difference is alignment of uint64_t in 32-bit.
-	 * So the first field 'state' is accessed directly using
-	 * offsetof() (where its offset happens to be zero), while the
-	 * remaining fields which are all uint64_t, start at 'offset'
-	 * which we tweak here by adding 4.
+	 * The only difference between 32-bit and 64-bit versions of the
+	 * runstate struct us the alignment of uint64_t in 32-bit, which
+	 * means that the 64-bit version has an additional 4 bytes of
+	 * padding after the first field 'state'.
+	 *
+	 * So we use 'int __user *user_state' to point to the state field,
+	 * and 'uint64_t __user *user_times' for runstate_entry_time. So
+	 * the actual array of time[] in each state starts at user_times[1].
 	 */
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
+	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
+	user_state = (int __user *)ghc->hva;
+
+	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+
+	user_times = (uint64_t __user *)(ghc->hva +
+					 offsetof(struct compat_vcpu_runstate_info,
+						  state_entry_time));
+#ifdef CONFIG_X86_64
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
 		     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
 		     offsetof(struct compat_vcpu_runstate_info, time) + 4);
 
 	if (v->kvm->arch.xen.long_mode)
-		offset = offsetof(struct vcpu_runstate_info, state_entry_time);
+		user_times = (uint64_t __user *)(ghc->hva +
+						 offsetof(struct vcpu_runstate_info,
+							  state_entry_time));
 #endif
 	/*
 	 * First write the updated state_entry_time at the appropriate
@@ -132,28 +157,21 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) !=
 		     sizeof(state_entry_time));
 
-	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-					  &state_entry_time, offset,
-					  sizeof(state_entry_time)))
-		return;
+	if (__put_user(state_entry_time, user_times))
+		goto out;
 	smp_wmb();
 
 	/*
 	 * Next, write the new runstate. This is in the *same* place
 	 * for 32-bit and 64-bit guests, asserted here for paranoia.
 	 */
-	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
-		     offsetof(struct compat_vcpu_runstate_info, state));
 	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
 		     sizeof(vx->current_runstate));
 	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
 		     sizeof(vx->current_runstate));
 
-	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-					  &vx->current_runstate,
-					  offsetof(struct vcpu_runstate_info, state),
-					  sizeof(vx->current_runstate)))
-		return;
+	if (__put_user(vx->current_runstate, user_state))
+		goto out;
 
 	/*
 	 * Write the actual runstate times immediately after the
@@ -168,24 +186,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
 		     sizeof(vx->runstate_times));
 
-	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-					  &vx->runstate_times[0],
-					  offset + sizeof(u64),
-					  sizeof(vx->runstate_times)))
-		return;
-
+	if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
+		goto out;
 	smp_wmb();
 
 	/*
 	 * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
 	 * runstate_entry_time field.
 	 */
-
 	state_entry_time &= ~XEN_RUNSTATE_UPDATE;
-	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-					  &state_entry_time, offset,
-					  sizeof(state_entry_time)))
-		return;
+	__put_user(state_entry_time, user_times);
+	smp_wmb();
+
+ out:
+	mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+	if (atomic)
+		pagefault_enable();
 }
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -337,6 +354,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			break;
 		}
 
+		/* It must fit within a single page */
+		if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
+			r = -EINVAL;
+			break;
+		}
+
 		r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.vcpu_info_cache,
 					      data->u.gpa,
@@ -354,6 +377,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			break;
 		}
 
+		/* It must fit within a single page */
+		if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
+			r = -EINVAL;
+			break;
+		}
+
 		r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.vcpu_time_info_cache,
 					      data->u.gpa,
@@ -375,6 +404,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			break;
 		}
 
+		/* It must fit within a single page */
+		if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
+			r = -EINVAL;
+			break;
+		}
+
 		r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.runstate_cache,
 					      data->u.gpa,
-- 
2.25.1


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 11:43 ` Paolo Bonzini
@ 2021-10-25 13:30   ` Woodhouse, David
  0 siblings, 0 replies; 28+ messages in thread
From: Woodhouse, David @ 2021-10-25 13:30 UTC (permalink / raw)
  To: kvm, pbonzini; +Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

On Mon, 2021-10-25 at 13:43 +0200, Paolo Bonzini wrote:
> On 23/10/21 21:47, Woodhouse, David wrote:
> >   	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
> >   		     sizeof(vx->current_runstate));
> >   	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
> >   		     sizeof(vx->current_runstate));
> 
> We can also use sizeof_field here, while you're at it (separate patch, 
> though).

Ack. I'm about to work on event channel delivery from within the
kernel, so I'll do so as part of that series.

Thanks.



Amazon Development Centre (London) Ltd. Registered in England and Wales with registration number 04543232 with its registered office at 1 Principal Place, Worship Street, London EC2A 2FA, United Kingdom.



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 12:19     ` David Woodhouse
  2021-10-25 12:22       ` Paolo Bonzini
@ 2021-10-28 22:22       ` David Woodhouse
  2021-10-29 11:31         ` Joao Martins
  1 sibling, 1 reply; 28+ messages in thread
From: David Woodhouse @ 2021-10-28 22:22 UTC (permalink / raw)
  To: Paolo Bonzini, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 13745 bytes --]

On Mon, 2021-10-25 at 13:19 +0100, David Woodhouse wrote:
> On Mon, 2021-10-25 at 11:39 +0100, David Woodhouse wrote:
> > > One possible solution (which I even have unfinished patches for) is to
> > > put all the gfn_to_pfn_caches on a list, and refresh them when the MMU
> > > notifier receives an invalidation.
> > 
> > For this use case I'm not even sure why I'd *want* to cache the PFN and
> > explicitly kmap/memremap it, when surely by *definition* there's a
> > perfectly serviceable HVA which already points to it?
> 
> That's indeed true for *this* use case but my *next* use case is
> actually implementing the event channel delivery.
> 
> What we have in-kernel already is everything we absolutely *need* in
> order to host Xen guests, but I really do want to fix the fact that
> even IPIs and timers are bouncing up through userspace.

Here's a completely untested attempt, in which all the complexity is
based around the fact that I can't just pin the pages as João and
Ankur's original did.

It adds a new KVM_IRQ_ROUTING_XEN_EVTCHN with an ABI that allows for us
to add FIFO event channels, but for now only supports 2 level.

In kvm_xen_set_evtchn() I currently use kvm_map_gfn() *without* a cache
at all, but I'll work something out for that. I think I can use a
gfn_to_hva_cache (like the one removed in commit 319afe685) and in the
rare case that it's invalid, I can take kvm->lock to revalidate it.

It sets the bit in the global shared info but doesn't touch the target
vCPU's vcpu_info; instead it sets a bit in an *in-kernel* shadow of the
target's evtchn_pending_sel word, and kicks the vCPU.

That shadow is actually synced to the guest's vcpu_info struct in
kvm_xen_has_interrupt(). There's a little bit of fun asm there to set
the bits in the userspace struct and then clear the same set of bits in
the kernel shadow *if* the first op didn't fault. Or such is the
intent; I didn't hook up a test yet.

As things stand, I should be able to use this for delivery of PIRQs
from my VMM, where things like passed-through PCI MSI gets turned into
Xen event channels. As well as KVM unit tests, of course.

The plan is then to hook up IPIs and timers — again based on the Oracle
code from before, but using eventfds for the actual evtchn delivery. 

From be4b79e54ed07bbd2e4310a6da9e990efa6fbc6e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 28 Oct 2021 23:10:31 +0100
Subject: [PATCH] KVM: x86/xen: First attempt at KVM_IRQ_ROUTING_XEN_EVTCHN

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/kvm/irq_comm.c         |  12 +++
 arch/x86/kvm/xen.c              | 176 +++++++++++++++++++++++++++++++-
 arch/x86/kvm/xen.h              |   6 ++
 include/linux/kvm_host.h        |   7 ++
 include/uapi/linux/kvm.h        |  10 ++
 6 files changed, 207 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 70771376e246..e1a4521ae838 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -606,6 +606,7 @@ struct kvm_vcpu_xen {
 	u64 last_steal;
 	u64 runstate_entry_time;
 	u64 runstate_times[4];
+	unsigned long evtchn_pending_sel;
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index d5b72a08e566..6894f9a369f2 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -24,6 +24,7 @@
 
 #include "hyperv.h"
 #include "x86.h"
+#include "xen.h"
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
@@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 			return r;
 		break;
 
+#ifdef CONFIG_KVM_XEN
+	case KVM_IRQ_ROUTING_XEN_EVTCHN:
+		if (!level)
+			return -1;
+
+		return kvm_xen_set_evtchn(e, kvm, true);
+#endif
 	default:
 		break;
 	}
@@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
 		e->hv_sint.sint = ue->u.hv_sint.sint;
 		break;
+#ifdef CONFIG_KVM_XEN
+	case KVM_IRQ_ROUTING_XEN_EVTCHN:
+		return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
 	default:
 		return -EINVAL;
 	}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index c4bca001a7c9..bff5c458af96 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -207,6 +207,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 {
+	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+	bool atomic = in_atomic() || !task_is_running(current);
 	int err;
 	u8 rc = 0;
 
@@ -216,6 +218,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 */
 	struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
 	struct kvm_memslots *slots = kvm_memslots(v->kvm);
+	bool ghc_valid = slots->generation == ghc->generation &&
+		!kvm_is_error_hva(ghc->hva) && ghc->memslot;
+
 	unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
 
 	/* No need for compat handling here */
@@ -231,8 +236,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 * cache in kvm_read_guest_offset_cached(), but just uses
 	 * __get_user() instead. And falls back to the slow path.
 	 */
-	if (likely(slots->generation == ghc->generation &&
-		   !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+	if (!evtchn_pending_sel && ghc_valid) {
 		/* Fast path */
 		pagefault_disable();
 		err = __get_user(rc, (u8 __user *)ghc->hva + offset);
@@ -251,12 +255,72 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 * and we'll end up getting called again from a context where we *can*
 	 * fault in the page and wait for it.
 	 */
-	if (in_atomic() || !task_is_running(current))
+	if (atomic)
 		return 1;
 
-	kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
-				     sizeof(rc));
+	if (!ghc_valid) {
+		err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
+		if (err && !ghc->memslot) {
+			/*
+			 * If this failed, userspace has screwed up the
+			 * vcpu_info mapping. No interrupts for you.
+			 */
+			return 0;
+		}
+	}
 
+	/*
+	 * Now we have a valid (protected by srcu) userspace HVA in
+	 * ghc->hva which points to the struct vcpu_info. If there
+	 * are any bits in the in-kernel evtchn_pending_sel then
+	 * we need to write those to the guest vcpu_info and set
+	 * its evtchn_upcall_pending flag. If there aren't any bits
+	 * to add, we only want to *check* evtchn_upcall_pending.
+	 */
+	if (evtchn_pending_sel) {
+		if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+			struct vcpu_info __user *vi = (void *)ghc->hva;
+
+			/* Attempt to set the evtchn_pending_sel bits in the
+			 * guest, and if that succeeds then clear the same
+			 * bits in the in-kernel version. */
+			asm volatile("1:\t" LOCK_PREFIX "orq %1, %0\n"
+				     "\tnotq %0\n"
+				     "\t" LOCK_PREFIX "andq %2, %0\n"
+				     "2:\n"
+				     "\t.section .fixup,\"ax\"\n"
+				     "3:\tjmp\t2b\n"
+				     "\t.previous\n"
+				     _ASM_EXTABLE_UA(1b, 3b)
+				     : "=r" (evtchn_pending_sel)
+				     : "m" (vi->evtchn_pending_sel),
+				       "m" (v->arch.xen.evtchn_pending_sel),
+				       "0" (evtchn_pending_sel));
+		} else {
+			struct compat_vcpu_info __user *vi = (void *)ghc->hva;
+			u32 evtchn_pending_sel32 = evtchn_pending_sel;
+
+			/* Attempt to set the evtchn_pending_sel bits in the
+			 * guest, and if that succeeds then clear the same
+			 * bits in the in-kernel version. */
+			asm volatile("1:\t" LOCK_PREFIX "orl %1, %0\n"
+				     "\tnotl %0\n"
+				     "\t" LOCK_PREFIX "andl %2, %0\n"
+				     "2:\n"
+				     "\t.section .fixup,\"ax\"\n"
+				     "3:\tjmp\t2b\n"
+				     "\t.previous\n"
+				     _ASM_EXTABLE_UA(1b, 3b)
+				     : "=r" (evtchn_pending_sel32)
+				     : "m" (vi->evtchn_pending_sel),
+				       "m" (v->arch.xen.evtchn_pending_sel),
+				       "0" (evtchn_pending_sel32));
+		}
+		rc = 1;
+		__put_user(rc, (u8 __user *)ghc->hva + offset);
+	} else {
+		__get_user(rc, (u8 __user *)ghc->hva + offset);
+	}
 	return rc;
 }
 
@@ -772,3 +836,105 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 
 	return 0;
 }
+
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+		return 4096;
+	else
+		return 1024;
+}
+
+int kvm_xen_set_evtchn(struct kvm_kernel_irq_routing_entry *e,
+		       struct kvm *kvm, bool in_atomic)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_host_map map;
+	unsigned long *pending_bits, *mask_bits;
+	int port_word_bit;
+	int rc;
+
+	vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
+	if (!vcpu)
+		return -EINVAL;
+
+	if (vcpu->arch.xen.vcpu_info_set)
+		return -EINVAL;
+
+	if (e->xen_evtchn.port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	/* With no cache this is *always* going to fail in the atomic case for now */
+	rc = kvm_map_gfn(vcpu, kvm->arch.xen.shinfo_gfn, &map, NULL, in_atomic);
+	if (rc < 0)
+		return in_atomic ? -EWOULDBLOCK : rc;
+
+	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+		struct shared_info *shinfo = map.hva;
+		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+		port_word_bit = e->xen_evtchn.port / 64;
+	} else {
+		struct compat_shared_info *shinfo = map.hva;
+		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+		port_word_bit = e->xen_evtchn.port / 32;
+	}
+
+	/*
+	 * If this port wasn't already set, and if it isn't masked, then
+	 * we try to set the corresponding bit in the in-kernel shadow of
+	 * evtchn_pending_sel for the target vCPU. And if *that* wasn't
+	 * already set, then we kick the vCPU in question to write to the
+	 * *real* evtchn_pending_sel in its own guest vcpu_info struct.
+	 */
+	if (!test_and_set_bit(e->xen_evtchn.port, pending_bits) &&
+	    !test_bit(e->xen_evtchn.port, mask_bits) &&
+	    !test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) {
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	kvm_unmap_gfn(vcpu, &map, NULL, true, in_atomic);
+	return rc;
+}
+
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+			 struct kvm_kernel_irq_routing_entry *e,
+			 const struct kvm_irq_routing_entry *ue)
+
+{
+	struct kvm_vcpu *vcpu;
+
+	if (kvm->arch.xen.shinfo_gfn == GPA_INVALID)
+		return -EINVAL;
+
+	if (e->xen_evtchn.vcpu >= KVM_MAX_VCPUS)
+		return -EINVAL;
+
+	vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
+	if (!vcpu)
+		return -EINVAL;
+
+	if (vcpu->arch.xen.vcpu_info_set)
+		return -EINVAL;
+
+	if (!kvm->arch.xen.upcall_vector)
+		return -EINVAL;
+
+	/* Once we support the per-vCPU LAPIC based vector we will permit
+	 * that here instead of the per-KVM upcall vector */
+
+	if (e->xen_evtchn.port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	/* We only support 2 level event channels for now */
+	if (e->xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+		return -EINVAL;
+
+	e->xen_evtchn.port = ue->u.xen_evtchn.port;
+	e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
+	e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
+
+	return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index cc0cf5f37450..3e717947b928 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_init_vm(struct kvm *kvm);
 void kvm_xen_destroy_vm(struct kvm *kvm);
 
+int kvm_xen_set_evtchn(struct kvm_kernel_irq_routing_entry *e,
+		       struct kvm *kvm, bool in_atomic);
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+			 struct kvm_kernel_irq_routing_entry *e,
+			 const struct kvm_irq_routing_entry *ue);
+
 static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
 {
 	return static_branch_unlikely(&kvm_xen_enabled.key) &&
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0f18df7fe874..9003fae1af9d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -470,6 +470,12 @@ struct kvm_hv_sint {
 	u32 sint;
 };
 
+struct kvm_xen_evtchn {
+	u32 port;
+	u32 vcpu;
+	u32 priority;
+};
+
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
@@ -490,6 +496,7 @@ struct kvm_kernel_irq_routing_entry {
 		} msi;
 		struct kvm_s390_adapter_int adapter;
 		struct kvm_hv_sint hv_sint;
+		struct kvm_xen_evtchn xen_evtchn;
 	};
 	struct hlist_node link;
 };
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a067410ebea5..05391c80bb6a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1143,11 +1143,20 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
+struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+};
+
+#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
+#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -1159,6 +1168,7 @@ struct kvm_irq_routing_entry {
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
+		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
 };
-- 
2.31.1


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 13:13         ` David Woodhouse
@ 2021-10-29 10:48           ` David Woodhouse
  0 siblings, 0 replies; 28+ messages in thread
From: David Woodhouse @ 2021-10-29 10:48 UTC (permalink / raw)
  To: Paolo Bonzini, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 646 bytes --]

On Mon, 2021-10-25 at 14:13 +0100, David Woodhouse wrote:
> 
> When you put it like that, it just seems so stunningly redundant :)
> 
> "When we get notified that the guest HVA has been mapped, we create
> our own kernel mapping of the same page. When we are notifed that the
> guest HVA gets unmapped, we tear down our kernel mapping of it."

Except, of course, that the kernel mapping can be used from *anywhere*
and not just a from thread belonging to the VM.

Like when irqfd_inject() invokes kvm_set_irq() from a work queue, which
is *obviously* oopsing once I fix up the other minor issues in the
patch I sent out last night.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-28 22:22       ` [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU David Woodhouse
@ 2021-10-29 11:31         ` Joao Martins
  2021-10-29 11:56           ` David Woodhouse
  0 siblings, 1 reply; 28+ messages in thread
From: Joao Martins @ 2021-10-29 11:31 UTC (permalink / raw)
  To: David Woodhouse
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro,
	Paolo Bonzini, kvm, Raslan, KarimAllah, Ankur Arora

On 10/28/21 23:22, David Woodhouse wrote:
> On Mon, 2021-10-25 at 13:19 +0100, David Woodhouse wrote:
>> On Mon, 2021-10-25 at 11:39 +0100, David Woodhouse wrote:
>>>> One possible solution (which I even have unfinished patches for) is to
>>>> put all the gfn_to_pfn_caches on a list, and refresh them when the MMU
>>>> notifier receives an invalidation.
>>>
>>> For this use case I'm not even sure why I'd *want* to cache the PFN and
>>> explicitly kmap/memremap it, when surely by *definition* there's a
>>> perfectly serviceable HVA which already points to it?
>>
>> That's indeed true for *this* use case but my *next* use case is
>> actually implementing the event channel delivery.
>>
>> What we have in-kernel already is everything we absolutely *need* in
>> order to host Xen guests, but I really do want to fix the fact that
>> even IPIs and timers are bouncing up through userspace.
> 
> Here's a completely untested attempt, in which all the complexity is
> based around the fact that I can't just pin the pages as João and
> Ankur's original did.
> 
> It adds a new KVM_IRQ_ROUTING_XEN_EVTCHN with an ABI that allows for us
> to add FIFO event channels, but for now only supports 2 level.
> 
> In kvm_xen_set_evtchn() I currently use kvm_map_gfn() *without* a cache
> at all, but I'll work something out for that. I think I can use a
> gfn_to_hva_cache (like the one removed in commit 319afe685) and in the
> rare case that it's invalid, I can take kvm->lock to revalidate it.
> 
> It sets the bit in the global shared info but doesn't touch the target
> vCPU's vcpu_info; instead it sets a bit in an *in-kernel* shadow of the
> target's evtchn_pending_sel word, and kicks the vCPU.
> 
> That shadow is actually synced to the guest's vcpu_info struct in
> kvm_xen_has_interrupt(). There's a little bit of fun asm there to set
> the bits in the userspace struct and then clear the same set of bits in
> the kernel shadow *if* the first op didn't fault. Or such is the
> intent; I didn't hook up a test yet.
> 
> As things stand, I should be able to use this for delivery of PIRQs
> from my VMM, where things like passed-through PCI MSI gets turned into
> Xen event channels. As well as KVM unit tests, of course.
> 
Cool stuff!! I remember we only made IPIs and timers work but not PIRQs
event channels.

> The plan is then to hook up IPIs and timers — again based on the Oracle
> code from before, but using eventfds for the actual evtchn delivery. 
> 
I recall the eventfd_signal() was there should the VMM choose supply an
eventfd. But working without one was mainly for IPI/timers due to
performance reasons (avoiding the call to eventfd_signal()). We saw some
slight overhead there -- but I can't find the data right now :(

> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
> index c4bca001a7c9..bff5c458af96 100644
> --- a/arch/x86/kvm/xen.c
> +++ b/arch/x86/kvm/xen.c
> @@ -207,6 +207,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
>  
>  int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
>  {
> +	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
> +	bool atomic = in_atomic() || !task_is_running(current);
>  	int err;
>  	u8 rc = 0;
>  
> @@ -216,6 +218,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
>  	 */
>  	struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
>  	struct kvm_memslots *slots = kvm_memslots(v->kvm);
> +	bool ghc_valid = slots->generation == ghc->generation &&
> +		!kvm_is_error_hva(ghc->hva) && ghc->memslot;
> +
>  	unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
>  
>  	/* No need for compat handling here */
> @@ -231,8 +236,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
>  	 * cache in kvm_read_guest_offset_cached(), but just uses
>  	 * __get_user() instead. And falls back to the slow path.
>  	 */
> -	if (likely(slots->generation == ghc->generation &&
> -		   !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
> +	if (!evtchn_pending_sel && ghc_valid) {
>  		/* Fast path */
>  		pagefault_disable();
>  		err = __get_user(rc, (u8 __user *)ghc->hva + offset);
> @@ -251,12 +255,72 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
>  	 * and we'll end up getting called again from a context where we *can*
>  	 * fault in the page and wait for it.
>  	 */
> -	if (in_atomic() || !task_is_running(current))
> +	if (atomic)
>  		return 1;
>  
> -	kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
> -				     sizeof(rc));
> +	if (!ghc_valid) {
> +		err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
> +		if (err && !ghc->memslot) {
> +			/*
> +			 * If this failed, userspace has screwed up the
> +			 * vcpu_info mapping. No interrupts for you.
> +			 */
> +			return 0;
> +		}
> +	}
>  
> +	/*
> +	 * Now we have a valid (protected by srcu) userspace HVA in
> +	 * ghc->hva which points to the struct vcpu_info. If there
> +	 * are any bits in the in-kernel evtchn_pending_sel then
> +	 * we need to write those to the guest vcpu_info and set
> +	 * its evtchn_upcall_pending flag. If there aren't any bits
> +	 * to add, we only want to *check* evtchn_upcall_pending.
> +	 */
> +	if (evtchn_pending_sel) {
> +		if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
> +			struct vcpu_info __user *vi = (void *)ghc->hva;
> +
> +			/* Attempt to set the evtchn_pending_sel bits in the
> +			 * guest, and if that succeeds then clear the same
> +			 * bits in the in-kernel version. */
> +			asm volatile("1:\t" LOCK_PREFIX "orq %1, %0\n"
> +				     "\tnotq %0\n"
> +				     "\t" LOCK_PREFIX "andq %2, %0\n"
> +				     "2:\n"
> +				     "\t.section .fixup,\"ax\"\n"
> +				     "3:\tjmp\t2b\n"
> +				     "\t.previous\n"
> +				     _ASM_EXTABLE_UA(1b, 3b)
> +				     : "=r" (evtchn_pending_sel)
> +				     : "m" (vi->evtchn_pending_sel),
> +				       "m" (v->arch.xen.evtchn_pending_sel),
> +				       "0" (evtchn_pending_sel));
> +		} else {
> +			struct compat_vcpu_info __user *vi = (void *)ghc->hva;
> +			u32 evtchn_pending_sel32 = evtchn_pending_sel;
> +
> +			/* Attempt to set the evtchn_pending_sel bits in the
> +			 * guest, and if that succeeds then clear the same
> +			 * bits in the in-kernel version. */
> +			asm volatile("1:\t" LOCK_PREFIX "orl %1, %0\n"
> +				     "\tnotl %0\n"
> +				     "\t" LOCK_PREFIX "andl %2, %0\n"
> +				     "2:\n"
> +				     "\t.section .fixup,\"ax\"\n"
> +				     "3:\tjmp\t2b\n"
> +				     "\t.previous\n"
> +				     _ASM_EXTABLE_UA(1b, 3b)
> +				     : "=r" (evtchn_pending_sel32)
> +				     : "m" (vi->evtchn_pending_sel),
> +				       "m" (v->arch.xen.evtchn_pending_sel),
> +				       "0" (evtchn_pending_sel32));
> +		}

Perhaps I am not reading it right (or I forgot) but don't you need to use the shared info
vcpu info when the guest hasn't explicitly registered for a *separate* vcpu info, no?

Or maybe this relies on the API contract (?) that VMM needs to register the vcpu info in
addition to shared info regardless of Xen guest explicitly asking for a separate vcpu
info. If so, might be worth a comment...


> +		rc = 1;
> +		__put_user(rc, (u8 __user *)ghc->hva + offset);
> +	} else {
> +		__get_user(rc, (u8 __user *)ghc->hva + offset);
> +	}
>  	return rc;
>  }
>  
> @@ -772,3 +836,105 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
>  
>  	return 0;
>  }
> +
> +static inline int max_evtchn_port(struct kvm *kvm)
> +{
> +	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
> +		return 4096;
> +	else
> +		return 1024;
> +}
> +

Maybe worth using Xen ABI interface macros that help tieing this in
to Xen guest ABI. Particular the macros in include/xen/interface/event_channel.h

#define EVTCHN_2L_NR_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)

Sadly doesn't cover 32-bit case :( given the xen_ulong_t.

> +int kvm_xen_setup_evtchn(struct kvm *kvm,
> +			 struct kvm_kernel_irq_routing_entry *e,
> +			 const struct kvm_irq_routing_entry *ue)
> +
> +{
> +	struct kvm_vcpu *vcpu;
> +
> +	if (kvm->arch.xen.shinfo_gfn == GPA_INVALID)
> +		return -EINVAL;
> +
> +	if (e->xen_evtchn.vcpu >= KVM_MAX_VCPUS)
> +		return -EINVAL;
> +
> +	vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
> +	if (!vcpu)
> +		return -EINVAL;
> +
> +	if (vcpu->arch.xen.vcpu_info_set)
> +		return -EINVAL;
> +
> +	if (!kvm->arch.xen.upcall_vector)
> +		return -EINVAL;
> +
> +	/* Once we support the per-vCPU LAPIC based vector we will permit
> +	 * that here instead of the per-KVM upcall vector */
> +
> +	if (e->xen_evtchn.port >= max_evtchn_port(kvm))
> +		return -EINVAL;
> +
> +	/* We only support 2 level event channels for now */
> +	if (e->xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
> +		return -EINVAL;
> +


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-29 11:31         ` Joao Martins
@ 2021-10-29 11:56           ` David Woodhouse
  2021-10-29 17:10             ` David Woodhouse
  0 siblings, 1 reply; 28+ messages in thread
From: David Woodhouse @ 2021-10-29 11:56 UTC (permalink / raw)
  To: Joao Martins
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro,
	Paolo Bonzini, kvm, Raslan, KarimAllah, Ankur Arora

[-- Attachment #1: Type: text/plain, Size: 6959 bytes --]

On Fri, 2021-10-29 at 12:31 +0100, Joao Martins wrote:
> On 10/28/21 23:22, David Woodhouse wrote:
> > On Mon, 2021-10-25 at 13:19 +0100, David Woodhouse wrote:
> > > On Mon, 2021-10-25 at 11:39 +0100, David Woodhouse wrote:
> > > > > One possible solution (which I even have unfinished patches for) is to
> > > > > put all the gfn_to_pfn_caches on a list, and refresh them when the MMU
> > > > > notifier receives an invalidation.
> > > > 
> > > > For this use case I'm not even sure why I'd *want* to cache the PFN and
> > > > explicitly kmap/memremap it, when surely by *definition* there's a
> > > > perfectly serviceable HVA which already points to it?
> > > 
> > > That's indeed true for *this* use case but my *next* use case is
> > > actually implementing the event channel delivery.
> > > 
> > > What we have in-kernel already is everything we absolutely *need* in
> > > order to host Xen guests, but I really do want to fix the fact that
> > > even IPIs and timers are bouncing up through userspace.
> > 
> > Here's a completely untested attempt, in which all the complexity is
> > based around the fact that I can't just pin the pages as João and
> > Ankur's original did.
> > 
> > It adds a new KVM_IRQ_ROUTING_XEN_EVTCHN with an ABI that allows for us
> > to add FIFO event channels, but for now only supports 2 level.
> > 
> > In kvm_xen_set_evtchn() I currently use kvm_map_gfn() *without* a cache
> > at all, but I'll work something out for that. I think I can use a
> > gfn_to_hva_cache (like the one removed in commit 319afe685) and in the
> > rare case that it's invalid, I can take kvm->lock to revalidate it.
> > 
> > It sets the bit in the global shared info but doesn't touch the target
> > vCPU's vcpu_info; instead it sets a bit in an *in-kernel* shadow of the
> > target's evtchn_pending_sel word, and kicks the vCPU.
> > 
> > That shadow is actually synced to the guest's vcpu_info struct in
> > kvm_xen_has_interrupt(). There's a little bit of fun asm there to set
> > the bits in the userspace struct and then clear the same set of bits in
> > the kernel shadow *if* the first op didn't fault. Or such is the
> > intent; I didn't hook up a test yet.
> > 
> > As things stand, I should be able to use this for delivery of PIRQs
> > from my VMM, where things like passed-through PCI MSI gets turned into
> > Xen event channels. As well as KVM unit tests, of course.
> > 
> Cool stuff!! I remember we only made IPIs and timers work but not PIRQs
> event channels.

PIRQs turn out to be fairly trivial in the end, once you get your head
around the fact that Xen guests don't actually *unmask* MSI-X when
they're routed to PIRQ; they rely on Xen to do that for them!

If you already have a virtual IOMMU doing interrupt remapping, hooking
it up to remap from the magic Xen MSI 'this is really a PIRQ' message
is fairly simple. Right now I fail that translation and inject the
evtchn manually from userspace, but this will allow me to translate
to KVM_IRQ_ROUTING_XEN_EVTCHN and hook the vfio eventfd directly up to
it.

> > The plan is then to hook up IPIs and timers — again based on the Oracle
> > code from before, but using eventfds for the actual evtchn delivery. 
> > 
> I recall the eventfd_signal() was there should the VMM choose supply an
> eventfd. But working without one was mainly for IPI/timers due to
> performance reasons (avoiding the call to eventfd_signal()). We saw some
> slight overhead there -- but I can't find the data right now :(
> 

Hm, there shouldn't have been *much* additional overhead, since you did
hook up the evtchn delivery from kvm_arch_set_irq_inatomic() so you
weren't incurring the workqueue latency every time. I'll play.

Given that we can't pin pages, I spent a while lying awake at night
pondering *how* we defer the evtchn delivery, if we take a page fault
when we can't just sleep. I was pondering a shadow of the whole
evtchn_pending bitmap, perhaps one per vCPU because the deferred
delivery would need to know *which* vCPU to deliver it to. And more
complexity about whether it was masked or not, too (what if we set the
pending bit but then take a fault when reading whether it was masked?)

Then I remembered that irqfd_wakeup() will *try* the inatomic version
and then fall back to a workqueue, and the whole horridness just went
away :)

Perhaps as an optimisation I can do the same kind of thing — when IPI
or timer wants to raise an evtchn, it can *try* to do so atomically but
fall back to the eventfd if it needs to wait.

I think I've just conceded that I have to map/unmap the page at a
kernel virtual address from the MMU notifier when it becomes
present/absent, so I might get *some* of the benefits of pinning from
that (at least if I protect it with a spinlock it can't go away
*between* two consecutive accesses).

> Perhaps I am not reading it right (or I forgot) but don't you need to use the shared info
> vcpu info when the guest hasn't explicitly registered for a *separate* vcpu info, no?

We can't, because we don't know the *Xen* CPU numbering scheme. It may
differ from both kvm_vcpu->vcpu_id and kvm_vcpu->idx. I lost a week or
so of my life to that, delivering interrupts to the wrong vCPUs :)

> Or maybe this relies on the API contract (?) that VMM needs to register the vcpu info in
> addition to shared info regardless of Xen guest explicitly asking for a separate vcpu
> info. If so, might be worth a comment...

Indeed, I washed my hands of that complexity in the kernel and left it
entirely up to the VMM. From Documentation/virt/kvm/api.rst:

KVM_XEN_ATTR_TYPE_SHARED_INFO
  Sets the guest physical frame number at which the Xen "shared info"
  page resides. Note that although Xen places vcpu_info for the first
  32 vCPUs in the shared_info page, KVM does not automatically do so
  and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used
  explicitly even when the vcpu_info for a given vCPU resides at the
  "default" location in the shared_info page. This is because KVM is
  not aware of the Xen CPU id which is used as the index into the
  vcpu_info[] array, so cannot know the correct default location.


> > +static inline int max_evtchn_port(struct kvm *kvm)
> > +{
> > +	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
> > +		return 4096;
> > +	else
> > +		return 1024;
> > +}
> > +
> 
> Maybe worth using Xen ABI interface macros that help tieing this in
> to Xen guest ABI. Particular the macros in include/xen/interface/event_channel.h
> 
> #define EVTCHN_2L_NR_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)
> 
> Sadly doesn't cover 32-bit case :( given the xen_ulong_t.

Yeah, that's a bit of a placeholder and wants cleanup but only after
I've made it *work*. :)

For KVM I have definitions of the compat stuff in arch/x86/kvm/xen.h so
could add it there.


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-29 11:56           ` David Woodhouse
@ 2021-10-29 17:10             ` David Woodhouse
  0 siblings, 0 replies; 28+ messages in thread
From: David Woodhouse @ 2021-10-29 17:10 UTC (permalink / raw)
  To: Joao Martins
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro,
	Paolo Bonzini, kvm, Raslan, KarimAllah, Ankur Arora

[-- Attachment #1: Type: text/plain, Size: 18569 bytes --]

On Fri, 2021-10-29 at 12:56 +0100, David Woodhouse wrote:
> I think I've just conceded that I have to map/unmap the page at a
> kernel virtual address from the MMU notifier when it becomes
> present/absent, so I might get *some* of the benefits of pinning from
> that (at least if I protect it with a spinlock it can't go away
> *between* two consecutive accesses).

Maybe... in fact from the workqueue I can borrow the mm I want with
kthread_use_mm(kvm->mm) and thejn kvm_map_gfn() does work, so I've done
that as a hack for now to get the rest of it working and tested.

https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xen-evtchn

It's working now taking the workqueue path every time, so next I'll
take a look at fixing the atomic path to do something other than just
return -EWOULDBLOCK every time, by maintaining a kernel mapping of the
shinfo page.

From 82dcaa28d80c9ee4e2a09ac4f263be4ce59a3457 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 28 Oct 2021 23:10:31 +0100
Subject: [PATCH] KVM: x86/xen: First attempt at KVM_IRQ_ROUTING_XEN_EVTCHN

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/kvm_host.h               |   1 +
 arch/x86/kvm/irq_comm.c                       |  12 +
 arch/x86/kvm/xen.c                            | 221 +++++++++++++++++-
 arch/x86/kvm/xen.h                            |   6 +
 include/linux/kvm_host.h                      |   7 +
 include/uapi/linux/kvm.h                      |  10 +
 .../selftests/kvm/x86_64/xen_shinfo_test.c    |  99 +++++++-
 7 files changed, 350 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 70771376e246..e1a4521ae838 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -606,6 +606,7 @@ struct kvm_vcpu_xen {
 	u64 last_steal;
 	u64 runstate_entry_time;
 	u64 runstate_times[4];
+	unsigned long evtchn_pending_sel;
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index d5b72a08e566..6894f9a369f2 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -24,6 +24,7 @@
 
 #include "hyperv.h"
 #include "x86.h"
+#include "xen.h"
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
@@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 			return r;
 		break;
 
+#ifdef CONFIG_KVM_XEN
+	case KVM_IRQ_ROUTING_XEN_EVTCHN:
+		if (!level)
+			return -1;
+
+		return kvm_xen_set_evtchn(e, kvm, true);
+#endif
 	default:
 		break;
 	}
@@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
 		e->hv_sint.sint = ue->u.hv_sint.sint;
 		break;
+#ifdef CONFIG_KVM_XEN
+	case KVM_IRQ_ROUTING_XEN_EVTCHN:
+		return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
 	default:
 		return -EINVAL;
 	}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index c4bca001a7c9..3ec7c62f99ec 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -13,6 +13,8 @@
 #include <linux/kvm_host.h>
 #include <linux/sched/stat.h>
 
+#include <asm/mmu_context.h>
+
 #include <trace/events/kvm.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
@@ -207,6 +209,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 {
+	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+	bool atomic = in_atomic() || !task_is_running(current);
 	int err;
 	u8 rc = 0;
 
@@ -216,6 +220,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 */
 	struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
 	struct kvm_memslots *slots = kvm_memslots(v->kvm);
+	bool ghc_valid = slots->generation == ghc->generation &&
+		!kvm_is_error_hva(ghc->hva) && ghc->memslot;
+
 	unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
 
 	/* No need for compat handling here */
@@ -231,8 +238,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 * cache in kvm_read_guest_offset_cached(), but just uses
 	 * __get_user() instead. And falls back to the slow path.
 	 */
-	if (likely(slots->generation == ghc->generation &&
-		   !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+	if (!evtchn_pending_sel && ghc_valid) {
 		/* Fast path */
 		pagefault_disable();
 		err = __get_user(rc, (u8 __user *)ghc->hva + offset);
@@ -251,11 +257,80 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 * and we'll end up getting called again from a context where we *can*
 	 * fault in the page and wait for it.
 	 */
-	if (in_atomic() || !task_is_running(current))
+	if (atomic)
 		return 1;
 
-	kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
-				     sizeof(rc));
+	if (!ghc_valid) {
+		err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
+		if (err && !ghc->memslot) {
+			/*
+			 * If this failed, userspace has screwed up the
+			 * vcpu_info mapping. No interrupts for you.
+			 */
+			return 0;
+		}
+	}
+
+	/*
+	 * Now we have a valid (protected by srcu) userspace HVA in
+	 * ghc->hva which points to the struct vcpu_info. If there
+	 * are any bits in the in-kernel evtchn_pending_sel then
+	 * we need to write those to the guest vcpu_info and set
+	 * its evtchn_upcall_pending flag. If there aren't any bits
+	 * to add, we only want to *check* evtchn_upcall_pending.
+	 */
+	if (evtchn_pending_sel) {
+		if (!user_access_begin((void *)ghc->hva, sizeof(struct vcpu_info)))
+			return 0;
+
+		if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+			struct vcpu_info __user *vi = (void *)ghc->hva;
+
+			/* Attempt to set the evtchn_pending_sel bits in the
+			 * guest, and if that succeeds then clear the same
+			 * bits in the in-kernel version. */
+			asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
+				     "\tnotq %0\n"
+				     "\t" LOCK_PREFIX "andq %0, %2\n"
+				     "2:\n"
+				     "\t.section .fixup,\"ax\"\n"
+				     "3:\tjmp\t2b\n"
+				     "\t.previous\n"
+				     _ASM_EXTABLE_UA(1b, 3b)
+				     : "=r" (evtchn_pending_sel)
+				     : "m" (vi->evtchn_pending_sel),
+				       "m" (v->arch.xen.evtchn_pending_sel),
+				       "0" (evtchn_pending_sel));
+		} else {
+			struct compat_vcpu_info __user *vi = (void *)ghc->hva;
+			u32 evtchn_pending_sel32 = evtchn_pending_sel;
+
+			/* Attempt to set the evtchn_pending_sel bits in the
+			 * guest, and if that succeeds then clear the same
+			 * bits in the in-kernel version. */
+			asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
+				     "\tnotl %0\n"
+				     "\t" LOCK_PREFIX "andl %0, %2\n"
+				     "2:\n"
+				     "\t.section .fixup,\"ax\"\n"
+				     "3:\tjmp\t2b\n"
+				     "\t.previous\n"
+				     _ASM_EXTABLE_UA(1b, 3b)
+				     : "=r" (evtchn_pending_sel32)
+				     : "m" (vi->evtchn_pending_sel),
+				       "m" (v->arch.xen.evtchn_pending_sel),
+				       "0" (evtchn_pending_sel32));
+		}
+		rc = 1;
+		unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
+
+	err:
+		user_access_end();
+
+		mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+	} else {
+		__get_user(rc, (u8 __user *)ghc->hva + offset);
+	}
 
 	return rc;
 }
@@ -772,3 +847,139 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 
 	return 0;
 }
+
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+		return 4096;
+	else
+		return 1024;
+}
+
+int kvm_xen_set_evtchn(struct kvm_kernel_irq_routing_entry *e,
+		       struct kvm *kvm, bool in_atomic)
+{
+	bool mm_borrowed = false;
+	struct kvm_vcpu *vcpu;
+	struct kvm_host_map map;
+	unsigned long *pending_bits, *mask_bits;
+	int port_word_bit;
+	int idx;
+	int rc;
+
+	vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
+	if (!vcpu)
+		return -EINVAL;
+
+	if (!vcpu->arch.xen.vcpu_info_set)
+		return -EINVAL;
+
+	if (e->xen_evtchn.port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	if (current->mm != kvm->mm) {
+		if (in_atomic)
+			return -EWOULDBLOCK;
+
+		/* Else we'd better be in the workqueue */
+		BUG_ON(current->mm);
+		kthread_use_mm(kvm->mm);
+		mm_borrowed = true;
+	}
+
+	idx = srcu_read_lock(&kvm->srcu);
+
+	/* With no cache this is *always* going to fail in the atomic case for now */
+	rc = kvm_map_gfn(vcpu, kvm->arch.xen.shinfo_gfn, &map, NULL, in_atomic);
+	if (rc < 0) {
+		if (in_atomic)
+			rc = -EWOULDBLOCK;
+		goto out_rcu;
+	}
+
+	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+		struct shared_info *shinfo = map.hva;
+		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+		port_word_bit = e->xen_evtchn.port / 64;
+	} else {
+		struct compat_shared_info *shinfo = map.hva;
+		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+		port_word_bit = e->xen_evtchn.port / 32;
+	}
+
+	/*
+	 * If this port wasn't already set, and if it isn't masked, then
+	 * we try to set the corresponding bit in the in-kernel shadow of
+	 * evtchn_pending_sel for the target vCPU. And if *that* wasn't
+	 * already set, then we kick the vCPU in question to write to the
+	 * *real* evtchn_pending_sel in its own guest vcpu_info struct.
+	 */
+	if (!test_and_set_bit(e->xen_evtchn.port, pending_bits) &&
+	    !test_bit(e->xen_evtchn.port, mask_bits) &&
+	    !test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) {
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	kvm_unmap_gfn(vcpu, &map, NULL, true, in_atomic);
+ out_rcu:
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	if (mm_borrowed)
+		kthread_unuse_mm(kvm->mm);
+
+	return rc;
+}
+
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+			 int irq_source_id, int level, bool line_status)
+{
+	if (!level)
+		return -1;
+
+	return kvm_xen_set_evtchn(e, kvm, false);
+}
+
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+			 struct kvm_kernel_irq_routing_entry *e,
+			 const struct kvm_irq_routing_entry *ue)
+
+{
+	struct kvm_vcpu *vcpu;
+
+	if (kvm->arch.xen.shinfo_gfn == GPA_INVALID)
+		return -EINVAL;
+
+	if (ue->u.xen_evtchn.vcpu >= KVM_MAX_VCPUS)
+		return -EINVAL;
+
+	vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
+	if (!vcpu)
+		return -EINVAL;
+
+	if (!vcpu->arch.xen.vcpu_info_set)
+		return -EINVAL;
+
+	if (!kvm->arch.xen.upcall_vector)
+		return -EINVAL;
+
+	/* Once we support the per-vCPU LAPIC based vector we will permit
+	 * that here instead of the per-KVM upcall vector */
+
+	if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	/* We only support 2 level event channels for now */
+	if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+		return -EINVAL;
+
+	e->xen_evtchn.port = ue->u.xen_evtchn.port;
+	e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
+	e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
+	e->set = evtchn_set_fn;
+
+	return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index cc0cf5f37450..3e717947b928 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_init_vm(struct kvm *kvm);
 void kvm_xen_destroy_vm(struct kvm *kvm);
 
+int kvm_xen_set_evtchn(struct kvm_kernel_irq_routing_entry *e,
+		       struct kvm *kvm, bool in_atomic);
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+			 struct kvm_kernel_irq_routing_entry *e,
+			 const struct kvm_irq_routing_entry *ue);
+
 static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
 {
 	return static_branch_unlikely(&kvm_xen_enabled.key) &&
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0f18df7fe874..9003fae1af9d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -470,6 +470,12 @@ struct kvm_hv_sint {
 	u32 sint;
 };
 
+struct kvm_xen_evtchn {
+	u32 port;
+	u32 vcpu;
+	u32 priority;
+};
+
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
@@ -490,6 +496,7 @@ struct kvm_kernel_irq_routing_entry {
 		} msi;
 		struct kvm_s390_adapter_int adapter;
 		struct kvm_hv_sint hv_sint;
+		struct kvm_xen_evtchn xen_evtchn;
 	};
 	struct hlist_node link;
 };
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a067410ebea5..05391c80bb6a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1143,11 +1143,20 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
+struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+};
+
+#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
+#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -1159,6 +1168,7 @@ struct kvm_irq_routing_entry {
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
+		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
 };
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index a0699f00b3d6..514acf3ab73f 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -14,6 +14,9 @@
 #include <stdint.h>
 #include <time.h>
 #include <sched.h>
+#include <signal.h>
+
+#include <sys/eventfd.h>
 
 #define VCPU_ID		5
 
@@ -22,10 +25,12 @@
 #define SHINFO_REGION_SLOT	10
 #define PAGE_SIZE		4096
 
+#define SHINFO_ADDR	(SHINFO_REGION_GPA)
 #define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
 #define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + 0x20)
 #define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
 
+#define SHINFO_VADDR	(SHINFO_REGION_GVA)
 #define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + 0x20)
 #define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
 
@@ -73,15 +78,30 @@ struct vcpu_info {
         struct pvclock_vcpu_time_info time;
 }; /* 64 bytes (x86) */
 
+struct shared_info {
+	struct vcpu_info vcpu_info[32];
+	unsigned long evtchn_pending[64];
+	unsigned long evtchn_mask[64];
+	struct pvclock_wall_clock wc;
+	uint32_t wc_sec_hi;
+	/* arch_shared_info here */
+};
+
 #define RUNSTATE_running  0
 #define RUNSTATE_runnable 1
 #define RUNSTATE_blocked  2
 #define RUNSTATE_offline  3
 
+struct {
+	struct kvm_irq_routing info;
+	struct kvm_irq_routing_entry entries[2];
+} irq_routes;
+
 static void evtchn_handler(struct ex_regs *regs)
 {
 	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
 	vi->evtchn_upcall_pending = 0;
+	vi->evtchn_pending_sel = 0;
 
 	GUEST_SYNC(0x20);
 }
@@ -127,7 +147,19 @@ static void guest_code(void)
 	GUEST_SYNC(6);
 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
 
-	GUEST_DONE();
+	/* Attempt to deliver a *masked* interrupt */
+	GUEST_SYNC(7);
+
+	/* Wait until we see the bit set */
+	struct shared_info *si = (void *)SHINFO_VADDR;
+	while (!si->evtchn_pending[0])
+		__asm__ __volatile__ ("rep nop" : : : "memory");
+
+	/* Now deliver an *unmasked* interrupt */
+	GUEST_SYNC(8);
+
+	for (;;)
+		__asm__ __volatile__ ("rep nop" : : : "memory");
 }
 
 static int cmp_timespec(struct timespec *a, struct timespec *b)
@@ -144,6 +176,11 @@ static int cmp_timespec(struct timespec *a, struct timespec *b)
 		return 0;
 }
 
+static void handle_alrm(int sig)
+{
+	TEST_FAIL("IRQ delivery timed out");
+}
+
 int main(int argc, char *argv[])
 {
 	struct timespec min_ts, max_ts, vm_ts;
@@ -155,6 +192,7 @@ int main(int argc, char *argv[])
 	}
 
 	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
+	bool do_eventfd_tests = true;
 
 	clock_gettime(CLOCK_REALTIME, &min_ts);
 
@@ -214,6 +252,51 @@ int main(int argc, char *argv[])
 		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
 	}
 
+	int irq_fd[2] = { -1, -1 };
+
+	if (do_eventfd_tests) {
+		irq_fd[0] = eventfd(0, 0);
+		irq_fd[1] = eventfd(0, 0);
+
+		/* Unexpected, but not a KVM failure */
+		if (irq_fd[0] == -1 || irq_fd[1] == -1)
+			do_eventfd_tests = false;
+	}
+
+	if (do_eventfd_tests) {
+		irq_routes.info.nr = 2;
+
+		irq_routes.entries[0].gsi = 32;
+		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[0].u.xen_evtchn.port = 15;
+		irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID;
+		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		irq_routes.entries[1].gsi = 33;
+		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[1].u.xen_evtchn.port = 66;
+		irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID;
+		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes);
+
+		struct kvm_irqfd ifd = { };
+
+		ifd.fd = irq_fd[0];
+		ifd.gsi = 32;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		ifd.fd = irq_fd[1];
+		ifd.gsi = 33;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		struct sigaction sa = { };
+		sa.sa_handler = handle_alrm;
+		sigaction(SIGALRM, &sa, NULL);
+	}
+
+	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
+
 	struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
 	vinfo->evtchn_upcall_pending = 0;
 
@@ -289,9 +372,23 @@ int main(int argc, char *argv[])
 					sched_yield();
 				} while (get_run_delay() < rundelay);
 				break;
+			case 7:
+				if (!do_eventfd_tests)
+					goto done;
+				shinfo->evtchn_mask[0] = 0x8000;
+				eventfd_write(irq_fd[0], 1UL);
+				alarm(1);
+				break;
+			case 8:
+				eventfd_write(irq_fd[1], 1UL);
+				evtchn_irq_expected = true;
+				break;
+
 			case 0x20:
 				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
 				evtchn_irq_expected = false;
+				if (shinfo->evtchn_pending[1])
+					goto done;
 				break;
 			}
 			break;
-- 
2.31.1


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 12:22       ` Paolo Bonzini
  2021-10-25 13:13         ` David Woodhouse
@ 2021-10-30  7:58         ` David Woodhouse
  2021-10-31  6:52           ` Paolo Bonzini
  1 sibling, 1 reply; 28+ messages in thread
From: David Woodhouse @ 2021-10-30  7:58 UTC (permalink / raw)
  To: Paolo Bonzini, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 1543 bytes --]

On Mon, 2021-10-25 at 14:22 +0200, Paolo Bonzini wrote:
> On 25/10/21 14:19, David Woodhouse wrote:
> > So, with a fixed version of kvm_map_gfn() I suppose I could do the
> > same, but that's *two*  maps/unmaps for each interrupt? That's probably
> > worse than just bouncing out and letting userspace do it!
> 
> Absolutely!  The fixed version of kvm_map_gfn should not do any 
> map/unmap, it should do it eagerly on MMU notifier operations.

Staring at this some more... what *currently* protects a
gfn_to_pfn_cache when the page tables change — either because userspace
either mmaps something else over the same HVA, or the underlying page
is just swapped out and restored? 

I don't see that our MMU notifiers will do anything to invalidate e.g.
kvm->arch.st.cache, and thus record_steal_time() will just keep
kmapping and writing to the *original* underlying cached pfn, long
after that page no longer even belongs to the process running KVM?

I think I'd prefer to fix that instead of making excuses for it and
leaving it around as an example, but...

I think that *if* it's a standard page (not IOMEM) then we do hold a
reference on the page so it won't be *freed* and used by any other
process, and I think that even means it won't be swapped out?

So the only interesting failure modes are if the VMM explicitly mmaps
something else over the referenced page (don't do that then?) or if the
VMM is managing IOMEM pages explicitly and swaps out the page the guest
is using for steal time reporting?



[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-30  7:58         ` David Woodhouse
@ 2021-10-31  6:52           ` Paolo Bonzini
  2021-10-31  7:51             ` David Woodhouse
  2021-10-31  8:31             ` David Woodhouse
  0 siblings, 2 replies; 28+ messages in thread
From: Paolo Bonzini @ 2021-10-31  6:52 UTC (permalink / raw)
  To: David Woodhouse, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

On 30/10/21 09:58, David Woodhouse wrote:
>> Absolutely!  The fixed version of kvm_map_gfn should not do any
>> map/unmap, it should do it eagerly on MMU notifier operations.
> Staring at this some more... what*currently*  protects a
> gfn_to_pfn_cache when the page tables change — either because userspace
> either mmaps something else over the same HVA, or the underlying page
> is just swapped out and restored?

kvm_cache_gfn_to_pfn calls gfn_to_pfn_memslot, which pins the page.

Paolo


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-31  6:52           ` Paolo Bonzini
@ 2021-10-31  7:51             ` David Woodhouse
  2021-10-31  8:31             ` David Woodhouse
  1 sibling, 0 replies; 28+ messages in thread
From: David Woodhouse @ 2021-10-31  7:51 UTC (permalink / raw)
  To: Paolo Bonzini, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro



On 31 October 2021 06:52:33 GMT, Paolo Bonzini <pbonzini@redhat.com> wrote:
>On 30/10/21 09:58, David Woodhouse wrote:
>>> Absolutely!  The fixed version of kvm_map_gfn should not do any
>>> map/unmap, it should do it eagerly on MMU notifier operations.
>> Staring at this some more... what*currently*  protects a
>> gfn_to_pfn_cache when the page tables change — either because userspace
>> either mmaps something else over the same HVA, or the underlying page
>> is just swapped out and restored?
>
>kvm_cache_gfn_to_pfn calls gfn_to_pfn_memslot, which pins the page.


It pins the underlying page but that doesn't guarantee that the page remains mapped at the HVA corresponding to that GFN, does it?

And I though the whole point of the repeated map/unmap was *not* to pin the page, anyway?

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-31  6:52           ` Paolo Bonzini
  2021-10-31  7:51             ` David Woodhouse
@ 2021-10-31  8:31             ` David Woodhouse
  2021-11-01  8:16               ` David Woodhouse
  1 sibling, 1 reply; 28+ messages in thread
From: David Woodhouse @ 2021-10-31  8:31 UTC (permalink / raw)
  To: Paolo Bonzini, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 3415 bytes --]

On Sun, 2021-10-31 at 07:52 +0100, Paolo Bonzini wrote:
> On 30/10/21 09:58, David Woodhouse wrote:
> > > Absolutely!  The fixed version of kvm_map_gfn should not do any
> > > map/unmap, it should do it eagerly on MMU notifier operations.
> 
> > Staring at this some more... what*currently*  protects a
> > gfn_to_pfn_cache when the page tables change — either because userspace
> > either mmaps something else over the same HVA, or the underlying page
> > is just swapped out and restored?
> 
> kvm_cache_gfn_to_pfn calls gfn_to_pfn_memslot, which pins the page.

Empirially, this breaks the test case in the series I sent out last
night, because the kernel is looking at the *wrong* shared info page
after userspace maps a new one at that HVA:

diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index c0a3b9cc2a86..5c41043d34d6 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -204,6 +204,11 @@ int main(int argc, char *argv[])
 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0);
 	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2);
 
+	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
+
+	int zero_fd = open("/dev/zero", O_RDONLY);
+	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
+
 	struct kvm_xen_hvm_config hvmc = {
 		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
 		.msr = XEN_HYPERCALL_MSR,
@@ -222,6 +227,9 @@ int main(int argc, char *argv[])
 	};
 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
 
+	void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0);
+	TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info");
+
 	struct kvm_xen_vcpu_attr vi = {
 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
 		.u.gpa = VCPU_INFO_ADDR,
@@ -295,8 +303,6 @@ int main(int argc, char *argv[])
 		sigaction(SIGALRM, &sa, NULL);
 	}
 
-	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
-
 	struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
 	vinfo->evtchn_upcall_pending = 0;
 


And then this *fixes* it again, by spotting the unmap and invalidating
the cached mapping:

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 1a64ba5b9437..f7e94d8d21a6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -25,6 +25,7 @@
 #include "kvm_emulate.h"
 #include "cpuid.h"
 #include "spte.h"
+#include "xen.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -1588,6 +1589,20 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool flush = false;
 
+	if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+	    kvm->arch.xen.shinfo_set) {
+		printk("Invalidate %llx-%llx\n", range->start, range->end);
+		write_lock(&kvm->arch.xen.shinfo_lock);
+		if (kvm->arch.xen.shinfo_cache.gfn >= range->start &&
+		    kvm->arch.xen.shinfo_cache.gfn < range->end) {
+			kvm->arch.xen.shared_info = NULL;
+			/* We have to break the GFN to PFN cache too or it'll just
+			 * remap the old page anyway. XX: Do this on the xen.c side */
+			kvm->arch.xen.shinfo_cache.generation = -1;
+		}
+		write_unlock(&kvm->arch.xen.shinfo_lock);
+	}
+
 	if (kvm_memslots_have_rmaps(kvm))
 		flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
 

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-31  8:31             ` David Woodhouse
@ 2021-11-01  8:16               ` David Woodhouse
  2021-11-01 14:18                   ` kernel test robot
  2021-11-01 20:35                   ` kernel test robot
  0 siblings, 2 replies; 28+ messages in thread
From: David Woodhouse @ 2021-11-01  8:16 UTC (permalink / raw)
  To: Paolo Bonzini, kvm, Raslan, KarimAllah, Joao Martins, Ankur Arora
  Cc: jmattson, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 9840 bytes --]

On Sun, 2021-10-31 at 08:31 +0000, David Woodhouse wrote:
> On Sun, 2021-10-31 at 07:52 +0100, Paolo Bonzini wrote:
> > On 30/10/21 09:58, David Woodhouse wrote:
> > > > Absolutely!  The fixed version of kvm_map_gfn should not do any
> > > > map/unmap, it should do it eagerly on MMU notifier operations.
> > 
> > > Staring at this some more... what*currently*  protects a
> > > gfn_to_pfn_cache when the page tables change — either because userspace
> > > either mmaps something else over the same HVA, or the underlying page
> > > is just swapped out and restored?
> > 
> > kvm_cache_gfn_to_pfn calls gfn_to_pfn_memslot, which pins the page.
> 
> Empirically, this breaks the test case in the series I sent out last
> night, because the kernel is looking at the *wrong* shared info page
> after userspace maps a new one at that HVA...

The fun part now looks like this; if this is deemed sane I'll work up
something that fixes the steal time thing in a similar way. And maybe
turn it into a generic 'gfn_to_kva_cache'?

The full series, with the final patch showing how it gets used, is at
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xen-evtchn

From a329c575aadc0b65bb7ebd97eaf9f9374508cea9 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 30 Oct 2021 19:53:23 +0100
Subject: [PATCH 5/6] KVM: x86/xen: Maintain valid mapping of Xen shared_info
 page

In order to allow for event channel delivery, we would like to have a
kernel mapping of the shared_info page which can be accessed in atomic
context in the common case.

The gfn_to_pfn_cache only automatically handles invalidation when the
KVM memslots change; it doesn't handle a change in the userspace HVA
to host PFN mappings. So hook into the MMU notifiers to invalidate the
shared_info pointer on demand.

The shared_info can be accessed while holding the shinfo_lock, with a
slow path which takes the kvm->lock mutex to refresh the mapping.
I'd like to use RCU for the invalidation but I don't think we can
always sleep in the invalidate_range notifier. Having a true kernel
mapping of the page means that our access to it can be atomic anyway,
so holding a spinlock is OK.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/kvm_host.h |  4 ++
 arch/x86/kvm/mmu/mmu.c          | 23 ++++++++++++
 arch/x86/kvm/xen.c              | 65 +++++++++++++++++++++++++++------
 include/linux/kvm_host.h        | 26 -------------
 include/linux/kvm_types.h       | 27 ++++++++++++++
 5 files changed, 107 insertions(+), 38 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 13f64654dfff..bb4868c3c06b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1017,6 +1017,10 @@ struct kvm_xen {
 	bool long_mode;
 	u8 upcall_vector;
 	gfn_t shinfo_gfn;
+	rwlock_t shinfo_lock;
+	void *shared_info;
+	struct kvm_host_map shinfo_map;
+	struct gfn_to_pfn_cache shinfo_cache;
 };
 
 enum kvm_irqchip_mode {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0cc58901bf7a..429a4860d67a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -25,6 +25,7 @@
 #include "kvm_emulate.h"
 #include "cpuid.h"
 #include "spte.h"
+#include "xen.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -1588,6 +1589,28 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool flush = false;
 
+	if (static_branch_unlikely(&kvm_xen_enabled.key)) {
+		write_lock(&kvm->arch.xen.shinfo_lock);
+
+		if (kvm->arch.xen.shared_info &&
+		    kvm->arch.xen.shinfo_gfn >= range->start &&
+		    kvm->arch.xen.shinfo_cache.gfn < range->end) {
+			/*
+			 * If kvm_xen_shared_info_init() had *finished* mapping the
+			 * page and assigned the pointer for real, then mark the page
+			 * dirty now instead of via the eventual cache teardown.
+			 */
+			if (kvm->arch.xen.shared_info != KVM_UNMAPPED_PAGE) {
+				kvm_set_pfn_dirty(kvm->arch.xen.shinfo_cache.pfn);
+				kvm->arch.xen.shinfo_cache.dirty = false;
+			}
+
+			kvm->arch.xen.shared_info = NULL;
+		}
+
+		write_unlock(&kvm->arch.xen.shinfo_lock);
+	}
+
 	if (kvm_memslots_have_rmaps(kvm))
 		flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
 
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 565da9c3853b..9d143bc7d769 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -21,18 +21,59 @@
 
 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
 
-static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+static void kvm_xen_shared_info_unmap(struct kvm *kvm)
+{
+	bool was_valid = false;
+
+	write_lock(&kvm->arch.xen.shinfo_lock);
+	if (kvm->arch.xen.shared_info)
+		was_valid = true;
+	kvm->arch.xen.shared_info = NULL;
+	kvm->arch.xen.shinfo_gfn = GPA_INVALID;
+	write_unlock(&kvm->arch.xen.shinfo_lock);
+
+	if (kvm_vcpu_mapped(&kvm->arch.xen.shinfo_map)) {
+		kvm_unmap_gfn(kvm, &kvm->arch.xen.shinfo_map,
+			      &kvm->arch.xen.shinfo_cache, was_valid, false);
+
+		/* If the MMU notifier invalidated it, the gfn_to_pfn_cache
+		 * may be invalid. Force it to notice */
+		if (!was_valid)
+			kvm->arch.xen.shinfo_cache.generation = -1;
+	}
+}
+
+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn, bool update_clock)
 {
 	gpa_t gpa = gfn_to_gpa(gfn);
 	int wc_ofs, sec_hi_ofs;
 	int ret = 0;
 	int idx = srcu_read_lock(&kvm->srcu);
 
-	if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) {
-		ret = -EFAULT;
+	kvm_xen_shared_info_unmap(kvm);
+
+	if (gfn == GPA_INVALID)
 		goto out;
-	}
+
+	/* Let the MMU notifier know that we are in the process of mapping it */
+	write_lock(&kvm->arch.xen.shinfo_lock);
+	kvm->arch.xen.shared_info = KVM_UNMAPPED_PAGE;
 	kvm->arch.xen.shinfo_gfn = gfn;
+	write_unlock(&kvm->arch.xen.shinfo_lock);
+
+	ret = kvm_map_gfn(kvm, gfn, &kvm->arch.xen.shinfo_map,
+			  &kvm->arch.xen.shinfo_cache, false);
+	if (ret)
+		goto out;
+
+	write_lock(&kvm->arch.xen.shinfo_lock);
+	/* Unless the MMU notifier already invalidated it */
+	if (kvm->arch.xen.shared_info == KVM_UNMAPPED_PAGE)
+		kvm->arch.xen.shared_info = kvm->arch.xen.shinfo_map.hva;
+	write_unlock(&kvm->arch.xen.shinfo_lock);
+
+	if (!update_clock)
+		goto out;
 
 	/* Paranoia checks on the 32-bit struct layout */
 	BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -260,15 +301,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		break;
 
 	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
-		if (data->u.shared_info.gfn == GPA_INVALID) {
-			kvm->arch.xen.shinfo_gfn = GPA_INVALID;
-			r = 0;
-			break;
-		}
-		r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+		r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn, true);
 		break;
 
-
 	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
 		if (data->u.vector && data->u.vector < 0x10)
 			r = -EINVAL;
@@ -661,11 +696,17 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 
 void kvm_xen_init_vm(struct kvm *kvm)
 {
-	kvm->arch.xen.shinfo_gfn = GPA_INVALID;
+	rwlock_init(&kvm->arch.xen.shinfo_lock);
 }
 
 void kvm_xen_destroy_vm(struct kvm *kvm)
 {
+	struct gfn_to_pfn_cache *cache = &kvm->arch.xen.shinfo_cache;
+
+	kvm_xen_shared_info_unmap(kvm);
+
+	kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
 	if (kvm->arch.xen_hvm_config.msr)
 		static_branch_slow_dec_deferred(&kvm_xen_enabled);
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 749cdc77fc4e..f0012d128aa5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -251,32 +251,6 @@ enum {
 	READING_SHADOW_PAGE_TABLES,
 };
 
-#define KVM_UNMAPPED_PAGE	((void *) 0x500 + POISON_POINTER_DELTA)
-
-struct kvm_host_map {
-	/*
-	 * Only valid if the 'pfn' is managed by the host kernel (i.e. There is
-	 * a 'struct page' for it. When using mem= kernel parameter some memory
-	 * can be used as guest memory but they are not managed by host
-	 * kernel).
-	 * If 'pfn' is not managed by the host kernel, this field is
-	 * initialized to KVM_UNMAPPED_PAGE.
-	 */
-	struct page *page;
-	void *hva;
-	kvm_pfn_t pfn;
-	kvm_pfn_t gfn;
-};
-
-/*
- * Used to check if the mapping is valid or not. Never use 'kvm_host_map'
- * directly to check for that.
- */
-static inline bool kvm_vcpu_mapped(struct kvm_host_map *map)
-{
-	return !!map->hva;
-}
-
 static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop)
 {
 	return single_task_running() && !need_resched() && ktime_before(cur, stop);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 2237abb93ccd..2092f4ca156b 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -60,6 +60,33 @@ struct gfn_to_pfn_cache {
 	bool dirty;
 };
 
+#define KVM_UNMAPPED_PAGE	((void *) 0x500 + POISON_POINTER_DELTA)
+
+struct kvm_host_map {
+	/*
+	 * Only valid if the 'pfn' is managed by the host kernel (i.e. There is
+	 * a 'struct page' for it. When using mem= kernel parameter some memory
+	 * can be used as guest memory but they are not managed by host
+	 * kernel).
+	 * If 'pfn' is not managed by the host kernel, this field is
+	 * initialized to KVM_UNMAPPED_PAGE.
+	 */
+	struct page *page;
+	void *hva;
+	kvm_pfn_t pfn;
+	kvm_pfn_t gfn;
+};
+
+/*
+ * Used to check if the mapping is valid or not. Never use 'kvm_host_map'
+ * directly to check for that.
+ */
+static inline bool kvm_vcpu_mapped(struct kvm_host_map *map)
+{
+	return !!map->hva;
+}
+
+
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 /*
  * Memory caches are used to preallocate memory ahead of various MMU flows,
-- 
2.31.1


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] KVM: x86/xen: Maintain valid mapping of Xen shared_inf=
  2021-11-01  8:16               ` David Woodhouse
@ 2021-11-01 14:18                   ` kernel test robot
  2021-11-01 20:35                   ` kernel test robot
  1 sibling, 0 replies; 28+ messages in thread
From: kernel test robot @ 2021-11-01 14:18 UTC (permalink / raw)
  To: David Woodhouse, Paolo Bonzini, kvm, Raslan, KarimAllah,
	Joao Martins, Ankur Arora
  Cc: llvm, kbuild-all, jmattson, wanpengli, seanjc, vkuznets, mtosatti

[-- Attachment #1: Type: text/plain, Size: 3415 bytes --]

Hi David,

I love your patch! Yet something to improve:

[auto build test ERROR on kvm/queue]
[also build test ERROR on mst-vhost/linux-next v5.15 next-20211101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/David-Woodhouse/KVM-x86-xen-Maintain-valid-mapping-of-Xen-shared_inf/20211101-161824
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
config: x86_64-randconfig-a003-20211101 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 82ed106567063ea269c6d5669278b733e173a42f)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/5faac25097318b72a65ed637b6a46ec92353cadb
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review David-Woodhouse/KVM-x86-xen-Maintain-valid-mapping-of-Xen-shared_inf/20211101-161824
        git checkout 5faac25097318b72a65ed637b6a46ec92353cadb
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> arch/x86/kvm/xen.c:36:17: error: incompatible pointer types passing 'struct kvm *' to parameter of type 'struct kvm_vcpu *' [-Werror,-Wincompatible-pointer-types]
                   kvm_unmap_gfn(kvm, &kvm->arch.xen.shinfo_map,
                                 ^~~
   include/linux/kvm_host.h:923:36: note: passing argument to parameter 'vcpu' here
   int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
                                      ^
   arch/x86/kvm/xen.c:64:20: error: incompatible pointer types passing 'struct kvm *' to parameter of type 'struct kvm_vcpu *' [-Werror,-Wincompatible-pointer-types]
           ret = kvm_map_gfn(kvm, gfn, &kvm->arch.xen.shinfo_map,
                             ^~~
   include/linux/kvm_host.h:919:34: note: passing argument to parameter 'vcpu' here
   int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
                                    ^
   2 errors generated.


vim +36 arch/x86/kvm/xen.c

    23	
    24	static void kvm_xen_shared_info_unmap(struct kvm *kvm)
    25	{
    26		bool was_valid = false;
    27	
    28		write_lock(&kvm->arch.xen.shinfo_lock);
    29		if (kvm->arch.xen.shared_info)
    30			was_valid = true;
    31		kvm->arch.xen.shared_info = NULL;
    32		kvm->arch.xen.shinfo_gfn = GPA_INVALID;
    33		write_unlock(&kvm->arch.xen.shinfo_lock);
    34	
    35		if (kvm_vcpu_mapped(&kvm->arch.xen.shinfo_map)) {
  > 36			kvm_unmap_gfn(kvm, &kvm->arch.xen.shinfo_map,
    37				      &kvm->arch.xen.shinfo_cache, was_valid, false);
    38	
    39			/* If the MMU notifier invalidated it, the gfn_to_pfn_cache
    40			 * may be invalid. Force it to notice */
    41			if (!was_valid)
    42				kvm->arch.xen.shinfo_cache.generation = -1;
    43		}
    44	}
    45	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31258 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] KVM: x86/xen: Maintain valid mapping of Xen shared_inf=
@ 2021-11-01 14:18                   ` kernel test robot
  0 siblings, 0 replies; 28+ messages in thread
From: kernel test robot @ 2021-11-01 14:18 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 3490 bytes --]

Hi David,

I love your patch! Yet something to improve:

[auto build test ERROR on kvm/queue]
[also build test ERROR on mst-vhost/linux-next v5.15 next-20211101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/David-Woodhouse/KVM-x86-xen-Maintain-valid-mapping-of-Xen-shared_inf/20211101-161824
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
config: x86_64-randconfig-a003-20211101 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 82ed106567063ea269c6d5669278b733e173a42f)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/5faac25097318b72a65ed637b6a46ec92353cadb
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review David-Woodhouse/KVM-x86-xen-Maintain-valid-mapping-of-Xen-shared_inf/20211101-161824
        git checkout 5faac25097318b72a65ed637b6a46ec92353cadb
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> arch/x86/kvm/xen.c:36:17: error: incompatible pointer types passing 'struct kvm *' to parameter of type 'struct kvm_vcpu *' [-Werror,-Wincompatible-pointer-types]
                   kvm_unmap_gfn(kvm, &kvm->arch.xen.shinfo_map,
                                 ^~~
   include/linux/kvm_host.h:923:36: note: passing argument to parameter 'vcpu' here
   int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
                                      ^
   arch/x86/kvm/xen.c:64:20: error: incompatible pointer types passing 'struct kvm *' to parameter of type 'struct kvm_vcpu *' [-Werror,-Wincompatible-pointer-types]
           ret = kvm_map_gfn(kvm, gfn, &kvm->arch.xen.shinfo_map,
                             ^~~
   include/linux/kvm_host.h:919:34: note: passing argument to parameter 'vcpu' here
   int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
                                    ^
   2 errors generated.


vim +36 arch/x86/kvm/xen.c

    23	
    24	static void kvm_xen_shared_info_unmap(struct kvm *kvm)
    25	{
    26		bool was_valid = false;
    27	
    28		write_lock(&kvm->arch.xen.shinfo_lock);
    29		if (kvm->arch.xen.shared_info)
    30			was_valid = true;
    31		kvm->arch.xen.shared_info = NULL;
    32		kvm->arch.xen.shinfo_gfn = GPA_INVALID;
    33		write_unlock(&kvm->arch.xen.shinfo_lock);
    34	
    35		if (kvm_vcpu_mapped(&kvm->arch.xen.shinfo_map)) {
  > 36			kvm_unmap_gfn(kvm, &kvm->arch.xen.shinfo_map,
    37				      &kvm->arch.xen.shinfo_cache, was_valid, false);
    38	
    39			/* If the MMU notifier invalidated it, the gfn_to_pfn_cache
    40			 * may be invalid. Force it to notice */
    41			if (!was_valid)
    42				kvm->arch.xen.shinfo_cache.generation = -1;
    43		}
    44	}
    45	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 31258 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] KVM: x86/xen: Maintain valid mapping of Xen shared_inf=
  2021-11-01  8:16               ` David Woodhouse
@ 2021-11-01 20:35                   ` kernel test robot
  2021-11-01 20:35                   ` kernel test robot
  1 sibling, 0 replies; 28+ messages in thread
From: kernel test robot @ 2021-11-01 20:35 UTC (permalink / raw)
  To: David Woodhouse, Paolo Bonzini, kvm, Raslan, KarimAllah,
	Joao Martins, Ankur Arora
  Cc: llvm, kbuild-all, jmattson, wanpengli, seanjc, vkuznets, mtosatti

[-- Attachment #1: Type: text/plain, Size: 4303 bytes --]

Hi David,

I love your patch! Yet something to improve:

[auto build test ERROR on kvm/queue]
[also build test ERROR on mst-vhost/linux-next v5.15 next-20211101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/David-Woodhouse/KVM-x86-xen-Maintain-valid-mapping-of-Xen-shared_inf/20211101-161824
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
config: i386-randconfig-a002-20211101 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 264d3b6d4e08401c5b50a85bd76e80b3461d77e6)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/5faac25097318b72a65ed637b6a46ec92353cadb
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review David-Woodhouse/KVM-x86-xen-Maintain-valid-mapping-of-Xen-shared_inf/20211101-161824
        git checkout 5faac25097318b72a65ed637b6a46ec92353cadb
        # save the attached .config to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
           if (static_branch_unlikely(&kvm_xen_enabled.key)) {
                                       ^
>> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
>> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
>> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
>> arch/x86/kvm/mmu/mmu.c:1582:6: error: invalid argument type 'void' to unary expression
           if (static_branch_unlikely(&kvm_xen_enabled.key)) {
               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/jump_label.h:508:35: note: expanded from macro 'static_branch_unlikely'
   #define static_branch_unlikely(x)       unlikely_notrace(static_key_enabled(&(x)->key))
                                           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/compiler.h:80:30: note: expanded from macro 'unlikely_notrace'
   # define unlikely_notrace(x)    unlikely(x)
                                   ^~~~~~~~~~~
   include/linux/compiler.h:78:40: note: expanded from macro 'unlikely'
   # define unlikely(x)    __builtin_expect(!!(x), 0)
                                             ^~~~
   5 errors generated.


vim +/kvm_xen_enabled +1582 arch/x86/kvm/mmu/mmu.c

  1577	
  1578	bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
  1579	{
  1580		bool flush = false;
  1581	
> 1582		if (static_branch_unlikely(&kvm_xen_enabled.key)) {
  1583			write_lock(&kvm->arch.xen.shinfo_lock);
  1584	
  1585			if (kvm->arch.xen.shared_info &&
  1586			    kvm->arch.xen.shinfo_gfn >= range->start &&
  1587			    kvm->arch.xen.shinfo_cache.gfn < range->end) {
  1588				/*
  1589				 * If kvm_xen_shared_info_init() had *finished* mapping the
  1590				 * page and assigned the pointer for real, then mark the page
  1591				 * dirty now instead of via the eventual cache teardown.
  1592				 */
  1593				if (kvm->arch.xen.shared_info != KVM_UNMAPPED_PAGE) {
  1594					kvm_set_pfn_dirty(kvm->arch.xen.shinfo_cache.pfn);
  1595					kvm->arch.xen.shinfo_cache.dirty = false;
  1596				}
  1597	
  1598				kvm->arch.xen.shared_info = NULL;
  1599			}
  1600	
  1601			write_unlock(&kvm->arch.xen.shinfo_lock);
  1602		}
  1603	
  1604		if (kvm_memslots_have_rmaps(kvm))
  1605			flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
  1606	
  1607		if (is_tdp_mmu_enabled(kvm))
  1608			flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
  1609	
  1610		return flush;
  1611	}
  1612	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 36389 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] KVM: x86/xen: Maintain valid mapping of Xen shared_inf=
@ 2021-11-01 20:35                   ` kernel test robot
  0 siblings, 0 replies; 28+ messages in thread
From: kernel test robot @ 2021-11-01 20:35 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 4398 bytes --]

Hi David,

I love your patch! Yet something to improve:

[auto build test ERROR on kvm/queue]
[also build test ERROR on mst-vhost/linux-next v5.15 next-20211101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/David-Woodhouse/KVM-x86-xen-Maintain-valid-mapping-of-Xen-shared_inf/20211101-161824
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
config: i386-randconfig-a002-20211101 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 264d3b6d4e08401c5b50a85bd76e80b3461d77e6)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/5faac25097318b72a65ed637b6a46ec92353cadb
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review David-Woodhouse/KVM-x86-xen-Maintain-valid-mapping-of-Xen-shared_inf/20211101-161824
        git checkout 5faac25097318b72a65ed637b6a46ec92353cadb
        # save the attached .config to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
           if (static_branch_unlikely(&kvm_xen_enabled.key)) {
                                       ^
>> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
>> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
>> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
>> arch/x86/kvm/mmu/mmu.c:1582:6: error: invalid argument type 'void' to unary expression
           if (static_branch_unlikely(&kvm_xen_enabled.key)) {
               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/jump_label.h:508:35: note: expanded from macro 'static_branch_unlikely'
   #define static_branch_unlikely(x)       unlikely_notrace(static_key_enabled(&(x)->key))
                                           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/compiler.h:80:30: note: expanded from macro 'unlikely_notrace'
   # define unlikely_notrace(x)    unlikely(x)
                                   ^~~~~~~~~~~
   include/linux/compiler.h:78:40: note: expanded from macro 'unlikely'
   # define unlikely(x)    __builtin_expect(!!(x), 0)
                                             ^~~~
   5 errors generated.


vim +/kvm_xen_enabled +1582 arch/x86/kvm/mmu/mmu.c

  1577	
  1578	bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
  1579	{
  1580		bool flush = false;
  1581	
> 1582		if (static_branch_unlikely(&kvm_xen_enabled.key)) {
  1583			write_lock(&kvm->arch.xen.shinfo_lock);
  1584	
  1585			if (kvm->arch.xen.shared_info &&
  1586			    kvm->arch.xen.shinfo_gfn >= range->start &&
  1587			    kvm->arch.xen.shinfo_cache.gfn < range->end) {
  1588				/*
  1589				 * If kvm_xen_shared_info_init() had *finished* mapping the
  1590				 * page and assigned the pointer for real, then mark the page
  1591				 * dirty now instead of via the eventual cache teardown.
  1592				 */
  1593				if (kvm->arch.xen.shared_info != KVM_UNMAPPED_PAGE) {
  1594					kvm_set_pfn_dirty(kvm->arch.xen.shinfo_cache.pfn);
  1595					kvm->arch.xen.shinfo_cache.dirty = false;
  1596				}
  1597	
  1598				kvm->arch.xen.shared_info = NULL;
  1599			}
  1600	
  1601			write_unlock(&kvm->arch.xen.shinfo_lock);
  1602		}
  1603	
  1604		if (kvm_memslots_have_rmaps(kvm))
  1605			flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
  1606	
  1607		if (is_tdp_mmu_enabled(kvm))
  1608			flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
  1609	
  1610		return flush;
  1611	}
  1612	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 36389 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] KVM: x86/xen: Maintain valid mapping of Xen shared_inf=
  2021-11-01 20:35                   ` kernel test robot
@ 2021-11-01 20:43                     ` David Woodhouse
  -1 siblings, 0 replies; 28+ messages in thread
From: David Woodhouse @ 2021-11-01 20:43 UTC (permalink / raw)
  To: kernel test robot, Paolo Bonzini, kvm, Raslan, KarimAllah,
	Joao Martins, Ankur Arora
  Cc: llvm, kbuild-all, jmattson, wanpengli, seanjc, vkuznets, mtosatti

[-- Attachment #1: Type: text/plain, Size: 1643 bytes --]

On Tue, 2021-11-02 at 04:35 +0800, kernel test robot wrote:
> 
> All errors (new ones prefixed by >>):
> 
> >> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
>            if (static_branch_unlikely(&kvm_xen_enabled.key)) {
>                                        ^
> >> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
> >> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
> >> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
> >> arch/x86/kvm/mmu/mmu.c:1582:6: error: invalid argument type 'void' to unary expression
>            if (static_branch_unlikely(&kvm_xen_enabled.key)) {
>                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    include/linux/jump_label.h:508:35: note: expanded from macro 'static_branch_unlikely'
>    #define static_branch_unlikely(x)       unlikely_notrace(static_key_enabled(&(x)->key))
>                                            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    include/linux/compiler.h:80:30: note: expanded from macro 'unlikely_notrace'
>    # define unlikely_notrace(x)    unlikely(x)
>                                    ^~~~~~~~~~~
>    include/linux/compiler.h:78:40: note: expanded from macro 'unlikely'
>    # define unlikely(x)    __builtin_expect(!!(x), 0)
>                                              ^~~~
>    5 errors generated.


Oops, missing #ifdef CONFIG_KVM_XEN around that one. Fixed in 
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xen-evtchn

Thanks.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] KVM: x86/xen: Maintain valid mapping of Xen shared_inf=
@ 2021-11-01 20:43                     ` David Woodhouse
  0 siblings, 0 replies; 28+ messages in thread
From: David Woodhouse @ 2021-11-01 20:43 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 1643 bytes --]

On Tue, 2021-11-02 at 04:35 +0800, kernel test robot wrote:
> 
> All errors (new ones prefixed by >>):
> 
> >> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
>            if (static_branch_unlikely(&kvm_xen_enabled.key)) {
>                                        ^
> >> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
> >> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
> >> arch/x86/kvm/mmu/mmu.c:1582:30: error: use of undeclared identifier 'kvm_xen_enabled'
> >> arch/x86/kvm/mmu/mmu.c:1582:6: error: invalid argument type 'void' to unary expression
>            if (static_branch_unlikely(&kvm_xen_enabled.key)) {
>                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    include/linux/jump_label.h:508:35: note: expanded from macro 'static_branch_unlikely'
>    #define static_branch_unlikely(x)       unlikely_notrace(static_key_enabled(&(x)->key))
>                                            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    include/linux/compiler.h:80:30: note: expanded from macro 'unlikely_notrace'
>    # define unlikely_notrace(x)    unlikely(x)
>                                    ^~~~~~~~~~~
>    include/linux/compiler.h:78:40: note: expanded from macro 'unlikely'
>    # define unlikely(x)    __builtin_expect(!!(x), 0)
>                                              ^~~~
>    5 errors generated.


Oops, missing #ifdef CONFIG_KVM_XEN around that one. Fixed in 
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xen-evtchn

Thanks.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
  2021-10-25 13:29 ` [PATCH v2] " David Woodhouse
@ 2022-02-09 14:30   ` David Woodhouse
  0 siblings, 0 replies; 28+ messages in thread
From: David Woodhouse @ 2022-02-09 14:30 UTC (permalink / raw)
  To: kvm; +Cc: jmattson, pbonzini, wanpengli, seanjc, vkuznets, mtosatti, joro

[-- Attachment #1: Type: text/plain, Size: 1833 bytes --]

On Mon, 2021-10-25 at 14:29 +0100, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
> 
> There are circumstances whem kvm_xen_update_runstate_guest() should not
> sleep because it ends up being called from __schedule() when the vCPU
> is preempted:
> 
> [  222.830825]  kvm_xen_update_runstate_guest+0x24/0x100
> [  222.830878]  kvm_arch_vcpu_put+0x14c/0x200
> [  222.830920]  kvm_sched_out+0x30/0x40
> [  222.830960]  __schedule+0x55c/0x9f0
> 
> To handle this, make it use the same trick as __kvm_xen_has_interrupt(),
> of using the hva from the gfn_to_hva_cache directly. Then it can use
> pagefault_disable() around the accesses and just bail out if the page
> is absent (which is unlikely).
> 
> I almost switched to using a gfn_to_pfn_cache here and bailing out if
> kvm_map_gfn() fails, like kvm_steal_time_set_preempted() does — but on
> closer inspection it looks like kvm_map_gfn() will *always* fail in
> atomic context for a page in IOMEM, which means it will silently fail
> to make the update every single time for such guests, AFAICT. So I
> didn't do it that way after all. And will probably fix that one too.
> 
> Cc: stable@vger.kernel.org
> Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information")
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
> v2: Mark the page dirty after writing to it, add stable tag.

Hm, this got dropped I think.

I'm now working on other event channel stuff and I spotted it because
it's causing conflicts on backports to my internal tree (where it
didn't get dropped).

I am also working on converting to gfn_to_pfn_cache; the commit comment
above predates me actually *fixing* that. But this patch should
probably go in as-is for backporting to stable, and the conversions can
come on top.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5965 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2022-02-09 14:30 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-23 19:47 [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU Woodhouse, David
2021-10-24 11:15 ` David Woodhouse
2021-10-25 10:23 ` Paolo Bonzini
2021-10-25 10:39   ` [EXTERNAL] " David Woodhouse
2021-10-25 12:12     ` Paolo Bonzini
2021-10-25 12:19     ` David Woodhouse
2021-10-25 12:22       ` Paolo Bonzini
2021-10-25 13:13         ` David Woodhouse
2021-10-29 10:48           ` David Woodhouse
2021-10-30  7:58         ` David Woodhouse
2021-10-31  6:52           ` Paolo Bonzini
2021-10-31  7:51             ` David Woodhouse
2021-10-31  8:31             ` David Woodhouse
2021-11-01  8:16               ` David Woodhouse
2021-11-01 14:18                 ` [PATCH 5/6] KVM: x86/xen: Maintain valid mapping of Xen shared_inf= kernel test robot
2021-11-01 14:18                   ` kernel test robot
2021-11-01 20:35                 ` kernel test robot
2021-11-01 20:35                   ` kernel test robot
2021-11-01 20:43                   ` David Woodhouse
2021-11-01 20:43                     ` David Woodhouse
2021-10-28 22:22       ` [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU David Woodhouse
2021-10-29 11:31         ` Joao Martins
2021-10-29 11:56           ` David Woodhouse
2021-10-29 17:10             ` David Woodhouse
2021-10-25 11:43 ` Paolo Bonzini
2021-10-25 13:30   ` Woodhouse, David
2021-10-25 13:29 ` [PATCH v2] " David Woodhouse
2022-02-09 14:30   ` David Woodhouse

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.