On Mon, 2021-10-25 at 13:19 +0100, David Woodhouse wrote: > On Mon, 2021-10-25 at 11:39 +0100, David Woodhouse wrote: > > > One possible solution (which I even have unfinished patches for) is to > > > put all the gfn_to_pfn_caches on a list, and refresh them when the MMU > > > notifier receives an invalidation. > > > > For this use case I'm not even sure why I'd *want* to cache the PFN and > > explicitly kmap/memremap it, when surely by *definition* there's a > > perfectly serviceable HVA which already points to it? > > That's indeed true for *this* use case but my *next* use case is > actually implementing the event channel delivery. > > What we have in-kernel already is everything we absolutely *need* in > order to host Xen guests, but I really do want to fix the fact that > even IPIs and timers are bouncing up through userspace. Here's a completely untested attempt, in which all the complexity is based around the fact that I can't just pin the pages as João and Ankur's original did. It adds a new KVM_IRQ_ROUTING_XEN_EVTCHN with an ABI that allows for us to add FIFO event channels, but for now only supports 2 level. In kvm_xen_set_evtchn() I currently use kvm_map_gfn() *without* a cache at all, but I'll work something out for that. I think I can use a gfn_to_hva_cache (like the one removed in commit 319afe685) and in the rare case that it's invalid, I can take kvm->lock to revalidate it. It sets the bit in the global shared info but doesn't touch the target vCPU's vcpu_info; instead it sets a bit in an *in-kernel* shadow of the target's evtchn_pending_sel word, and kicks the vCPU. That shadow is actually synced to the guest's vcpu_info struct in kvm_xen_has_interrupt(). There's a little bit of fun asm there to set the bits in the userspace struct and then clear the same set of bits in the kernel shadow *if* the first op didn't fault. Or such is the intent; I didn't hook up a test yet. As things stand, I should be able to use this for delivery of PIRQs from my VMM, where things like passed-through PCI MSI gets turned into Xen event channels. As well as KVM unit tests, of course. The plan is then to hook up IPIs and timers — again based on the Oracle code from before, but using eventfds for the actual evtchn delivery. From be4b79e54ed07bbd2e4310a6da9e990efa6fbc6e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 28 Oct 2021 23:10:31 +0100 Subject: [PATCH] KVM: x86/xen: First attempt at KVM_IRQ_ROUTING_XEN_EVTCHN Signed-off-by: David Woodhouse --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/irq_comm.c | 12 +++ arch/x86/kvm/xen.c | 176 +++++++++++++++++++++++++++++++- arch/x86/kvm/xen.h | 6 ++ include/linux/kvm_host.h | 7 ++ include/uapi/linux/kvm.h | 10 ++ 6 files changed, 207 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 70771376e246..e1a4521ae838 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -606,6 +606,7 @@ struct kvm_vcpu_xen { u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; + unsigned long evtchn_pending_sel; }; struct kvm_vcpu_arch { diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index d5b72a08e566..6894f9a369f2 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -24,6 +24,7 @@ #include "hyperv.h" #include "x86.h" +#include "xen.h" static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, @@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return r; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + if (!level) + return -1; + + return kvm_xen_set_evtchn(e, kvm, true); +#endif default: break; } @@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm, e->hv_sint.vcpu = ue->u.hv_sint.vcpu; e->hv_sint.sint = ue->u.hv_sint.sint; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + return kvm_xen_setup_evtchn(kvm, e, ue); +#endif default: return -EINVAL; } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index c4bca001a7c9..bff5c458af96 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -207,6 +207,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { + unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); + bool atomic = in_atomic() || !task_is_running(current); int err; u8 rc = 0; @@ -216,6 +218,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) */ struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; struct kvm_memslots *slots = kvm_memslots(v->kvm); + bool ghc_valid = slots->generation == ghc->generation && + !kvm_is_error_hva(ghc->hva) && ghc->memslot; + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); /* No need for compat handling here */ @@ -231,8 +236,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) * cache in kvm_read_guest_offset_cached(), but just uses * __get_user() instead. And falls back to the slow path. */ - if (likely(slots->generation == ghc->generation && - !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { + if (!evtchn_pending_sel && ghc_valid) { /* Fast path */ pagefault_disable(); err = __get_user(rc, (u8 __user *)ghc->hva + offset); @@ -251,12 +255,72 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) * and we'll end up getting called again from a context where we *can* * fault in the page and wait for it. */ - if (in_atomic() || !task_is_running(current)) + if (atomic) return 1; - kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, - sizeof(rc)); + if (!ghc_valid) { + err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len); + if (err && !ghc->memslot) { + /* + * If this failed, userspace has screwed up the + * vcpu_info mapping. No interrupts for you. + */ + return 0; + } + } + /* + * Now we have a valid (protected by srcu) userspace HVA in + * ghc->hva which points to the struct vcpu_info. If there + * are any bits in the in-kernel evtchn_pending_sel then + * we need to write those to the guest vcpu_info and set + * its evtchn_upcall_pending flag. If there aren't any bits + * to add, we only want to *check* evtchn_upcall_pending. + */ + if (evtchn_pending_sel) { + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { + struct vcpu_info __user *vi = (void *)ghc->hva; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orq %1, %0\n" + "\tnotq %0\n" + "\t" LOCK_PREFIX "andq %2, %0\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel) + : "m" (vi->evtchn_pending_sel), + "m" (v->arch.xen.evtchn_pending_sel), + "0" (evtchn_pending_sel)); + } else { + struct compat_vcpu_info __user *vi = (void *)ghc->hva; + u32 evtchn_pending_sel32 = evtchn_pending_sel; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orl %1, %0\n" + "\tnotl %0\n" + "\t" LOCK_PREFIX "andl %2, %0\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel32) + : "m" (vi->evtchn_pending_sel), + "m" (v->arch.xen.evtchn_pending_sel), + "0" (evtchn_pending_sel32)); + } + rc = 1; + __put_user(rc, (u8 __user *)ghc->hva + offset); + } else { + __get_user(rc, (u8 __user *)ghc->hva + offset); + } return rc; } @@ -772,3 +836,105 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) return 0; } + +static inline int max_evtchn_port(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) + return 4096; + else + return 1024; +} + +int kvm_xen_set_evtchn(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, bool in_atomic) +{ + struct kvm_vcpu *vcpu; + struct kvm_host_map map; + unsigned long *pending_bits, *mask_bits; + int port_word_bit; + int rc; + + vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu); + if (!vcpu) + return -EINVAL; + + if (vcpu->arch.xen.vcpu_info_set) + return -EINVAL; + + if (e->xen_evtchn.port >= max_evtchn_port(kvm)) + return -EINVAL; + + /* With no cache this is *always* going to fail in the atomic case for now */ + rc = kvm_map_gfn(vcpu, kvm->arch.xen.shinfo_gfn, &map, NULL, in_atomic); + if (rc < 0) + return in_atomic ? -EWOULDBLOCK : rc; + + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct shared_info *shinfo = map.hva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 64; + } else { + struct compat_shared_info *shinfo = map.hva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 32; + } + + /* + * If this port wasn't already set, and if it isn't masked, then + * we try to set the corresponding bit in the in-kernel shadow of + * evtchn_pending_sel for the target vCPU. And if *that* wasn't + * already set, then we kick the vCPU in question to write to the + * *real* evtchn_pending_sel in its own guest vcpu_info struct. + */ + if (!test_and_set_bit(e->xen_evtchn.port, pending_bits) && + !test_bit(e->xen_evtchn.port, mask_bits) && + !test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } + + kvm_unmap_gfn(vcpu, &map, NULL, true, in_atomic); + return rc; +} + +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) + +{ + struct kvm_vcpu *vcpu; + + if (kvm->arch.xen.shinfo_gfn == GPA_INVALID) + return -EINVAL; + + if (e->xen_evtchn.vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); + if (!vcpu) + return -EINVAL; + + if (vcpu->arch.xen.vcpu_info_set) + return -EINVAL; + + if (!kvm->arch.xen.upcall_vector) + return -EINVAL; + + /* Once we support the per-vCPU LAPIC based vector we will permit + * that here instead of the per-KVM upcall vector */ + + if (e->xen_evtchn.port >= max_evtchn_port(kvm)) + return -EINVAL; + + /* We only support 2 level event channels for now */ + if (e->xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + return -EINVAL; + + e->xen_evtchn.port = ue->u.xen_evtchn.port; + e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu; + e->xen_evtchn.priority = ue->u.xen_evtchn.priority; + + return 0; +} diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index cc0cf5f37450..3e717947b928 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); +int kvm_xen_set_evtchn(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, bool in_atomic); +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue); + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0f18df7fe874..9003fae1af9d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -470,6 +470,12 @@ struct kvm_hv_sint { u32 sint; }; +struct kvm_xen_evtchn { + u32 port; + u32 vcpu; + u32 priority; +}; + struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; @@ -490,6 +496,7 @@ struct kvm_kernel_irq_routing_entry { } msi; struct kvm_s390_adapter_int adapter; struct kvm_hv_sint hv_sint; + struct kvm_xen_evtchn xen_evtchn; }; struct hlist_node link; }; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a067410ebea5..05391c80bb6a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1143,11 +1143,20 @@ struct kvm_irq_routing_hv_sint { __u32 sint; }; +struct kvm_irq_routing_xen_evtchn { + __u32 port; + __u32 vcpu; + __u32 priority; +}; + +#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1)) + /* gsi routing entry types */ #define KVM_IRQ_ROUTING_IRQCHIP 1 #define KVM_IRQ_ROUTING_MSI 2 #define KVM_IRQ_ROUTING_S390_ADAPTER 3 #define KVM_IRQ_ROUTING_HV_SINT 4 +#define KVM_IRQ_ROUTING_XEN_EVTCHN 5 struct kvm_irq_routing_entry { __u32 gsi; @@ -1159,6 +1168,7 @@ struct kvm_irq_routing_entry { struct kvm_irq_routing_msi msi; struct kvm_irq_routing_s390_adapter adapter; struct kvm_irq_routing_hv_sint hv_sint; + struct kvm_irq_routing_xen_evtchn xen_evtchn; __u32 pad[8]; } u; }; -- 2.31.1