On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote: > When the VM boots, the CAS negotiation process determines which > interrupt mode to use and invokes a machine reset. At that time, any > links to the previous KVM interrupt device should be 'destroyed' > before the new chosen one is created. > > To perform the necessary cleanups in KVM, we extend the KVM device > interface with a new 'release' operation which is called when the file > descriptor of the device is closed. > > Such operations are defined for the XICS-on-XIVE and the XIVE native > KVM devices. They clear the vCPU interrupt presenters that could be > attached and then destroy the device. > > Signed-off-by: Cédric Le Goater > --- > include/linux/kvm_host.h | 1 + > arch/powerpc/kvm/book3s_xive.c | 50 +++++++++++++++++++++++++-- > arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++ > virt/kvm/kvm_main.c | 13 +++++++ > 4 files changed, 85 insertions(+), 2 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 831d963451d8..3b444620d8fc 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1246,6 +1246,7 @@ struct kvm_device_ops { > long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, > unsigned long arg); > int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); > + void (*release)(struct kvm_device *dev); > }; > > void kvm_device_get(struct kvm_device *dev); > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > index 4d4e1730de84..ba777db849d7 100644 > --- a/arch/powerpc/kvm/book3s_xive.c > +++ b/arch/powerpc/kvm/book3s_xive.c > @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) > void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > { > struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > - struct kvmppc_xive *xive = xc->xive; > + struct kvmppc_xive *xive; > int i; > > + if (!kvmppc_xics_enabled(vcpu)) > + return; > + > + if (!xc) > + return; > + > pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); > > + xive = xc->xive; > + > /* Ensure no interrupt is still routed to that VP */ > xc->valid = false; > kvmppc_xive_disable_vcpu_interrupts(vcpu); > @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > } > /* Free the VP */ > kfree(xc); > + > + /* Cleanup the vcpu */ > + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; > + vcpu->arch.xive_vcpu = NULL; > } > > int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > } > if (xive->kvm != vcpu->kvm) > return -EPERM; > - if (vcpu->arch.irq_type) > + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) > return -EBUSY; > if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { > pr_devel("Duplicate !\n"); > @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev) > kfree(dev); > } > > +static void kvmppc_xive_release(struct kvm_device *dev) > +{ > + struct kvmppc_xive *xive = dev->private; > + struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu; > + int i; > + > + pr_devel("Releasing xive device\n"); > + > + /* > + * When releasing the KVM device fd, the vCPUs can still be > + * running and we should clean up the vCPU interrupt > + * presenters first. > + */ > + if (atomic_read(&kvm->online_vcpus) != 0) { What prevents online_vcpus from becoming non-zero after this test, but before the kvmppc_xive_free()? Is the test actually necessary? The operations below should be safe even if there are no online cpus, yes? > + /* > + * call kick_all_cpus_sync() to ensure that all CPUs > + * have executed any pending interrupts > + */ > + if (is_kvmppc_hv_enabled(kvm)) > + kick_all_cpus_sync(); > + > + /* > + * TODO: There is still a race window with the early > + * checks in kvmppc_native_connect_vcpu() > + */ That's... not reassuring. What are the consequences of that race, and what do you plan to do about it? > + kvm_for_each_vcpu(i, vcpu, kvm) > + kvmppc_xive_cleanup_vcpu(vcpu); > + } > + > + kvmppc_xive_free(dev); > +} > + > struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) > { > struct kvmppc_xive *xive; > @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = { > .name = "kvm-xive", > .create = kvmppc_xive_create, > .init = kvmppc_xive_init, > + .release = kvmppc_xive_release, > .destroy = kvmppc_xive_free, > .set_attr = xive_set_attr, > .get_attr = xive_get_attr, > diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c > index 092db0efe628..629da7bf2a89 100644 > --- a/arch/powerpc/kvm/book3s_xive_native.c > +++ b/arch/powerpc/kvm/book3s_xive_native.c > @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev) > kfree(dev); > } > > +static void kvmppc_xive_native_release(struct kvm_device *dev) > +{ > + struct kvmppc_xive *xive = dev->private; > + struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu; > + int i; > + > + pr_devel("Releasing xive native device\n"); > + > + /* > + * When releasing the KVM device fd, the vCPUs can still be > + * running and we should clean up the vCPU interrupt > + * presenters first. > + */ > + if (atomic_read(&kvm->online_vcpus) != 0) { Likewise here. > + kvm_for_each_vcpu(i, vcpu, kvm) > + kvmppc_xive_native_cleanup_vcpu(vcpu); > + } > + > + kvmppc_xive_native_free(dev); > +} > + > static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) > { > struct kvmppc_xive *xive; > @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = { > .name = "kvm-xive-native", > .create = kvmppc_xive_native_create, > .init = kvmppc_xive_native_init, > + .release = kvmppc_xive_native_release, > .destroy = kvmppc_xive_native_free, > .set_attr = kvmppc_xive_native_set_attr, > .get_attr = kvmppc_xive_native_get_attr, > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index ea2018ae1cd7..ea2619d5ca98 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp) > struct kvm_device *dev = filp->private_data; > struct kvm *kvm = dev->kvm; > > + if (!dev) > + return -ENODEV; > + > + if (dev->kvm != kvm) > + return -EPERM; > + > + if (dev->ops->release) { > + mutex_lock(&kvm->lock); > + list_del(&dev->vm_node); > + dev->ops->release(dev); > + mutex_unlock(&kvm->lock); > + } > + Wasn't there a big comment that explained that release replaced destroy somewhere? > kvm_put_kvm(kvm); > return 0; > } -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson