All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
@ 2020-08-10 10:08 ` Greg Kurz
  0 siblings, 0 replies; 10+ messages in thread
From: Greg Kurz @ 2020-08-10 10:08 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David Gibson, Cédric Le Goater, kvm-ppc, kvm

Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
method by a 'release' method"), convert the historical XICS KVM device to
implement the 'release' method. This is needed to run nested guests with
an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
during boot, which requires to be able to destroy and to re-create the
KVM device. Only the historical XICS KVM device is available under pseries
at the current time and it still uses the legacy 'destroy' method.

Switching to 'release' means that vCPUs might still be running when the
device is destroyed. In order to avoid potential use-after-free, the
kvmppc_xics structure is allocated on first usage and kept around until
the VM exits. The same pointer is used each time a KVM XICS device is
being created, but this is okay since we only have one per VM.

Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
next time the vCPU resumes execution, it won't be going into the XICS
code anymore.

Signed-off-by: Greg Kurz <groug@kaod.org>
---
 arch/powerpc/include/asm/kvm_host.h |    1 
 arch/powerpc/kvm/book3s.c           |    4 +-
 arch/powerpc/kvm/book3s_xics.c      |   86 ++++++++++++++++++++++++++++-------
 3 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e020d269416d..974adda2ec94 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -325,6 +325,7 @@ struct kvm_arch {
 #endif
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_xics *xics;
+	struct kvmppc_xics *xics_device;
 	struct kvmppc_xive *xive;    /* Current XIVE device in use */
 	struct {
 		struct kvmppc_xive *native;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 41fedec69ac3..56618c2770e1 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -878,13 +878,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 
 #ifdef CONFIG_KVM_XICS
 	/*
-	 * Free the XIVE devices which are not directly freed by the
+	 * Free the XIVE and XICS devices which are not directly freed by the
 	 * device 'release' method
 	 */
 	kfree(kvm->arch.xive_devices.native);
 	kvm->arch.xive_devices.native = NULL;
 	kfree(kvm->arch.xive_devices.xics_on_xive);
 	kvm->arch.xive_devices.xics_on_xive = NULL;
+	kfree(kvm->arch.xics_device);
+	kvm->arch.xics_device = NULL;
 #endif /* CONFIG_KVM_XICS */
 }
 
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 381bf8dea193..5fee5a11550d 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	return -ENXIO;
 }
 
-static void kvmppc_xics_free(struct kvm_device *dev)
+/*
+ * Called when device fd is closed. kvm->lock is held.
+ */
+static void kvmppc_xics_release(struct kvm_device *dev)
 {
 	struct kvmppc_xics *xics = dev->private;
 	int i;
 	struct kvm *kvm = xics->kvm;
+	struct kvm_vcpu *vcpu;
+
+	pr_devel("Releasing xics device\n");
+
+	/*
+	 * Since this is the device release function, we know that
+	 * userspace does not have any open fd referring to the
+	 * device.  Therefore there can not be any of the device
+	 * attribute set/get functions being executed concurrently,
+	 * and similarly, the connect_vcpu and set/clr_mapped
+	 * functions also cannot be being executed.
+	 */
 
 	debugfs_remove(xics->dentry);
 
+	/*
+	 * We should clean up the vCPU interrupt presenters first.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		/*
+		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+		 * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
+		 * Holding the vcpu->mutex also means that execution is
+		 * excluded for the vcpu until the ICP was freed. When the vcpu
+		 * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
+		 * have been cleared and the vcpu will not be going into the
+		 * XICS code anymore.
+		 */
+		mutex_lock(&vcpu->mutex);
+		kvmppc_xics_free_icp(vcpu);
+		mutex_unlock(&vcpu->mutex);
+	}
+
 	if (kvm)
 		kvm->arch.xics = NULL;
 
-	for (i = 0; i <= xics->max_icsid; i++)
+	for (i = 0; i <= xics->max_icsid; i++) {
 		kfree(xics->ics[i]);
-	kfree(xics);
+		xics->ics[i] = NULL;
+	}
+	/*
+	 * A reference of the kvmppc_xics pointer is now kept under
+	 * the xics_device pointer of the machine for reuse. It is
+	 * freed when the VM is destroyed for now until we fix all the
+	 * execution paths.
+	 */
 	kfree(dev);
 }
 
+static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
+{
+	struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
+	struct kvmppc_xics *xics = *kvm_xics_device;
+
+	if (!xics) {
+		xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+		*kvm_xics_device = xics;
+	} else {
+		memset(xics, 0, sizeof(*xics));
+	}
+
+	return xics;
+}
+
 static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 {
 	struct kvmppc_xics *xics;
 	struct kvm *kvm = dev->kvm;
-	int ret = 0;
 
-	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+	pr_devel("Creating xics for partition\n");
+
+	/* Already there ? */
+	if (kvm->arch.xics)
+		return -EEXIST;
+
+	xics = kvmppc_xics_get_device(kvm);
 	if (!xics)
 		return -ENOMEM;
 
 	dev->private = xics;
 	xics->dev = dev;
 	xics->kvm = kvm;
-
-	/* Already there ? */
-	if (kvm->arch.xics)
-		ret = -EEXIST;
-	else
-		kvm->arch.xics = xics;
-
-	if (ret) {
-		kfree(xics);
-		return ret;
-	}
+	kvm->arch.xics = xics;
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
@@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
 	.name = "kvm-xics",
 	.create = kvmppc_xics_create,
 	.init = kvmppc_xics_init,
-	.destroy = kvmppc_xics_free,
+	.release = kvmppc_xics_release,
 	.set_attr = xics_set_attr,
 	.get_attr = xics_get_attr,
 	.has_attr = xics_has_attr,
@@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
 		return -EPERM;
 	if (xics->kvm != vcpu->kvm)
 		return -EPERM;
-	if (vcpu->arch.irq_type)
+	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
 		return -EBUSY;
 
 	r = kvmppc_xics_create_icp(vcpu, xcpu);



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
@ 2020-08-10 10:08 ` Greg Kurz
  0 siblings, 0 replies; 10+ messages in thread
From: Greg Kurz @ 2020-08-10 10:08 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David Gibson, Cédric Le Goater, kvm-ppc, kvm

Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
method by a 'release' method"), convert the historical XICS KVM device to
implement the 'release' method. This is needed to run nested guests with
an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
during boot, which requires to be able to destroy and to re-create the
KVM device. Only the historical XICS KVM device is available under pseries
at the current time and it still uses the legacy 'destroy' method.

Switching to 'release' means that vCPUs might still be running when the
device is destroyed. In order to avoid potential use-after-free, the
kvmppc_xics structure is allocated on first usage and kept around until
the VM exits. The same pointer is used each time a KVM XICS device is
being created, but this is okay since we only have one per VM.

Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
next time the vCPU resumes execution, it won't be going into the XICS
code anymore.

Signed-off-by: Greg Kurz <groug@kaod.org>
---
 arch/powerpc/include/asm/kvm_host.h |    1 
 arch/powerpc/kvm/book3s.c           |    4 +-
 arch/powerpc/kvm/book3s_xics.c      |   86 ++++++++++++++++++++++++++++-------
 3 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e020d269416d..974adda2ec94 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -325,6 +325,7 @@ struct kvm_arch {
 #endif
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_xics *xics;
+	struct kvmppc_xics *xics_device;
 	struct kvmppc_xive *xive;    /* Current XIVE device in use */
 	struct {
 		struct kvmppc_xive *native;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 41fedec69ac3..56618c2770e1 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -878,13 +878,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 
 #ifdef CONFIG_KVM_XICS
 	/*
-	 * Free the XIVE devices which are not directly freed by the
+	 * Free the XIVE and XICS devices which are not directly freed by the
 	 * device 'release' method
 	 */
 	kfree(kvm->arch.xive_devices.native);
 	kvm->arch.xive_devices.native = NULL;
 	kfree(kvm->arch.xive_devices.xics_on_xive);
 	kvm->arch.xive_devices.xics_on_xive = NULL;
+	kfree(kvm->arch.xics_device);
+	kvm->arch.xics_device = NULL;
 #endif /* CONFIG_KVM_XICS */
 }
 
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 381bf8dea193..5fee5a11550d 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	return -ENXIO;
 }
 
-static void kvmppc_xics_free(struct kvm_device *dev)
+/*
+ * Called when device fd is closed. kvm->lock is held.
+ */
+static void kvmppc_xics_release(struct kvm_device *dev)
 {
 	struct kvmppc_xics *xics = dev->private;
 	int i;
 	struct kvm *kvm = xics->kvm;
+	struct kvm_vcpu *vcpu;
+
+	pr_devel("Releasing xics device\n");
+
+	/*
+	 * Since this is the device release function, we know that
+	 * userspace does not have any open fd referring to the
+	 * device.  Therefore there can not be any of the device
+	 * attribute set/get functions being executed concurrently,
+	 * and similarly, the connect_vcpu and set/clr_mapped
+	 * functions also cannot be being executed.
+	 */
 
 	debugfs_remove(xics->dentry);
 
+	/*
+	 * We should clean up the vCPU interrupt presenters first.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		/*
+		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+		 * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
+		 * Holding the vcpu->mutex also means that execution is
+		 * excluded for the vcpu until the ICP was freed. When the vcpu
+		 * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
+		 * have been cleared and the vcpu will not be going into the
+		 * XICS code anymore.
+		 */
+		mutex_lock(&vcpu->mutex);
+		kvmppc_xics_free_icp(vcpu);
+		mutex_unlock(&vcpu->mutex);
+	}
+
 	if (kvm)
 		kvm->arch.xics = NULL;
 
-	for (i = 0; i <= xics->max_icsid; i++)
+	for (i = 0; i <= xics->max_icsid; i++) {
 		kfree(xics->ics[i]);
-	kfree(xics);
+		xics->ics[i] = NULL;
+	}
+	/*
+	 * A reference of the kvmppc_xics pointer is now kept under
+	 * the xics_device pointer of the machine for reuse. It is
+	 * freed when the VM is destroyed for now until we fix all the
+	 * execution paths.
+	 */
 	kfree(dev);
 }
 
+static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
+{
+	struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
+	struct kvmppc_xics *xics = *kvm_xics_device;
+
+	if (!xics) {
+		xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+		*kvm_xics_device = xics;
+	} else {
+		memset(xics, 0, sizeof(*xics));
+	}
+
+	return xics;
+}
+
 static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 {
 	struct kvmppc_xics *xics;
 	struct kvm *kvm = dev->kvm;
-	int ret = 0;
 
-	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+	pr_devel("Creating xics for partition\n");
+
+	/* Already there ? */
+	if (kvm->arch.xics)
+		return -EEXIST;
+
+	xics = kvmppc_xics_get_device(kvm);
 	if (!xics)
 		return -ENOMEM;
 
 	dev->private = xics;
 	xics->dev = dev;
 	xics->kvm = kvm;
-
-	/* Already there ? */
-	if (kvm->arch.xics)
-		ret = -EEXIST;
-	else
-		kvm->arch.xics = xics;
-
-	if (ret) {
-		kfree(xics);
-		return ret;
-	}
+	kvm->arch.xics = xics;
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
@@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
 	.name = "kvm-xics",
 	.create = kvmppc_xics_create,
 	.init = kvmppc_xics_init,
-	.destroy = kvmppc_xics_free,
+	.release = kvmppc_xics_release,
 	.set_attr = xics_set_attr,
 	.get_attr = xics_get_attr,
 	.has_attr = xics_has_attr,
@@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
 		return -EPERM;
 	if (xics->kvm != vcpu->kvm)
 		return -EPERM;
-	if (vcpu->arch.irq_type)
+	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
 		return -EBUSY;
 
 	r = kvmppc_xics_create_icp(vcpu, xcpu);


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
  2020-08-10 10:08 ` Greg Kurz
@ 2020-08-31  8:03   ` Greg Kurz
  -1 siblings, 0 replies; 10+ messages in thread
From: Greg Kurz @ 2020-08-31  8:03 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David Gibson, Cédric Le Goater, kvm-ppc, kvm

On Mon, 10 Aug 2020 12:08:05 +0200
Greg Kurz <groug@kaod.org> wrote:

> Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
> with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
> method by a 'release' method"), convert the historical XICS KVM device to
> implement the 'release' method. This is needed to run nested guests with
> an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
> during boot, which requires to be able to destroy and to re-create the
> KVM device. Only the historical XICS KVM device is available under pseries
> at the current time and it still uses the legacy 'destroy' method.
> 
> Switching to 'release' means that vCPUs might still be running when the
> device is destroyed. In order to avoid potential use-after-free, the
> kvmppc_xics structure is allocated on first usage and kept around until
> the VM exits. The same pointer is used each time a KVM XICS device is
> being created, but this is okay since we only have one per VM.
> 
> Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
> next time the vCPU resumes execution, it won't be going into the XICS
> code anymore.
> 
> Signed-off-by: Greg Kurz <groug@kaod.org>
> ---

Any chance someone can review this ?

Cheers,

--
Greg

>  arch/powerpc/include/asm/kvm_host.h |    1 
>  arch/powerpc/kvm/book3s.c           |    4 +-
>  arch/powerpc/kvm/book3s_xics.c      |   86 ++++++++++++++++++++++++++++-------
>  3 files changed, 72 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index e020d269416d..974adda2ec94 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -325,6 +325,7 @@ struct kvm_arch {
>  #endif
>  #ifdef CONFIG_KVM_XICS
>  	struct kvmppc_xics *xics;
> +	struct kvmppc_xics *xics_device;
>  	struct kvmppc_xive *xive;    /* Current XIVE device in use */
>  	struct {
>  		struct kvmppc_xive *native;
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 41fedec69ac3..56618c2770e1 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -878,13 +878,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
>  
>  #ifdef CONFIG_KVM_XICS
>  	/*
> -	 * Free the XIVE devices which are not directly freed by the
> +	 * Free the XIVE and XICS devices which are not directly freed by the
>  	 * device 'release' method
>  	 */
>  	kfree(kvm->arch.xive_devices.native);
>  	kvm->arch.xive_devices.native = NULL;
>  	kfree(kvm->arch.xive_devices.xics_on_xive);
>  	kvm->arch.xive_devices.xics_on_xive = NULL;
> +	kfree(kvm->arch.xics_device);
> +	kvm->arch.xics_device = NULL;
>  #endif /* CONFIG_KVM_XICS */
>  }
>  
> diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> index 381bf8dea193..5fee5a11550d 100644
> --- a/arch/powerpc/kvm/book3s_xics.c
> +++ b/arch/powerpc/kvm/book3s_xics.c
> @@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
>  	return -ENXIO;
>  }
>  
> -static void kvmppc_xics_free(struct kvm_device *dev)
> +/*
> + * Called when device fd is closed. kvm->lock is held.
> + */
> +static void kvmppc_xics_release(struct kvm_device *dev)
>  {
>  	struct kvmppc_xics *xics = dev->private;
>  	int i;
>  	struct kvm *kvm = xics->kvm;
> +	struct kvm_vcpu *vcpu;
> +
> +	pr_devel("Releasing xics device\n");
> +
> +	/*
> +	 * Since this is the device release function, we know that
> +	 * userspace does not have any open fd referring to the
> +	 * device.  Therefore there can not be any of the device
> +	 * attribute set/get functions being executed concurrently,
> +	 * and similarly, the connect_vcpu and set/clr_mapped
> +	 * functions also cannot be being executed.
> +	 */
>  
>  	debugfs_remove(xics->dentry);
>  
> +	/*
> +	 * We should clean up the vCPU interrupt presenters first.
> +	 */
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		/*
> +		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
> +		 * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
> +		 * Holding the vcpu->mutex also means that execution is
> +		 * excluded for the vcpu until the ICP was freed. When the vcpu
> +		 * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
> +		 * have been cleared and the vcpu will not be going into the
> +		 * XICS code anymore.
> +		 */
> +		mutex_lock(&vcpu->mutex);
> +		kvmppc_xics_free_icp(vcpu);
> +		mutex_unlock(&vcpu->mutex);
> +	}
> +
>  	if (kvm)
>  		kvm->arch.xics = NULL;
>  
> -	for (i = 0; i <= xics->max_icsid; i++)
> +	for (i = 0; i <= xics->max_icsid; i++) {
>  		kfree(xics->ics[i]);
> -	kfree(xics);
> +		xics->ics[i] = NULL;
> +	}
> +	/*
> +	 * A reference of the kvmppc_xics pointer is now kept under
> +	 * the xics_device pointer of the machine for reuse. It is
> +	 * freed when the VM is destroyed for now until we fix all the
> +	 * execution paths.
> +	 */
>  	kfree(dev);
>  }
>  
> +static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
> +{
> +	struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
> +	struct kvmppc_xics *xics = *kvm_xics_device;
> +
> +	if (!xics) {
> +		xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> +		*kvm_xics_device = xics;
> +	} else {
> +		memset(xics, 0, sizeof(*xics));
> +	}
> +
> +	return xics;
> +}
> +
>  static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
>  {
>  	struct kvmppc_xics *xics;
>  	struct kvm *kvm = dev->kvm;
> -	int ret = 0;
>  
> -	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> +	pr_devel("Creating xics for partition\n");
> +
> +	/* Already there ? */
> +	if (kvm->arch.xics)
> +		return -EEXIST;
> +
> +	xics = kvmppc_xics_get_device(kvm);
>  	if (!xics)
>  		return -ENOMEM;
>  
>  	dev->private = xics;
>  	xics->dev = dev;
>  	xics->kvm = kvm;
> -
> -	/* Already there ? */
> -	if (kvm->arch.xics)
> -		ret = -EEXIST;
> -	else
> -		kvm->arch.xics = xics;
> -
> -	if (ret) {
> -		kfree(xics);
> -		return ret;
> -	}
> +	kvm->arch.xics = xics;
>  
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> @@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
>  	.name = "kvm-xics",
>  	.create = kvmppc_xics_create,
>  	.init = kvmppc_xics_init,
> -	.destroy = kvmppc_xics_free,
> +	.release = kvmppc_xics_release,
>  	.set_attr = xics_set_attr,
>  	.get_attr = xics_get_attr,
>  	.has_attr = xics_has_attr,
> @@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
>  		return -EPERM;
>  	if (xics->kvm != vcpu->kvm)
>  		return -EPERM;
> -	if (vcpu->arch.irq_type)
> +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
>  		return -EBUSY;
>  
>  	r = kvmppc_xics_create_icp(vcpu, xcpu);
> 
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
@ 2020-08-31  8:03   ` Greg Kurz
  0 siblings, 0 replies; 10+ messages in thread
From: Greg Kurz @ 2020-08-31  8:03 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David Gibson, Cédric Le Goater, kvm-ppc, kvm

On Mon, 10 Aug 2020 12:08:05 +0200
Greg Kurz <groug@kaod.org> wrote:

> Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
> with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
> method by a 'release' method"), convert the historical XICS KVM device to
> implement the 'release' method. This is needed to run nested guests with
> an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
> during boot, which requires to be able to destroy and to re-create the
> KVM device. Only the historical XICS KVM device is available under pseries
> at the current time and it still uses the legacy 'destroy' method.
> 
> Switching to 'release' means that vCPUs might still be running when the
> device is destroyed. In order to avoid potential use-after-free, the
> kvmppc_xics structure is allocated on first usage and kept around until
> the VM exits. The same pointer is used each time a KVM XICS device is
> being created, but this is okay since we only have one per VM.
> 
> Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
> next time the vCPU resumes execution, it won't be going into the XICS
> code anymore.
> 
> Signed-off-by: Greg Kurz <groug@kaod.org>
> ---

Any chance someone can review this ?

Cheers,

--
Greg

>  arch/powerpc/include/asm/kvm_host.h |    1 
>  arch/powerpc/kvm/book3s.c           |    4 +-
>  arch/powerpc/kvm/book3s_xics.c      |   86 ++++++++++++++++++++++++++++-------
>  3 files changed, 72 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index e020d269416d..974adda2ec94 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -325,6 +325,7 @@ struct kvm_arch {
>  #endif
>  #ifdef CONFIG_KVM_XICS
>  	struct kvmppc_xics *xics;
> +	struct kvmppc_xics *xics_device;
>  	struct kvmppc_xive *xive;    /* Current XIVE device in use */
>  	struct {
>  		struct kvmppc_xive *native;
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 41fedec69ac3..56618c2770e1 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -878,13 +878,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
>  
>  #ifdef CONFIG_KVM_XICS
>  	/*
> -	 * Free the XIVE devices which are not directly freed by the
> +	 * Free the XIVE and XICS devices which are not directly freed by the
>  	 * device 'release' method
>  	 */
>  	kfree(kvm->arch.xive_devices.native);
>  	kvm->arch.xive_devices.native = NULL;
>  	kfree(kvm->arch.xive_devices.xics_on_xive);
>  	kvm->arch.xive_devices.xics_on_xive = NULL;
> +	kfree(kvm->arch.xics_device);
> +	kvm->arch.xics_device = NULL;
>  #endif /* CONFIG_KVM_XICS */
>  }
>  
> diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> index 381bf8dea193..5fee5a11550d 100644
> --- a/arch/powerpc/kvm/book3s_xics.c
> +++ b/arch/powerpc/kvm/book3s_xics.c
> @@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
>  	return -ENXIO;
>  }
>  
> -static void kvmppc_xics_free(struct kvm_device *dev)
> +/*
> + * Called when device fd is closed. kvm->lock is held.
> + */
> +static void kvmppc_xics_release(struct kvm_device *dev)
>  {
>  	struct kvmppc_xics *xics = dev->private;
>  	int i;
>  	struct kvm *kvm = xics->kvm;
> +	struct kvm_vcpu *vcpu;
> +
> +	pr_devel("Releasing xics device\n");
> +
> +	/*
> +	 * Since this is the device release function, we know that
> +	 * userspace does not have any open fd referring to the
> +	 * device.  Therefore there can not be any of the device
> +	 * attribute set/get functions being executed concurrently,
> +	 * and similarly, the connect_vcpu and set/clr_mapped
> +	 * functions also cannot be being executed.
> +	 */
>  
>  	debugfs_remove(xics->dentry);
>  
> +	/*
> +	 * We should clean up the vCPU interrupt presenters first.
> +	 */
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		/*
> +		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
> +		 * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
> +		 * Holding the vcpu->mutex also means that execution is
> +		 * excluded for the vcpu until the ICP was freed. When the vcpu
> +		 * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
> +		 * have been cleared and the vcpu will not be going into the
> +		 * XICS code anymore.
> +		 */
> +		mutex_lock(&vcpu->mutex);
> +		kvmppc_xics_free_icp(vcpu);
> +		mutex_unlock(&vcpu->mutex);
> +	}
> +
>  	if (kvm)
>  		kvm->arch.xics = NULL;
>  
> -	for (i = 0; i <= xics->max_icsid; i++)
> +	for (i = 0; i <= xics->max_icsid; i++) {
>  		kfree(xics->ics[i]);
> -	kfree(xics);
> +		xics->ics[i] = NULL;
> +	}
> +	/*
> +	 * A reference of the kvmppc_xics pointer is now kept under
> +	 * the xics_device pointer of the machine for reuse. It is
> +	 * freed when the VM is destroyed for now until we fix all the
> +	 * execution paths.
> +	 */
>  	kfree(dev);
>  }
>  
> +static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
> +{
> +	struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
> +	struct kvmppc_xics *xics = *kvm_xics_device;
> +
> +	if (!xics) {
> +		xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> +		*kvm_xics_device = xics;
> +	} else {
> +		memset(xics, 0, sizeof(*xics));
> +	}
> +
> +	return xics;
> +}
> +
>  static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
>  {
>  	struct kvmppc_xics *xics;
>  	struct kvm *kvm = dev->kvm;
> -	int ret = 0;
>  
> -	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> +	pr_devel("Creating xics for partition\n");
> +
> +	/* Already there ? */
> +	if (kvm->arch.xics)
> +		return -EEXIST;
> +
> +	xics = kvmppc_xics_get_device(kvm);
>  	if (!xics)
>  		return -ENOMEM;
>  
>  	dev->private = xics;
>  	xics->dev = dev;
>  	xics->kvm = kvm;
> -
> -	/* Already there ? */
> -	if (kvm->arch.xics)
> -		ret = -EEXIST;
> -	else
> -		kvm->arch.xics = xics;
> -
> -	if (ret) {
> -		kfree(xics);
> -		return ret;
> -	}
> +	kvm->arch.xics = xics;
>  
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> @@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
>  	.name = "kvm-xics",
>  	.create = kvmppc_xics_create,
>  	.init = kvmppc_xics_init,
> -	.destroy = kvmppc_xics_free,
> +	.release = kvmppc_xics_release,
>  	.set_attr = xics_set_attr,
>  	.get_attr = xics_get_attr,
>  	.has_attr = xics_has_attr,
> @@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
>  		return -EPERM;
>  	if (xics->kvm != vcpu->kvm)
>  		return -EPERM;
> -	if (vcpu->arch.irq_type)
> +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
>  		return -EBUSY;
>  
>  	r = kvmppc_xics_create_icp(vcpu, xcpu);
> 
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
  2020-08-10 10:08 ` Greg Kurz
@ 2020-09-02  7:26   ` Cédric Le Goater
  -1 siblings, 0 replies; 10+ messages in thread
From: Cédric Le Goater @ 2020-09-02  7:26 UTC (permalink / raw)
  To: Greg Kurz, Paul Mackerras; +Cc: David Gibson, kvm-ppc, kvm

On 8/10/20 12:08 PM, Greg Kurz wrote:
> Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
> with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
> method by a 'release' method"), convert the historical XICS KVM device to
> implement the 'release' method. This is needed to run nested guests with
> an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
> during boot, which requires to be able to destroy and to re-create the
> KVM device. Only the historical XICS KVM device is available under pseries
> at the current time and it still uses the legacy 'destroy' method.
> 
> Switching to 'release' means that vCPUs might still be running when the
> device is destroyed. In order to avoid potential use-after-free, the
> kvmppc_xics structure is allocated on first usage and kept around until
> the VM exits. The same pointer is used each time a KVM XICS device is
> being created, but this is okay since we only have one per VM.
> 
> Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
> next time the vCPU resumes execution, it won't be going into the XICS
> code anymore.
> 
> Signed-off-by: Greg Kurz <groug@kaod.org>

Reviewed-by: Cédric Le Goater <clg@kaod.org>

and on a P8 host, 

Tested-by: Cédric Le Goater <clg@kaod.org>

Thanks,

C. 

> ---
>  arch/powerpc/include/asm/kvm_host.h |    1 
>  arch/powerpc/kvm/book3s.c           |    4 +-
>  arch/powerpc/kvm/book3s_xics.c      |   86 ++++++++++++++++++++++++++++-------
>  3 files changed, 72 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index e020d269416d..974adda2ec94 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -325,6 +325,7 @@ struct kvm_arch {
>  #endif
>  #ifdef CONFIG_KVM_XICS
>  	struct kvmppc_xics *xics;
> +	struct kvmppc_xics *xics_device;
>  	struct kvmppc_xive *xive;    /* Current XIVE device in use */
>  	struct {
>  		struct kvmppc_xive *native;
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 41fedec69ac3..56618c2770e1 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -878,13 +878,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
>  
>  #ifdef CONFIG_KVM_XICS
>  	/*
> -	 * Free the XIVE devices which are not directly freed by the
> +	 * Free the XIVE and XICS devices which are not directly freed by the
>  	 * device 'release' method
>  	 */
>  	kfree(kvm->arch.xive_devices.native);
>  	kvm->arch.xive_devices.native = NULL;
>  	kfree(kvm->arch.xive_devices.xics_on_xive);
>  	kvm->arch.xive_devices.xics_on_xive = NULL;
> +	kfree(kvm->arch.xics_device);
> +	kvm->arch.xics_device = NULL;
>  #endif /* CONFIG_KVM_XICS */
>  }
>  
> diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> index 381bf8dea193..5fee5a11550d 100644
> --- a/arch/powerpc/kvm/book3s_xics.c
> +++ b/arch/powerpc/kvm/book3s_xics.c
> @@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
>  	return -ENXIO;
>  }
>  
> -static void kvmppc_xics_free(struct kvm_device *dev)
> +/*
> + * Called when device fd is closed. kvm->lock is held.
> + */
> +static void kvmppc_xics_release(struct kvm_device *dev)
>  {
>  	struct kvmppc_xics *xics = dev->private;
>  	int i;
>  	struct kvm *kvm = xics->kvm;
> +	struct kvm_vcpu *vcpu;
> +
> +	pr_devel("Releasing xics device\n");
> +
> +	/*
> +	 * Since this is the device release function, we know that
> +	 * userspace does not have any open fd referring to the
> +	 * device.  Therefore there can not be any of the device
> +	 * attribute set/get functions being executed concurrently,
> +	 * and similarly, the connect_vcpu and set/clr_mapped
> +	 * functions also cannot be being executed.
> +	 */
>  
>  	debugfs_remove(xics->dentry);
>  
> +	/*
> +	 * We should clean up the vCPU interrupt presenters first.
> +	 */
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		/*
> +		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
> +		 * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
> +		 * Holding the vcpu->mutex also means that execution is
> +		 * excluded for the vcpu until the ICP was freed. When the vcpu
> +		 * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
> +		 * have been cleared and the vcpu will not be going into the
> +		 * XICS code anymore.
> +		 */
> +		mutex_lock(&vcpu->mutex);
> +		kvmppc_xics_free_icp(vcpu);
> +		mutex_unlock(&vcpu->mutex);
> +	}
> +
>  	if (kvm)
>  		kvm->arch.xics = NULL;
>  
> -	for (i = 0; i <= xics->max_icsid; i++)
> +	for (i = 0; i <= xics->max_icsid; i++) {
>  		kfree(xics->ics[i]);
> -	kfree(xics);
> +		xics->ics[i] = NULL;
> +	}
> +	/*
> +	 * A reference of the kvmppc_xics pointer is now kept under
> +	 * the xics_device pointer of the machine for reuse. It is
> +	 * freed when the VM is destroyed for now until we fix all the
> +	 * execution paths.
> +	 */
>  	kfree(dev);
>  }
>  
> +static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
> +{
> +	struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
> +	struct kvmppc_xics *xics = *kvm_xics_device;
> +
> +	if (!xics) {
> +		xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> +		*kvm_xics_device = xics;
> +	} else {
> +		memset(xics, 0, sizeof(*xics));
> +	}
> +
> +	return xics;
> +}
> +
>  static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
>  {
>  	struct kvmppc_xics *xics;
>  	struct kvm *kvm = dev->kvm;
> -	int ret = 0;
>  
> -	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> +	pr_devel("Creating xics for partition\n");
> +
> +	/* Already there ? */
> +	if (kvm->arch.xics)
> +		return -EEXIST;
> +
> +	xics = kvmppc_xics_get_device(kvm);
>  	if (!xics)
>  		return -ENOMEM;
>  
>  	dev->private = xics;
>  	xics->dev = dev;
>  	xics->kvm = kvm;
> -
> -	/* Already there ? */
> -	if (kvm->arch.xics)
> -		ret = -EEXIST;
> -	else
> -		kvm->arch.xics = xics;
> -
> -	if (ret) {
> -		kfree(xics);
> -		return ret;
> -	}
> +	kvm->arch.xics = xics;
>  
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> @@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
>  	.name = "kvm-xics",
>  	.create = kvmppc_xics_create,
>  	.init = kvmppc_xics_init,
> -	.destroy = kvmppc_xics_free,
> +	.release = kvmppc_xics_release,
>  	.set_attr = xics_set_attr,
>  	.get_attr = xics_get_attr,
>  	.has_attr = xics_has_attr,
> @@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
>  		return -EPERM;
>  	if (xics->kvm != vcpu->kvm)
>  		return -EPERM;
> -	if (vcpu->arch.irq_type)
> +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
>  		return -EBUSY;
>  
>  	r = kvmppc_xics_create_icp(vcpu, xcpu);
> 
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
@ 2020-09-02  7:26   ` Cédric Le Goater
  0 siblings, 0 replies; 10+ messages in thread
From: Cédric Le Goater @ 2020-09-02  7:26 UTC (permalink / raw)
  To: Greg Kurz, Paul Mackerras; +Cc: David Gibson, kvm-ppc, kvm

On 8/10/20 12:08 PM, Greg Kurz wrote:
> Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
> with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
> method by a 'release' method"), convert the historical XICS KVM device to
> implement the 'release' method. This is needed to run nested guests with
> an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
> during boot, which requires to be able to destroy and to re-create the
> KVM device. Only the historical XICS KVM device is available under pseries
> at the current time and it still uses the legacy 'destroy' method.
> 
> Switching to 'release' means that vCPUs might still be running when the
> device is destroyed. In order to avoid potential use-after-free, the
> kvmppc_xics structure is allocated on first usage and kept around until
> the VM exits. The same pointer is used each time a KVM XICS device is
> being created, but this is okay since we only have one per VM.
> 
> Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
> next time the vCPU resumes execution, it won't be going into the XICS
> code anymore.
> 
> Signed-off-by: Greg Kurz <groug@kaod.org>

Reviewed-by: Cédric Le Goater <clg@kaod.org>

and on a P8 host, 

Tested-by: Cédric Le Goater <clg@kaod.org>

Thanks,

C. 

> ---
>  arch/powerpc/include/asm/kvm_host.h |    1 
>  arch/powerpc/kvm/book3s.c           |    4 +-
>  arch/powerpc/kvm/book3s_xics.c      |   86 ++++++++++++++++++++++++++++-------
>  3 files changed, 72 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index e020d269416d..974adda2ec94 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -325,6 +325,7 @@ struct kvm_arch {
>  #endif
>  #ifdef CONFIG_KVM_XICS
>  	struct kvmppc_xics *xics;
> +	struct kvmppc_xics *xics_device;
>  	struct kvmppc_xive *xive;    /* Current XIVE device in use */
>  	struct {
>  		struct kvmppc_xive *native;
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 41fedec69ac3..56618c2770e1 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -878,13 +878,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
>  
>  #ifdef CONFIG_KVM_XICS
>  	/*
> -	 * Free the XIVE devices which are not directly freed by the
> +	 * Free the XIVE and XICS devices which are not directly freed by the
>  	 * device 'release' method
>  	 */
>  	kfree(kvm->arch.xive_devices.native);
>  	kvm->arch.xive_devices.native = NULL;
>  	kfree(kvm->arch.xive_devices.xics_on_xive);
>  	kvm->arch.xive_devices.xics_on_xive = NULL;
> +	kfree(kvm->arch.xics_device);
> +	kvm->arch.xics_device = NULL;
>  #endif /* CONFIG_KVM_XICS */
>  }
>  
> diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> index 381bf8dea193..5fee5a11550d 100644
> --- a/arch/powerpc/kvm/book3s_xics.c
> +++ b/arch/powerpc/kvm/book3s_xics.c
> @@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
>  	return -ENXIO;
>  }
>  
> -static void kvmppc_xics_free(struct kvm_device *dev)
> +/*
> + * Called when device fd is closed. kvm->lock is held.
> + */
> +static void kvmppc_xics_release(struct kvm_device *dev)
>  {
>  	struct kvmppc_xics *xics = dev->private;
>  	int i;
>  	struct kvm *kvm = xics->kvm;
> +	struct kvm_vcpu *vcpu;
> +
> +	pr_devel("Releasing xics device\n");
> +
> +	/*
> +	 * Since this is the device release function, we know that
> +	 * userspace does not have any open fd referring to the
> +	 * device.  Therefore there can not be any of the device
> +	 * attribute set/get functions being executed concurrently,
> +	 * and similarly, the connect_vcpu and set/clr_mapped
> +	 * functions also cannot be being executed.
> +	 */
>  
>  	debugfs_remove(xics->dentry);
>  
> +	/*
> +	 * We should clean up the vCPU interrupt presenters first.
> +	 */
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		/*
> +		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
> +		 * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
> +		 * Holding the vcpu->mutex also means that execution is
> +		 * excluded for the vcpu until the ICP was freed. When the vcpu
> +		 * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
> +		 * have been cleared and the vcpu will not be going into the
> +		 * XICS code anymore.
> +		 */
> +		mutex_lock(&vcpu->mutex);
> +		kvmppc_xics_free_icp(vcpu);
> +		mutex_unlock(&vcpu->mutex);
> +	}
> +
>  	if (kvm)
>  		kvm->arch.xics = NULL;
>  
> -	for (i = 0; i <= xics->max_icsid; i++)
> +	for (i = 0; i <= xics->max_icsid; i++) {
>  		kfree(xics->ics[i]);
> -	kfree(xics);
> +		xics->ics[i] = NULL;
> +	}
> +	/*
> +	 * A reference of the kvmppc_xics pointer is now kept under
> +	 * the xics_device pointer of the machine for reuse. It is
> +	 * freed when the VM is destroyed for now until we fix all the
> +	 * execution paths.
> +	 */
>  	kfree(dev);
>  }
>  
> +static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
> +{
> +	struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
> +	struct kvmppc_xics *xics = *kvm_xics_device;
> +
> +	if (!xics) {
> +		xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> +		*kvm_xics_device = xics;
> +	} else {
> +		memset(xics, 0, sizeof(*xics));
> +	}
> +
> +	return xics;
> +}
> +
>  static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
>  {
>  	struct kvmppc_xics *xics;
>  	struct kvm *kvm = dev->kvm;
> -	int ret = 0;
>  
> -	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> +	pr_devel("Creating xics for partition\n");
> +
> +	/* Already there ? */
> +	if (kvm->arch.xics)
> +		return -EEXIST;
> +
> +	xics = kvmppc_xics_get_device(kvm);
>  	if (!xics)
>  		return -ENOMEM;
>  
>  	dev->private = xics;
>  	xics->dev = dev;
>  	xics->kvm = kvm;
> -
> -	/* Already there ? */
> -	if (kvm->arch.xics)
> -		ret = -EEXIST;
> -	else
> -		kvm->arch.xics = xics;
> -
> -	if (ret) {
> -		kfree(xics);
> -		return ret;
> -	}
> +	kvm->arch.xics = xics;
>  
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> @@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
>  	.name = "kvm-xics",
>  	.create = kvmppc_xics_create,
>  	.init = kvmppc_xics_init,
> -	.destroy = kvmppc_xics_free,
> +	.release = kvmppc_xics_release,
>  	.set_attr = xics_set_attr,
>  	.get_attr = xics_get_attr,
>  	.has_attr = xics_has_attr,
> @@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
>  		return -EPERM;
>  	if (xics->kvm != vcpu->kvm)
>  		return -EPERM;
> -	if (vcpu->arch.irq_type)
> +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
>  		return -EBUSY;
>  
>  	r = kvmppc_xics_create_icp(vcpu, xcpu);
> 
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
  2020-09-02  7:26   ` Cédric Le Goater
@ 2020-09-02  7:31     ` Greg Kurz
  -1 siblings, 0 replies; 10+ messages in thread
From: Greg Kurz @ 2020-09-02  7:31 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Paul Mackerras, David Gibson, kvm-ppc, kvm

On Wed, 2 Sep 2020 09:26:06 +0200
Cédric Le Goater <clg@kaod.org> wrote:

> On 8/10/20 12:08 PM, Greg Kurz wrote:
> > Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
> > with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
> > method by a 'release' method"), convert the historical XICS KVM device to
> > implement the 'release' method. This is needed to run nested guests with
> > an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
> > during boot, which requires to be able to destroy and to re-create the
> > KVM device. Only the historical XICS KVM device is available under pseries
> > at the current time and it still uses the legacy 'destroy' method.
> > 
> > Switching to 'release' means that vCPUs might still be running when the
> > device is destroyed. In order to avoid potential use-after-free, the
> > kvmppc_xics structure is allocated on first usage and kept around until
> > the VM exits. The same pointer is used each time a KVM XICS device is
> > being created, but this is okay since we only have one per VM.
> > 
> > Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
> > next time the vCPU resumes execution, it won't be going into the XICS
> > code anymore.
> > 
> > Signed-off-by: Greg Kurz <groug@kaod.org>
> 
> Reviewed-by: Cédric Le Goater <clg@kaod.org>
> 
> and on a P8 host, 
> 
> Tested-by: Cédric Le Goater <clg@kaod.org>
> 
> Thanks,
> 
> C. 
> 

Thanks for the review and testing !

Cheers,

--
Greg

> > ---
> >  arch/powerpc/include/asm/kvm_host.h |    1 
> >  arch/powerpc/kvm/book3s.c           |    4 +-
> >  arch/powerpc/kvm/book3s_xics.c      |   86 ++++++++++++++++++++++++++++-------
> >  3 files changed, 72 insertions(+), 19 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> > index e020d269416d..974adda2ec94 100644
> > --- a/arch/powerpc/include/asm/kvm_host.h
> > +++ b/arch/powerpc/include/asm/kvm_host.h
> > @@ -325,6 +325,7 @@ struct kvm_arch {
> >  #endif
> >  #ifdef CONFIG_KVM_XICS
> >  	struct kvmppc_xics *xics;
> > +	struct kvmppc_xics *xics_device;
> >  	struct kvmppc_xive *xive;    /* Current XIVE device in use */
> >  	struct {
> >  		struct kvmppc_xive *native;
> > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> > index 41fedec69ac3..56618c2770e1 100644
> > --- a/arch/powerpc/kvm/book3s.c
> > +++ b/arch/powerpc/kvm/book3s.c
> > @@ -878,13 +878,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
> >  
> >  #ifdef CONFIG_KVM_XICS
> >  	/*
> > -	 * Free the XIVE devices which are not directly freed by the
> > +	 * Free the XIVE and XICS devices which are not directly freed by the
> >  	 * device 'release' method
> >  	 */
> >  	kfree(kvm->arch.xive_devices.native);
> >  	kvm->arch.xive_devices.native = NULL;
> >  	kfree(kvm->arch.xive_devices.xics_on_xive);
> >  	kvm->arch.xive_devices.xics_on_xive = NULL;
> > +	kfree(kvm->arch.xics_device);
> > +	kvm->arch.xics_device = NULL;
> >  #endif /* CONFIG_KVM_XICS */
> >  }
> >  
> > diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> > index 381bf8dea193..5fee5a11550d 100644
> > --- a/arch/powerpc/kvm/book3s_xics.c
> > +++ b/arch/powerpc/kvm/book3s_xics.c
> > @@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> >  	return -ENXIO;
> >  }
> >  
> > -static void kvmppc_xics_free(struct kvm_device *dev)
> > +/*
> > + * Called when device fd is closed. kvm->lock is held.
> > + */
> > +static void kvmppc_xics_release(struct kvm_device *dev)
> >  {
> >  	struct kvmppc_xics *xics = dev->private;
> >  	int i;
> >  	struct kvm *kvm = xics->kvm;
> > +	struct kvm_vcpu *vcpu;
> > +
> > +	pr_devel("Releasing xics device\n");
> > +
> > +	/*
> > +	 * Since this is the device release function, we know that
> > +	 * userspace does not have any open fd referring to the
> > +	 * device.  Therefore there can not be any of the device
> > +	 * attribute set/get functions being executed concurrently,
> > +	 * and similarly, the connect_vcpu and set/clr_mapped
> > +	 * functions also cannot be being executed.
> > +	 */
> >  
> >  	debugfs_remove(xics->dentry);
> >  
> > +	/*
> > +	 * We should clean up the vCPU interrupt presenters first.
> > +	 */
> > +	kvm_for_each_vcpu(i, vcpu, kvm) {
> > +		/*
> > +		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
> > +		 * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
> > +		 * Holding the vcpu->mutex also means that execution is
> > +		 * excluded for the vcpu until the ICP was freed. When the vcpu
> > +		 * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
> > +		 * have been cleared and the vcpu will not be going into the
> > +		 * XICS code anymore.
> > +		 */
> > +		mutex_lock(&vcpu->mutex);
> > +		kvmppc_xics_free_icp(vcpu);
> > +		mutex_unlock(&vcpu->mutex);
> > +	}
> > +
> >  	if (kvm)
> >  		kvm->arch.xics = NULL;
> >  
> > -	for (i = 0; i <= xics->max_icsid; i++)
> > +	for (i = 0; i <= xics->max_icsid; i++) {
> >  		kfree(xics->ics[i]);
> > -	kfree(xics);
> > +		xics->ics[i] = NULL;
> > +	}
> > +	/*
> > +	 * A reference of the kvmppc_xics pointer is now kept under
> > +	 * the xics_device pointer of the machine for reuse. It is
> > +	 * freed when the VM is destroyed for now until we fix all the
> > +	 * execution paths.
> > +	 */
> >  	kfree(dev);
> >  }
> >  
> > +static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
> > +{
> > +	struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
> > +	struct kvmppc_xics *xics = *kvm_xics_device;
> > +
> > +	if (!xics) {
> > +		xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> > +		*kvm_xics_device = xics;
> > +	} else {
> > +		memset(xics, 0, sizeof(*xics));
> > +	}
> > +
> > +	return xics;
> > +}
> > +
> >  static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
> >  {
> >  	struct kvmppc_xics *xics;
> >  	struct kvm *kvm = dev->kvm;
> > -	int ret = 0;
> >  
> > -	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> > +	pr_devel("Creating xics for partition\n");
> > +
> > +	/* Already there ? */
> > +	if (kvm->arch.xics)
> > +		return -EEXIST;
> > +
> > +	xics = kvmppc_xics_get_device(kvm);
> >  	if (!xics)
> >  		return -ENOMEM;
> >  
> >  	dev->private = xics;
> >  	xics->dev = dev;
> >  	xics->kvm = kvm;
> > -
> > -	/* Already there ? */
> > -	if (kvm->arch.xics)
> > -		ret = -EEXIST;
> > -	else
> > -		kvm->arch.xics = xics;
> > -
> > -	if (ret) {
> > -		kfree(xics);
> > -		return ret;
> > -	}
> > +	kvm->arch.xics = xics;
> >  
> >  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> >  	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> > @@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
> >  	.name = "kvm-xics",
> >  	.create = kvmppc_xics_create,
> >  	.init = kvmppc_xics_init,
> > -	.destroy = kvmppc_xics_free,
> > +	.release = kvmppc_xics_release,
> >  	.set_attr = xics_set_attr,
> >  	.get_attr = xics_get_attr,
> >  	.has_attr = xics_has_attr,
> > @@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
> >  		return -EPERM;
> >  	if (xics->kvm != vcpu->kvm)
> >  		return -EPERM;
> > -	if (vcpu->arch.irq_type)
> > +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
> >  		return -EBUSY;
> >  
> >  	r = kvmppc_xics_create_icp(vcpu, xcpu);
> > 
> > 
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
@ 2020-09-02  7:31     ` Greg Kurz
  0 siblings, 0 replies; 10+ messages in thread
From: Greg Kurz @ 2020-09-02  7:31 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Paul Mackerras, David Gibson, kvm-ppc, kvm

On Wed, 2 Sep 2020 09:26:06 +0200
Cédric Le Goater <clg@kaod.org> wrote:

> On 8/10/20 12:08 PM, Greg Kurz wrote:
> > Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
> > with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
> > method by a 'release' method"), convert the historical XICS KVM device to
> > implement the 'release' method. This is needed to run nested guests with
> > an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
> > during boot, which requires to be able to destroy and to re-create the
> > KVM device. Only the historical XICS KVM device is available under pseries
> > at the current time and it still uses the legacy 'destroy' method.
> > 
> > Switching to 'release' means that vCPUs might still be running when the
> > device is destroyed. In order to avoid potential use-after-free, the
> > kvmppc_xics structure is allocated on first usage and kept around until
> > the VM exits. The same pointer is used each time a KVM XICS device is
> > being created, but this is okay since we only have one per VM.
> > 
> > Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
> > next time the vCPU resumes execution, it won't be going into the XICS
> > code anymore.
> > 
> > Signed-off-by: Greg Kurz <groug@kaod.org>
> 
> Reviewed-by: Cédric Le Goater <clg@kaod.org>
> 
> and on a P8 host, 
> 
> Tested-by: Cédric Le Goater <clg@kaod.org>
> 
> Thanks,
> 
> C. 
> 

Thanks for the review and testing !

Cheers,

--
Greg

> > ---
> >  arch/powerpc/include/asm/kvm_host.h |    1 
> >  arch/powerpc/kvm/book3s.c           |    4 +-
> >  arch/powerpc/kvm/book3s_xics.c      |   86 ++++++++++++++++++++++++++++-------
> >  3 files changed, 72 insertions(+), 19 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> > index e020d269416d..974adda2ec94 100644
> > --- a/arch/powerpc/include/asm/kvm_host.h
> > +++ b/arch/powerpc/include/asm/kvm_host.h
> > @@ -325,6 +325,7 @@ struct kvm_arch {
> >  #endif
> >  #ifdef CONFIG_KVM_XICS
> >  	struct kvmppc_xics *xics;
> > +	struct kvmppc_xics *xics_device;
> >  	struct kvmppc_xive *xive;    /* Current XIVE device in use */
> >  	struct {
> >  		struct kvmppc_xive *native;
> > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> > index 41fedec69ac3..56618c2770e1 100644
> > --- a/arch/powerpc/kvm/book3s.c
> > +++ b/arch/powerpc/kvm/book3s.c
> > @@ -878,13 +878,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
> >  
> >  #ifdef CONFIG_KVM_XICS
> >  	/*
> > -	 * Free the XIVE devices which are not directly freed by the
> > +	 * Free the XIVE and XICS devices which are not directly freed by the
> >  	 * device 'release' method
> >  	 */
> >  	kfree(kvm->arch.xive_devices.native);
> >  	kvm->arch.xive_devices.native = NULL;
> >  	kfree(kvm->arch.xive_devices.xics_on_xive);
> >  	kvm->arch.xive_devices.xics_on_xive = NULL;
> > +	kfree(kvm->arch.xics_device);
> > +	kvm->arch.xics_device = NULL;
> >  #endif /* CONFIG_KVM_XICS */
> >  }
> >  
> > diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> > index 381bf8dea193..5fee5a11550d 100644
> > --- a/arch/powerpc/kvm/book3s_xics.c
> > +++ b/arch/powerpc/kvm/book3s_xics.c
> > @@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> >  	return -ENXIO;
> >  }
> >  
> > -static void kvmppc_xics_free(struct kvm_device *dev)
> > +/*
> > + * Called when device fd is closed. kvm->lock is held.
> > + */
> > +static void kvmppc_xics_release(struct kvm_device *dev)
> >  {
> >  	struct kvmppc_xics *xics = dev->private;
> >  	int i;
> >  	struct kvm *kvm = xics->kvm;
> > +	struct kvm_vcpu *vcpu;
> > +
> > +	pr_devel("Releasing xics device\n");
> > +
> > +	/*
> > +	 * Since this is the device release function, we know that
> > +	 * userspace does not have any open fd referring to the
> > +	 * device.  Therefore there can not be any of the device
> > +	 * attribute set/get functions being executed concurrently,
> > +	 * and similarly, the connect_vcpu and set/clr_mapped
> > +	 * functions also cannot be being executed.
> > +	 */
> >  
> >  	debugfs_remove(xics->dentry);
> >  
> > +	/*
> > +	 * We should clean up the vCPU interrupt presenters first.
> > +	 */
> > +	kvm_for_each_vcpu(i, vcpu, kvm) {
> > +		/*
> > +		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
> > +		 * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
> > +		 * Holding the vcpu->mutex also means that execution is
> > +		 * excluded for the vcpu until the ICP was freed. When the vcpu
> > +		 * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
> > +		 * have been cleared and the vcpu will not be going into the
> > +		 * XICS code anymore.
> > +		 */
> > +		mutex_lock(&vcpu->mutex);
> > +		kvmppc_xics_free_icp(vcpu);
> > +		mutex_unlock(&vcpu->mutex);
> > +	}
> > +
> >  	if (kvm)
> >  		kvm->arch.xics = NULL;
> >  
> > -	for (i = 0; i <= xics->max_icsid; i++)
> > +	for (i = 0; i <= xics->max_icsid; i++) {
> >  		kfree(xics->ics[i]);
> > -	kfree(xics);
> > +		xics->ics[i] = NULL;
> > +	}
> > +	/*
> > +	 * A reference of the kvmppc_xics pointer is now kept under
> > +	 * the xics_device pointer of the machine for reuse. It is
> > +	 * freed when the VM is destroyed for now until we fix all the
> > +	 * execution paths.
> > +	 */
> >  	kfree(dev);
> >  }
> >  
> > +static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
> > +{
> > +	struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
> > +	struct kvmppc_xics *xics = *kvm_xics_device;
> > +
> > +	if (!xics) {
> > +		xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> > +		*kvm_xics_device = xics;
> > +	} else {
> > +		memset(xics, 0, sizeof(*xics));
> > +	}
> > +
> > +	return xics;
> > +}
> > +
> >  static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
> >  {
> >  	struct kvmppc_xics *xics;
> >  	struct kvm *kvm = dev->kvm;
> > -	int ret = 0;
> >  
> > -	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
> > +	pr_devel("Creating xics for partition\n");
> > +
> > +	/* Already there ? */
> > +	if (kvm->arch.xics)
> > +		return -EEXIST;
> > +
> > +	xics = kvmppc_xics_get_device(kvm);
> >  	if (!xics)
> >  		return -ENOMEM;
> >  
> >  	dev->private = xics;
> >  	xics->dev = dev;
> >  	xics->kvm = kvm;
> > -
> > -	/* Already there ? */
> > -	if (kvm->arch.xics)
> > -		ret = -EEXIST;
> > -	else
> > -		kvm->arch.xics = xics;
> > -
> > -	if (ret) {
> > -		kfree(xics);
> > -		return ret;
> > -	}
> > +	kvm->arch.xics = xics;
> >  
> >  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> >  	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> > @@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
> >  	.name = "kvm-xics",
> >  	.create = kvmppc_xics_create,
> >  	.init = kvmppc_xics_init,
> > -	.destroy = kvmppc_xics_free,
> > +	.release = kvmppc_xics_release,
> >  	.set_attr = xics_set_attr,
> >  	.get_attr = xics_get_attr,
> >  	.has_attr = xics_has_attr,
> > @@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
> >  		return -EPERM;
> >  	if (xics->kvm != vcpu->kvm)
> >  		return -EPERM;
> > -	if (vcpu->arch.irq_type)
> > +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
> >  		return -EBUSY;
> >  
> >  	r = kvmppc_xics_create_icp(vcpu, xcpu);
> > 
> > 
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
  2020-08-10 10:08 ` Greg Kurz
@ 2020-09-03 22:44   ` Paul Mackerras
  -1 siblings, 0 replies; 10+ messages in thread
From: Paul Mackerras @ 2020-09-03 22:44 UTC (permalink / raw)
  To: Greg Kurz; +Cc: David Gibson, Cédric Le Goater, kvm-ppc, kvm

On Mon, Aug 10, 2020 at 12:08:05PM +0200, Greg Kurz wrote:
> Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
> with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
> method by a 'release' method"), convert the historical XICS KVM device to
> implement the 'release' method. This is needed to run nested guests with
> an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
> during boot, which requires to be able to destroy and to re-create the
> KVM device. Only the historical XICS KVM device is available under pseries
> at the current time and it still uses the legacy 'destroy' method.
> 
> Switching to 'release' means that vCPUs might still be running when the
> device is destroyed. In order to avoid potential use-after-free, the
> kvmppc_xics structure is allocated on first usage and kept around until
> the VM exits. The same pointer is used each time a KVM XICS device is
> being created, but this is okay since we only have one per VM.
> 
> Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
> next time the vCPU resumes execution, it won't be going into the XICS
> code anymore.
> 
> Signed-off-by: Greg Kurz <groug@kaod.org>

Thanks, applied to my kvm-ppc-next branch.

Paul.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method
@ 2020-09-03 22:44   ` Paul Mackerras
  0 siblings, 0 replies; 10+ messages in thread
From: Paul Mackerras @ 2020-09-03 22:44 UTC (permalink / raw)
  To: Greg Kurz; +Cc: David Gibson, Cédric Le Goater, kvm-ppc, kvm

On Mon, Aug 10, 2020 at 12:08:05PM +0200, Greg Kurz wrote:
> Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
> with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
> method by a 'release' method"), convert the historical XICS KVM device to
> implement the 'release' method. This is needed to run nested guests with
> an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
> during boot, which requires to be able to destroy and to re-create the
> KVM device. Only the historical XICS KVM device is available under pseries
> at the current time and it still uses the legacy 'destroy' method.
> 
> Switching to 'release' means that vCPUs might still be running when the
> device is destroyed. In order to avoid potential use-after-free, the
> kvmppc_xics structure is allocated on first usage and kept around until
> the VM exits. The same pointer is used each time a KVM XICS device is
> being created, but this is okay since we only have one per VM.
> 
> Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
> next time the vCPU resumes execution, it won't be going into the XICS
> code anymore.
> 
> Signed-off-by: Greg Kurz <groug@kaod.org>

Thanks, applied to my kvm-ppc-next branch.

Paul.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-09-03 22:44 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-10 10:08 [PATCH] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method Greg Kurz
2020-08-10 10:08 ` Greg Kurz
2020-08-31  8:03 ` Greg Kurz
2020-08-31  8:03   ` Greg Kurz
2020-09-02  7:26 ` Cédric Le Goater
2020-09-02  7:26   ` Cédric Le Goater
2020-09-02  7:31   ` Greg Kurz
2020-09-02  7:31     ` Greg Kurz
2020-09-03 22:44 ` Paul Mackerras
2020-09-03 22:44   ` Paul Mackerras

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.