On Fri, Feb 22, 2019 at 12:28:34PM +0100, Cédric Le Goater wrote: > At a VCPU level, the state of the thread interrupt management > registers needs to be collected. These registers are cached under the > 'xive_saved_state.w01' field of the VCPU when the VPCU context is > pulled from the HW thread. An OPAL call retrieves the backup of the > IPB register in the underlying XIVE NVT structure and merges it in the > KVM state. > > The structures of the interface between QEMU and KVM provisions some > extra room (two u64) for further extensions if more state needs to be > transferred back to QEMU. > > Signed-off-by: Cédric Le Goater > --- > arch/powerpc/include/asm/kvm_ppc.h | 11 +++ > arch/powerpc/include/uapi/asm/kvm.h | 2 + > arch/powerpc/kvm/book3s.c | 24 +++++++ > arch/powerpc/kvm/book3s_xive_native.c | 82 ++++++++++++++++++++++ > Documentation/virtual/kvm/devices/xive.txt | 19 +++++ > 5 files changed, 138 insertions(+) > > diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h > index 1e61877fe147..664c65051612 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -272,6 +272,7 @@ union kvmppc_one_reg { > u64 addr; > u64 length; > } vpaval; > + u64 xive_timaval[4]; This is doubling the size of the userspace visible one_reg union. Is that safe? > }; > > struct kvmppc_ops { > @@ -604,6 +605,10 @@ extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, > extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu); > extern void kvmppc_xive_native_init_module(void); > extern void kvmppc_xive_native_exit_module(void); > +extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, > + union kvmppc_one_reg *val); > +extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, > + union kvmppc_one_reg *val); > > #else > static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, > @@ -636,6 +641,12 @@ static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, > static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { } > static inline void kvmppc_xive_native_init_module(void) { } > static inline void kvmppc_xive_native_exit_module(void) { } > +static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, > + union kvmppc_one_reg *val) > +{ return 0; } > +static inline int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, > + union kvmppc_one_reg *val) > +{ return -ENOENT; } > > #endif /* CONFIG_KVM_XIVE */ > > diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h > index cd78ad1020fe..42d4ef93ec2d 100644 > --- a/arch/powerpc/include/uapi/asm/kvm.h > +++ b/arch/powerpc/include/uapi/asm/kvm.h > @@ -480,6 +480,8 @@ struct kvm_ppc_cpu_char { > #define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */ > #define KVM_REG_PPC_ICP_PPRI_MASK 0xff > > +#define KVM_REG_PPC_VP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d) > + > /* Device control API: PPC-specific devices */ > #define KVM_DEV_MPIC_GRP_MISC 1 > #define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */ > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c > index 96d43f091255..f85a9211f30c 100644 > --- a/arch/powerpc/kvm/book3s.c > +++ b/arch/powerpc/kvm/book3s.c > @@ -641,6 +641,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, > *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); > break; > #endif /* CONFIG_KVM_XICS */ > +#ifdef CONFIG_KVM_XIVE > + case KVM_REG_PPC_VP_STATE: > + if (!vcpu->arch.xive_vcpu) { > + r = -ENXIO; > + break; > + } > + if (xive_enabled()) > + r = kvmppc_xive_native_get_vp(vcpu, val); > + else > + r = -ENXIO; > + break; > +#endif /* CONFIG_KVM_XIVE */ > case KVM_REG_PPC_FSCR: > *val = get_reg_val(id, vcpu->arch.fscr); > break; > @@ -714,6 +726,18 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, > r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val)); > break; > #endif /* CONFIG_KVM_XICS */ > +#ifdef CONFIG_KVM_XIVE > + case KVM_REG_PPC_VP_STATE: > + if (!vcpu->arch.xive_vcpu) { > + r = -ENXIO; > + break; > + } > + if (xive_enabled()) > + r = kvmppc_xive_native_set_vp(vcpu, val); > + else > + r = -ENXIO; > + break; > +#endif /* CONFIG_KVM_XIVE */ > case KVM_REG_PPC_FSCR: > vcpu->arch.fscr = set_reg_val(id, *val); > break; > diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c > index 3debc876d5a0..132bff52d70a 100644 > --- a/arch/powerpc/kvm/book3s_xive_native.c > +++ b/arch/powerpc/kvm/book3s_xive_native.c > @@ -845,6 +845,88 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) > return ret; > } > > +/* > + * Interrupt Pending Buffer (IPB) offset > + */ > +#define TM_IPB_SHIFT 40 > +#define TM_IPB_MASK (((u64) 0xFF) << TM_IPB_SHIFT) > + > +int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + u64 opal_state; > + int rc; > + > + if (!kvmppc_xive_enabled(vcpu)) > + return -EPERM; > + > + if (!xc) > + return -ENOENT; > + > + /* Thread context registers. We only care about IPB and CPPR */ > + val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01; > + > + /* > + * Return the OS CAM line to print out the VP identifier in > + * the QEMU monitor. This is not restored. > + */ > + val->xive_timaval[1] = vcpu->arch.xive_cam_word; I'm pretty dubious about this mixing of vital state information with what's basically debug information. Doubly so since it requires changing the ABI to increase the one_reg union's size. Might be better to have this control only return the 0th and 2nd u64s from the TIMA, with the CAM debug information returned via some other mechanism. > + > + /* Get the VP state from OPAL */ > + rc = xive_native_get_vp_state(xc->vp_id, &opal_state); > + if (rc) > + return rc; > + > + /* > + * Capture the backup of IPB register in the NVT structure and > + * merge it in our KVM VP state. > + */ > + val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK); > + > + pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n", > + __func__, > + vcpu->arch.xive_saved_state.nsr, > + vcpu->arch.xive_saved_state.cppr, > + vcpu->arch.xive_saved_state.ipb, > + vcpu->arch.xive_saved_state.pipr, > + vcpu->arch.xive_saved_state.w01, > + (u32) vcpu->arch.xive_cam_word, opal_state); Hrm.. except you don't seem to be using the last half of the timaval field anyway. > + > + return 0; > +} > + > +int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; > + > + pr_devel("%s w01=%016llx vp=%016llx\n", __func__, > + val->xive_timaval[0], val->xive_timaval[1]); > + > + if (!kvmppc_xive_enabled(vcpu)) > + return -EPERM; > + > + if (!xc || !xive) > + return -ENOENT; > + > + /* We can't update the state of a "pushed" VCPU */ > + if (WARN_ON(vcpu->arch.xive_pushed)) What prevents userspace from tripping this WARN_ON()? > + return -EIO; EBUSY might be more appropriate here. > + > + /* > + * Restore the thread context registers. IPB and CPPR should > + * be the only ones that matter. > + */ > + vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0]; > + > + /* > + * There is no need to restore the XIVE internal state (IPB > + * stored in the NVT) as the IPB register was merged in KVM VP > + * state when captured. > + */ > + return 0; > +} > + > static int xive_native_debug_show(struct seq_file *m, void *private) > { > struct kvmppc_xive *xive = m->private; > diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt > index a26be635cff9..1b8957c50c53 100644 > --- a/Documentation/virtual/kvm/devices/xive.txt > +++ b/Documentation/virtual/kvm/devices/xive.txt > @@ -102,6 +102,25 @@ the legacy interrupt mode, referred as XICS (POWER7/8). > -EINVAL: Not initialized source number, invalid priority or > invalid CPU number. > > +* VCPU state > + > + The XIVE IC maintains VP interrupt state in an internal structure > + called the NVT. When a VP is not dispatched on a HW processor > + thread, this structure can be updated by HW if the VP is the target > + of an event notification. > + > + It is important for migration to capture the cached IPB from the NVT > + as it synthesizes the priorities of the pending interrupts. We > + capture a bit more to report debug information. > + > + KVM_REG_PPC_VP_STATE (4 * 64bits) > + bits: | 63 .... 32 | 31 .... 0 | > + values: | TIMA word0 | TIMA word1 | > + bits: | 127 .......... 64 | > + values: | VP CAM Line | > + bits: | 255 .......... 128 | > + values: | unused | > + > * Migration: > > Saving the state of a VM using the XIVE native exploitation mode -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson