All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode
@ 2019-01-07 18:39 Cédric Le Goater
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 01/13] linux-headers: update to 5.0 Cédric Le Goater
                   ` (12 more replies)
  0 siblings, 13 replies; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

Hello,

Following the 'dual' IRQ backend, this series adds KVM support to the
XIVE interrupt mode.

The first patches introduce the XIVE KVM device, state synchronization
and migration support under KVM. The second part of the patchset
modifies the XICS and XIVE interrupt models to add KVM support to the
'dual' IRQ backend.

This is a first round to check that the interfaces with Linux/KVM are
well in place. 

GitHub trees available here :
 
QEMU sPAPR:

  https://github.com/legoater/qemu/commits/xive-next
  
Linux/KVM:

  https://github.com/legoater/linux/commits/xive-5.0

OPAL:

  https://github.com/legoater/skiboot/commits/xive

Thanks,

C.

Cédric Le Goater (13):
  linux-headers: update to 5.0
  spapr/xive: add KVM support
  spapr/xive: add state synchronization with KVM
  spapr/xive: introduce a VM state change handler
  spapr/xive: add migration support for KVM
  spapr/xive: fix migration of the XiveTCTX under TCG
  ppc/xics: introduce a icp_kvm_connect() routine
  spapr/rtas: modify spapr_rtas_register() to remove RTAS handlers
  sysbus: add a sysbus_mmio_unmap() helper
  spapr: introduce routines to delete the KVM IRQ device
  spapr: check for the activation of the KVM IRQ device
  spapr/xics: ignore the lower 4K in the IRQ number space
  spapr: add KVM support to the 'dual' machine

 default-configs/ppc64-softmmu.mak |   1 +
 include/hw/ppc/spapr.h            |   4 +
 include/hw/ppc/spapr_xive.h       |  26 +
 include/hw/ppc/xics.h             |   1 +
 include/hw/ppc/xive.h             |  24 +
 include/hw/sysbus.h               |   1 +
 linux-headers/asm-powerpc/kvm.h   |  46 ++
 linux-headers/linux/kvm.h         |   9 +
 target/ppc/kvm_ppc.h              |   6 +
 hw/core/sysbus.c                  |  10 +
 hw/intc/spapr_xive.c              |  83 ++-
 hw/intc/spapr_xive_kvm.c          | 886 ++++++++++++++++++++++++++++++
 hw/intc/xics_kvm.c                | 154 +++++-
 hw/intc/xive.c                    |  45 +-
 hw/ppc/spapr_irq.c                | 114 +++-
 hw/ppc/spapr_rtas.c               |   2 +-
 target/ppc/kvm.c                  |   7 +
 hw/intc/Makefile.objs             |   1 +
 18 files changed, 1363 insertions(+), 57 deletions(-)
 create mode 100644 hw/intc/spapr_xive_kvm.c

-- 
2.20.1

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 01/13] linux-headers: update to 5.0
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 02/13] spapr/xive: add KVM support Cédric Le Goater
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

These changes provide the interface with the KVM device implementing
the XIVE native exploitation interrupt mode. Also used to retrieve the
state of the KVM device for the monitor usage and for migration.

Available from :

  https://github.com/legoater/linux/commits/xive-5.0

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 linux-headers/asm-powerpc/kvm.h | 46 +++++++++++++++++++++++++++++++++
 linux-headers/linux/kvm.h       |  9 +++++++
 2 files changed, 55 insertions(+)

diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
index 8c876c166ef2..10fe86c21e8f 100644
--- a/linux-headers/asm-powerpc/kvm.h
+++ b/linux-headers/asm-powerpc/kvm.h
@@ -480,6 +480,8 @@ struct kvm_ppc_cpu_char {
 #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
 #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
 
+#define KVM_REG_PPC_NVT_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
+
 /* Device control API: PPC-specific devices */
 #define KVM_DEV_MPIC_GRP_MISC		1
 #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
@@ -675,4 +677,48 @@ struct kvm_ppc_cpu_char {
 #define  KVM_XICS_PRESENTED		(1ULL << 43)
 #define  KVM_XICS_QUEUED		(1ULL << 44)
 
+/* POWER9 XIVE Native Interrupt Controller */
+#define KVM_DEV_XIVE_GRP_CTRL		1
+#define   KVM_DEV_XIVE_GET_ESB_FD	1
+#define   KVM_DEV_XIVE_GET_TIMA_FD	2
+#define   KVM_DEV_XIVE_VC_BASE		3
+#define   KVM_DEV_XIVE_SAVE_EQ_PAGES	4
+#define KVM_DEV_XIVE_GRP_SOURCES	2	/* 64-bit source attributes */
+#define KVM_DEV_XIVE_GRP_SYNC		3	/* 64-bit source attributes */
+#define KVM_DEV_XIVE_GRP_EAS		4	/* 64-bit eas attributes */
+#define KVM_DEV_XIVE_GRP_EQ		5	/* 64-bit eq attributes */
+
+/* Layout of 64-bit XIVE source attribute values */
+#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
+#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
+
+/* Layout of 64-bit eas attribute values */
+#define KVM_XIVE_EAS_PRIORITY_SHIFT	0
+#define KVM_XIVE_EAS_PRIORITY_MASK	0x7
+#define KVM_XIVE_EAS_SERVER_SHIFT	3
+#define KVM_XIVE_EAS_SERVER_MASK	0xfffffff8ULL
+#define KVM_XIVE_EAS_MASK_SHIFT		32
+#define KVM_XIVE_EAS_MASK_MASK		0x100000000ULL
+#define KVM_XIVE_EAS_EISN_SHIFT		33
+#define KVM_XIVE_EAS_EISN_MASK		0xfffffffe00000000ULL
+
+/* Layout of 64-bit eq attribute */
+#define KVM_XIVE_EQ_PRIORITY_SHIFT	0
+#define KVM_XIVE_EQ_PRIORITY_MASK	0x7
+#define KVM_XIVE_EQ_SERVER_SHIFT	3
+#define KVM_XIVE_EQ_SERVER_MASK		0xfffffff8ULL
+
+/* Layout of 64-bit eq attribute values */
+struct kvm_ppc_xive_eq {
+	__u32 flags;
+	__u32 qsize;
+	__u64 qpage;
+	__u32 qtoggle;
+	__u32 qindex;
+};
+
+#define KVM_XIVE_EQ_FLAG_ENABLED	0x00000001
+#define KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY	0x00000002
+#define KVM_XIVE_EQ_FLAG_ESCALATE	0x00000004
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index f11a7eb49cfa..7f476ad5e4e8 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1,3 +1,4 @@
+
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 #ifndef __LINUX_KVM_H
 #define __LINUX_KVM_H
@@ -965,6 +966,10 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_COALESCED_PIO 162
 #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
 #define KVM_CAP_EXCEPTION_PAYLOAD 164
+#define KVM_CAP_ARM_VM_IPA_SIZE 165
+#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
+#define KVM_CAP_HYPERV_CPUID 167
+#define KVM_CAP_PPC_IRQ_XIVE 168
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1188,6 +1193,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
 	KVM_DEV_TYPE_ARM_VGIC_ITS,
 #define KVM_DEV_TYPE_ARM_VGIC_ITS	KVM_DEV_TYPE_ARM_VGIC_ITS
+	KVM_DEV_TYPE_XIVE,
+#define KVM_DEV_TYPE_XIVE		KVM_DEV_TYPE_XIVE
 	KVM_DEV_TYPE_MAX,
 };
 
@@ -1305,6 +1312,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
 #define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
 
+#define KVM_DESTROY_DEVICE	  _IOWR(KVMIO,  0xf0, struct kvm_create_device)
+
 /*
  * ioctls for vcpu fds
  */
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 02/13] spapr/xive: add KVM support
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 01/13] linux-headers: update to 5.0 Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-02-06  2:39   ` David Gibson
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 03/13] spapr/xive: add state synchronization with KVM Cédric Le Goater
                   ` (10 subsequent siblings)
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

This introduces a set of helpers when KVM is in use, which create the
KVM XIVE device, initialize the interrupt sources at a KVM level and
connect the interrupt presenters to the vCPU.

They also handle the initialization of the TIMA and the source ESB
memory regions of the controller. These have a different type under
KVM. They are 'ram device' memory mappings, similarly to VFIO, exposed
to the guest and the associated VMAs on the host are populated
dynamically with the appropriate pages using a fault handler.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 default-configs/ppc64-softmmu.mak |   1 +
 include/hw/ppc/spapr_xive.h       |  10 ++
 include/hw/ppc/xive.h             |  22 +++
 target/ppc/kvm_ppc.h              |   6 +
 hw/intc/spapr_xive.c              |  31 ++--
 hw/intc/spapr_xive_kvm.c          | 254 ++++++++++++++++++++++++++++++
 hw/intc/xive.c                    |  22 ++-
 hw/ppc/spapr_irq.c                |  11 +-
 target/ppc/kvm.c                  |   7 +
 hw/intc/Makefile.objs             |   1 +
 10 files changed, 349 insertions(+), 16 deletions(-)
 create mode 100644 hw/intc/spapr_xive_kvm.c

diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
index 7f34ad0528ed..c1bf5cd951f5 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -18,6 +18,7 @@ CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
 CONFIG_XICS_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
 CONFIG_XIVE=$(CONFIG_PSERIES)
 CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
+CONFIG_XIVE_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
 CONFIG_MEM_DEVICE=y
 CONFIG_DIMM=y
 CONFIG_SPAPR_RNG=y
diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 7fdc25057420..24a0be478039 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -35,6 +35,10 @@ typedef struct sPAPRXive {
     /* TIMA mapping address */
     hwaddr        tm_base;
     MemoryRegion  tm_mmio;
+
+    /* KVM support */
+    int           fd;
+    void          *tm_mmap;
 } sPAPRXive;
 
 bool spapr_xive_irq_claim(sPAPRXive *xive, uint32_t lisn, bool lsi);
@@ -48,5 +52,11 @@ void spapr_dt_xive(sPAPRMachineState *spapr, uint32_t nr_servers, void *fdt,
                    uint32_t phandle);
 void spapr_xive_set_tctx_os_cam(XiveTCTX *tctx);
 void spapr_xive_mmio_set_enabled(sPAPRXive *xive, bool enable);
+void spapr_xive_map_mmio(sPAPRXive *xive);
+
+/*
+ * KVM XIVE device helpers
+ */
+void kvmppc_xive_connect(sPAPRXive *xive, Error **errp);
 
 #endif /* PPC_SPAPR_XIVE_H */
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index ec23253ba448..4bbba8d39a65 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -140,6 +140,7 @@
 #ifndef PPC_XIVE_H
 #define PPC_XIVE_H
 
+#include "sysemu/kvm.h"
 #include "hw/qdev-core.h"
 #include "hw/sysbus.h"
 #include "hw/ppc/xive_regs.h"
@@ -194,6 +195,9 @@ typedef struct XiveSource {
     uint32_t        esb_shift;
     MemoryRegion    esb_mmio;
 
+    /* KVM support */
+    void            *esb_mmap;
+
     XiveNotifier    *xive;
 } XiveSource;
 
@@ -421,4 +425,22 @@ static inline uint32_t xive_nvt_cam_line(uint8_t nvt_blk, uint32_t nvt_idx)
     return (nvt_blk << 19) | nvt_idx;
 }
 
+/*
+ * KVM XIVE device helpers
+ */
+
+/* Keep inlined to discard compile of KVM code sections */
+static inline bool kvmppc_xive_enabled(void)
+{
+    if (kvm_enabled()) {
+        return machine_kernel_irqchip_allowed(MACHINE(qdev_get_machine()));
+    } else {
+        return false;
+    }
+}
+
+void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp);
+void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val);
+void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp);
+
 #endif /* PPC_XIVE_H */
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index bdfaa4e70a83..d2159660f9f2 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -59,6 +59,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
 bool kvmppc_has_cap_htm(void);
 bool kvmppc_has_cap_mmu_radix(void);
 bool kvmppc_has_cap_mmu_hash_v3(void);
+bool kvmppc_has_cap_xive(void);
 int kvmppc_get_cap_safe_cache(void);
 int kvmppc_get_cap_safe_bounds_check(void);
 int kvmppc_get_cap_safe_indirect_branch(void);
@@ -307,6 +308,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
     return false;
 }
 
+static inline bool kvmppc_has_cap_xive(void)
+{
+    return false;
+}
+
 static inline int kvmppc_get_cap_safe_cache(void)
 {
     return 0;
diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index d391177ab81f..cf6d3a5f12e1 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -172,7 +172,7 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
     }
 }
 
-static void spapr_xive_map_mmio(sPAPRXive *xive)
+void spapr_xive_map_mmio(sPAPRXive *xive)
 {
     sysbus_mmio_map(SYS_BUS_DEVICE(xive), 0, xive->vc_base);
     sysbus_mmio_map(SYS_BUS_DEVICE(xive), 1, xive->end_base);
@@ -250,6 +250,9 @@ static void spapr_xive_instance_init(Object *obj)
                       TYPE_XIVE_END_SOURCE);
     object_property_add_child(obj, "end_source", OBJECT(&xive->end_source),
                               NULL);
+
+    /* Not connected to the KVM XIVE device */
+    xive->fd = -1;
 }
 
 static void spapr_xive_realize(DeviceState *dev, Error **errp)
@@ -304,17 +307,25 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
     xive->eat = g_new0(XiveEAS, xive->nr_irqs);
     xive->endt = g_new0(XiveEND, xive->nr_ends);
 
-    /* TIMA initialization */
-    memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
-                          "xive.tima", 4ull << TM_SHIFT);
+    if (kvmppc_xive_enabled()) {
+        kvmppc_xive_connect(xive, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    } else {
+        /* TIMA initialization */
+        memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
+                              "xive.tima", 4ull << TM_SHIFT);
 
-    /* Define all XIVE MMIO regions on SysBus */
-    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio);
-    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio);
-    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
+        /* Define all XIVE MMIO regions on SysBus */
+        sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio);
+        sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio);
+        sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
 
-    /* Map all regions */
-    spapr_xive_map_mmio(xive);
+        /* Map all regions */
+        spapr_xive_map_mmio(xive);
+    }
 
     qemu_register_reset(spapr_xive_reset, dev);
 }
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
new file mode 100644
index 000000000000..f96c66fa419d
--- /dev/null
+++ b/hw/intc/spapr_xive_kvm.c
@@ -0,0 +1,254 @@
+/*
+ * QEMU PowerPC sPAPR XIVE interrupt controller model
+ *
+ * Copyright (c) 2017-2019, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "target/ppc/cpu.h"
+#include "sysemu/cpus.h"
+#include "sysemu/kvm.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_xive.h"
+#include "hw/ppc/xive.h"
+#include "kvm_ppc.h"
+
+#include <sys/ioctl.h>
+
+/*
+ * Helpers for CPU hotplug
+ *
+ * TODO: make a common KVMEnabledCPU layer for XICS and XIVE
+ */
+typedef struct KVMEnabledCPU {
+    unsigned long vcpu_id;
+    QLIST_ENTRY(KVMEnabledCPU) node;
+} KVMEnabledCPU;
+
+static QLIST_HEAD(, KVMEnabledCPU)
+    kvm_enabled_cpus = QLIST_HEAD_INITIALIZER(&kvm_enabled_cpus);
+
+static bool kvm_cpu_is_enabled(CPUState *cs)
+{
+    KVMEnabledCPU *enabled_cpu;
+    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
+
+    QLIST_FOREACH(enabled_cpu, &kvm_enabled_cpus, node) {
+        if (enabled_cpu->vcpu_id == vcpu_id) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static void kvm_cpu_enable(CPUState *cs)
+{
+    KVMEnabledCPU *enabled_cpu;
+    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
+
+    enabled_cpu = g_malloc(sizeof(*enabled_cpu));
+    enabled_cpu->vcpu_id = vcpu_id;
+    QLIST_INSERT_HEAD(&kvm_enabled_cpus, enabled_cpu, node);
+}
+
+/*
+ * XIVE Thread Interrupt Management context (KVM)
+ */
+
+void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp)
+{
+    sPAPRXive *xive = SPAPR_MACHINE(qdev_get_machine())->xive;
+    unsigned long vcpu_id;
+    int ret;
+
+    /* Check if CPU was hot unplugged and replugged. */
+    if (kvm_cpu_is_enabled(tctx->cs)) {
+        return;
+    }
+
+    vcpu_id = kvm_arch_vcpu_id(tctx->cs);
+
+    ret = kvm_vcpu_enable_cap(tctx->cs, KVM_CAP_PPC_IRQ_XIVE, 0, xive->fd,
+                              vcpu_id, 0);
+    if (ret < 0) {
+        error_setg(errp, "XIVE: unable to connect CPU%ld to KVM device: %s",
+                   vcpu_id, strerror(errno));
+        return;
+    }
+
+    kvm_cpu_enable(tctx->cs);
+}
+
+/*
+ * XIVE Interrupt Source (KVM)
+ */
+
+/*
+ * At reset, the interrupt sources are simply created and MASKED. We
+ * only need to inform the KVM XIVE device about their type: LSI or
+ * MSI.
+ */
+void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp)
+{
+    sPAPRXive *xive = SPAPR_XIVE(xsrc->xive);
+    int i;
+
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        Error *local_err = NULL;
+        uint64_t state = 0;
+
+        if (xive_source_irq_is_lsi(xsrc, i)) {
+            state |= KVM_XIVE_LEVEL_SENSITIVE;
+            if (xsrc->status[i] & XIVE_STATUS_ASSERTED) {
+                state |= KVM_XIVE_LEVEL_ASSERTED;
+            }
+        }
+
+        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCES, i, &state,
+                          true, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
+void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
+{
+    XiveSource *xsrc = opaque;
+    struct kvm_irq_level args;
+    int rc;
+
+    args.irq = srcno;
+    if (!xive_source_irq_is_lsi(xsrc, srcno)) {
+        if (!val) {
+            return;
+        }
+        args.level = KVM_INTERRUPT_SET;
+    } else {
+        if (val) {
+            xsrc->status[srcno] |= XIVE_STATUS_ASSERTED;
+            args.level = KVM_INTERRUPT_SET_LEVEL;
+        } else {
+            xsrc->status[srcno] &= ~XIVE_STATUS_ASSERTED;
+            args.level = KVM_INTERRUPT_UNSET;
+        }
+    }
+    rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args);
+    if (rc < 0) {
+        error_report("XIVE: kvm_irq_line() failed : %s", strerror(errno));
+    }
+}
+
+/*
+ * sPAPR XIVE interrupt controller (KVM)
+ */
+
+static void *kvmppc_xive_mmap(sPAPRXive *xive, int ctrl, size_t len,
+                                 Error **errp)
+{
+    Error *local_err = NULL;
+    void *addr;
+    int fd;
+
+    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, ctrl, &fd, false,
+                      &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return NULL;
+    }
+
+    addr = mmap(NULL, len, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+    close(fd);
+    if (addr == MAP_FAILED) {
+        error_setg_errno(errp, errno, "XIVE: unable to set memory mapping");
+        return NULL;
+    }
+
+    return addr;
+}
+
+/*
+ * All the XIVE memory regions are now backed by mappings from the KVM
+ * XIVE device.
+ */
+void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
+{
+    XiveSource *xsrc = &xive->source;
+    XiveENDSource *end_xsrc = &xive->end_source;
+    Error *local_err = NULL;
+    size_t esb_len;
+    size_t tima_len;
+
+    if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
+        error_setg(errp, "IRQ_XIVE capability must be present for KVM");
+        return;
+    }
+
+    /* First, create the KVM XIVE device */
+    xive->fd = kvm_create_device(kvm_state, KVM_DEV_TYPE_XIVE, false);
+    if (xive->fd < 0) {
+        error_setg_errno(errp, -xive->fd, "XIVE: error creating KVM device");
+        return;
+    }
+
+    /*
+     * Source ESBs KVM mapping
+     *
+     * Inform KVM where we will map the ESB pages. This is needed by
+     * the H_INT_GET_SOURCE_INFO hcall which returns the source
+     * characteristics, among which the ESB page address.
+     */
+    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, KVM_DEV_XIVE_VC_BASE,
+                      &xive->vc_base, true, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
+    xsrc->esb_mmap = kvmppc_xive_mmap(xive, KVM_DEV_XIVE_GET_ESB_FD,
+                                      esb_len, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    memory_region_init_ram_device_ptr(&xsrc->esb_mmio, OBJECT(xsrc),
+                                      "xive.esb", esb_len, xsrc->esb_mmap);
+    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio);
+
+    /* END ESBs mapping (No KVM) */
+    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio);
+
+    /*
+     * TIMA KVM mapping
+     *
+     * We could also inform KVM where the TIMA will be mapped but as
+     * this is a fixed MMIO address for the system it does not seem
+     * necessary to provide a KVM ioctl to change it.
+     */
+    tima_len = 4ull << TM_SHIFT;
+    xive->tm_mmap = kvmppc_xive_mmap(xive, KVM_DEV_XIVE_GET_TIMA_FD,
+                                     tima_len, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    memory_region_init_ram_device_ptr(&xive->tm_mmio, OBJECT(xive),
+                                      "xive.tima", tima_len, xive->tm_mmap);
+    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
+
+    kvm_kernel_irqchip = true;
+    kvm_msi_via_irqfd_allowed = true;
+    kvm_gsi_direct_mapping = true;
+
+    /* Map all regions */
+    spapr_xive_map_mmio(xive);
+}
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index a3cb0cf0e348..9a2d7be283f8 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -15,6 +15,7 @@
 #include "sysemu/dma.h"
 #include "hw/qdev-properties.h"
 #include "monitor/monitor.h"
+#include "hw/boards.h"
 #include "hw/ppc/xive.h"
 #include "hw/ppc/xive_regs.h"
 
@@ -493,6 +494,15 @@ static void xive_tctx_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    /* Connect the presenter to the VCPU (required for CPU hotplug) */
+    if (kvmppc_xive_enabled()) {
+        kvmppc_xive_cpu_connect(tctx, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
     qemu_register_reset(xive_tctx_reset, dev);
 }
 
@@ -895,6 +905,10 @@ static void xive_source_reset(void *dev)
 
     /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
     memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
+
+    if (kvmppc_xive_enabled()) {
+        kvmppc_xive_source_reset(xsrc, &error_fatal);
+    }
 }
 
 static void xive_source_realize(DeviceState *dev, Error **errp)
@@ -928,9 +942,11 @@ static void xive_source_realize(DeviceState *dev, Error **errp)
     xsrc->status = g_malloc0(xsrc->nr_irqs);
     xsrc->lsi_map = bitmap_new(xsrc->nr_irqs);
 
-    memory_region_init_io(&xsrc->esb_mmio, OBJECT(xsrc),
-                          &xive_source_esb_ops, xsrc, "xive.esb",
-                          (1ull << xsrc->esb_shift) * xsrc->nr_irqs);
+    if (!kvmppc_xive_enabled()) {
+        memory_region_init_io(&xsrc->esb_mmio, OBJECT(xsrc),
+                              &xive_source_esb_ops, xsrc, "xive.esb",
+                              (1ull << xsrc->esb_shift) * xsrc->nr_irqs);
+    }
 
     qemu_register_reset(xive_source_reset, dev);
 }
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index 5fce72fe0f6c..afbdabfa6543 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -15,6 +15,7 @@
 #include "hw/ppc/spapr_xive.h"
 #include "hw/ppc/xics.h"
 #include "sysemu/kvm.h"
+#include "kvm_ppc.h"
 
 #include "trace.h"
 
@@ -266,9 +267,9 @@ static void spapr_irq_init_xive(sPAPRMachineState *spapr, Error **errp)
     DeviceState *dev;
     int i;
 
-    /* KVM XIVE device not yet available */
     if (kvm_enabled()) {
-        if (machine_kernel_irqchip_required(machine)) {
+        if (machine_kernel_irqchip_required(machine) &&
+            !kvmppc_has_cap_xive()) {
             error_setg(errp, "kernel_irqchip requested. no KVM XIVE support");
             return;
         }
@@ -384,7 +385,11 @@ static void spapr_irq_set_irq_xive(void *opaque, int srcno, int val)
 {
     sPAPRMachineState *spapr = opaque;
 
-    xive_source_set_irq(&spapr->xive->source, srcno, val);
+    if (kvmppc_xive_enabled()) {
+        kvmppc_xive_source_set_irq(&spapr->xive->source, srcno, val);
+    } else {
+        xive_source_set_irq(&spapr->xive->source, srcno, val);
+    }
 }
 
 /*
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index ebbb48c42f25..88a470a73e7c 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -86,6 +86,7 @@ static int cap_fixup_hcalls;
 static int cap_htm;             /* Hardware transactional memory support */
 static int cap_mmu_radix;
 static int cap_mmu_hash_v3;
+static int cap_xive;
 static int cap_resize_hpt;
 static int cap_ppc_pvr_compat;
 static int cap_ppc_safe_cache;
@@ -149,6 +150,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
     cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM);
     cap_mmu_radix = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_RADIX);
     cap_mmu_hash_v3 = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_HASH_V3);
+    cap_xive = kvm_vm_check_extension(s, KVM_CAP_PPC_IRQ_XIVE);
     cap_resize_hpt = kvm_vm_check_extension(s, KVM_CAP_SPAPR_RESIZE_HPT);
     kvmppc_get_cpu_characteristics(s);
     cap_ppc_nested_kvm_hv = kvm_vm_check_extension(s, KVM_CAP_PPC_NESTED_HV);
@@ -2389,6 +2391,11 @@ static int parse_cap_ppc_safe_indirect_branch(struct kvm_ppc_cpu_char c)
     return 0;
 }
 
+bool kvmppc_has_cap_xive(void)
+{
+    return cap_xive;
+}
+
 static void kvmppc_get_cpu_characteristics(KVMState *s)
 {
     struct kvm_ppc_cpu_char c;
diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
index 301a8e972d91..23126c199178 100644
--- a/hw/intc/Makefile.objs
+++ b/hw/intc/Makefile.objs
@@ -39,6 +39,7 @@ obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
 obj-$(CONFIG_XICS_KVM) += xics_kvm.o
 obj-$(CONFIG_XIVE) += xive.o
 obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o
+obj-$(CONFIG_XIVE_KVM) += spapr_xive_kvm.o
 obj-$(CONFIG_POWERNV) += xics_pnv.o
 obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
 obj-$(CONFIG_S390_FLIC) += s390_flic.o
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 03/13] spapr/xive: add state synchronization with KVM
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 01/13] linux-headers: update to 5.0 Cédric Le Goater
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 02/13] spapr/xive: add KVM support Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-02-06  2:42   ` David Gibson
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 04/13] spapr/xive: introduce a VM state change handler Cédric Le Goater
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

This extends the KVM XIVE device backend with 'synchronize_state'
methods used to retrieve the state from KVM. The HW state of the
sources, the KVM device and the thread interrupt contexts are
collected for the monitor usage and also migration.

These get operations rely on their KVM counterpart in the host kernel
which acts as a proxy for OPAL, the host firmware. The set operations
will be added for migration support later.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr_xive.h |   9 ++
 include/hw/ppc/xive.h       |   1 +
 hw/intc/spapr_xive.c        |  24 ++--
 hw/intc/spapr_xive_kvm.c    | 223 ++++++++++++++++++++++++++++++++++++
 hw/intc/xive.c              |  10 ++
 5 files changed, 260 insertions(+), 7 deletions(-)

diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 24a0be478039..02f2de20111c 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -44,6 +44,14 @@ typedef struct sPAPRXive {
 bool spapr_xive_irq_claim(sPAPRXive *xive, uint32_t lisn, bool lsi);
 bool spapr_xive_irq_free(sPAPRXive *xive, uint32_t lisn);
 void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
+bool spapr_xive_priority_is_reserved(uint8_t priority);
+
+void spapr_xive_cpu_to_nvt(PowerPCCPU *cpu,
+                           uint8_t *out_nvt_blk, uint32_t *out_nvt_idx);
+void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
+                           uint8_t *out_end_blk, uint32_t *out_end_idx);
+int spapr_xive_target_to_end(uint32_t target, uint8_t prio,
+                             uint8_t *out_end_blk, uint32_t *out_end_idx);
 
 typedef struct sPAPRMachineState sPAPRMachineState;
 
@@ -58,5 +66,6 @@ void spapr_xive_map_mmio(sPAPRXive *xive);
  * KVM XIVE device helpers
  */
 void kvmppc_xive_connect(sPAPRXive *xive, Error **errp);
+void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp);
 
 #endif /* PPC_SPAPR_XIVE_H */
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index 4bbba8d39a65..2e48d75a22e0 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -442,5 +442,6 @@ static inline bool kvmppc_xive_enabled(void)
 void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp);
 void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val);
 void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp);
+void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp);
 
 #endif /* PPC_XIVE_H */
diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index cf6d3a5f12e1..50dd66707968 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -54,8 +54,8 @@ static uint32_t spapr_xive_nvt_to_target(uint8_t nvt_blk, uint32_t nvt_idx)
     return nvt_idx - SPAPR_XIVE_NVT_BASE;
 }
 
-static void spapr_xive_cpu_to_nvt(PowerPCCPU *cpu,
-                                  uint8_t *out_nvt_blk, uint32_t *out_nvt_idx)
+void spapr_xive_cpu_to_nvt(PowerPCCPU *cpu,
+                           uint8_t *out_nvt_blk, uint32_t *out_nvt_idx)
 {
     assert(cpu);
 
@@ -85,8 +85,8 @@ static int spapr_xive_target_to_nvt(uint32_t target,
  * sPAPR END indexing uses a simple mapping of the CPU vcpu_id, 8
  * priorities per CPU
  */
-static void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
-                                  uint8_t *out_end_blk, uint32_t *out_end_idx)
+void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
+                           uint8_t *out_end_blk, uint32_t *out_end_idx)
 {
     assert(cpu);
 
@@ -99,8 +99,8 @@ static void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
     }
 }
 
-static int spapr_xive_target_to_end(uint32_t target, uint8_t prio,
-                                    uint8_t *out_end_blk, uint32_t *out_end_idx)
+int spapr_xive_target_to_end(uint32_t target, uint8_t prio,
+                             uint8_t *out_end_blk, uint32_t *out_end_idx)
 {
     PowerPCCPU *cpu = spapr_find_cpu(target);
 
@@ -139,6 +139,16 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
     XiveSource *xsrc = &xive->source;
     int i;
 
+    if (kvmppc_xive_enabled()) {
+        Error *local_err = NULL;
+
+        kvmppc_xive_synchronize_state(xive, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return;
+        }
+    }
+
     monitor_printf(mon, "  LSIN         PQ    EISN     CPU/PRIO EQ\n");
 
     for (i = 0; i < xive->nr_irqs; i++) {
@@ -529,7 +539,7 @@ bool spapr_xive_irq_free(sPAPRXive *xive, uint32_t lisn)
  * interrupts (DD2.X POWER9). So we only allow the guest to use
  * priorities [0..6].
  */
-static bool spapr_xive_priority_is_reserved(uint8_t priority)
+bool spapr_xive_priority_is_reserved(uint8_t priority)
 {
     switch (priority) {
     case 0 ... 6:
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index f96c66fa419d..f52bddc92a2a 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -60,6 +60,57 @@ static void kvm_cpu_enable(CPUState *cs)
 /*
  * XIVE Thread Interrupt Management context (KVM)
  */
+static void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
+{
+    uint64_t state[4] = { 0 };
+    int ret;
+
+    ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
+    if (ret != 0) {
+        error_setg_errno(errp, errno,
+                         "XIVE: could not capture KVM state of CPU %ld",
+                         kvm_arch_vcpu_id(tctx->cs));
+        return;
+    }
+
+    /* word0 and word1 of the OS ring. */
+    *((uint64_t *) &tctx->regs[TM_QW1_OS]) = state[0];
+
+    /*
+     * KVM also returns word2 containing the OS CAM line which is
+     * interesting to print out in the QEMU monitor.
+     */
+    *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]) = state[1];
+}
+
+typedef struct {
+    XiveTCTX *tctx;
+    Error *err;
+} XiveCpuGetState;
+
+static void kvmppc_xive_cpu_do_synchronize_state(CPUState *cpu,
+                                                 run_on_cpu_data arg)
+{
+    XiveCpuGetState *s = arg.host_ptr;
+
+    kvmppc_xive_cpu_get_state(s->tctx, &s->err);
+}
+
+void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp)
+{
+    XiveCpuGetState s = {
+        .tctx = tctx,
+        .err = NULL,
+    };
+
+    run_on_cpu(tctx->cs, kvmppc_xive_cpu_do_synchronize_state,
+               RUN_ON_CPU_HOST_PTR(&s));
+
+    if (s.err) {
+        error_propagate(errp, s.err);
+        return;
+    }
+}
 
 void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp)
 {
@@ -119,6 +170,34 @@ void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp)
     }
 }
 
+/*
+ * This is used to perform the magic loads on the ESB pages, described
+ * in xive.h.
+ */
+static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
+{
+    unsigned long addr = (unsigned long) xsrc->esb_mmap +
+        xive_source_esb_mgmt(xsrc, srcno) + offset;
+
+    /* Prevent the compiler from optimizing away the load */
+    volatile uint64_t value = *((uint64_t *) addr);
+
+    return be64_to_cpu(value) & 0x3;
+}
+
+static void kvmppc_xive_source_get_state(XiveSource *xsrc)
+{
+    int i;
+
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        /* Perform a load without side effect to retrieve the PQ bits */
+        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
+
+        /* and save PQ locally */
+        xive_source_esb_set(xsrc, i, pq);
+    }
+}
+
 void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
 {
     XiveSource *xsrc = opaque;
@@ -149,6 +228,150 @@ void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
 /*
  * sPAPR XIVE interrupt controller (KVM)
  */
+static int kvmppc_xive_get_eq_state(sPAPRXive *xive, CPUState *cs, Error **errp)
+{
+    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
+    int ret;
+    int i;
+
+    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
+        Error *local_err = NULL;
+        struct kvm_ppc_xive_eq kvm_eq = { 0 };
+        uint64_t kvm_eq_idx;
+        XiveEND end = { 0 };
+        uint8_t end_blk, nvt_blk;
+        uint32_t end_idx, nvt_idx;
+
+        /* Skip priorities reserved for the hypervisor */
+        if (spapr_xive_priority_is_reserved(i)) {
+            continue;
+        }
+
+        /* Encode the tuple (server, prio) as a KVM EQ index */
+        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
+            KVM_XIVE_EQ_PRIORITY_MASK;
+        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
+            KVM_XIVE_EQ_SERVER_MASK;
+
+        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
+                                &kvm_eq, false, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return ret;
+        }
+
+        if (!(kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED)) {
+            continue;
+        }
+
+        /* Update the local END structure with the KVM input */
+        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED) {
+            end.w0 |= cpu_to_be32(END_W0_VALID | END_W0_ENQUEUE);
+        }
+        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY) {
+            end.w0 |= cpu_to_be32(END_W0_UCOND_NOTIFY);
+        }
+        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ESCALATE) {
+            end.w0 |= cpu_to_be32(END_W0_ESCALATE_CTL);
+        }
+        end.w0 |= xive_set_field32(END_W0_QSIZE, 0ul, kvm_eq.qsize - 12);
+
+        end.w1 = xive_set_field32(END_W1_GENERATION, 0ul, kvm_eq.qtoggle) |
+            xive_set_field32(END_W1_PAGE_OFF, 0ul, kvm_eq.qindex);
+        end.w2 = cpu_to_be32((kvm_eq.qpage >> 32) & 0x0fffffff);
+        end.w3 = cpu_to_be32(kvm_eq.qpage & 0xffffffff);
+        end.w4 = 0;
+        end.w5 = 0;
+
+        spapr_xive_cpu_to_nvt(POWERPC_CPU(cs), &nvt_blk, &nvt_idx);
+
+        end.w6 = xive_set_field32(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
+            xive_set_field32(END_W6_NVT_INDEX, 0ul, nvt_idx);
+        end.w7 = xive_set_field32(END_W7_F0_PRIORITY, 0ul, i);
+
+        spapr_xive_cpu_to_end(POWERPC_CPU(cs), i, &end_blk, &end_idx);
+
+        assert(end_idx < xive->nr_ends);
+        memcpy(&xive->endt[end_idx], &end, sizeof(XiveEND));
+    }
+
+    return 0;
+}
+
+static void kvmppc_xive_get_eas_state(sPAPRXive *xive, Error **errp)
+{
+    XiveSource *xsrc = &xive->source;
+    int i;
+
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        XiveEAS *eas = &xive->eat[i];
+        XiveEAS new_eas;
+        uint64_t kvm_eas;
+        uint8_t priority;
+        uint32_t server;
+        uint32_t end_idx;
+        uint8_t end_blk;
+        uint32_t eisn;
+        Error *local_err = NULL;
+
+        if (!xive_eas_is_valid(eas)) {
+            continue;
+        }
+
+        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, false,
+                          &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+
+        priority = (kvm_eas & KVM_XIVE_EAS_PRIORITY_MASK) >>
+            KVM_XIVE_EAS_PRIORITY_SHIFT;
+        server = (kvm_eas & KVM_XIVE_EAS_SERVER_MASK) >>
+            KVM_XIVE_EAS_SERVER_SHIFT;
+        eisn = (kvm_eas & KVM_XIVE_EAS_EISN_MASK) >> KVM_XIVE_EAS_EISN_SHIFT;
+
+        if (spapr_xive_target_to_end(server, priority, &end_blk, &end_idx)) {
+            error_setg(errp, "XIVE: invalid tuple CPU %d priority %d", server,
+                       priority);
+            return;
+        }
+
+        new_eas.w = cpu_to_be64(EAS_VALID);
+        if (kvm_eas & KVM_XIVE_EAS_MASK_MASK) {
+            new_eas.w |= cpu_to_be64(EAS_MASKED);
+        }
+
+        new_eas.w = xive_set_field64(EAS_END_INDEX, new_eas.w, end_idx);
+        new_eas.w = xive_set_field64(EAS_END_BLOCK, new_eas.w, end_blk);
+        new_eas.w = xive_set_field64(EAS_END_DATA, new_eas.w, eisn);
+
+        *eas = new_eas;
+    }
+}
+
+void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp)
+{
+    XiveSource *xsrc = &xive->source;
+    CPUState *cs;
+    Error *local_err = NULL;
+
+    kvmppc_xive_source_get_state(xsrc);
+
+    kvmppc_xive_get_eas_state(xive, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    CPU_FOREACH(cs) {
+        kvmppc_xive_get_eq_state(xive, cs, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
 
 static void *kvmppc_xive_mmap(sPAPRXive *xive, int ctrl, size_t len,
                                  Error **errp)
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index 9a2d7be283f8..596c29d8c826 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -434,6 +434,16 @@ void xive_tctx_pic_print_info(XiveTCTX *tctx, Monitor *mon)
     int cpu_index = tctx->cs ? tctx->cs->cpu_index : -1;
     int i;
 
+    if (kvmppc_xive_enabled()) {
+        Error *local_err = NULL;
+
+        kvmppc_xive_cpu_synchronize_state(tctx, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return;
+        }
+    }
+
     monitor_printf(mon, "CPU[%04x]:   QW   NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
                    "  W2\n", cpu_index);
 
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 04/13] spapr/xive: introduce a VM state change handler
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (2 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 03/13] spapr/xive: add state synchronization with KVM Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-02-06  2:49   ` David Gibson
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 05/13] spapr/xive: add migration support for KVM Cédric Le Goater
                   ` (8 subsequent siblings)
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

This handler is in charge of stabilizing the flow of event notifications
in the XIVE controller before migrating a guest. This is a requirement
before transferring the guest EQ pages to a destination.

When the VM is stopped, the handler masks the sources (PQ=01) to stop
the flow of events and saves their previous state. The XIVE controller
is then synced through KVM to flush any in-flight event notification
and to stabilize the EQs. At this stage, the EQ pages are marked dirty
to make sure the EQ pages are transferred if a migration sequence is
in progress.

The previous configuration of the sources is restored when the VM
resumes, after a migration or a stop.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr_xive.h |   1 +
 hw/intc/spapr_xive_kvm.c    | 111 +++++++++++++++++++++++++++++++++++-
 2 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 02f2de20111c..8815ed5aa372 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -39,6 +39,7 @@ typedef struct sPAPRXive {
     /* KVM support */
     int           fd;
     void          *tm_mmap;
+    VMChangeStateEntry *change;
 } sPAPRXive;
 
 bool spapr_xive_irq_claim(sPAPRXive *xive, uint32_t lisn, bool lsi);
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index f52bddc92a2a..c7639ffe7758 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -350,13 +350,119 @@ static void kvmppc_xive_get_eas_state(sPAPRXive *xive, Error **errp)
     }
 }
 
+/*
+ * Sync the XIVE controller through KVM to flush any in-flight event
+ * notification and stabilize the EQs.
+ */
+ static void kvmppc_xive_sync_all(sPAPRXive *xive, Error **errp)
+{
+    XiveSource *xsrc = &xive->source;
+    Error *local_err = NULL;
+    int i;
+
+    /* Sync the KVM source. This reaches the XIVE HW through OPAL */
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        XiveEAS *eas = &xive->eat[i];
+
+        if (!xive_eas_is_valid(eas)) {
+            continue;
+        }
+
+        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SYNC, i, NULL, true,
+                          &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
+/*
+ * The primary goal of the XIVE VM change handler is to mark the EQ
+ * pages dirty when all XIVE event notifications have stopped.
+ *
+ * Whenever the VM is stopped, the VM change handler masks the sources
+ * (PQ=01) to stop the flow of events and saves the previous state in
+ * anticipation of a migration. The XIVE controller is then synced
+ * through KVM to flush any in-flight event notification and stabilize
+ * the EQs.
+ *
+ * At this stage, we can mark the EQ page dirty and let a migration
+ * sequence transfer the EQ pages to the destination, which is done
+ * just after the stop state.
+ *
+ * The previous configuration of the sources is restored when the VM
+ * runs again.
+ */
+static void kvmppc_xive_change_state_handler(void *opaque, int running,
+                                             RunState state)
+{
+    sPAPRXive *xive = opaque;
+    XiveSource *xsrc = &xive->source;
+    Error *local_err = NULL;
+    int i;
+
+    /*
+     * Restore the sources to their initial state. This is called when
+     * the VM resumes after a stop or a migration.
+     */
+    if (running) {
+        for (i = 0; i < xsrc->nr_irqs; i++) {
+            uint8_t pq = xive_source_esb_get(xsrc, i);
+            if (xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8)) != 0x1) {
+                error_report("XIVE: IRQ %d has an invalid state", i);
+            }
+        }
+
+        return;
+    }
+
+    /*
+     * Mask the sources, to stop the flow of event notifications, and
+     * save the PQs locally in the XiveSource object. The XiveSource
+     * state will be collected later on by its vmstate handler if a
+     * migration is in progress.
+     */
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_01);
+        xive_source_esb_set(xsrc, i, pq);
+    }
+
+    /*
+     * Sync the XIVE controller in KVM, to flush in-flight event
+     * notification that should be enqueued in the EQs.
+     */
+    kvmppc_xive_sync_all(xive, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return;
+    }
+
+    /*
+     * Mark the XIVE EQ pages dirty to collect all updates.
+     */
+    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL,
+                      KVM_DEV_XIVE_SAVE_EQ_PAGES, NULL, true, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+    }
+}
+
 void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp)
 {
     XiveSource *xsrc = &xive->source;
     CPUState *cs;
     Error *local_err = NULL;
 
-    kvmppc_xive_source_get_state(xsrc);
+    /*
+     * When the VM is stopped, the sources are masked and the previous
+     * state is saved in anticipation of a migration. We should not
+     * synchronize the source state in that case else we will override
+     * the saved state.
+     */
+    if (runstate_is_running()) {
+        kvmppc_xive_source_get_state(xsrc);
+    }
 
     kvmppc_xive_get_eas_state(xive, &local_err);
     if (local_err) {
@@ -468,6 +574,9 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
                                       "xive.tima", tima_len, xive->tm_mmap);
     sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
 
+    xive->change = qemu_add_vm_change_state_handler(
+        kvmppc_xive_change_state_handler, xive);
+
     kvm_kernel_irqchip = true;
     kvm_msi_via_irqfd_allowed = true;
     kvm_gsi_direct_mapping = true;
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 05/13] spapr/xive: add migration support for KVM
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (3 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 04/13] spapr/xive: introduce a VM state change handler Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-02-07  3:41   ` David Gibson
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 06/13] spapr/xive: fix migration of the XiveTCTX under TCG Cédric Le Goater
                   ` (7 subsequent siblings)
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

When the VM is stopped, the VM state handler stabilizes the XIVE IC
and marks the EQ pages dirty. These are then transferred to destination
before the transfer of the device vmstates starts.

The sPAPRXive interrupt controller model captures the XIVE internal
tables, EAT and ENDT and the XiveTCTX model does the same for the
thread interrupt context registers.

At restart, the sPAPRXive 'post_load' method restores all the XIVE
states. It is called by the sPAPR machine 'post_load' method, when all
XIVE states have been transferred and loaded.

Finally, the source states are restored in the VM change state handler
when the machine reaches the running state.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr_xive.h |   5 +
 include/hw/ppc/xive.h       |   1 +
 hw/intc/spapr_xive.c        |  34 +++++++
 hw/intc/spapr_xive_kvm.c    | 187 +++++++++++++++++++++++++++++++++++-
 hw/intc/xive.c              |  17 ++++
 hw/ppc/spapr_irq.c          |   2 +-
 6 files changed, 244 insertions(+), 2 deletions(-)

diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 8815ed5aa372..52804516e909 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -46,6 +46,7 @@ bool spapr_xive_irq_claim(sPAPRXive *xive, uint32_t lisn, bool lsi);
 bool spapr_xive_irq_free(sPAPRXive *xive, uint32_t lisn);
 void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
 bool spapr_xive_priority_is_reserved(uint8_t priority);
+int spapr_xive_post_load(sPAPRXive *xive, int version_id);
 
 void spapr_xive_cpu_to_nvt(PowerPCCPU *cpu,
                            uint8_t *out_nvt_blk, uint32_t *out_nvt_idx);
@@ -53,6 +54,8 @@ void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
                            uint8_t *out_end_blk, uint32_t *out_end_idx);
 int spapr_xive_target_to_end(uint32_t target, uint8_t prio,
                              uint8_t *out_end_blk, uint32_t *out_end_idx);
+int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
+                             uint32_t *out_server, uint8_t *out_prio);
 
 typedef struct sPAPRMachineState sPAPRMachineState;
 
@@ -68,5 +71,7 @@ void spapr_xive_map_mmio(sPAPRXive *xive);
  */
 void kvmppc_xive_connect(sPAPRXive *xive, Error **errp);
 void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp);
+int kvmppc_xive_pre_save(sPAPRXive *xive);
+int kvmppc_xive_post_load(sPAPRXive *xive, int version_id);
 
 #endif /* PPC_SPAPR_XIVE_H */
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index 2e48d75a22e0..8aa314f93ffd 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -443,5 +443,6 @@ void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp);
 void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val);
 void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp);
 void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp);
+void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp);
 
 #endif /* PPC_XIVE_H */
diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index 50dd66707968..21f3c1ef0901 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -85,6 +85,19 @@ static int spapr_xive_target_to_nvt(uint32_t target,
  * sPAPR END indexing uses a simple mapping of the CPU vcpu_id, 8
  * priorities per CPU
  */
+int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
+                             uint32_t *out_server, uint8_t *out_prio)
+{
+    if (out_server) {
+        *out_server = end_idx >> 3;
+    }
+
+    if (out_prio) {
+        *out_prio = end_idx & 0x7;
+    }
+    return 0;
+}
+
 void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
                            uint8_t *out_end_blk, uint32_t *out_end_idx)
 {
@@ -438,10 +451,31 @@ static const VMStateDescription vmstate_spapr_xive_eas = {
     },
 };
 
+static int vmstate_spapr_xive_pre_save(void *opaque)
+{
+    if (kvmppc_xive_enabled()) {
+        return kvmppc_xive_pre_save(SPAPR_XIVE(opaque));
+    }
+
+    return 0;
+}
+
+/* Called by the sPAPR machine 'post_load' method */
+int spapr_xive_post_load(sPAPRXive *xive, int version_id)
+{
+    if (kvmppc_xive_enabled()) {
+        return kvmppc_xive_post_load(xive, version_id);
+    }
+
+    return 0;
+}
+
 static const VMStateDescription vmstate_spapr_xive = {
     .name = TYPE_SPAPR_XIVE,
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = vmstate_spapr_xive_pre_save,
+    .post_load = NULL, /* handled at the machine level */
     .fields = (VMStateField[]) {
         VMSTATE_UINT32_EQUAL(nr_irqs, sPAPRXive, NULL),
         VMSTATE_STRUCT_VARRAY_POINTER_UINT32(eat, sPAPRXive, nr_irqs,
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index c7639ffe7758..fe58a9ee32d3 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -60,7 +60,30 @@ static void kvm_cpu_enable(CPUState *cs)
 /*
  * XIVE Thread Interrupt Management context (KVM)
  */
-static void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
+
+static void kvmppc_xive_cpu_set_state(XiveTCTX *tctx, Error **errp)
+{
+    uint64_t state[4];
+    int ret;
+
+    /* word0 and word1 of the OS ring. */
+    state[0] = *((uint64_t *) &tctx->regs[TM_QW1_OS]);
+
+    /*
+     * OS CAM line. Used by KVM to print out the VP identifier. This
+     * is for debug only.
+     */
+    state[1] = *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]);
+
+    ret = kvm_set_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
+    if (ret != 0) {
+        error_setg_errno(errp, errno,
+                         "XIVE: could not restore KVM state of CPU %ld",
+                         kvm_arch_vcpu_id(tctx->cs));
+    }
+}
+
+void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
 {
     uint64_t state[4] = { 0 };
     int ret;
@@ -228,6 +251,58 @@ void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
 /*
  * sPAPR XIVE interrupt controller (KVM)
  */
+static int kvmppc_xive_set_eq_state(sPAPRXive *xive, CPUState *cs, Error **errp)
+{
+    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
+    int ret;
+    int i;
+
+    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
+        Error *local_err = NULL;
+        XiveEND *end;
+        uint8_t end_blk;
+        uint32_t end_idx;
+        struct kvm_ppc_xive_eq kvm_eq = { 0 };
+        uint64_t kvm_eq_idx;
+
+        if (spapr_xive_priority_is_reserved(i)) {
+            continue;
+        }
+
+        spapr_xive_cpu_to_end(POWERPC_CPU(cs), i, &end_blk, &end_idx);
+
+        assert(end_idx < xive->nr_ends);
+        end = &xive->endt[end_idx];
+
+        if (!xive_end_is_valid(end)) {
+            continue;
+        }
+
+        /* Build the KVM state from the local END structure */
+        kvm_eq.flags   = KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY;
+        kvm_eq.qsize   = xive_get_field32(END_W0_QSIZE, end->w0) + 12;
+        kvm_eq.qpage   = (uint64_t) be32_to_cpu(end->w2 & 0x0fffffff) << 32 |
+            be32_to_cpu(end->w3);
+        kvm_eq.qtoggle = xive_get_field32(END_W1_GENERATION, end->w1);
+        kvm_eq.qindex  = xive_get_field32(END_W1_PAGE_OFF, end->w1);
+
+        /* Encode the tuple (server, prio) as a KVM EQ index */
+        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
+            KVM_XIVE_EQ_PRIORITY_MASK;
+        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
+            KVM_XIVE_EQ_SERVER_MASK;
+
+        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
+                                &kvm_eq, true, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
 static int kvmppc_xive_get_eq_state(sPAPRXive *xive, CPUState *cs, Error **errp)
 {
     unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
@@ -298,6 +373,48 @@ static int kvmppc_xive_get_eq_state(sPAPRXive *xive, CPUState *cs, Error **errp)
     return 0;
 }
 
+static void kvmppc_xive_set_eas_state(sPAPRXive *xive, Error **errp)
+{
+    XiveSource *xsrc = &xive->source;
+    int i;
+
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        XiveEAS *eas = &xive->eat[i];
+        uint32_t end_idx;
+        uint32_t end_blk;
+        uint32_t eisn;
+        uint8_t priority;
+        uint32_t server;
+        uint64_t kvm_eas;
+        Error *local_err = NULL;
+
+        /* No need to set MASKED EAS, this is the default state after reset */
+        if (!xive_eas_is_valid(eas) || xive_eas_is_masked(eas)) {
+            continue;
+        }
+
+        end_idx = xive_get_field64(EAS_END_INDEX, eas->w);
+        end_blk = xive_get_field64(EAS_END_BLOCK, eas->w);
+        eisn = xive_get_field64(EAS_END_DATA, eas->w);
+
+        spapr_xive_end_to_target(end_blk, end_idx, &server, &priority);
+
+        kvm_eas = priority << KVM_XIVE_EAS_PRIORITY_SHIFT &
+            KVM_XIVE_EAS_PRIORITY_MASK;
+        kvm_eas |= server << KVM_XIVE_EAS_SERVER_SHIFT &
+            KVM_XIVE_EAS_SERVER_MASK;
+        kvm_eas |= ((uint64_t)eisn << KVM_XIVE_EAS_EISN_SHIFT) &
+            KVM_XIVE_EAS_EISN_MASK;
+
+        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, true,
+                          &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
 static void kvmppc_xive_get_eas_state(sPAPRXive *xive, Error **errp)
 {
     XiveSource *xsrc = &xive->source;
@@ -448,6 +565,74 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running,
     }
 }
 
+int kvmppc_xive_pre_save(sPAPRXive *xive)
+{
+    Error *local_err = NULL;
+    CPUState *cs;
+
+    /* Grab the EAT */
+    kvmppc_xive_get_eas_state(xive, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return -1;
+    }
+
+    /*
+     * Grab the ENDT. The EQ index and the toggle bit are what we want
+     * to capture.
+     */
+    CPU_FOREACH(cs) {
+        kvmppc_xive_get_eq_state(xive, cs, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * The sPAPRXive 'post_load' method is called by the sPAPR machine
+ * 'post_load' method, when all XIVE states have been transferred and
+ * loaded.
+ */
+int kvmppc_xive_post_load(sPAPRXive *xive, int version_id)
+{
+    Error *local_err = NULL;
+    CPUState *cs;
+
+    /* Restore the ENDT first. The targetting depends on it. */
+    CPU_FOREACH(cs) {
+        kvmppc_xive_set_eq_state(xive, cs, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return -1;
+        }
+    }
+
+    /* Restore the EAT */
+    kvmppc_xive_set_eas_state(xive, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return -1;
+    }
+
+    /* Restore the thread interrupt contexts */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        kvmppc_xive_cpu_set_state(cpu->tctx, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return -1;
+        }
+    }
+
+    /* The source states will be restored when the machine starts running */
+    return 0;
+}
+
 void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp)
 {
     XiveSource *xsrc = &xive->source;
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index 596c29d8c826..c5c2fbc3f8bc 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -521,10 +521,27 @@ static void xive_tctx_unrealize(DeviceState *dev, Error **errp)
     qemu_unregister_reset(xive_tctx_reset, dev);
 }
 
+static int vmstate_xive_tctx_pre_save(void *opaque)
+{
+    Error *local_err = NULL;
+
+    if (kvmppc_xive_enabled()) {
+        kvmppc_xive_cpu_get_state(XIVE_TCTX(opaque), &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
 static const VMStateDescription vmstate_xive_tctx = {
     .name = TYPE_XIVE_TCTX,
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = vmstate_xive_tctx_pre_save,
+    .post_load = NULL, /* handled by the sPAPRxive model */
     .fields = (VMStateField[]) {
         VMSTATE_BUFFER(regs, XiveTCTX),
         VMSTATE_END_OF_LIST()
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index afbdabfa6543..233c97c5ecd9 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -363,7 +363,7 @@ static void spapr_irq_cpu_intc_create_xive(sPAPRMachineState *spapr,
 
 static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
 {
-    return 0;
+    return spapr_xive_post_load(spapr->xive, version_id);
 }
 
 static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 06/13] spapr/xive: fix migration of the XiveTCTX under TCG
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (4 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 05/13] spapr/xive: add migration support for KVM Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-02-08  5:36   ` David Gibson
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 07/13] ppc/xics: introduce a icp_kvm_connect() routine Cédric Le Goater
                   ` (6 subsequent siblings)
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

When the thread interrupt management state is retrieved from the KVM
VCPU, word2 is saved under the QEMU XIVE thread context to print out
the OS CAM line under the QEMU monitor.

This breaks the migration of a TCG guest (and with KVM when
kernel_irqchip=off) because the matching algorithm of the presenter
relies on the OS CAM value. Fix with an extra reset of the thread
contexts to restore the expected value.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/ppc/spapr_irq.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index 233c97c5ecd9..ba27d9d8e972 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -363,7 +363,31 @@ static void spapr_irq_cpu_intc_create_xive(sPAPRMachineState *spapr,
 
 static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
 {
-    return spapr_xive_post_load(spapr->xive, version_id);
+    CPUState *cs;
+    int ret;
+
+    ret = spapr_xive_post_load(spapr->xive, version_id);
+    if (ret) {
+        return ret;
+    }
+
+    /*
+     * When the states are collected from the KVM XIVE device, word2
+     * of the XiveTCTX is set to print out the OS CAM line under the
+     * QEMU monitor.
+     *
+     * This breaks the migration on a TCG guest (or on KVM with
+     * kernel_irqchip=off) because the matching algorithm of the
+     * presenter relies on the OS CAM value. Fix with an extra reset
+     * of the thread contexts to restore the expected value.
+     */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        /* (TCG) Set the OS CAM line of the thread interrupt context. */
+        spapr_xive_set_tctx_os_cam(cpu->tctx);
+    }
+    return 0;
 }
 
 static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 07/13] ppc/xics: introduce a icp_kvm_connect() routine
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (5 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 06/13] spapr/xive: fix migration of the XiveTCTX under TCG Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 08/13] spapr/rtas: modify spapr_rtas_register() to remove RTAS handlers Cédric Le Goater
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

This routine gathers all the KVM initialization of the XICS KVM
presenter. It will be useful when the initialization of the KVM XICS
device is moved to a global routine.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/intc/xics_kvm.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index ac94594b1919..2426e5b2f4ed 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -123,11 +123,8 @@ static void icp_kvm_reset(DeviceState *dev)
     icp_set_kvm_state(ICP(dev), 1);
 }
 
-static void icp_kvm_realize(DeviceState *dev, Error **errp)
+static void icp_kvm_connect(ICPState *icp, Error **errp)
 {
-    ICPState *icp = ICP(dev);
-    ICPStateClass *icpc = ICP_GET_CLASS(icp);
-    Error *local_err = NULL;
     CPUState *cs;
     KVMEnabledICP *enabled_icp;
     unsigned long vcpu_id;
@@ -135,11 +132,6 @@ static void icp_kvm_realize(DeviceState *dev, Error **errp)
 
     if (kernel_xics_fd == -1) {
         abort();
-    }
-
-    icpc->parent_realize(dev, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
         return;
     }
 
@@ -168,6 +160,25 @@ static void icp_kvm_realize(DeviceState *dev, Error **errp)
     QLIST_INSERT_HEAD(&kvm_enabled_icps, enabled_icp, node);
 }
 
+static void icp_kvm_realize(DeviceState *dev, Error **errp)
+{
+    ICPStateClass *icpc = ICP_GET_CLASS(dev);
+    Error *local_err = NULL;
+
+    icpc->parent_realize(dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    /* Connect the presenter to the VCPU (required for CPU hotplug) */
+    icp_kvm_connect(ICP(dev), &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
 static void icp_kvm_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 08/13] spapr/rtas: modify spapr_rtas_register() to remove RTAS handlers
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (6 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 07/13] ppc/xics: introduce a icp_kvm_connect() routine Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-01-29  5:09   ` Alexey Kardashevskiy
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 09/13] sysbus: add a sysbus_mmio_unmap() helper Cédric Le Goater
                   ` (4 subsequent siblings)
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

Removing RTAS handlers will become necessary when the new pseries
machine supporting multiple interrupt mode is introduced.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr.h | 4 ++++
 hw/ppc/spapr_rtas.c    | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 9e01a5a12e4a..9a6d015b9cf5 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -657,6 +657,10 @@ typedef void (*spapr_rtas_fn)(PowerPCCPU *cpu, sPAPRMachineState *sm,
                               uint32_t nargs, target_ulong args,
                               uint32_t nret, target_ulong rets);
 void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn);
+static inline void spapr_rtas_unregister(int token)
+{
+    spapr_rtas_register(token, NULL, NULL);
+}
 target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPRMachineState *sm,
                              uint32_t token, uint32_t nargs, target_ulong args,
                              uint32_t nret, target_ulong rets);
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
index d6a0952154ac..e005d5d08151 100644
--- a/hw/ppc/spapr_rtas.c
+++ b/hw/ppc/spapr_rtas.c
@@ -404,7 +404,7 @@ void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn)
 
     token -= RTAS_TOKEN_BASE;
 
-    assert(!rtas_table[token].name);
+    assert(!name || !rtas_table[token].name);
 
     rtas_table[token].name = name;
     rtas_table[token].fn = fn;
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 09/13] sysbus: add a sysbus_mmio_unmap() helper
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (7 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 08/13] spapr/rtas: modify spapr_rtas_register() to remove RTAS handlers Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 10/13] spapr: introduce routines to delete the KVM IRQ device Cédric Le Goater
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

This will be used to remove the MMIO regions of the POWER9 XIVE
interrupt controller when the sPAPR machine is reseted.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/sysbus.h |  1 +
 hw/core/sysbus.c    | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/include/hw/sysbus.h b/include/hw/sysbus.h
index 1aedcf05c92b..4c668fbbdc60 100644
--- a/include/hw/sysbus.h
+++ b/include/hw/sysbus.h
@@ -89,6 +89,7 @@ qemu_irq sysbus_get_connected_irq(SysBusDevice *dev, int n);
 void sysbus_mmio_map(SysBusDevice *dev, int n, hwaddr addr);
 void sysbus_mmio_map_overlap(SysBusDevice *dev, int n, hwaddr addr,
                              int priority);
+void sysbus_mmio_unmap(SysBusDevice *dev, int n);
 void sysbus_add_io(SysBusDevice *dev, hwaddr addr,
                    MemoryRegion *mem);
 MemoryRegion *sysbus_address_space(SysBusDevice *dev);
diff --git a/hw/core/sysbus.c b/hw/core/sysbus.c
index 9f9edbcab96f..f90d87b058c3 100644
--- a/hw/core/sysbus.c
+++ b/hw/core/sysbus.c
@@ -153,6 +153,16 @@ static void sysbus_mmio_map_common(SysBusDevice *dev, int n, hwaddr addr,
     }
 }
 
+void sysbus_mmio_unmap(SysBusDevice *dev, int n)
+{
+    assert(n >= 0 && n < dev->num_mmio);
+
+    if (dev->mmio[n].addr != (hwaddr)-1) {
+        memory_region_del_subregion(get_system_memory(), dev->mmio[n].memory);
+        dev->mmio[n].addr = (hwaddr)-1;
+    }
+}
+
 void sysbus_mmio_map(SysBusDevice *dev, int n, hwaddr addr)
 {
     sysbus_mmio_map_common(dev, n, addr, false, 0);
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 10/13] spapr: introduce routines to delete the KVM IRQ device
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (8 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 09/13] sysbus: add a sysbus_mmio_unmap() helper Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-02-12  0:58   ` David Gibson
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 11/13] spapr: check for the activation of " Cédric Le Goater
                   ` (2 subsequent siblings)
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

If a new interrupt mode is chosen by CAS, the machine generates a
reset to reconfigure. At this point, the connection with the previous
KVM device needs to be closed and a new connection needs to opened
with the KVM device operating the chosen interrupt mode.

New routines are introduced to destroy the XICS and the XIVE KVM
devices. They make use of a new KVM device ioctl which destroys the
device and also disconnects the IRQ presenters from the vCPUs.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr_xive.h |  1 +
 include/hw/ppc/xics.h       |  1 +
 hw/intc/spapr_xive_kvm.c    | 60 +++++++++++++++++++++++++++++++++++++
 hw/intc/xics_kvm.c          | 57 +++++++++++++++++++++++++++++++++++
 4 files changed, 119 insertions(+)

diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 52804516e909..f172fc20b650 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -70,6 +70,7 @@ void spapr_xive_map_mmio(sPAPRXive *xive);
  * KVM XIVE device helpers
  */
 void kvmppc_xive_connect(sPAPRXive *xive, Error **errp);
+void kvmppc_xive_disconnect(sPAPRXive *xive, Error **errp);
 void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp);
 int kvmppc_xive_pre_save(sPAPRXive *xive);
 int kvmppc_xive_post_load(sPAPRXive *xive, int version_id);
diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
index 07508cbd217e..75d4effb5c5f 100644
--- a/include/hw/ppc/xics.h
+++ b/include/hw/ppc/xics.h
@@ -205,6 +205,7 @@ typedef struct sPAPRMachineState sPAPRMachineState;
 void spapr_dt_xics(sPAPRMachineState *spapr, uint32_t nr_servers, void *fdt,
                    uint32_t phandle);
 int xics_kvm_init(sPAPRMachineState *spapr, Error **errp);
+int xics_kvm_disconnect(sPAPRMachineState *spapr, Error **errp);
 void xics_spapr_init(sPAPRMachineState *spapr);
 
 Object *icp_create(Object *cpu, const char *type, XICSFabric *xi,
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index fe58a9ee32d3..93ea8e71047a 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -57,6 +57,16 @@ static void kvm_cpu_enable(CPUState *cs)
     QLIST_INSERT_HEAD(&kvm_enabled_cpus, enabled_cpu, node);
 }
 
+static void kvm_cpu_disable_all(void)
+{
+    KVMEnabledCPU *enabled_cpu, *next;
+
+    QLIST_FOREACH_SAFE(enabled_cpu, &kvm_enabled_cpus, node, next) {
+        QLIST_REMOVE(enabled_cpu, node);
+        g_free(enabled_cpu);
+    }
+}
+
 /*
  * XIVE Thread Interrupt Management context (KVM)
  */
@@ -769,3 +779,53 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
     /* Map all regions */
     spapr_xive_map_mmio(xive);
 }
+
+void kvmppc_xive_disconnect(sPAPRXive *xive, Error **errp)
+{
+    XiveSource *xsrc;
+    struct kvm_create_device xive_destroy_device = { 0 };
+    size_t esb_len;
+    int rc;
+
+    if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
+        error_setg(errp, "IRQ_XIVE capability must be present for KVM");
+        return;
+    }
+
+    /* The KVM XIVE device is not in use */
+    if (!xive || xive->fd == -1) {
+        return;
+    }
+
+    /* Clear the KVM mapping */
+    xsrc = &xive->source;
+    esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
+
+    sysbus_mmio_unmap(SYS_BUS_DEVICE(xive), 0);
+    munmap(xsrc->esb_mmap, esb_len);
+
+    sysbus_mmio_unmap(SYS_BUS_DEVICE(xive), 1);
+
+    sysbus_mmio_unmap(SYS_BUS_DEVICE(xive), 2);
+    munmap(xive->tm_mmap, 4ull << TM_SHIFT);
+
+    /* Destroy the KVM device. This also clears the VCPU presenters */
+    xive_destroy_device.fd = xive->fd;
+    xive_destroy_device.type = KVM_DEV_TYPE_XIVE;
+    rc = kvm_vm_ioctl(kvm_state, KVM_DESTROY_DEVICE, &xive_destroy_device);
+    if (rc < 0) {
+        error_setg_errno(errp, -rc, "Error on KVM_DESTROY_DEVICE for XIVE");
+    }
+    close(xive->fd);
+    xive->fd = -1;
+
+    kvm_kernel_irqchip = false;
+    kvm_msi_via_irqfd_allowed = false;
+    kvm_gsi_direct_mapping = false;
+
+    /* Clear the local list of presenter (hotplug) */
+    kvm_cpu_disable_all();
+
+    /* VM Change state handler is not needed anymore */
+    qemu_del_vm_change_state_handler(xive->change);
+}
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index 2426e5b2f4ed..da6a00bc88cc 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -50,6 +50,16 @@ typedef struct KVMEnabledICP {
 static QLIST_HEAD(, KVMEnabledICP)
     kvm_enabled_icps = QLIST_HEAD_INITIALIZER(&kvm_enabled_icps);
 
+static void kvm_disable_icps(void)
+{
+    KVMEnabledICP *enabled_icp, *next;
+
+    QLIST_FOREACH_SAFE(enabled_icp, &kvm_enabled_icps, node, next) {
+        QLIST_REMOVE(enabled_icp, node);
+        g_free(enabled_icp);
+    }
+}
+
 /*
  * ICP-KVM
  */
@@ -455,6 +465,53 @@ fail:
     return -1;
 }
 
+int xics_kvm_disconnect(sPAPRMachineState *spapr, Error **errp)
+{
+    int rc;
+    struct kvm_create_device xics_create_device = {
+        .fd = kernel_xics_fd,
+        .type = KVM_DEV_TYPE_XICS,
+        .flags = 0,
+    };
+
+    /* The KVM XICS device is not in use */
+    if (kernel_xics_fd == -1) {
+        return 0;
+    }
+
+    if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
+        error_setg(errp,
+                   "KVM and IRQ_XICS capability must be present for KVM XICS device");
+        return -1;
+    }
+
+    rc = kvm_vm_ioctl(kvm_state, KVM_DESTROY_DEVICE, &xics_create_device);
+    if (rc < 0) {
+        error_setg_errno(errp, -rc, "Error on KVM_DESTROY_DEVICE for XICS");
+    }
+    close(kernel_xics_fd);
+    kernel_xics_fd = -1;
+
+    spapr_rtas_register(RTAS_IBM_SET_XIVE, NULL, 0);
+    spapr_rtas_register(RTAS_IBM_GET_XIVE, NULL, 0);
+    spapr_rtas_register(RTAS_IBM_INT_OFF, NULL, 0);
+    spapr_rtas_register(RTAS_IBM_INT_ON, NULL, 0);
+
+    kvmppc_define_rtas_kernel_token(0, "ibm,set-xive");
+    kvmppc_define_rtas_kernel_token(0, "ibm,get-xive");
+    kvmppc_define_rtas_kernel_token(0, "ibm,int-on");
+    kvmppc_define_rtas_kernel_token(0, "ibm,int-off");
+
+    kvm_kernel_irqchip = false;
+    kvm_msi_via_irqfd_allowed = false;
+    kvm_gsi_direct_mapping = false;
+
+    /* Clear the presenter from the VCPUs */
+    kvm_disable_icps();
+
+    return rc;
+}
+
 static void xics_kvm_register_types(void)
 {
     type_register_static(&ics_kvm_info);
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 11/13] spapr: check for the activation of the KVM IRQ device
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (9 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 10/13] spapr: introduce routines to delete the KVM IRQ device Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-02-12  1:01   ` David Gibson
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space Cédric Le Goater
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine Cédric Le Goater
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

The activation of the KVM IRQ device depends on the interrupt mode
chosen at CAS time by the machine and some methods used at reset or by
the migration need to be protected.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/intc/spapr_xive_kvm.c | 28 ++++++++++++++++++++++++++++
 hw/intc/xics_kvm.c       | 25 ++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index 93ea8e71047a..d35814c1992e 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -95,9 +95,15 @@ static void kvmppc_xive_cpu_set_state(XiveTCTX *tctx, Error **errp)
 
 void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
 {
+    sPAPRXive *xive = SPAPR_MACHINE(qdev_get_machine())->xive;
     uint64_t state[4] = { 0 };
     int ret;
 
+    /* The KVM XIVE device is not in use */
+    if (xive->fd == -1) {
+        return;
+    }
+
     ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
     if (ret != 0) {
         error_setg_errno(errp, errno,
@@ -151,6 +157,11 @@ void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp)
     unsigned long vcpu_id;
     int ret;
 
+    /* The KVM XIVE device is not in use */
+    if (xive->fd == -1) {
+        return;
+    }
+
     /* Check if CPU was hot unplugged and replugged. */
     if (kvm_cpu_is_enabled(tctx->cs)) {
         return;
@@ -234,9 +245,13 @@ static void kvmppc_xive_source_get_state(XiveSource *xsrc)
 void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
 {
     XiveSource *xsrc = opaque;
+    sPAPRXive *xive = SPAPR_XIVE(xsrc->xive);
     struct kvm_irq_level args;
     int rc;
 
+    /* The KVM XIVE device should be in use */
+    assert(xive->fd != -1);
+
     args.irq = srcno;
     if (!xive_source_irq_is_lsi(xsrc, srcno)) {
         if (!val) {
@@ -580,6 +595,11 @@ int kvmppc_xive_pre_save(sPAPRXive *xive)
     Error *local_err = NULL;
     CPUState *cs;
 
+    /* The KVM XIVE device is not in use */
+    if (xive->fd == -1) {
+        return 0;
+    }
+
     /* Grab the EAT */
     kvmppc_xive_get_eas_state(xive, &local_err);
     if (local_err) {
@@ -612,6 +632,9 @@ int kvmppc_xive_post_load(sPAPRXive *xive, int version_id)
     Error *local_err = NULL;
     CPUState *cs;
 
+    /* The KVM XIVE device should be in use */
+    assert(xive->fd != -1);
+
     /* Restore the ENDT first. The targetting depends on it. */
     CPU_FOREACH(cs) {
         kvmppc_xive_set_eq_state(xive, cs, &local_err);
@@ -649,6 +672,11 @@ void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp)
     CPUState *cs;
     Error *local_err = NULL;
 
+    /* The KVM XIVE device is not in use */
+    if (xive->fd == -1) {
+        return;
+    }
+
     /*
      * When the VM is stopped, the sources are masked and the previous
      * state is saved in anticipation of a migration. We should not
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index da6a00bc88cc..651bbfdf6966 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -68,6 +68,11 @@ static void icp_get_kvm_state(ICPState *icp)
     uint64_t state;
     int ret;
 
+    /* The KVM XICS device is not in use */
+    if (kernel_xics_fd == -1) {
+        return;
+    }
+
     /* ICP for this CPU thread is not in use, exiting */
     if (!icp->cs) {
         return;
@@ -104,6 +109,11 @@ static int icp_set_kvm_state(ICPState *icp, int version_id)
     uint64_t state;
     int ret;
 
+    /* The KVM XICS device is not in use */
+    if (kernel_xics_fd == -1) {
+        return 0;
+    }
+
     /* ICP for this CPU thread is not in use, exiting */
     if (!icp->cs) {
         return 0;
@@ -140,8 +150,8 @@ static void icp_kvm_connect(ICPState *icp, Error **errp)
     unsigned long vcpu_id;
     int ret;
 
+    /* The KVM XICS device is not in use */
     if (kernel_xics_fd == -1) {
-        abort();
         return;
     }
 
@@ -220,6 +230,11 @@ static void ics_get_kvm_state(ICSState *ics)
     uint64_t state;
     int i;
 
+    /* The KVM XICS device is not in use */
+    if (kernel_xics_fd == -1) {
+        return;
+    }
+
     for (i = 0; i < ics->nr_irqs; i++) {
         ICSIRQState *irq = &ics->irqs[i];
 
@@ -279,6 +294,11 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
     int i;
     Error *local_err = NULL;
 
+    /* The KVM XICS device is not in use */
+    if (kernel_xics_fd == -1) {
+        return 0;
+    }
+
     for (i = 0; i < ics->nr_irqs; i++) {
         ICSIRQState *irq = &ics->irqs[i];
         int ret;
@@ -325,6 +345,9 @@ void ics_kvm_set_irq(void *opaque, int srcno, int val)
     struct kvm_irq_level args;
     int rc;
 
+    /* The KVM XICS device should be in use */
+    assert(kernel_xics_fd != -1);
+
     args.irq = srcno + ics->offset;
     if (ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MSI) {
         if (!val) {
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (10 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 11/13] spapr: check for the activation of " Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-02-12  1:06   ` David Gibson
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine Cédric Le Goater
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

The IRQ number space of the XIVE and XICS interrupt mode are aligned
when using the dual interrupt mode for the machine. This means that
the ICS offset is set to zero in QEMU and that the KVM XICS device
should be informed of this new value. Unfortunately, there is now way
to do so and KVM still maintains the XICS_IRQ_BASE (0x1000) offset.

Ignore the lower 4K which are not used under the XICS interrupt
mode. These IRQ numbers are only claimed by XIVE for the CPU IPIs.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/intc/xics_kvm.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index 651bbfdf6966..1d21ff217b82 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -238,6 +238,15 @@ static void ics_get_kvm_state(ICSState *ics)
     for (i = 0; i < ics->nr_irqs; i++) {
         ICSIRQState *irq = &ics->irqs[i];
 
+        /*
+         * The KVM XICS device considers that the IRQ numbers should
+         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
+         * numbers (only claimed by XIVE for the CPU IPIs).
+         */
+        if (i + ics->offset < XICS_IRQ_BASE) {
+            continue;
+        }
+
         kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
                           i + ics->offset, &state, false, &error_fatal);
 
@@ -303,6 +312,15 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
         ICSIRQState *irq = &ics->irqs[i];
         int ret;
 
+        /*
+         * The KVM XICS device considers that the IRQ numbers should
+         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
+         * numbers (only claimed by XIVE for the CPU IPIs).
+         */
+        if (i + ics->offset < XICS_IRQ_BASE) {
+            continue;
+        }
+
         state = irq->server;
         state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
             << KVM_XICS_PRIORITY_SHIFT;
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
                   ` (11 preceding siblings ...)
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space Cédric Le Goater
@ 2019-01-07 18:39 ` Cédric Le Goater
  2019-02-12  1:11   ` David Gibson
  12 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-07 18:39 UTC (permalink / raw)
  To: David Gibson
  Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel, Cédric Le Goater

The interrupt mode is chosen by the CAS negotiation process and
activated after a reset to take into account the required changes in
the machine. This brings new constraints on how the associated KVM IRQ
device is initialized.

Currently, each model takes care of the initialization of the KVM
device in their realize method but this is not possible anymore as the
initialization needs to be done globaly when the interrupt mode is
known, i.e. when machine is reseted. It also means that we need a way
to delete a KVM device when another mode is chosen.

Also, to support migration, the QEMU objects holding the state to
transfer should always be available but not necessarily activated.

The overall approach of this proposal is to initialize both interrupt
mode at the QEMU level and keep the IRQ number space in sync to allow
switching from one mode to another. For the KVM side of things, the
whole initialization of the KVM device, sources and presenters, is
grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
handlers are modified accordingly to handle the init and the delete
sequences of the KVM device.

As KVM is now initialized at reset, we loose the possiblity to
fallback to the QEMU emulated mode in case of failure and failures
become fatal to the machine.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/intc/spapr_xive.c     |  8 +---
 hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
 hw/intc/xics_kvm.c       | 25 +++++++++++++
 hw/intc/xive.c           |  4 --
 hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
 5 files changed, 109 insertions(+), 34 deletions(-)

diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index 21f3c1ef0901..0661aca35900 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
     xive->eat = g_new0(XiveEAS, xive->nr_irqs);
     xive->endt = g_new0(XiveEND, xive->nr_ends);
 
-    if (kvmppc_xive_enabled()) {
-        kvmppc_xive_connect(xive, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-    } else {
+    if (!kvmppc_xive_enabled()) {
         /* TIMA initialization */
         memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
                               "xive.tima", 4ull << TM_SHIFT);
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index d35814c1992e..3ebc947f2be7 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
     Error *local_err = NULL;
     size_t esb_len;
     size_t tima_len;
+    CPUState *cs;
+
+    /*
+     * The KVM XIVE device already in use. This is the case when
+     * rebooting XIVE -> XIVE
+     */
+    if (xive->fd != -1) {
+        return;
+    }
 
     if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
         error_setg(errp, "IRQ_XIVE capability must be present for KVM");
@@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
     xive->change = qemu_add_vm_change_state_handler(
         kvmppc_xive_change_state_handler, xive);
 
+    /* Connect the presenters to the initial VCPUs of the machine */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
+    /* Update the KVM sources */
+    kvmppc_xive_source_reset(xsrc, &local_err);
+    if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+    }
+
     kvm_kernel_irqchip = true;
     kvm_msi_via_irqfd_allowed = true;
     kvm_gsi_direct_mapping = true;
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index 1d21ff217b82..bfc35d71df7f 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
 {
     int rc;
+    CPUState *cs;
+    Error *local_err = NULL;
+
+    /*
+     * The KVM XICS device already in use. This is the case when
+     * rebooting XICS -> XICS
+     */
+    if (kernel_xics_fd != -1) {
+        return 0;
+    }
 
     if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
         error_setg(errp,
@@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
     kvm_msi_via_irqfd_allowed = true;
     kvm_gsi_direct_mapping = true;
 
+    /* Connect the presenters to the initial VCPUs of the machine */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        icp_kvm_connect(cpu->icp, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            goto fail;
+        }
+        icp_set_kvm_state(cpu->icp, 1);
+    }
+
+    /* Update the KVM sources */
+    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
+
     return 0;
 
 fail:
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index c5c2fbc3f8bc..c166eab5b210 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
 
     /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
     memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
-
-    if (kvmppc_xive_enabled()) {
-        kvmppc_xive_source_reset(xsrc, &error_fatal);
-    }
 }
 
 static void xive_source_realize(DeviceState *dev, Error **errp)
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index ba27d9d8e972..5592eec3787b 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
     int nr_irqs = spapr->irq->nr_irqs;
     Error *local_err = NULL;
 
-    if (kvm_enabled()) {
-        if (machine_kernel_irqchip_allowed(machine) &&
-            !xics_kvm_init(spapr, &local_err)) {
-            spapr->icp_type = TYPE_KVM_ICP;
-            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
-                                          &local_err);
-        }
-        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
-            error_prepend(&local_err,
-                          "kernel_irqchip requested but unavailable: ");
-            goto error;
+    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+        spapr->icp_type = TYPE_KVM_ICP;
+        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
+                                      &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
         }
-        error_free(local_err);
-        local_err = NULL;
     }
 
     if (!spapr->ics) {
@@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
         spapr->icp_type = TYPE_ICP;
         spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
                                       &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
     }
-
-error:
-    error_propagate(errp, local_err);
 }
 
 #define ICS_IRQ_FREE(ics, srcno)   \
@@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
 
 static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
 {
-    /* TODO: create the KVM XICS device */
+    MachineState *machine = MACHINE(spapr);
+    Error *local_err = NULL;
+
+    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+        xics_kvm_init(spapr, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XICS connect failed: ");
+            return;
+        }
+    }
 }
 
 #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
@@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
 static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
 {
     CPUState *cs;
+    Error *local_err = NULL;
 
     CPU_FOREACH(cs) {
         PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
         spapr_xive_set_tctx_os_cam(cpu->tctx);
     }
 
+    if (kvmppc_xive_enabled()) {
+        kvmppc_xive_connect(spapr->xive, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XIVE connect failed: ");
+            return;
+        }
+    }
+
     /* Activate the XIVE MMIOs */
     spapr_xive_mmio_set_enabled(spapr->xive, true);
 }
@@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
 
 static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
 {
-    MachineState *machine = MACHINE(spapr);
     Error *local_err = NULL;
 
-    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
-        error_setg(errp, "No KVM support for the 'dual' machine");
-        return;
-    }
-
     spapr_irq_xics.init(spapr, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
@@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
 
 static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
 {
+    MachineState *machine = MACHINE(spapr);
+
     /*
      * Force a reset of the XIVE backend after migration. The machine
      * defaults to XICS at startup.
      */
     if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+            xics_kvm_disconnect(spapr, &error_fatal);
+        }
         spapr_irq_xive.reset(spapr, &error_fatal);
     }
 
@@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
 
 static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
 {
+    MachineState *machine = MACHINE(spapr);
+    Error *local_err = NULL;
+
     /*
      * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
      * if selected.
      */
     spapr_xive_mmio_set_enabled(spapr->xive, false);
 
+    /* Destroy all KVM devices */
+    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+        xics_kvm_disconnect(spapr, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XICS disconnect failed: ");
+            return;
+        }
+        kvmppc_xive_disconnect(spapr->xive, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XIVE disconnect failed: ");
+            return;
+        }
+    }
+
     spapr_irq_current(spapr)->reset(spapr, errp);
 }
 
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 08/13] spapr/rtas: modify spapr_rtas_register() to remove RTAS handlers
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 08/13] spapr/rtas: modify spapr_rtas_register() to remove RTAS handlers Cédric Le Goater
@ 2019-01-29  5:09   ` Alexey Kardashevskiy
  2019-01-29  7:20     ` Cédric Le Goater
  0 siblings, 1 reply; 43+ messages in thread
From: Alexey Kardashevskiy @ 2019-01-29  5:09 UTC (permalink / raw)
  To: Cédric Le Goater, David Gibson; +Cc: qemu-ppc, qemu-devel



On 08/01/2019 05:39, Cédric Le Goater wrote:
> Removing RTAS handlers will become necessary when the new pseries
> machine supporting multiple interrupt mode is introduced.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  include/hw/ppc/spapr.h | 4 ++++
>  hw/ppc/spapr_rtas.c    | 2 +-
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 9e01a5a12e4a..9a6d015b9cf5 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -657,6 +657,10 @@ typedef void (*spapr_rtas_fn)(PowerPCCPU *cpu, sPAPRMachineState *sm,
>                                uint32_t nargs, target_ulong args,
>                                uint32_t nret, target_ulong rets);
>  void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn);
> +static inline void spapr_rtas_unregister(int token)
> +{
> +    spapr_rtas_register(token, NULL, NULL);
> +}

The new helper is not used anywhere.


>  target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPRMachineState *sm,
>                               uint32_t token, uint32_t nargs, target_ulong args,
>                               uint32_t nret, target_ulong rets);
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index d6a0952154ac..e005d5d08151 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -404,7 +404,7 @@ void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn)
>  
>      token -= RTAS_TOKEN_BASE;
>  
> -    assert(!rtas_table[token].name);
> +    assert(!name || !rtas_table[token].name);


but allowing name==NULL is.


>  
>      rtas_table[token].name = name;
>      rtas_table[token].fn = fn;
> 

-- 
Alexey

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 08/13] spapr/rtas: modify spapr_rtas_register() to remove RTAS handlers
  2019-01-29  5:09   ` Alexey Kardashevskiy
@ 2019-01-29  7:20     ` Cédric Le Goater
  0 siblings, 0 replies; 43+ messages in thread
From: Cédric Le Goater @ 2019-01-29  7:20 UTC (permalink / raw)
  To: Alexey Kardashevskiy, David Gibson; +Cc: qemu-ppc, qemu-devel

On 1/29/19 6:09 AM, Alexey Kardashevskiy wrote:
> 
> 
> On 08/01/2019 05:39, Cédric Le Goater wrote:
>> Removing RTAS handlers will become necessary when the new pseries
>> machine supporting multiple interrupt mode is introduced.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  include/hw/ppc/spapr.h | 4 ++++
>>  hw/ppc/spapr_rtas.c    | 2 +-
>>  2 files changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 9e01a5a12e4a..9a6d015b9cf5 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -657,6 +657,10 @@ typedef void (*spapr_rtas_fn)(PowerPCCPU *cpu, sPAPRMachineState *sm,
>>                                uint32_t nargs, target_ulong args,
>>                                uint32_t nret, target_ulong rets);
>>  void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn);
>> +static inline void spapr_rtas_unregister(int token)
>> +{
>> +    spapr_rtas_register(token, NULL, NULL);
>> +}
> 
> The new helper is not used anywhere.

ah yes. patch 10 should be introducing the helper in xics_kvm_disconnect()

I am fixing that.

Thanks,

C.

>>  target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPRMachineState *sm,
>>                               uint32_t token, uint32_t nargs, target_ulong args,
>>                               uint32_t nret, target_ulong rets);
>> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
>> index d6a0952154ac..e005d5d08151 100644
>> --- a/hw/ppc/spapr_rtas.c
>> +++ b/hw/ppc/spapr_rtas.c
>> @@ -404,7 +404,7 @@ void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn)
>>  
>>      token -= RTAS_TOKEN_BASE;
>>  
>> -    assert(!rtas_table[token].name);
>> +    assert(!name || !rtas_table[token].name);
> 
> 
> but allowing name==NULL is.
> 
> 
>>  
>>      rtas_table[token].name = name;
>>      rtas_table[token].fn = fn;
>>
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 02/13] spapr/xive: add KVM support
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 02/13] spapr/xive: add KVM support Cédric Le Goater
@ 2019-02-06  2:39   ` David Gibson
  0 siblings, 0 replies; 43+ messages in thread
From: David Gibson @ 2019-02-06  2:39 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 20912 bytes --]

On Mon, Jan 07, 2019 at 07:39:35PM +0100, Cédric Le Goater wrote:
> This introduces a set of helpers when KVM is in use, which create the
> KVM XIVE device, initialize the interrupt sources at a KVM level and
> connect the interrupt presenters to the vCPU.
> 
> They also handle the initialization of the TIMA and the source ESB
> memory regions of the controller. These have a different type under
> KVM. They are 'ram device' memory mappings, similarly to VFIO, exposed
> to the guest and the associated VMAs on the host are populated
> dynamically with the appropriate pages using a fault handler.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>

Looks fine apart from the details of how the KVM interface works.

> ---
>  default-configs/ppc64-softmmu.mak |   1 +
>  include/hw/ppc/spapr_xive.h       |  10 ++
>  include/hw/ppc/xive.h             |  22 +++
>  target/ppc/kvm_ppc.h              |   6 +
>  hw/intc/spapr_xive.c              |  31 ++--
>  hw/intc/spapr_xive_kvm.c          | 254 ++++++++++++++++++++++++++++++
>  hw/intc/xive.c                    |  22 ++-
>  hw/ppc/spapr_irq.c                |  11 +-
>  target/ppc/kvm.c                  |   7 +
>  hw/intc/Makefile.objs             |   1 +
>  10 files changed, 349 insertions(+), 16 deletions(-)
>  create mode 100644 hw/intc/spapr_xive_kvm.c
> 
> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> index 7f34ad0528ed..c1bf5cd951f5 100644
> --- a/default-configs/ppc64-softmmu.mak
> +++ b/default-configs/ppc64-softmmu.mak
> @@ -18,6 +18,7 @@ CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
>  CONFIG_XICS_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
>  CONFIG_XIVE=$(CONFIG_PSERIES)
>  CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
> +CONFIG_XIVE_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
>  CONFIG_MEM_DEVICE=y
>  CONFIG_DIMM=y
>  CONFIG_SPAPR_RNG=y
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 7fdc25057420..24a0be478039 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -35,6 +35,10 @@ typedef struct sPAPRXive {
>      /* TIMA mapping address */
>      hwaddr        tm_base;
>      MemoryRegion  tm_mmio;
> +
> +    /* KVM support */
> +    int           fd;
> +    void          *tm_mmap;
>  } sPAPRXive;
>  
>  bool spapr_xive_irq_claim(sPAPRXive *xive, uint32_t lisn, bool lsi);
> @@ -48,5 +52,11 @@ void spapr_dt_xive(sPAPRMachineState *spapr, uint32_t nr_servers, void *fdt,
>                     uint32_t phandle);
>  void spapr_xive_set_tctx_os_cam(XiveTCTX *tctx);
>  void spapr_xive_mmio_set_enabled(sPAPRXive *xive, bool enable);
> +void spapr_xive_map_mmio(sPAPRXive *xive);
> +
> +/*
> + * KVM XIVE device helpers
> + */
> +void kvmppc_xive_connect(sPAPRXive *xive, Error **errp);
>  
>  #endif /* PPC_SPAPR_XIVE_H */
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index ec23253ba448..4bbba8d39a65 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -140,6 +140,7 @@
>  #ifndef PPC_XIVE_H
>  #define PPC_XIVE_H
>  
> +#include "sysemu/kvm.h"
>  #include "hw/qdev-core.h"
>  #include "hw/sysbus.h"
>  #include "hw/ppc/xive_regs.h"
> @@ -194,6 +195,9 @@ typedef struct XiveSource {
>      uint32_t        esb_shift;
>      MemoryRegion    esb_mmio;
>  
> +    /* KVM support */
> +    void            *esb_mmap;
> +
>      XiveNotifier    *xive;
>  } XiveSource;
>  
> @@ -421,4 +425,22 @@ static inline uint32_t xive_nvt_cam_line(uint8_t nvt_blk, uint32_t nvt_idx)
>      return (nvt_blk << 19) | nvt_idx;
>  }
>  
> +/*
> + * KVM XIVE device helpers
> + */
> +
> +/* Keep inlined to discard compile of KVM code sections */
> +static inline bool kvmppc_xive_enabled(void)
> +{
> +    if (kvm_enabled()) {
> +        return machine_kernel_irqchip_allowed(MACHINE(qdev_get_machine()));
> +    } else {
> +        return false;
> +    }
> +}
> +
> +void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp);
> +void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val);
> +void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp);
> +
>  #endif /* PPC_XIVE_H */
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index bdfaa4e70a83..d2159660f9f2 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -59,6 +59,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
>  bool kvmppc_has_cap_htm(void);
>  bool kvmppc_has_cap_mmu_radix(void);
>  bool kvmppc_has_cap_mmu_hash_v3(void);
> +bool kvmppc_has_cap_xive(void);
>  int kvmppc_get_cap_safe_cache(void);
>  int kvmppc_get_cap_safe_bounds_check(void);
>  int kvmppc_get_cap_safe_indirect_branch(void);
> @@ -307,6 +308,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
>      return false;
>  }
>  
> +static inline bool kvmppc_has_cap_xive(void)
> +{
> +    return false;
> +}
> +
>  static inline int kvmppc_get_cap_safe_cache(void)
>  {
>      return 0;
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index d391177ab81f..cf6d3a5f12e1 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -172,7 +172,7 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>      }
>  }
>  
> -static void spapr_xive_map_mmio(sPAPRXive *xive)
> +void spapr_xive_map_mmio(sPAPRXive *xive)
>  {
>      sysbus_mmio_map(SYS_BUS_DEVICE(xive), 0, xive->vc_base);
>      sysbus_mmio_map(SYS_BUS_DEVICE(xive), 1, xive->end_base);
> @@ -250,6 +250,9 @@ static void spapr_xive_instance_init(Object *obj)
>                        TYPE_XIVE_END_SOURCE);
>      object_property_add_child(obj, "end_source", OBJECT(&xive->end_source),
>                                NULL);
> +
> +    /* Not connected to the KVM XIVE device */
> +    xive->fd = -1;
>  }
>  
>  static void spapr_xive_realize(DeviceState *dev, Error **errp)
> @@ -304,17 +307,25 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>  
> -    /* TIMA initialization */
> -    memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> -                          "xive.tima", 4ull << TM_SHIFT);
> +    if (kvmppc_xive_enabled()) {
> +        kvmppc_xive_connect(xive, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    } else {
> +        /* TIMA initialization */
> +        memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> +                              "xive.tima", 4ull << TM_SHIFT);
>  
> -    /* Define all XIVE MMIO regions on SysBus */
> -    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio);
> -    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio);
> -    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
> +        /* Define all XIVE MMIO regions on SysBus */
> +        sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio);
> +        sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio);
> +        sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
>  
> -    /* Map all regions */
> -    spapr_xive_map_mmio(xive);
> +        /* Map all regions */
> +        spapr_xive_map_mmio(xive);
> +    }
>  
>      qemu_register_reset(spapr_xive_reset, dev);
>  }
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> new file mode 100644
> index 000000000000..f96c66fa419d
> --- /dev/null
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -0,0 +1,254 @@
> +/*
> + * QEMU PowerPC sPAPR XIVE interrupt controller model
> + *
> + * Copyright (c) 2017-2019, IBM Corporation.
> + *
> + * This code is licensed under the GPL version 2 or later. See the
> + * COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "target/ppc/cpu.h"
> +#include "sysemu/cpus.h"
> +#include "sysemu/kvm.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_xive.h"
> +#include "hw/ppc/xive.h"
> +#include "kvm_ppc.h"
> +
> +#include <sys/ioctl.h>
> +
> +/*
> + * Helpers for CPU hotplug
> + *
> + * TODO: make a common KVMEnabledCPU layer for XICS and XIVE
> + */
> +typedef struct KVMEnabledCPU {
> +    unsigned long vcpu_id;
> +    QLIST_ENTRY(KVMEnabledCPU) node;
> +} KVMEnabledCPU;
> +
> +static QLIST_HEAD(, KVMEnabledCPU)
> +    kvm_enabled_cpus = QLIST_HEAD_INITIALIZER(&kvm_enabled_cpus);
> +
> +static bool kvm_cpu_is_enabled(CPUState *cs)
> +{
> +    KVMEnabledCPU *enabled_cpu;
> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> +
> +    QLIST_FOREACH(enabled_cpu, &kvm_enabled_cpus, node) {
> +        if (enabled_cpu->vcpu_id == vcpu_id) {
> +            return true;
> +        }
> +    }
> +    return false;
> +}
> +
> +static void kvm_cpu_enable(CPUState *cs)
> +{
> +    KVMEnabledCPU *enabled_cpu;
> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> +
> +    enabled_cpu = g_malloc(sizeof(*enabled_cpu));
> +    enabled_cpu->vcpu_id = vcpu_id;
> +    QLIST_INSERT_HEAD(&kvm_enabled_cpus, enabled_cpu, node);
> +}
> +
> +/*
> + * XIVE Thread Interrupt Management context (KVM)
> + */
> +
> +void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp)
> +{
> +    sPAPRXive *xive = SPAPR_MACHINE(qdev_get_machine())->xive;
> +    unsigned long vcpu_id;
> +    int ret;
> +
> +    /* Check if CPU was hot unplugged and replugged. */
> +    if (kvm_cpu_is_enabled(tctx->cs)) {
> +        return;
> +    }
> +
> +    vcpu_id = kvm_arch_vcpu_id(tctx->cs);
> +
> +    ret = kvm_vcpu_enable_cap(tctx->cs, KVM_CAP_PPC_IRQ_XIVE, 0, xive->fd,
> +                              vcpu_id, 0);
> +    if (ret < 0) {
> +        error_setg(errp, "XIVE: unable to connect CPU%ld to KVM device: %s",
> +                   vcpu_id, strerror(errno));
> +        return;
> +    }
> +
> +    kvm_cpu_enable(tctx->cs);
> +}
> +
> +/*
> + * XIVE Interrupt Source (KVM)
> + */
> +
> +/*
> + * At reset, the interrupt sources are simply created and MASKED. We
> + * only need to inform the KVM XIVE device about their type: LSI or
> + * MSI.
> + */
> +void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp)
> +{
> +    sPAPRXive *xive = SPAPR_XIVE(xsrc->xive);
> +    int i;
> +
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        Error *local_err = NULL;
> +        uint64_t state = 0;
> +
> +        if (xive_source_irq_is_lsi(xsrc, i)) {
> +            state |= KVM_XIVE_LEVEL_SENSITIVE;
> +            if (xsrc->status[i] & XIVE_STATUS_ASSERTED) {
> +                state |= KVM_XIVE_LEVEL_ASSERTED;
> +            }
> +        }
> +
> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCES, i, &state,
> +                          true, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +}
> +
> +void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
> +{
> +    XiveSource *xsrc = opaque;
> +    struct kvm_irq_level args;
> +    int rc;
> +
> +    args.irq = srcno;
> +    if (!xive_source_irq_is_lsi(xsrc, srcno)) {
> +        if (!val) {
> +            return;
> +        }
> +        args.level = KVM_INTERRUPT_SET;
> +    } else {
> +        if (val) {
> +            xsrc->status[srcno] |= XIVE_STATUS_ASSERTED;
> +            args.level = KVM_INTERRUPT_SET_LEVEL;
> +        } else {
> +            xsrc->status[srcno] &= ~XIVE_STATUS_ASSERTED;
> +            args.level = KVM_INTERRUPT_UNSET;
> +        }
> +    }
> +    rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args);
> +    if (rc < 0) {
> +        error_report("XIVE: kvm_irq_line() failed : %s", strerror(errno));
> +    }
> +}
> +
> +/*
> + * sPAPR XIVE interrupt controller (KVM)
> + */
> +
> +static void *kvmppc_xive_mmap(sPAPRXive *xive, int ctrl, size_t len,
> +                                 Error **errp)
> +{
> +    Error *local_err = NULL;
> +    void *addr;
> +    int fd;
> +
> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, ctrl, &fd, false,
> +                      &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return NULL;
> +    }
> +
> +    addr = mmap(NULL, len, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
> +    close(fd);
> +    if (addr == MAP_FAILED) {
> +        error_setg_errno(errp, errno, "XIVE: unable to set memory mapping");
> +        return NULL;
> +    }
> +
> +    return addr;
> +}
> +
> +/*
> + * All the XIVE memory regions are now backed by mappings from the KVM
> + * XIVE device.
> + */
> +void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    XiveENDSource *end_xsrc = &xive->end_source;
> +    Error *local_err = NULL;
> +    size_t esb_len;
> +    size_t tima_len;
> +
> +    if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> +        error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> +        return;
> +    }
> +
> +    /* First, create the KVM XIVE device */
> +    xive->fd = kvm_create_device(kvm_state, KVM_DEV_TYPE_XIVE, false);
> +    if (xive->fd < 0) {
> +        error_setg_errno(errp, -xive->fd, "XIVE: error creating KVM device");
> +        return;
> +    }
> +
> +    /*
> +     * Source ESBs KVM mapping
> +     *
> +     * Inform KVM where we will map the ESB pages. This is needed by
> +     * the H_INT_GET_SOURCE_INFO hcall which returns the source
> +     * characteristics, among which the ESB page address.
> +     */
> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, KVM_DEV_XIVE_VC_BASE,
> +                      &xive->vc_base, true, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
> +    xsrc->esb_mmap = kvmppc_xive_mmap(xive, KVM_DEV_XIVE_GET_ESB_FD,
> +                                      esb_len, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    memory_region_init_ram_device_ptr(&xsrc->esb_mmio, OBJECT(xsrc),
> +                                      "xive.esb", esb_len, xsrc->esb_mmap);
> +    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio);
> +
> +    /* END ESBs mapping (No KVM) */
> +    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio);
> +
> +    /*
> +     * TIMA KVM mapping
> +     *
> +     * We could also inform KVM where the TIMA will be mapped but as
> +     * this is a fixed MMIO address for the system it does not seem
> +     * necessary to provide a KVM ioctl to change it.
> +     */
> +    tima_len = 4ull << TM_SHIFT;
> +    xive->tm_mmap = kvmppc_xive_mmap(xive, KVM_DEV_XIVE_GET_TIMA_FD,
> +                                     tima_len, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +    memory_region_init_ram_device_ptr(&xive->tm_mmio, OBJECT(xive),
> +                                      "xive.tima", tima_len, xive->tm_mmap);
> +    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
> +
> +    kvm_kernel_irqchip = true;
> +    kvm_msi_via_irqfd_allowed = true;
> +    kvm_gsi_direct_mapping = true;
> +
> +    /* Map all regions */
> +    spapr_xive_map_mmio(xive);
> +}
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index a3cb0cf0e348..9a2d7be283f8 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -15,6 +15,7 @@
>  #include "sysemu/dma.h"
>  #include "hw/qdev-properties.h"
>  #include "monitor/monitor.h"
> +#include "hw/boards.h"
>  #include "hw/ppc/xive.h"
>  #include "hw/ppc/xive_regs.h"
>  
> @@ -493,6 +494,15 @@ static void xive_tctx_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> +    /* Connect the presenter to the VCPU (required for CPU hotplug) */
> +    if (kvmppc_xive_enabled()) {
> +        kvmppc_xive_cpu_connect(tctx, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +
>      qemu_register_reset(xive_tctx_reset, dev);
>  }
>  
> @@ -895,6 +905,10 @@ static void xive_source_reset(void *dev)
>  
>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> +
> +    if (kvmppc_xive_enabled()) {
> +        kvmppc_xive_source_reset(xsrc, &error_fatal);
> +    }
>  }
>  
>  static void xive_source_realize(DeviceState *dev, Error **errp)
> @@ -928,9 +942,11 @@ static void xive_source_realize(DeviceState *dev, Error **errp)
>      xsrc->status = g_malloc0(xsrc->nr_irqs);
>      xsrc->lsi_map = bitmap_new(xsrc->nr_irqs);
>  
> -    memory_region_init_io(&xsrc->esb_mmio, OBJECT(xsrc),
> -                          &xive_source_esb_ops, xsrc, "xive.esb",
> -                          (1ull << xsrc->esb_shift) * xsrc->nr_irqs);
> +    if (!kvmppc_xive_enabled()) {
> +        memory_region_init_io(&xsrc->esb_mmio, OBJECT(xsrc),
> +                              &xive_source_esb_ops, xsrc, "xive.esb",
> +                              (1ull << xsrc->esb_shift) * xsrc->nr_irqs);
> +    }
>  
>      qemu_register_reset(xive_source_reset, dev);
>  }
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index 5fce72fe0f6c..afbdabfa6543 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -15,6 +15,7 @@
>  #include "hw/ppc/spapr_xive.h"
>  #include "hw/ppc/xics.h"
>  #include "sysemu/kvm.h"
> +#include "kvm_ppc.h"
>  
>  #include "trace.h"
>  
> @@ -266,9 +267,9 @@ static void spapr_irq_init_xive(sPAPRMachineState *spapr, Error **errp)
>      DeviceState *dev;
>      int i;
>  
> -    /* KVM XIVE device not yet available */
>      if (kvm_enabled()) {
> -        if (machine_kernel_irqchip_required(machine)) {
> +        if (machine_kernel_irqchip_required(machine) &&
> +            !kvmppc_has_cap_xive()) {
>              error_setg(errp, "kernel_irqchip requested. no KVM XIVE support");
>              return;
>          }
> @@ -384,7 +385,11 @@ static void spapr_irq_set_irq_xive(void *opaque, int srcno, int val)
>  {
>      sPAPRMachineState *spapr = opaque;
>  
> -    xive_source_set_irq(&spapr->xive->source, srcno, val);
> +    if (kvmppc_xive_enabled()) {
> +        kvmppc_xive_source_set_irq(&spapr->xive->source, srcno, val);
> +    } else {
> +        xive_source_set_irq(&spapr->xive->source, srcno, val);
> +    }
>  }
>  
>  /*
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index ebbb48c42f25..88a470a73e7c 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -86,6 +86,7 @@ static int cap_fixup_hcalls;
>  static int cap_htm;             /* Hardware transactional memory support */
>  static int cap_mmu_radix;
>  static int cap_mmu_hash_v3;
> +static int cap_xive;
>  static int cap_resize_hpt;
>  static int cap_ppc_pvr_compat;
>  static int cap_ppc_safe_cache;
> @@ -149,6 +150,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>      cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM);
>      cap_mmu_radix = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_RADIX);
>      cap_mmu_hash_v3 = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_HASH_V3);
> +    cap_xive = kvm_vm_check_extension(s, KVM_CAP_PPC_IRQ_XIVE);
>      cap_resize_hpt = kvm_vm_check_extension(s, KVM_CAP_SPAPR_RESIZE_HPT);
>      kvmppc_get_cpu_characteristics(s);
>      cap_ppc_nested_kvm_hv = kvm_vm_check_extension(s, KVM_CAP_PPC_NESTED_HV);
> @@ -2389,6 +2391,11 @@ static int parse_cap_ppc_safe_indirect_branch(struct kvm_ppc_cpu_char c)
>      return 0;
>  }
>  
> +bool kvmppc_has_cap_xive(void)
> +{
> +    return cap_xive;
> +}
> +
>  static void kvmppc_get_cpu_characteristics(KVMState *s)
>  {
>      struct kvm_ppc_cpu_char c;
> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> index 301a8e972d91..23126c199178 100644
> --- a/hw/intc/Makefile.objs
> +++ b/hw/intc/Makefile.objs
> @@ -39,6 +39,7 @@ obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
>  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
>  obj-$(CONFIG_XIVE) += xive.o
>  obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o
> +obj-$(CONFIG_XIVE_KVM) += spapr_xive_kvm.o
>  obj-$(CONFIG_POWERNV) += xics_pnv.o
>  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
>  obj-$(CONFIG_S390_FLIC) += s390_flic.o

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 03/13] spapr/xive: add state synchronization with KVM
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 03/13] spapr/xive: add state synchronization with KVM Cédric Le Goater
@ 2019-02-06  2:42   ` David Gibson
  0 siblings, 0 replies; 43+ messages in thread
From: David Gibson @ 2019-02-06  2:42 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 14545 bytes --]

On Mon, Jan 07, 2019 at 07:39:36PM +0100, Cédric Le Goater wrote:
> This extends the KVM XIVE device backend with 'synchronize_state'
> methods used to retrieve the state from KVM. The HW state of the
> sources, the KVM device and the thread interrupt contexts are
> collected for the monitor usage and also migration.
> 
> These get operations rely on their KVM counterpart in the host kernel
> which acts as a proxy for OPAL, the host firmware. The set operations
> will be added for migration support later.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  include/hw/ppc/spapr_xive.h |   9 ++
>  include/hw/ppc/xive.h       |   1 +
>  hw/intc/spapr_xive.c        |  24 ++--
>  hw/intc/spapr_xive_kvm.c    | 223 ++++++++++++++++++++++++++++++++++++
>  hw/intc/xive.c              |  10 ++
>  5 files changed, 260 insertions(+), 7 deletions(-)
> 
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 24a0be478039..02f2de20111c 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -44,6 +44,14 @@ typedef struct sPAPRXive {
>  bool spapr_xive_irq_claim(sPAPRXive *xive, uint32_t lisn, bool lsi);
>  bool spapr_xive_irq_free(sPAPRXive *xive, uint32_t lisn);
>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
> +bool spapr_xive_priority_is_reserved(uint8_t priority);
> +
> +void spapr_xive_cpu_to_nvt(PowerPCCPU *cpu,
> +                           uint8_t *out_nvt_blk, uint32_t *out_nvt_idx);
> +void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
> +                           uint8_t *out_end_blk, uint32_t *out_end_idx);
> +int spapr_xive_target_to_end(uint32_t target, uint8_t prio,
> +                             uint8_t *out_end_blk, uint32_t *out_end_idx);
>  
>  typedef struct sPAPRMachineState sPAPRMachineState;
>  
> @@ -58,5 +66,6 @@ void spapr_xive_map_mmio(sPAPRXive *xive);
>   * KVM XIVE device helpers
>   */
>  void kvmppc_xive_connect(sPAPRXive *xive, Error **errp);
> +void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp);
>  
>  #endif /* PPC_SPAPR_XIVE_H */
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 4bbba8d39a65..2e48d75a22e0 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -442,5 +442,6 @@ static inline bool kvmppc_xive_enabled(void)
>  void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp);
>  void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val);
>  void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp);
> +void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp);
>  
>  #endif /* PPC_XIVE_H */
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index cf6d3a5f12e1..50dd66707968 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -54,8 +54,8 @@ static uint32_t spapr_xive_nvt_to_target(uint8_t nvt_blk, uint32_t nvt_idx)
>      return nvt_idx - SPAPR_XIVE_NVT_BASE;
>  }
>  
> -static void spapr_xive_cpu_to_nvt(PowerPCCPU *cpu,
> -                                  uint8_t *out_nvt_blk, uint32_t *out_nvt_idx)
> +void spapr_xive_cpu_to_nvt(PowerPCCPU *cpu,
> +                           uint8_t *out_nvt_blk, uint32_t *out_nvt_idx)
>  {
>      assert(cpu);
>  
> @@ -85,8 +85,8 @@ static int spapr_xive_target_to_nvt(uint32_t target,
>   * sPAPR END indexing uses a simple mapping of the CPU vcpu_id, 8
>   * priorities per CPU
>   */
> -static void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
> -                                  uint8_t *out_end_blk, uint32_t *out_end_idx)
> +void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
> +                           uint8_t *out_end_blk, uint32_t *out_end_idx)
>  {
>      assert(cpu);
>  
> @@ -99,8 +99,8 @@ static void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
>      }
>  }
>  
> -static int spapr_xive_target_to_end(uint32_t target, uint8_t prio,
> -                                    uint8_t *out_end_blk, uint32_t *out_end_idx)
> +int spapr_xive_target_to_end(uint32_t target, uint8_t prio,
> +                             uint8_t *out_end_blk, uint32_t *out_end_idx)
>  {
>      PowerPCCPU *cpu = spapr_find_cpu(target);
>  
> @@ -139,6 +139,16 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>      XiveSource *xsrc = &xive->source;
>      int i;
>  
> +    if (kvmppc_xive_enabled()) {
> +        Error *local_err = NULL;
> +
> +        kvmppc_xive_synchronize_state(xive, &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return;
> +        }
> +    }
> +
>      monitor_printf(mon, "  LSIN         PQ    EISN     CPU/PRIO EQ\n");
>  
>      for (i = 0; i < xive->nr_irqs; i++) {
> @@ -529,7 +539,7 @@ bool spapr_xive_irq_free(sPAPRXive *xive, uint32_t lisn)
>   * interrupts (DD2.X POWER9). So we only allow the guest to use
>   * priorities [0..6].
>   */
> -static bool spapr_xive_priority_is_reserved(uint8_t priority)
> +bool spapr_xive_priority_is_reserved(uint8_t priority)
>  {
>      switch (priority) {
>      case 0 ... 6:
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index f96c66fa419d..f52bddc92a2a 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -60,6 +60,57 @@ static void kvm_cpu_enable(CPUState *cs)
>  /*
>   * XIVE Thread Interrupt Management context (KVM)
>   */
> +static void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
> +{
> +    uint64_t state[4] = { 0 };
> +    int ret;
> +
> +    ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
> +    if (ret != 0) {
> +        error_setg_errno(errp, errno,
> +                         "XIVE: could not capture KVM state of CPU %ld",
> +                         kvm_arch_vcpu_id(tctx->cs));
> +        return;
> +    }
> +
> +    /* word0 and word1 of the OS ring. */
> +    *((uint64_t *) &tctx->regs[TM_QW1_OS]) = state[0];
> +
> +    /*
> +     * KVM also returns word2 containing the OS CAM line which is
> +     * interesting to print out in the QEMU monitor.
> +     */
> +    *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]) = state[1];
> +}
> +
> +typedef struct {
> +    XiveTCTX *tctx;
> +    Error *err;
> +} XiveCpuGetState;
> +
> +static void kvmppc_xive_cpu_do_synchronize_state(CPUState *cpu,
> +                                                 run_on_cpu_data arg)
> +{
> +    XiveCpuGetState *s = arg.host_ptr;
> +
> +    kvmppc_xive_cpu_get_state(s->tctx, &s->err);
> +}
> +
> +void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp)
> +{
> +    XiveCpuGetState s = {
> +        .tctx = tctx,
> +        .err = NULL,
> +    };
> +
> +    run_on_cpu(tctx->cs, kvmppc_xive_cpu_do_synchronize_state,
> +               RUN_ON_CPU_HOST_PTR(&s));
> +
> +    if (s.err) {
> +        error_propagate(errp, s.err);
> +        return;
> +    }
> +}
>  
>  void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp)
>  {
> @@ -119,6 +170,34 @@ void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp)
>      }
>  }
>  
> +/*
> + * This is used to perform the magic loads on the ESB pages, described
> + * in xive.h.
> + */
> +static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
> +{
> +    unsigned long addr = (unsigned long) xsrc->esb_mmap +
> +        xive_source_esb_mgmt(xsrc, srcno) + offset;
> +
> +    /* Prevent the compiler from optimizing away the load */
> +    volatile uint64_t value = *((uint64_t *) addr);
> +
> +    return be64_to_cpu(value) & 0x3;
> +}
> +
> +static void kvmppc_xive_source_get_state(XiveSource *xsrc)
> +{
> +    int i;
> +
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        /* Perform a load without side effect to retrieve the PQ bits */
> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
> +
> +        /* and save PQ locally */
> +        xive_source_esb_set(xsrc, i, pq);
> +    }
> +}
> +
>  void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
>  {
>      XiveSource *xsrc = opaque;
> @@ -149,6 +228,150 @@ void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
>  /*
>   * sPAPR XIVE interrupt controller (KVM)
>   */
> +static int kvmppc_xive_get_eq_state(sPAPRXive *xive, CPUState *cs, Error **errp)
> +{
> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> +    int ret;
> +    int i;
> +
> +    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
> +        Error *local_err = NULL;
> +        struct kvm_ppc_xive_eq kvm_eq = { 0 };
> +        uint64_t kvm_eq_idx;
> +        XiveEND end = { 0 };
> +        uint8_t end_blk, nvt_blk;
> +        uint32_t end_idx, nvt_idx;
> +
> +        /* Skip priorities reserved for the hypervisor */
> +        if (spapr_xive_priority_is_reserved(i)) {
> +            continue;
> +        }
> +
> +        /* Encode the tuple (server, prio) as a KVM EQ index */
> +        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
> +            KVM_XIVE_EQ_PRIORITY_MASK;
> +        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
> +            KVM_XIVE_EQ_SERVER_MASK;
> +
> +        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
> +                                &kvm_eq, false, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return ret;
> +        }
> +
> +        if (!(kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED)) {
> +            continue;
> +        }
> +
> +        /* Update the local END structure with the KVM input */
> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED) {
> +            end.w0 |= cpu_to_be32(END_W0_VALID | END_W0_ENQUEUE);
> +        }
> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY) {
> +            end.w0 |= cpu_to_be32(END_W0_UCOND_NOTIFY);
> +        }
> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ESCALATE) {
> +            end.w0 |= cpu_to_be32(END_W0_ESCALATE_CTL);
> +        }
> +        end.w0 |= xive_set_field32(END_W0_QSIZE, 0ul, kvm_eq.qsize - 12);
> +
> +        end.w1 = xive_set_field32(END_W1_GENERATION, 0ul, kvm_eq.qtoggle) |
> +            xive_set_field32(END_W1_PAGE_OFF, 0ul, kvm_eq.qindex);
> +        end.w2 = cpu_to_be32((kvm_eq.qpage >> 32) & 0x0fffffff);
> +        end.w3 = cpu_to_be32(kvm_eq.qpage & 0xffffffff);
> +        end.w4 = 0;
> +        end.w5 = 0;
> +
> +        spapr_xive_cpu_to_nvt(POWERPC_CPU(cs), &nvt_blk, &nvt_idx);
> +
> +        end.w6 = xive_set_field32(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
> +            xive_set_field32(END_W6_NVT_INDEX, 0ul, nvt_idx);
> +        end.w7 = xive_set_field32(END_W7_F0_PRIORITY, 0ul, i);
> +
> +        spapr_xive_cpu_to_end(POWERPC_CPU(cs), i, &end_blk, &end_idx);
> +
> +        assert(end_idx < xive->nr_ends);
> +        memcpy(&xive->endt[end_idx], &end, sizeof(XiveEND));
> +    }
> +
> +    return 0;
> +}
> +
> +static void kvmppc_xive_get_eas_state(sPAPRXive *xive, Error **errp)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    int i;
> +
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        XiveEAS *eas = &xive->eat[i];
> +        XiveEAS new_eas;
> +        uint64_t kvm_eas;
> +        uint8_t priority;
> +        uint32_t server;
> +        uint32_t end_idx;
> +        uint8_t end_blk;
> +        uint32_t eisn;
> +        Error *local_err = NULL;
> +
> +        if (!xive_eas_is_valid(eas)) {
> +            continue;
> +        }
> +
> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, false,
> +                          &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +
> +        priority = (kvm_eas & KVM_XIVE_EAS_PRIORITY_MASK) >>
> +            KVM_XIVE_EAS_PRIORITY_SHIFT;
> +        server = (kvm_eas & KVM_XIVE_EAS_SERVER_MASK) >>
> +            KVM_XIVE_EAS_SERVER_SHIFT;
> +        eisn = (kvm_eas & KVM_XIVE_EAS_EISN_MASK) >> KVM_XIVE_EAS_EISN_SHIFT;
> +
> +        if (spapr_xive_target_to_end(server, priority, &end_blk, &end_idx)) {
> +            error_setg(errp, "XIVE: invalid tuple CPU %d priority %d", server,
> +                       priority);
> +            return;
> +        }
> +
> +        new_eas.w = cpu_to_be64(EAS_VALID);
> +        if (kvm_eas & KVM_XIVE_EAS_MASK_MASK) {
> +            new_eas.w |= cpu_to_be64(EAS_MASKED);
> +        }
> +
> +        new_eas.w = xive_set_field64(EAS_END_INDEX, new_eas.w, end_idx);
> +        new_eas.w = xive_set_field64(EAS_END_BLOCK, new_eas.w, end_blk);
> +        new_eas.w = xive_set_field64(EAS_END_DATA, new_eas.w, eisn);
> +
> +        *eas = new_eas;
> +    }
> +}
> +
> +void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    CPUState *cs;
> +    Error *local_err = NULL;
> +
> +    kvmppc_xive_source_get_state(xsrc);
> +
> +    kvmppc_xive_get_eas_state(xive, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    CPU_FOREACH(cs) {
> +        kvmppc_xive_get_eq_state(xive, cs, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +}
>  
>  static void *kvmppc_xive_mmap(sPAPRXive *xive, int ctrl, size_t len,
>                                   Error **errp)
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index 9a2d7be283f8..596c29d8c826 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -434,6 +434,16 @@ void xive_tctx_pic_print_info(XiveTCTX *tctx, Monitor *mon)
>      int cpu_index = tctx->cs ? tctx->cs->cpu_index : -1;
>      int i;
>  
> +    if (kvmppc_xive_enabled()) {
> +        Error *local_err = NULL;
> +
> +        kvmppc_xive_cpu_synchronize_state(tctx, &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return;
> +        }
> +    }
> +
>      monitor_printf(mon, "CPU[%04x]:   QW   NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
>                     "  W2\n", cpu_index);
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 04/13] spapr/xive: introduce a VM state change handler
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 04/13] spapr/xive: introduce a VM state change handler Cédric Le Goater
@ 2019-02-06  2:49   ` David Gibson
  0 siblings, 0 replies; 43+ messages in thread
From: David Gibson @ 2019-02-06  2:49 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 6595 bytes --]

On Mon, Jan 07, 2019 at 07:39:37PM +0100, Cédric Le Goater wrote:
> This handler is in charge of stabilizing the flow of event notifications
> in the XIVE controller before migrating a guest. This is a requirement
> before transferring the guest EQ pages to a destination.
> 
> When the VM is stopped, the handler masks the sources (PQ=01) to stop
> the flow of events and saves their previous state. The XIVE controller
> is then synced through KVM to flush any in-flight event notification
> and to stabilize the EQs. At this stage, the EQ pages are marked dirty
> to make sure the EQ pages are transferred if a migration sequence is
> in progress.
> 
> The previous configuration of the sources is restored when the VM
> resumes, after a migration or a stop.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>

Looks fine apart from interface details discussed on the kernel series.

> ---
>  include/hw/ppc/spapr_xive.h |   1 +
>  hw/intc/spapr_xive_kvm.c    | 111 +++++++++++++++++++++++++++++++++++-
>  2 files changed, 111 insertions(+), 1 deletion(-)
> 
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 02f2de20111c..8815ed5aa372 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -39,6 +39,7 @@ typedef struct sPAPRXive {
>      /* KVM support */
>      int           fd;
>      void          *tm_mmap;
> +    VMChangeStateEntry *change;
>  } sPAPRXive;
>  
>  bool spapr_xive_irq_claim(sPAPRXive *xive, uint32_t lisn, bool lsi);
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index f52bddc92a2a..c7639ffe7758 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -350,13 +350,119 @@ static void kvmppc_xive_get_eas_state(sPAPRXive *xive, Error **errp)
>      }
>  }
>  
> +/*
> + * Sync the XIVE controller through KVM to flush any in-flight event
> + * notification and stabilize the EQs.
> + */
> + static void kvmppc_xive_sync_all(sPAPRXive *xive, Error **errp)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    Error *local_err = NULL;
> +    int i;
> +
> +    /* Sync the KVM source. This reaches the XIVE HW through OPAL */
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        XiveEAS *eas = &xive->eat[i];
> +
> +        if (!xive_eas_is_valid(eas)) {
> +            continue;
> +        }
> +
> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SYNC, i, NULL, true,
> +                          &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +}
> +
> +/*
> + * The primary goal of the XIVE VM change handler is to mark the EQ
> + * pages dirty when all XIVE event notifications have stopped.
> + *
> + * Whenever the VM is stopped, the VM change handler masks the sources
> + * (PQ=01) to stop the flow of events and saves the previous state in
> + * anticipation of a migration. The XIVE controller is then synced
> + * through KVM to flush any in-flight event notification and stabilize
> + * the EQs.
> + *
> + * At this stage, we can mark the EQ page dirty and let a migration
> + * sequence transfer the EQ pages to the destination, which is done
> + * just after the stop state.
> + *
> + * The previous configuration of the sources is restored when the VM
> + * runs again.
> + */
> +static void kvmppc_xive_change_state_handler(void *opaque, int running,
> +                                             RunState state)
> +{
> +    sPAPRXive *xive = opaque;
> +    XiveSource *xsrc = &xive->source;
> +    Error *local_err = NULL;
> +    int i;
> +
> +    /*
> +     * Restore the sources to their initial state. This is called when
> +     * the VM resumes after a stop or a migration.
> +     */
> +    if (running) {
> +        for (i = 0; i < xsrc->nr_irqs; i++) {
> +            uint8_t pq = xive_source_esb_get(xsrc, i);
> +            if (xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8)) != 0x1) {
> +                error_report("XIVE: IRQ %d has an invalid state", i);
> +            }
> +        }
> +
> +        return;
> +    }
> +
> +    /*
> +     * Mask the sources, to stop the flow of event notifications, and
> +     * save the PQs locally in the XiveSource object. The XiveSource
> +     * state will be collected later on by its vmstate handler if a
> +     * migration is in progress.
> +     */
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_01);
> +        xive_source_esb_set(xsrc, i, pq);
> +    }
> +
> +    /*
> +     * Sync the XIVE controller in KVM, to flush in-flight event
> +     * notification that should be enqueued in the EQs.
> +     */
> +    kvmppc_xive_sync_all(xive, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return;
> +    }
> +
> +    /*
> +     * Mark the XIVE EQ pages dirty to collect all updates.
> +     */
> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL,
> +                      KVM_DEV_XIVE_SAVE_EQ_PAGES, NULL, true, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +    }
> +}
> +
>  void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp)
>  {
>      XiveSource *xsrc = &xive->source;
>      CPUState *cs;
>      Error *local_err = NULL;
>  
> -    kvmppc_xive_source_get_state(xsrc);
> +    /*
> +     * When the VM is stopped, the sources are masked and the previous
> +     * state is saved in anticipation of a migration. We should not
> +     * synchronize the source state in that case else we will override
> +     * the saved state.
> +     */
> +    if (runstate_is_running()) {
> +        kvmppc_xive_source_get_state(xsrc);
> +    }
>  
>      kvmppc_xive_get_eas_state(xive, &local_err);
>      if (local_err) {
> @@ -468,6 +574,9 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>                                        "xive.tima", tima_len, xive->tm_mmap);
>      sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
>  
> +    xive->change = qemu_add_vm_change_state_handler(
> +        kvmppc_xive_change_state_handler, xive);
> +
>      kvm_kernel_irqchip = true;
>      kvm_msi_via_irqfd_allowed = true;
>      kvm_gsi_direct_mapping = true;

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 05/13] spapr/xive: add migration support for KVM
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 05/13] spapr/xive: add migration support for KVM Cédric Le Goater
@ 2019-02-07  3:41   ` David Gibson
  0 siblings, 0 replies; 43+ messages in thread
From: David Gibson @ 2019-02-07  3:41 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 14156 bytes --]

On Mon, Jan 07, 2019 at 07:39:38PM +0100, Cédric Le Goater wrote:
> When the VM is stopped, the VM state handler stabilizes the XIVE IC
> and marks the EQ pages dirty. These are then transferred to destination
> before the transfer of the device vmstates starts.
> 
> The sPAPRXive interrupt controller model captures the XIVE internal
> tables, EAT and ENDT and the XiveTCTX model does the same for the
> thread interrupt context registers.
> 
> At restart, the sPAPRXive 'post_load' method restores all the XIVE
> states. It is called by the sPAPR machine 'post_load' method, when all
> XIVE states have been transferred and loaded.
> 
> Finally, the source states are restored in the VM change state handler
> when the machine reaches the running state.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>

Looks find modulo possible changes in the KVM interface.

> ---
>  include/hw/ppc/spapr_xive.h |   5 +
>  include/hw/ppc/xive.h       |   1 +
>  hw/intc/spapr_xive.c        |  34 +++++++
>  hw/intc/spapr_xive_kvm.c    | 187 +++++++++++++++++++++++++++++++++++-
>  hw/intc/xive.c              |  17 ++++
>  hw/ppc/spapr_irq.c          |   2 +-
>  6 files changed, 244 insertions(+), 2 deletions(-)
> 
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 8815ed5aa372..52804516e909 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -46,6 +46,7 @@ bool spapr_xive_irq_claim(sPAPRXive *xive, uint32_t lisn, bool lsi);
>  bool spapr_xive_irq_free(sPAPRXive *xive, uint32_t lisn);
>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
>  bool spapr_xive_priority_is_reserved(uint8_t priority);
> +int spapr_xive_post_load(sPAPRXive *xive, int version_id);
>  
>  void spapr_xive_cpu_to_nvt(PowerPCCPU *cpu,
>                             uint8_t *out_nvt_blk, uint32_t *out_nvt_idx);
> @@ -53,6 +54,8 @@ void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
>                             uint8_t *out_end_blk, uint32_t *out_end_idx);
>  int spapr_xive_target_to_end(uint32_t target, uint8_t prio,
>                               uint8_t *out_end_blk, uint32_t *out_end_idx);
> +int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
> +                             uint32_t *out_server, uint8_t *out_prio);
>  
>  typedef struct sPAPRMachineState sPAPRMachineState;
>  
> @@ -68,5 +71,7 @@ void spapr_xive_map_mmio(sPAPRXive *xive);
>   */
>  void kvmppc_xive_connect(sPAPRXive *xive, Error **errp);
>  void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp);
> +int kvmppc_xive_pre_save(sPAPRXive *xive);
> +int kvmppc_xive_post_load(sPAPRXive *xive, int version_id);
>  
>  #endif /* PPC_SPAPR_XIVE_H */
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 2e48d75a22e0..8aa314f93ffd 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -443,5 +443,6 @@ void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp);
>  void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val);
>  void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp);
>  void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp);
> +void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp);
>  
>  #endif /* PPC_XIVE_H */
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index 50dd66707968..21f3c1ef0901 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -85,6 +85,19 @@ static int spapr_xive_target_to_nvt(uint32_t target,
>   * sPAPR END indexing uses a simple mapping of the CPU vcpu_id, 8
>   * priorities per CPU
>   */
> +int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
> +                             uint32_t *out_server, uint8_t *out_prio)
> +{
> +    if (out_server) {
> +        *out_server = end_idx >> 3;
> +    }
> +
> +    if (out_prio) {
> +        *out_prio = end_idx & 0x7;
> +    }
> +    return 0;
> +}
> +
>  void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio,
>                             uint8_t *out_end_blk, uint32_t *out_end_idx)
>  {
> @@ -438,10 +451,31 @@ static const VMStateDescription vmstate_spapr_xive_eas = {
>      },
>  };
>  
> +static int vmstate_spapr_xive_pre_save(void *opaque)
> +{
> +    if (kvmppc_xive_enabled()) {
> +        return kvmppc_xive_pre_save(SPAPR_XIVE(opaque));
> +    }
> +
> +    return 0;
> +}
> +
> +/* Called by the sPAPR machine 'post_load' method */
> +int spapr_xive_post_load(sPAPRXive *xive, int version_id)
> +{
> +    if (kvmppc_xive_enabled()) {
> +        return kvmppc_xive_post_load(xive, version_id);
> +    }
> +
> +    return 0;
> +}
> +
>  static const VMStateDescription vmstate_spapr_xive = {
>      .name = TYPE_SPAPR_XIVE,
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = vmstate_spapr_xive_pre_save,
> +    .post_load = NULL, /* handled at the machine level */
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT32_EQUAL(nr_irqs, sPAPRXive, NULL),
>          VMSTATE_STRUCT_VARRAY_POINTER_UINT32(eat, sPAPRXive, nr_irqs,
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index c7639ffe7758..fe58a9ee32d3 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -60,7 +60,30 @@ static void kvm_cpu_enable(CPUState *cs)
>  /*
>   * XIVE Thread Interrupt Management context (KVM)
>   */
> -static void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
> +
> +static void kvmppc_xive_cpu_set_state(XiveTCTX *tctx, Error **errp)
> +{
> +    uint64_t state[4];
> +    int ret;
> +
> +    /* word0 and word1 of the OS ring. */
> +    state[0] = *((uint64_t *) &tctx->regs[TM_QW1_OS]);
> +
> +    /*
> +     * OS CAM line. Used by KVM to print out the VP identifier. This
> +     * is for debug only.
> +     */
> +    state[1] = *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]);
> +
> +    ret = kvm_set_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
> +    if (ret != 0) {
> +        error_setg_errno(errp, errno,
> +                         "XIVE: could not restore KVM state of CPU %ld",
> +                         kvm_arch_vcpu_id(tctx->cs));
> +    }
> +}
> +
> +void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
>  {
>      uint64_t state[4] = { 0 };
>      int ret;
> @@ -228,6 +251,58 @@ void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
>  /*
>   * sPAPR XIVE interrupt controller (KVM)
>   */
> +static int kvmppc_xive_set_eq_state(sPAPRXive *xive, CPUState *cs, Error **errp)
> +{
> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> +    int ret;
> +    int i;
> +
> +    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
> +        Error *local_err = NULL;
> +        XiveEND *end;
> +        uint8_t end_blk;
> +        uint32_t end_idx;
> +        struct kvm_ppc_xive_eq kvm_eq = { 0 };
> +        uint64_t kvm_eq_idx;
> +
> +        if (spapr_xive_priority_is_reserved(i)) {
> +            continue;
> +        }
> +
> +        spapr_xive_cpu_to_end(POWERPC_CPU(cs), i, &end_blk, &end_idx);
> +
> +        assert(end_idx < xive->nr_ends);
> +        end = &xive->endt[end_idx];
> +
> +        if (!xive_end_is_valid(end)) {
> +            continue;
> +        }
> +
> +        /* Build the KVM state from the local END structure */
> +        kvm_eq.flags   = KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY;
> +        kvm_eq.qsize   = xive_get_field32(END_W0_QSIZE, end->w0) + 12;
> +        kvm_eq.qpage   = (uint64_t) be32_to_cpu(end->w2 & 0x0fffffff) << 32 |
> +            be32_to_cpu(end->w3);
> +        kvm_eq.qtoggle = xive_get_field32(END_W1_GENERATION, end->w1);
> +        kvm_eq.qindex  = xive_get_field32(END_W1_PAGE_OFF, end->w1);
> +
> +        /* Encode the tuple (server, prio) as a KVM EQ index */
> +        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
> +            KVM_XIVE_EQ_PRIORITY_MASK;
> +        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
> +            KVM_XIVE_EQ_SERVER_MASK;
> +
> +        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
> +                                &kvm_eq, true, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return ret;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
>  static int kvmppc_xive_get_eq_state(sPAPRXive *xive, CPUState *cs, Error **errp)
>  {
>      unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> @@ -298,6 +373,48 @@ static int kvmppc_xive_get_eq_state(sPAPRXive *xive, CPUState *cs, Error **errp)
>      return 0;
>  }
>  
> +static void kvmppc_xive_set_eas_state(sPAPRXive *xive, Error **errp)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    int i;
> +
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        XiveEAS *eas = &xive->eat[i];
> +        uint32_t end_idx;
> +        uint32_t end_blk;
> +        uint32_t eisn;
> +        uint8_t priority;
> +        uint32_t server;
> +        uint64_t kvm_eas;
> +        Error *local_err = NULL;
> +
> +        /* No need to set MASKED EAS, this is the default state after reset */
> +        if (!xive_eas_is_valid(eas) || xive_eas_is_masked(eas)) {
> +            continue;
> +        }
> +
> +        end_idx = xive_get_field64(EAS_END_INDEX, eas->w);
> +        end_blk = xive_get_field64(EAS_END_BLOCK, eas->w);
> +        eisn = xive_get_field64(EAS_END_DATA, eas->w);
> +
> +        spapr_xive_end_to_target(end_blk, end_idx, &server, &priority);
> +
> +        kvm_eas = priority << KVM_XIVE_EAS_PRIORITY_SHIFT &
> +            KVM_XIVE_EAS_PRIORITY_MASK;
> +        kvm_eas |= server << KVM_XIVE_EAS_SERVER_SHIFT &
> +            KVM_XIVE_EAS_SERVER_MASK;
> +        kvm_eas |= ((uint64_t)eisn << KVM_XIVE_EAS_EISN_SHIFT) &
> +            KVM_XIVE_EAS_EISN_MASK;
> +
> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, true,
> +                          &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +}
> +
>  static void kvmppc_xive_get_eas_state(sPAPRXive *xive, Error **errp)
>  {
>      XiveSource *xsrc = &xive->source;
> @@ -448,6 +565,74 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running,
>      }
>  }
>  
> +int kvmppc_xive_pre_save(sPAPRXive *xive)
> +{
> +    Error *local_err = NULL;
> +    CPUState *cs;
> +
> +    /* Grab the EAT */
> +    kvmppc_xive_get_eas_state(xive, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return -1;
> +    }
> +
> +    /*
> +     * Grab the ENDT. The EQ index and the toggle bit are what we want
> +     * to capture.
> +     */
> +    CPU_FOREACH(cs) {
> +        kvmppc_xive_get_eq_state(xive, cs, &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return -1;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * The sPAPRXive 'post_load' method is called by the sPAPR machine
> + * 'post_load' method, when all XIVE states have been transferred and
> + * loaded.
> + */
> +int kvmppc_xive_post_load(sPAPRXive *xive, int version_id)
> +{
> +    Error *local_err = NULL;
> +    CPUState *cs;
> +
> +    /* Restore the ENDT first. The targetting depends on it. */
> +    CPU_FOREACH(cs) {
> +        kvmppc_xive_set_eq_state(xive, cs, &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return -1;
> +        }
> +    }
> +
> +    /* Restore the EAT */
> +    kvmppc_xive_set_eas_state(xive, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return -1;
> +    }
> +
> +    /* Restore the thread interrupt contexts */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        kvmppc_xive_cpu_set_state(cpu->tctx, &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return -1;
> +        }
> +    }
> +
> +    /* The source states will be restored when the machine starts running */
> +    return 0;
> +}
> +
>  void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp)
>  {
>      XiveSource *xsrc = &xive->source;
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index 596c29d8c826..c5c2fbc3f8bc 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -521,10 +521,27 @@ static void xive_tctx_unrealize(DeviceState *dev, Error **errp)
>      qemu_unregister_reset(xive_tctx_reset, dev);
>  }
>  
> +static int vmstate_xive_tctx_pre_save(void *opaque)
> +{
> +    Error *local_err = NULL;
> +
> +    if (kvmppc_xive_enabled()) {
> +        kvmppc_xive_cpu_get_state(XIVE_TCTX(opaque), &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return -1;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
>  static const VMStateDescription vmstate_xive_tctx = {
>      .name = TYPE_XIVE_TCTX,
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = vmstate_xive_tctx_pre_save,
> +    .post_load = NULL, /* handled by the sPAPRxive model */
>      .fields = (VMStateField[]) {
>          VMSTATE_BUFFER(regs, XiveTCTX),
>          VMSTATE_END_OF_LIST()
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index afbdabfa6543..233c97c5ecd9 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -363,7 +363,7 @@ static void spapr_irq_cpu_intc_create_xive(sPAPRMachineState *spapr,
>  
>  static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>  {
> -    return 0;
> +    return spapr_xive_post_load(spapr->xive, version_id);
>  }
>  
>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 06/13] spapr/xive: fix migration of the XiveTCTX under TCG
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 06/13] spapr/xive: fix migration of the XiveTCTX under TCG Cédric Le Goater
@ 2019-02-08  5:36   ` David Gibson
  2019-02-08  7:12     ` Cédric Le Goater
  0 siblings, 1 reply; 43+ messages in thread
From: David Gibson @ 2019-02-08  5:36 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 2327 bytes --]

On Mon, Jan 07, 2019 at 07:39:39PM +0100, Cédric Le Goater wrote:
> When the thread interrupt management state is retrieved from the KVM
> VCPU, word2 is saved under the QEMU XIVE thread context to print out
> the OS CAM line under the QEMU monitor.
> 
> This breaks the migration of a TCG guest (and with KVM when
> kernel_irqchip=off) because the matching algorithm of the presenter
> relies on the OS CAM value. Fix with an extra reset of the thread
> contexts to restore the expected value.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>

Why is the CAM value you get from KVM different from the one you
expect in qemu?

> ---
>  hw/ppc/spapr_irq.c | 26 +++++++++++++++++++++++++-
>  1 file changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index 233c97c5ecd9..ba27d9d8e972 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -363,7 +363,31 @@ static void spapr_irq_cpu_intc_create_xive(sPAPRMachineState *spapr,
>  
>  static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>  {
> -    return spapr_xive_post_load(spapr->xive, version_id);
> +    CPUState *cs;
> +    int ret;
> +
> +    ret = spapr_xive_post_load(spapr->xive, version_id);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    /*
> +     * When the states are collected from the KVM XIVE device, word2
> +     * of the XiveTCTX is set to print out the OS CAM line under the
> +     * QEMU monitor.
> +     *
> +     * This breaks the migration on a TCG guest (or on KVM with
> +     * kernel_irqchip=off) because the matching algorithm of the
> +     * presenter relies on the OS CAM value. Fix with an extra reset
> +     * of the thread contexts to restore the expected value.
> +     */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        /* (TCG) Set the OS CAM line of the thread interrupt context. */
> +        spapr_xive_set_tctx_os_cam(cpu->tctx);
> +    }
> +    return 0;
>  }
>  
>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 06/13] spapr/xive: fix migration of the XiveTCTX under TCG
  2019-02-08  5:36   ` David Gibson
@ 2019-02-08  7:12     ` Cédric Le Goater
  2019-02-12  0:22       ` David Gibson
  0 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-02-08  7:12 UTC (permalink / raw)
  To: David Gibson; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

On 2/8/19 6:36 AM, David Gibson wrote:
> On Mon, Jan 07, 2019 at 07:39:39PM +0100, Cédric Le Goater wrote:
>> When the thread interrupt management state is retrieved from the KVM
>> VCPU, word2 is saved under the QEMU XIVE thread context to print out
>> the OS CAM line under the QEMU monitor.
>>
>> This breaks the migration of a TCG guest (and with KVM when
>> kernel_irqchip=off) because the matching algorithm of the presenter
>> relies on the OS CAM value. Fix with an extra reset of the thread
>> contexts to restore the expected value.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> 
> Why is the CAM value you get from KVM different from the one you
> expect in qemu?

An NVT base identifier is allocated for each VM at the OPAL level
and each vCPU getd an increment of this value. It is pushed in the 
OS CAM line when the vCPU is scheduled to run.
 
KVM XIVE names this identifier a VP id. 

The QEMU emulation of XIVE uses a fixed value for the NVT base 
identifier.

C.

 
>> ---
>>  hw/ppc/spapr_irq.c | 26 +++++++++++++++++++++++++-
>>  1 file changed, 25 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>> index 233c97c5ecd9..ba27d9d8e972 100644
>> --- a/hw/ppc/spapr_irq.c
>> +++ b/hw/ppc/spapr_irq.c
>> @@ -363,7 +363,31 @@ static void spapr_irq_cpu_intc_create_xive(sPAPRMachineState *spapr,
>>  
>>  static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>>  {
>> -    return spapr_xive_post_load(spapr->xive, version_id);
>> +    CPUState *cs;
>> +    int ret;
>> +
>> +    ret = spapr_xive_post_load(spapr->xive, version_id);
>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    /*
>> +     * When the states are collected from the KVM XIVE device, word2
>> +     * of the XiveTCTX is set to print out the OS CAM line under the
>> +     * QEMU monitor.
>> +     *
>> +     * This breaks the migration on a TCG guest (or on KVM with
>> +     * kernel_irqchip=off) because the matching algorithm of the
>> +     * presenter relies on the OS CAM value. Fix with an extra reset
>> +     * of the thread contexts to restore the expected value.
>> +     */
>> +    CPU_FOREACH(cs) {
>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +
>> +        /* (TCG) Set the OS CAM line of the thread interrupt context. */
>> +        spapr_xive_set_tctx_os_cam(cpu->tctx);
>> +    }
>> +    return 0;
>>  }
>>  
>>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 06/13] spapr/xive: fix migration of the XiveTCTX under TCG
  2019-02-08  7:12     ` Cédric Le Goater
@ 2019-02-12  0:22       ` David Gibson
  2019-02-12  6:58         ` Cédric Le Goater
  0 siblings, 1 reply; 43+ messages in thread
From: David Gibson @ 2019-02-12  0:22 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 1583 bytes --]

On Fri, Feb 08, 2019 at 08:12:12AM +0100, Cédric Le Goater wrote:
> On 2/8/19 6:36 AM, David Gibson wrote:
> > On Mon, Jan 07, 2019 at 07:39:39PM +0100, Cédric Le Goater wrote:
> >> When the thread interrupt management state is retrieved from the KVM
> >> VCPU, word2 is saved under the QEMU XIVE thread context to print out
> >> the OS CAM line under the QEMU monitor.
> >>
> >> This breaks the migration of a TCG guest (and with KVM when
> >> kernel_irqchip=off) because the matching algorithm of the presenter
> >> relies on the OS CAM value. Fix with an extra reset of the thread
> >> contexts to restore the expected value.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> > 
> > Why is the CAM value you get from KVM different from the one you
> > expect in qemu?
> 
> An NVT base identifier is allocated for each VM at the OPAL level
> and each vCPU getd an increment of this value. It is pushed in the 
> OS CAM line when the vCPU is scheduled to run.
>  
> KVM XIVE names this identifier a VP id. 
> 
> The QEMU emulation of XIVE uses a fixed value for the NVT base 
> identifier.

Ah, I see.

Hmm.  Really this highlights why I'm uneasy migrating the whole TCTX
as just a blob of registers, even though only some of them are really
runtime state, and others are machine configuration that can be worked
out separately at the two ends.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 10/13] spapr: introduce routines to delete the KVM IRQ device
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 10/13] spapr: introduce routines to delete the KVM IRQ device Cédric Le Goater
@ 2019-02-12  0:58   ` David Gibson
  0 siblings, 0 replies; 43+ messages in thread
From: David Gibson @ 2019-02-12  0:58 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 7136 bytes --]

On Mon, Jan 07, 2019 at 07:39:43PM +0100, Cédric Le Goater wrote:
> If a new interrupt mode is chosen by CAS, the machine generates a
> reset to reconfigure. At this point, the connection with the previous
> KVM device needs to be closed and a new connection needs to opened
> with the KVM device operating the chosen interrupt mode.
> 
> New routines are introduced to destroy the XICS and the XIVE KVM
> devices. They make use of a new KVM device ioctl which destroys the
> device and also disconnects the IRQ presenters from the vCPUs.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>

Looks reasonable.

> ---
>  include/hw/ppc/spapr_xive.h |  1 +
>  include/hw/ppc/xics.h       |  1 +
>  hw/intc/spapr_xive_kvm.c    | 60 +++++++++++++++++++++++++++++++++++++
>  hw/intc/xics_kvm.c          | 57 +++++++++++++++++++++++++++++++++++
>  4 files changed, 119 insertions(+)
> 
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 52804516e909..f172fc20b650 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -70,6 +70,7 @@ void spapr_xive_map_mmio(sPAPRXive *xive);
>   * KVM XIVE device helpers
>   */
>  void kvmppc_xive_connect(sPAPRXive *xive, Error **errp);
> +void kvmppc_xive_disconnect(sPAPRXive *xive, Error **errp);
>  void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp);
>  int kvmppc_xive_pre_save(sPAPRXive *xive);
>  int kvmppc_xive_post_load(sPAPRXive *xive, int version_id);
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index 07508cbd217e..75d4effb5c5f 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -205,6 +205,7 @@ typedef struct sPAPRMachineState sPAPRMachineState;
>  void spapr_dt_xics(sPAPRMachineState *spapr, uint32_t nr_servers, void *fdt,
>                     uint32_t phandle);
>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp);
> +int xics_kvm_disconnect(sPAPRMachineState *spapr, Error **errp);
>  void xics_spapr_init(sPAPRMachineState *spapr);
>  
>  Object *icp_create(Object *cpu, const char *type, XICSFabric *xi,
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index fe58a9ee32d3..93ea8e71047a 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -57,6 +57,16 @@ static void kvm_cpu_enable(CPUState *cs)
>      QLIST_INSERT_HEAD(&kvm_enabled_cpus, enabled_cpu, node);
>  }
>  
> +static void kvm_cpu_disable_all(void)
> +{
> +    KVMEnabledCPU *enabled_cpu, *next;
> +
> +    QLIST_FOREACH_SAFE(enabled_cpu, &kvm_enabled_cpus, node, next) {
> +        QLIST_REMOVE(enabled_cpu, node);
> +        g_free(enabled_cpu);
> +    }
> +}
> +
>  /*
>   * XIVE Thread Interrupt Management context (KVM)
>   */
> @@ -769,3 +779,53 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>      /* Map all regions */
>      spapr_xive_map_mmio(xive);
>  }
> +
> +void kvmppc_xive_disconnect(sPAPRXive *xive, Error **errp)
> +{
> +    XiveSource *xsrc;
> +    struct kvm_create_device xive_destroy_device = { 0 };
> +    size_t esb_len;
> +    int rc;
> +
> +    if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> +        error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> +        return;
> +    }
> +
> +    /* The KVM XIVE device is not in use */
> +    if (!xive || xive->fd == -1) {
> +        return;
> +    }
> +
> +    /* Clear the KVM mapping */
> +    xsrc = &xive->source;
> +    esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
> +
> +    sysbus_mmio_unmap(SYS_BUS_DEVICE(xive), 0);
> +    munmap(xsrc->esb_mmap, esb_len);
> +
> +    sysbus_mmio_unmap(SYS_BUS_DEVICE(xive), 1);
> +
> +    sysbus_mmio_unmap(SYS_BUS_DEVICE(xive), 2);
> +    munmap(xive->tm_mmap, 4ull << TM_SHIFT);
> +
> +    /* Destroy the KVM device. This also clears the VCPU presenters */
> +    xive_destroy_device.fd = xive->fd;
> +    xive_destroy_device.type = KVM_DEV_TYPE_XIVE;
> +    rc = kvm_vm_ioctl(kvm_state, KVM_DESTROY_DEVICE, &xive_destroy_device);
> +    if (rc < 0) {
> +        error_setg_errno(errp, -rc, "Error on KVM_DESTROY_DEVICE for XIVE");
> +    }
> +    close(xive->fd);
> +    xive->fd = -1;
> +
> +    kvm_kernel_irqchip = false;
> +    kvm_msi_via_irqfd_allowed = false;
> +    kvm_gsi_direct_mapping = false;
> +
> +    /* Clear the local list of presenter (hotplug) */
> +    kvm_cpu_disable_all();
> +
> +    /* VM Change state handler is not needed anymore */
> +    qemu_del_vm_change_state_handler(xive->change);
> +}
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> index 2426e5b2f4ed..da6a00bc88cc 100644
> --- a/hw/intc/xics_kvm.c
> +++ b/hw/intc/xics_kvm.c
> @@ -50,6 +50,16 @@ typedef struct KVMEnabledICP {
>  static QLIST_HEAD(, KVMEnabledICP)
>      kvm_enabled_icps = QLIST_HEAD_INITIALIZER(&kvm_enabled_icps);
>  
> +static void kvm_disable_icps(void)
> +{
> +    KVMEnabledICP *enabled_icp, *next;
> +
> +    QLIST_FOREACH_SAFE(enabled_icp, &kvm_enabled_icps, node, next) {
> +        QLIST_REMOVE(enabled_icp, node);
> +        g_free(enabled_icp);
> +    }
> +}
> +
>  /*
>   * ICP-KVM
>   */
> @@ -455,6 +465,53 @@ fail:
>      return -1;
>  }
>  
> +int xics_kvm_disconnect(sPAPRMachineState *spapr, Error **errp)
> +{
> +    int rc;
> +    struct kvm_create_device xics_create_device = {
> +        .fd = kernel_xics_fd,
> +        .type = KVM_DEV_TYPE_XICS,
> +        .flags = 0,
> +    };
> +
> +    /* The KVM XICS device is not in use */
> +    if (kernel_xics_fd == -1) {
> +        return 0;
> +    }
> +
> +    if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> +        error_setg(errp,
> +                   "KVM and IRQ_XICS capability must be present for KVM XICS device");
> +        return -1;
> +    }
> +
> +    rc = kvm_vm_ioctl(kvm_state, KVM_DESTROY_DEVICE, &xics_create_device);
> +    if (rc < 0) {
> +        error_setg_errno(errp, -rc, "Error on KVM_DESTROY_DEVICE for XICS");
> +    }
> +    close(kernel_xics_fd);
> +    kernel_xics_fd = -1;
> +
> +    spapr_rtas_register(RTAS_IBM_SET_XIVE, NULL, 0);
> +    spapr_rtas_register(RTAS_IBM_GET_XIVE, NULL, 0);
> +    spapr_rtas_register(RTAS_IBM_INT_OFF, NULL, 0);
> +    spapr_rtas_register(RTAS_IBM_INT_ON, NULL, 0);
> +
> +    kvmppc_define_rtas_kernel_token(0, "ibm,set-xive");
> +    kvmppc_define_rtas_kernel_token(0, "ibm,get-xive");
> +    kvmppc_define_rtas_kernel_token(0, "ibm,int-on");
> +    kvmppc_define_rtas_kernel_token(0, "ibm,int-off");
> +
> +    kvm_kernel_irqchip = false;
> +    kvm_msi_via_irqfd_allowed = false;
> +    kvm_gsi_direct_mapping = false;
> +
> +    /* Clear the presenter from the VCPUs */
> +    kvm_disable_icps();
> +
> +    return rc;
> +}
> +
>  static void xics_kvm_register_types(void)
>  {
>      type_register_static(&ics_kvm_info);

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 11/13] spapr: check for the activation of the KVM IRQ device
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 11/13] spapr: check for the activation of " Cédric Le Goater
@ 2019-02-12  1:01   ` David Gibson
  2019-02-12  7:12     ` Cédric Le Goater
  0 siblings, 1 reply; 43+ messages in thread
From: David Gibson @ 2019-02-12  1:01 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 5793 bytes --]

On Mon, Jan 07, 2019 at 07:39:44PM +0100, Cédric Le Goater wrote:
> The activation of the KVM IRQ device depends on the interrupt mode
> chosen at CAS time by the machine and some methods used at reset or by
> the migration need to be protected.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/intc/spapr_xive_kvm.c | 28 ++++++++++++++++++++++++++++
>  hw/intc/xics_kvm.c       | 25 ++++++++++++++++++++++++-
>  2 files changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index 93ea8e71047a..d35814c1992e 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -95,9 +95,15 @@ static void kvmppc_xive_cpu_set_state(XiveTCTX *tctx, Error **errp)
>  
>  void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
>  {
> +    sPAPRXive *xive = SPAPR_MACHINE(qdev_get_machine())->xive;
>      uint64_t state[4] = { 0 };
>      int ret;
>  
> +    /* The KVM XIVE device is not in use */
> +    if (xive->fd == -1) {
> +        return;
> +    }
> +
>      ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
>      if (ret != 0) {
>          error_setg_errno(errp, errno,
> @@ -151,6 +157,11 @@ void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp)
>      unsigned long vcpu_id;
>      int ret;
>  
> +    /* The KVM XIVE device is not in use */
> +    if (xive->fd == -1) {
> +        return;
> +    }
> +
>      /* Check if CPU was hot unplugged and replugged. */
>      if (kvm_cpu_is_enabled(tctx->cs)) {
>          return;
> @@ -234,9 +245,13 @@ static void kvmppc_xive_source_get_state(XiveSource *xsrc)
>  void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
>  {
>      XiveSource *xsrc = opaque;
> +    sPAPRXive *xive = SPAPR_XIVE(xsrc->xive);
>      struct kvm_irq_level args;
>      int rc;
>  
> +    /* The KVM XIVE device should be in use */
> +    assert(xive->fd != -1);
> +
>      args.irq = srcno;
>      if (!xive_source_irq_is_lsi(xsrc, srcno)) {
>          if (!val) {
> @@ -580,6 +595,11 @@ int kvmppc_xive_pre_save(sPAPRXive *xive)
>      Error *local_err = NULL;
>      CPUState *cs;
>  
> +    /* The KVM XIVE device is not in use */
> +    if (xive->fd == -1) {
> +        return 0;
> +    }
> +
>      /* Grab the EAT */
>      kvmppc_xive_get_eas_state(xive, &local_err);
>      if (local_err) {
> @@ -612,6 +632,9 @@ int kvmppc_xive_post_load(sPAPRXive *xive, int version_id)
>      Error *local_err = NULL;
>      CPUState *cs;
>  
> +    /* The KVM XIVE device should be in use */
> +    assert(xive->fd != -1);

I'm guessing this is an assert() because the handler shouldn't be
registered when we're not in KVM mode.  But wouldn't that also be true
of the pre_save hook, which errors out rather than asserting?

>      /* Restore the ENDT first. The targetting depends on it. */
>      CPU_FOREACH(cs) {
>          kvmppc_xive_set_eq_state(xive, cs, &local_err);
> @@ -649,6 +672,11 @@ void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp)
>      CPUState *cs;
>      Error *local_err = NULL;
>  
> +    /* The KVM XIVE device is not in use */
> +    if (xive->fd == -1) {
> +        return;
> +    }
> +
>      /*
>       * When the VM is stopped, the sources are masked and the previous
>       * state is saved in anticipation of a migration. We should not
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> index da6a00bc88cc..651bbfdf6966 100644
> --- a/hw/intc/xics_kvm.c
> +++ b/hw/intc/xics_kvm.c
> @@ -68,6 +68,11 @@ static void icp_get_kvm_state(ICPState *icp)
>      uint64_t state;
>      int ret;
>  
> +    /* The KVM XICS device is not in use */
> +    if (kernel_xics_fd == -1) {
> +        return;
> +    }
> +
>      /* ICP for this CPU thread is not in use, exiting */
>      if (!icp->cs) {
>          return;
> @@ -104,6 +109,11 @@ static int icp_set_kvm_state(ICPState *icp, int version_id)
>      uint64_t state;
>      int ret;
>  
> +    /* The KVM XICS device is not in use */
> +    if (kernel_xics_fd == -1) {
> +        return 0;
> +    }
> +
>      /* ICP for this CPU thread is not in use, exiting */
>      if (!icp->cs) {
>          return 0;
> @@ -140,8 +150,8 @@ static void icp_kvm_connect(ICPState *icp, Error **errp)
>      unsigned long vcpu_id;
>      int ret;
>  
> +    /* The KVM XICS device is not in use */
>      if (kernel_xics_fd == -1) {
> -        abort();
>          return;
>      }
>  
> @@ -220,6 +230,11 @@ static void ics_get_kvm_state(ICSState *ics)
>      uint64_t state;
>      int i;
>  
> +    /* The KVM XICS device is not in use */
> +    if (kernel_xics_fd == -1) {
> +        return;
> +    }
> +
>      for (i = 0; i < ics->nr_irqs; i++) {
>          ICSIRQState *irq = &ics->irqs[i];
>  
> @@ -279,6 +294,11 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
>      int i;
>      Error *local_err = NULL;
>  
> +    /* The KVM XICS device is not in use */
> +    if (kernel_xics_fd == -1) {
> +        return 0;
> +    }
> +
>      for (i = 0; i < ics->nr_irqs; i++) {
>          ICSIRQState *irq = &ics->irqs[i];
>          int ret;
> @@ -325,6 +345,9 @@ void ics_kvm_set_irq(void *opaque, int srcno, int val)
>      struct kvm_irq_level args;
>      int rc;
>  
> +    /* The KVM XICS device should be in use */
> +    assert(kernel_xics_fd != -1);
> +
>      args.irq = srcno + ics->offset;
>      if (ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MSI) {
>          if (!val) {

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space Cédric Le Goater
@ 2019-02-12  1:06   ` David Gibson
  2019-02-12  7:05     ` Cédric Le Goater
  0 siblings, 1 reply; 43+ messages in thread
From: David Gibson @ 2019-02-12  1:06 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 2587 bytes --]

On Mon, Jan 07, 2019 at 07:39:45PM +0100, Cédric Le Goater wrote:
> The IRQ number space of the XIVE and XICS interrupt mode are aligned
> when using the dual interrupt mode for the machine. This means that
> the ICS offset is set to zero in QEMU and that the KVM XICS device
> should be informed of this new value. Unfortunately, there is now way
> to do so and KVM still maintains the XICS_IRQ_BASE (0x1000) offset.
> 
> Ignore the lower 4K which are not used under the XICS interrupt
> mode. These IRQ numbers are only claimed by XIVE for the CPU IPIs.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/intc/xics_kvm.c | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> index 651bbfdf6966..1d21ff217b82 100644
> --- a/hw/intc/xics_kvm.c
> +++ b/hw/intc/xics_kvm.c
> @@ -238,6 +238,15 @@ static void ics_get_kvm_state(ICSState *ics)
>      for (i = 0; i < ics->nr_irqs; i++) {
>          ICSIRQState *irq = &ics->irqs[i];
>  
> +        /*
> +         * The KVM XICS device considers that the IRQ numbers should
> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
> +         * numbers (only claimed by XIVE for the CPU IPIs).
> +         */
> +        if (i + ics->offset < XICS_IRQ_BASE) {
> +            continue;
> +        }
> +

This seems bogus to me.  The guest-visible irq numbers need to line up
between xics and xive mode, yes, but that doesn't mean we need to keep
around a great big array of unused array of ICS irq states, even in
TCG mode.

>          kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
>                            i + ics->offset, &state, false, &error_fatal);
>  
> @@ -303,6 +312,15 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
>          ICSIRQState *irq = &ics->irqs[i];
>          int ret;
>  
> +        /*
> +         * The KVM XICS device considers that the IRQ numbers should
> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
> +         * numbers (only claimed by XIVE for the CPU IPIs).
> +         */
> +        if (i + ics->offset < XICS_IRQ_BASE) {
> +            continue;
> +        }
> +
>          state = irq->server;
>          state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
>              << KVM_XICS_PRIORITY_SHIFT;

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-01-07 18:39 ` [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine Cédric Le Goater
@ 2019-02-12  1:11   ` David Gibson
  2019-02-12  7:18     ` Cédric Le Goater
  0 siblings, 1 reply; 43+ messages in thread
From: David Gibson @ 2019-02-12  1:11 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 12242 bytes --]

On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
> The interrupt mode is chosen by the CAS negotiation process and
> activated after a reset to take into account the required changes in
> the machine. This brings new constraints on how the associated KVM IRQ
> device is initialized.
> 
> Currently, each model takes care of the initialization of the KVM
> device in their realize method but this is not possible anymore as the
> initialization needs to be done globaly when the interrupt mode is
> known, i.e. when machine is reseted. It also means that we need a way
> to delete a KVM device when another mode is chosen.
> 
> Also, to support migration, the QEMU objects holding the state to
> transfer should always be available but not necessarily activated.
> 
> The overall approach of this proposal is to initialize both interrupt
> mode at the QEMU level and keep the IRQ number space in sync to allow
> switching from one mode to another. For the KVM side of things, the
> whole initialization of the KVM device, sources and presenters, is
> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> handlers are modified accordingly to handle the init and the delete
> sequences of the KVM device.
> 
> As KVM is now initialized at reset, we loose the possiblity to
> fallback to the QEMU emulated mode in case of failure and failures
> become fatal to the machine.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/intc/spapr_xive.c     |  8 +---
>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>  hw/intc/xive.c           |  4 --
>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>  5 files changed, 109 insertions(+), 34 deletions(-)
> 
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index 21f3c1ef0901..0661aca35900 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>  
> -    if (kvmppc_xive_enabled()) {
> -        kvmppc_xive_connect(xive, &local_err);
> -        if (local_err) {
> -            error_propagate(errp, local_err);
> -            return;
> -        }
> -    } else {
> +    if (!kvmppc_xive_enabled()) {
>          /* TIMA initialization */
>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>                                "xive.tima", 4ull << TM_SHIFT);
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index d35814c1992e..3ebc947f2be7 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>      Error *local_err = NULL;
>      size_t esb_len;
>      size_t tima_len;
> +    CPUState *cs;
> +
> +    /*
> +     * The KVM XIVE device already in use. This is the case when
> +     * rebooting XIVE -> XIVE

Can this case actually occur?  Further down you appear to
unconditionally destroy both KVM devices at reset time.

> +     */
> +    if (xive->fd != -1) {
> +        return;
> +    }
>  
>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>      xive->change = qemu_add_vm_change_state_handler(
>          kvmppc_xive_change_state_handler, xive);
>  
> +    /* Connect the presenters to the initial VCPUs of the machine */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +
> +    /* Update the KVM sources */
> +    kvmppc_xive_source_reset(xsrc, &local_err);
> +    if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +    }
> +
>      kvm_kernel_irqchip = true;
>      kvm_msi_via_irqfd_allowed = true;
>      kvm_gsi_direct_mapping = true;
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> index 1d21ff217b82..bfc35d71df7f 100644
> --- a/hw/intc/xics_kvm.c
> +++ b/hw/intc/xics_kvm.c
> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>  {
>      int rc;
> +    CPUState *cs;
> +    Error *local_err = NULL;
> +
> +    /*
> +     * The KVM XICS device already in use. This is the case when
> +     * rebooting XICS -> XICS
> +     */
> +    if (kernel_xics_fd != -1) {
> +        return 0;
> +    }
>  
>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>          error_setg(errp,
> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>      kvm_msi_via_irqfd_allowed = true;
>      kvm_gsi_direct_mapping = true;
>  
> +    /* Connect the presenters to the initial VCPUs of the machine */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        icp_kvm_connect(cpu->icp, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            goto fail;
> +        }
> +        icp_set_kvm_state(cpu->icp, 1);
> +    }
> +
> +    /* Update the KVM sources */
> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> +
>      return 0;
>  
>  fail:
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index c5c2fbc3f8bc..c166eab5b210 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
>  
>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> -
> -    if (kvmppc_xive_enabled()) {
> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> -    }
>  }
>  
>  static void xive_source_realize(DeviceState *dev, Error **errp)
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index ba27d9d8e972..5592eec3787b 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>      int nr_irqs = spapr->irq->nr_irqs;
>      Error *local_err = NULL;
>  
> -    if (kvm_enabled()) {
> -        if (machine_kernel_irqchip_allowed(machine) &&
> -            !xics_kvm_init(spapr, &local_err)) {
> -            spapr->icp_type = TYPE_KVM_ICP;
> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> -                                          &local_err);
> -        }
> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> -            error_prepend(&local_err,
> -                          "kernel_irqchip requested but unavailable: ");
> -            goto error;

I don't see anything that replaces the irqchip_required logic, which
doesn't seem right.

> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +        spapr->icp_type = TYPE_KVM_ICP;
> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> +                                      &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
>          }
> -        error_free(local_err);
> -        local_err = NULL;
>      }
>  
>      if (!spapr->ics) {
> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>          spapr->icp_type = TYPE_ICP;
>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
>                                        &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
>      }
> -
> -error:
> -    error_propagate(errp, local_err);
>  }
>  
>  #define ICS_IRQ_FREE(ics, srcno)   \
> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>  
>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
>  {
> -    /* TODO: create the KVM XICS device */
> +    MachineState *machine = MACHINE(spapr);
> +    Error *local_err = NULL;
> +
> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +        xics_kvm_init(spapr, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XICS connect failed: ");
> +            return;
> +        }
> +    }
>  }
>  
>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>  {
>      CPUState *cs;
> +    Error *local_err = NULL;
>  
>      CPU_FOREACH(cs) {
>          PowerPCCPU *cpu = POWERPC_CPU(cs);
> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>          spapr_xive_set_tctx_os_cam(cpu->tctx);
>      }
>  
> +    if (kvmppc_xive_enabled()) {
> +        kvmppc_xive_connect(spapr->xive, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XIVE connect failed: ");
> +            return;
> +        }
> +    }
> +
>      /* Activate the XIVE MMIOs */
>      spapr_xive_mmio_set_enabled(spapr->xive, true);
>  }
> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
>  
>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
>  {
> -    MachineState *machine = MACHINE(spapr);
>      Error *local_err = NULL;
>  
> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> -        error_setg(errp, "No KVM support for the 'dual' machine");
> -        return;
> -    }
> -
>      spapr_irq_xics.init(spapr, &local_err);
>      if (local_err) {
>          error_propagate(errp, local_err);
> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
>  
>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>  {
> +    MachineState *machine = MACHINE(spapr);
> +
>      /*
>       * Force a reset of the XIVE backend after migration. The machine
>       * defaults to XICS at startup.
>       */
>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +            xics_kvm_disconnect(spapr, &error_fatal);
> +        }
>          spapr_irq_xive.reset(spapr, &error_fatal);
>      }
>  
> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>  
>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
>  {
> +    MachineState *machine = MACHINE(spapr);
> +    Error *local_err = NULL;
> +
>      /*
>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
>       * if selected.
>       */
>      spapr_xive_mmio_set_enabled(spapr->xive, false);
>  
> +    /* Destroy all KVM devices */
> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +        xics_kvm_disconnect(spapr, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XICS disconnect failed: ");
> +            return;
> +        }
> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
> +            return;
> +        }
> +    }
> +
>      spapr_irq_current(spapr)->reset(spapr, errp);
>  }
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 06/13] spapr/xive: fix migration of the XiveTCTX under TCG
  2019-02-12  0:22       ` David Gibson
@ 2019-02-12  6:58         ` Cédric Le Goater
  0 siblings, 0 replies; 43+ messages in thread
From: Cédric Le Goater @ 2019-02-12  6:58 UTC (permalink / raw)
  To: David Gibson; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

On 2/12/19 1:22 AM, David Gibson wrote:
> On Fri, Feb 08, 2019 at 08:12:12AM +0100, Cédric Le Goater wrote:
>> On 2/8/19 6:36 AM, David Gibson wrote:
>>> On Mon, Jan 07, 2019 at 07:39:39PM +0100, Cédric Le Goater wrote:
>>>> When the thread interrupt management state is retrieved from the KVM
>>>> VCPU, word2 is saved under the QEMU XIVE thread context to print out
>>>> the OS CAM line under the QEMU monitor.
>>>>
>>>> This breaks the migration of a TCG guest (and with KVM when
>>>> kernel_irqchip=off) because the matching algorithm of the presenter
>>>> relies on the OS CAM value. Fix with an extra reset of the thread
>>>> contexts to restore the expected value.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>
>>> Why is the CAM value you get from KVM different from the one you
>>> expect in qemu?
>>
>> An NVT base identifier is allocated for each VM at the OPAL level
>> and each vCPU getd an increment of this value. It is pushed in the 
>> OS CAM line when the vCPU is scheduled to run.
>>  
>> KVM XIVE names this identifier a VP id. 
>>
>> The QEMU emulation of XIVE uses a fixed value for the NVT base 
>> identifier.
> 
> Ah, I see.
> 
> Hmm.  Really this highlights why I'm uneasy migrating the whole TCTX
> as just a blob of registers, even though only some of them are really
> runtime state, and others are machine configuration that can be worked
> out separately at the two ends.

This is really a special case :

1. migration kernel_irqchip=on -> kernel_irqchip=off
2. debug info from KVM is squashing TCG state

We could ignore the VP id configured by KVM but it seems interesting
to retrieve. 

C. 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space
  2019-02-12  1:06   ` David Gibson
@ 2019-02-12  7:05     ` Cédric Le Goater
  2019-02-13  1:33       ` David Gibson
  0 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-02-12  7:05 UTC (permalink / raw)
  To: David Gibson; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

On 2/12/19 2:06 AM, David Gibson wrote:
> On Mon, Jan 07, 2019 at 07:39:45PM +0100, Cédric Le Goater wrote:
>> The IRQ number space of the XIVE and XICS interrupt mode are aligned
>> when using the dual interrupt mode for the machine. This means that
>> the ICS offset is set to zero in QEMU and that the KVM XICS device
>> should be informed of this new value. Unfortunately, there is now way
>> to do so and KVM still maintains the XICS_IRQ_BASE (0x1000) offset.
>>
>> Ignore the lower 4K which are not used under the XICS interrupt
>> mode. These IRQ numbers are only claimed by XIVE for the CPU IPIs.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/intc/xics_kvm.c | 18 ++++++++++++++++++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>> index 651bbfdf6966..1d21ff217b82 100644
>> --- a/hw/intc/xics_kvm.c
>> +++ b/hw/intc/xics_kvm.c
>> @@ -238,6 +238,15 @@ static void ics_get_kvm_state(ICSState *ics)
>>      for (i = 0; i < ics->nr_irqs; i++) {
>>          ICSIRQState *irq = &ics->irqs[i];
>>  
>> +        /*
>> +         * The KVM XICS device considers that the IRQ numbers should
>> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
>> +         * numbers (only claimed by XIVE for the CPU IPIs).
>> +         */
>> +        if (i + ics->offset < XICS_IRQ_BASE) {
>> +            continue;
>> +        }
>> +
> 
> This seems bogus to me.  The guest-visible irq numbers need to line up
> between xics and xive mode, yes, but that doesn't mean we need to keep
> around a great big array of unused array of ICS irq states, even in
> TCG mode.

This is because the qirqs[] array is under the machine and shared between 
both interrupt modes, xics and xive.

C.

> 
>>          kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
>>                            i + ics->offset, &state, false, &error_fatal);
>>  
>> @@ -303,6 +312,15 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
>>          ICSIRQState *irq = &ics->irqs[i];
>>          int ret;
>>  
>> +        /*
>> +         * The KVM XICS device considers that the IRQ numbers should
>> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
>> +         * numbers (only claimed by XIVE for the CPU IPIs).
>> +         */
>> +        if (i + ics->offset < XICS_IRQ_BASE) {
>> +            continue;
>> +        }
>> +
>>          state = irq->server;
>>          state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
>>              << KVM_XICS_PRIORITY_SHIFT;
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 11/13] spapr: check for the activation of the KVM IRQ device
  2019-02-12  1:01   ` David Gibson
@ 2019-02-12  7:12     ` Cédric Le Goater
  2019-02-13  0:17       ` David Gibson
  0 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-02-12  7:12 UTC (permalink / raw)
  To: David Gibson; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

On 2/12/19 2:01 AM, David Gibson wrote:
> On Mon, Jan 07, 2019 at 07:39:44PM +0100, Cédric Le Goater wrote:
>> The activation of the KVM IRQ device depends on the interrupt mode
>> chosen at CAS time by the machine and some methods used at reset or by
>> the migration need to be protected.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/intc/spapr_xive_kvm.c | 28 ++++++++++++++++++++++++++++
>>  hw/intc/xics_kvm.c       | 25 ++++++++++++++++++++++++-
>>  2 files changed, 52 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>> index 93ea8e71047a..d35814c1992e 100644
>> --- a/hw/intc/spapr_xive_kvm.c
>> +++ b/hw/intc/spapr_xive_kvm.c
>> @@ -95,9 +95,15 @@ static void kvmppc_xive_cpu_set_state(XiveTCTX *tctx, Error **errp)
>>  
>>  void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
>>  {
>> +    sPAPRXive *xive = SPAPR_MACHINE(qdev_get_machine())->xive;
>>      uint64_t state[4] = { 0 };
>>      int ret;
>>  
>> +    /* The KVM XIVE device is not in use */
>> +    if (xive->fd == -1) {
>> +        return;
>> +    }
>> +
>>      ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
>>      if (ret != 0) {
>>          error_setg_errno(errp, errno,
>> @@ -151,6 +157,11 @@ void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp)
>>      unsigned long vcpu_id;
>>      int ret;
>>  
>> +    /* The KVM XIVE device is not in use */
>> +    if (xive->fd == -1) {
>> +        return;
>> +    }
>> +
>>      /* Check if CPU was hot unplugged and replugged. */
>>      if (kvm_cpu_is_enabled(tctx->cs)) {
>>          return;
>> @@ -234,9 +245,13 @@ static void kvmppc_xive_source_get_state(XiveSource *xsrc)
>>  void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
>>  {
>>      XiveSource *xsrc = opaque;
>> +    sPAPRXive *xive = SPAPR_XIVE(xsrc->xive);
>>      struct kvm_irq_level args;
>>      int rc;
>>  
>> +    /* The KVM XIVE device should be in use */
>> +    assert(xive->fd != -1);
>> +
>>      args.irq = srcno;
>>      if (!xive_source_irq_is_lsi(xsrc, srcno)) {
>>          if (!val) {
>> @@ -580,6 +595,11 @@ int kvmppc_xive_pre_save(sPAPRXive *xive)
>>      Error *local_err = NULL;
>>      CPUState *cs;
>>  
>> +    /* The KVM XIVE device is not in use */
>> +    if (xive->fd == -1) {
>> +        return 0;
>> +    }
>> +
>>      /* Grab the EAT */
>>      kvmppc_xive_get_eas_state(xive, &local_err);
>>      if (local_err) {
>> @@ -612,6 +632,9 @@ int kvmppc_xive_post_load(sPAPRXive *xive, int version_id)
>>      Error *local_err = NULL;
>>      CPUState *cs;
>>  
>> +    /* The KVM XIVE device should be in use */
>> +    assert(xive->fd != -1);
> 
> I'm guessing this is an assert() because the handler shouldn't be
> registered when we're not in KVM mode.  But wouldn't that also be true
> of the pre_save hook, which errors out rather than asserting?

The handlers are not symetric.

The pre_save is registered in the vmstate of the sPAPRXive model and the 
post_load is handled at the machine level after all XIVE state have been
transferred.

C.

> 
>>      /* Restore the ENDT first. The targetting depends on it. */
>>      CPU_FOREACH(cs) {
>>          kvmppc_xive_set_eq_state(xive, cs, &local_err);
>> @@ -649,6 +672,11 @@ void kvmppc_xive_synchronize_state(sPAPRXive *xive, Error **errp)
>>      CPUState *cs;
>>      Error *local_err = NULL;
>>  
>> +    /* The KVM XIVE device is not in use */
>> +    if (xive->fd == -1) {
>> +        return;
>> +    }
>> +
>>      /*
>>       * When the VM is stopped, the sources are masked and the previous
>>       * state is saved in anticipation of a migration. We should not
>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>> index da6a00bc88cc..651bbfdf6966 100644
>> --- a/hw/intc/xics_kvm.c
>> +++ b/hw/intc/xics_kvm.c
>> @@ -68,6 +68,11 @@ static void icp_get_kvm_state(ICPState *icp)
>>      uint64_t state;
>>      int ret;
>>  
>> +    /* The KVM XICS device is not in use */
>> +    if (kernel_xics_fd == -1) {
>> +        return;
>> +    }
>> +
>>      /* ICP for this CPU thread is not in use, exiting */
>>      if (!icp->cs) {
>>          return;
>> @@ -104,6 +109,11 @@ static int icp_set_kvm_state(ICPState *icp, int version_id)
>>      uint64_t state;
>>      int ret;
>>  
>> +    /* The KVM XICS device is not in use */
>> +    if (kernel_xics_fd == -1) {
>> +        return 0;
>> +    }
>> +
>>      /* ICP for this CPU thread is not in use, exiting */
>>      if (!icp->cs) {
>>          return 0;
>> @@ -140,8 +150,8 @@ static void icp_kvm_connect(ICPState *icp, Error **errp)
>>      unsigned long vcpu_id;
>>      int ret;
>>  
>> +    /* The KVM XICS device is not in use */
>>      if (kernel_xics_fd == -1) {
>> -        abort();
>>          return;
>>      }
>>  
>> @@ -220,6 +230,11 @@ static void ics_get_kvm_state(ICSState *ics)
>>      uint64_t state;
>>      int i;
>>  
>> +    /* The KVM XICS device is not in use */
>> +    if (kernel_xics_fd == -1) {
>> +        return;
>> +    }
>> +
>>      for (i = 0; i < ics->nr_irqs; i++) {
>>          ICSIRQState *irq = &ics->irqs[i];
>>  
>> @@ -279,6 +294,11 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
>>      int i;
>>      Error *local_err = NULL;
>>  
>> +    /* The KVM XICS device is not in use */
>> +    if (kernel_xics_fd == -1) {
>> +        return 0;
>> +    }
>> +
>>      for (i = 0; i < ics->nr_irqs; i++) {
>>          ICSIRQState *irq = &ics->irqs[i];
>>          int ret;
>> @@ -325,6 +345,9 @@ void ics_kvm_set_irq(void *opaque, int srcno, int val)
>>      struct kvm_irq_level args;
>>      int rc;
>>  
>> +    /* The KVM XICS device should be in use */
>> +    assert(kernel_xics_fd != -1);
>> +
>>      args.irq = srcno + ics->offset;
>>      if (ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MSI) {
>>          if (!val) {
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-02-12  1:11   ` David Gibson
@ 2019-02-12  7:18     ` Cédric Le Goater
  2019-02-13  1:32       ` David Gibson
  0 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-02-12  7:18 UTC (permalink / raw)
  To: David Gibson; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

On 2/12/19 2:11 AM, David Gibson wrote:
> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
>> The interrupt mode is chosen by the CAS negotiation process and
>> activated after a reset to take into account the required changes in
>> the machine. This brings new constraints on how the associated KVM IRQ
>> device is initialized.
>>
>> Currently, each model takes care of the initialization of the KVM
>> device in their realize method but this is not possible anymore as the
>> initialization needs to be done globaly when the interrupt mode is
>> known, i.e. when machine is reseted. It also means that we need a way
>> to delete a KVM device when another mode is chosen.
>>
>> Also, to support migration, the QEMU objects holding the state to
>> transfer should always be available but not necessarily activated.
>>
>> The overall approach of this proposal is to initialize both interrupt
>> mode at the QEMU level and keep the IRQ number space in sync to allow
>> switching from one mode to another. For the KVM side of things, the
>> whole initialization of the KVM device, sources and presenters, is
>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
>> handlers are modified accordingly to handle the init and the delete
>> sequences of the KVM device.
>>
>> As KVM is now initialized at reset, we loose the possiblity to
>> fallback to the QEMU emulated mode in case of failure and failures
>> become fatal to the machine.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/intc/spapr_xive.c     |  8 +---
>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>>  hw/intc/xive.c           |  4 --
>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>>  5 files changed, 109 insertions(+), 34 deletions(-)
>>
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index 21f3c1ef0901..0661aca35900 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>>  
>> -    if (kvmppc_xive_enabled()) {
>> -        kvmppc_xive_connect(xive, &local_err);
>> -        if (local_err) {
>> -            error_propagate(errp, local_err);
>> -            return;
>> -        }
>> -    } else {
>> +    if (!kvmppc_xive_enabled()) {
>>          /* TIMA initialization */
>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>>                                "xive.tima", 4ull << TM_SHIFT);
>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>> index d35814c1992e..3ebc947f2be7 100644
>> --- a/hw/intc/spapr_xive_kvm.c
>> +++ b/hw/intc/spapr_xive_kvm.c
>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>      Error *local_err = NULL;
>>      size_t esb_len;
>>      size_t tima_len;
>> +    CPUState *cs;
>> +
>> +    /*
>> +     * The KVM XIVE device already in use. This is the case when
>> +     * rebooting XIVE -> XIVE
> 
> Can this case actually occur?  Further down you appear to
> unconditionally destroy both KVM devices at reset time.

I guess you are right. I will check.

>> +     */
>> +    if (xive->fd != -1) {
>> +        return;
>> +    }
>>  
>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>      xive->change = qemu_add_vm_change_state_handler(
>>          kvmppc_xive_change_state_handler, xive);
>>  
>> +    /* Connect the presenters to the initial VCPUs of the machine */
>> +    CPU_FOREACH(cs) {
>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +
>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>> +    }
>> +
>> +    /* Update the KVM sources */
>> +    kvmppc_xive_source_reset(xsrc, &local_err);
>> +    if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +    }
>> +
>>      kvm_kernel_irqchip = true;
>>      kvm_msi_via_irqfd_allowed = true;
>>      kvm_gsi_direct_mapping = true;
>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>> index 1d21ff217b82..bfc35d71df7f 100644
>> --- a/hw/intc/xics_kvm.c
>> +++ b/hw/intc/xics_kvm.c
>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>  {
>>      int rc;
>> +    CPUState *cs;
>> +    Error *local_err = NULL;
>> +
>> +    /*
>> +     * The KVM XICS device already in use. This is the case when
>> +     * rebooting XICS -> XICS
>> +     */
>> +    if (kernel_xics_fd != -1) {
>> +        return 0;
>> +    }
>>  
>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>>          error_setg(errp,
>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>      kvm_msi_via_irqfd_allowed = true;
>>      kvm_gsi_direct_mapping = true;
>>  
>> +    /* Connect the presenters to the initial VCPUs of the machine */
>> +    CPU_FOREACH(cs) {
>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +
>> +        icp_kvm_connect(cpu->icp, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            goto fail;
>> +        }
>> +        icp_set_kvm_state(cpu->icp, 1);
>> +    }
>> +
>> +    /* Update the KVM sources */
>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
>> +
>>      return 0;
>>  
>>  fail:
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index c5c2fbc3f8bc..c166eab5b210 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
>>  
>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
>> -
>> -    if (kvmppc_xive_enabled()) {
>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
>> -    }
>>  }
>>  
>>  static void xive_source_realize(DeviceState *dev, Error **errp)
>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>> index ba27d9d8e972..5592eec3787b 100644
>> --- a/hw/ppc/spapr_irq.c
>> +++ b/hw/ppc/spapr_irq.c
>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>      int nr_irqs = spapr->irq->nr_irqs;
>>      Error *local_err = NULL;
>>  
>> -    if (kvm_enabled()) {
>> -        if (machine_kernel_irqchip_allowed(machine) &&
>> -            !xics_kvm_init(spapr, &local_err)) {
>> -            spapr->icp_type = TYPE_KVM_ICP;
>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>> -                                          &local_err);
>> -        }
>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
>> -            error_prepend(&local_err,
>> -                          "kernel_irqchip requested but unavailable: ");
>> -            goto error;
> 
> I don't see anything that replaces the irqchip_required logic, which
> doesn't seem right.

Yes. We do loose the ability to fall back to the emulated device in case
of failure. It is not impossible to do but it will require more changes
to check what are the KVM capabilities before starting the machine.

Nevertheless, any failure in reset when setting the KVM backend will
result in machine abort.

C.       

> 
>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +        spapr->icp_type = TYPE_KVM_ICP;
>> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>> +                                      &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>>          }
>> -        error_free(local_err);
>> -        local_err = NULL;
>>      }
>>  
>>      if (!spapr->ics) {
>> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>          spapr->icp_type = TYPE_ICP;
>>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
>>                                        &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>>      }
>> -
>> -error:
>> -    error_propagate(errp, local_err);
>>  }
>>  
>>  #define ICS_IRQ_FREE(ics, srcno)   \
>> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>>  
>>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
>>  {
>> -    /* TODO: create the KVM XICS device */
>> +    MachineState *machine = MACHINE(spapr);
>> +    Error *local_err = NULL;
>> +
>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +        xics_kvm_init(spapr, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XICS connect failed: ");
>> +            return;
>> +        }
>> +    }
>>  }
>>  
>>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
>> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>  {
>>      CPUState *cs;
>> +    Error *local_err = NULL;
>>  
>>      CPU_FOREACH(cs) {
>>          PowerPCCPU *cpu = POWERPC_CPU(cs);
>> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>          spapr_xive_set_tctx_os_cam(cpu->tctx);
>>      }
>>  
>> +    if (kvmppc_xive_enabled()) {
>> +        kvmppc_xive_connect(spapr->xive, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XIVE connect failed: ");
>> +            return;
>> +        }
>> +    }
>> +
>>      /* Activate the XIVE MMIOs */
>>      spapr_xive_mmio_set_enabled(spapr->xive, true);
>>  }
>> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
>>  
>>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
>>  {
>> -    MachineState *machine = MACHINE(spapr);
>>      Error *local_err = NULL;
>>  
>> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> -        error_setg(errp, "No KVM support for the 'dual' machine");
>> -        return;
>> -    }
>> -
>>      spapr_irq_xics.init(spapr, &local_err);
>>      if (local_err) {
>>          error_propagate(errp, local_err);
>> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
>>  
>>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>  {
>> +    MachineState *machine = MACHINE(spapr);
>> +
>>      /*
>>       * Force a reset of the XIVE backend after migration. The machine
>>       * defaults to XICS at startup.
>>       */
>>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +            xics_kvm_disconnect(spapr, &error_fatal);
>> +        }
>>          spapr_irq_xive.reset(spapr, &error_fatal);
>>      }
>>  
>> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>  
>>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
>>  {
>> +    MachineState *machine = MACHINE(spapr);
>> +    Error *local_err = NULL;
>> +
>>      /*
>>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
>>       * if selected.
>>       */
>>      spapr_xive_mmio_set_enabled(spapr->xive, false);
>>  
>> +    /* Destroy all KVM devices */
>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +        xics_kvm_disconnect(spapr, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XICS disconnect failed: ");
>> +            return;
>> +        }
>> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
>> +            return;
>> +        }
>> +    }
>> +
>>      spapr_irq_current(spapr)->reset(spapr, errp);
>>  }
>>  
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 11/13] spapr: check for the activation of the KVM IRQ device
  2019-02-12  7:12     ` Cédric Le Goater
@ 2019-02-13  0:17       ` David Gibson
  0 siblings, 0 replies; 43+ messages in thread
From: David Gibson @ 2019-02-13  0:17 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 3681 bytes --]

On Tue, Feb 12, 2019 at 08:12:28AM +0100, Cédric Le Goater wrote:
> On 2/12/19 2:01 AM, David Gibson wrote:
> > On Mon, Jan 07, 2019 at 07:39:44PM +0100, Cédric Le Goater wrote:
> >> The activation of the KVM IRQ device depends on the interrupt mode
> >> chosen at CAS time by the machine and some methods used at reset or by
> >> the migration need to be protected.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  hw/intc/spapr_xive_kvm.c | 28 ++++++++++++++++++++++++++++
> >>  hw/intc/xics_kvm.c       | 25 ++++++++++++++++++++++++-
> >>  2 files changed, 52 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> >> index 93ea8e71047a..d35814c1992e 100644
> >> --- a/hw/intc/spapr_xive_kvm.c
> >> +++ b/hw/intc/spapr_xive_kvm.c
> >> @@ -95,9 +95,15 @@ static void kvmppc_xive_cpu_set_state(XiveTCTX *tctx, Error **errp)
> >>  
> >>  void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp)
> >>  {
> >> +    sPAPRXive *xive = SPAPR_MACHINE(qdev_get_machine())->xive;
> >>      uint64_t state[4] = { 0 };
> >>      int ret;
> >>  
> >> +    /* The KVM XIVE device is not in use */
> >> +    if (xive->fd == -1) {
> >> +        return;
> >> +    }
> >> +
> >>      ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
> >>      if (ret != 0) {
> >>          error_setg_errno(errp, errno,
> >> @@ -151,6 +157,11 @@ void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp)
> >>      unsigned long vcpu_id;
> >>      int ret;
> >>  
> >> +    /* The KVM XIVE device is not in use */
> >> +    if (xive->fd == -1) {
> >> +        return;
> >> +    }
> >> +
> >>      /* Check if CPU was hot unplugged and replugged. */
> >>      if (kvm_cpu_is_enabled(tctx->cs)) {
> >>          return;
> >> @@ -234,9 +245,13 @@ static void kvmppc_xive_source_get_state(XiveSource *xsrc)
> >>  void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val)
> >>  {
> >>      XiveSource *xsrc = opaque;
> >> +    sPAPRXive *xive = SPAPR_XIVE(xsrc->xive);
> >>      struct kvm_irq_level args;
> >>      int rc;
> >>  
> >> +    /* The KVM XIVE device should be in use */
> >> +    assert(xive->fd != -1);
> >> +
> >>      args.irq = srcno;
> >>      if (!xive_source_irq_is_lsi(xsrc, srcno)) {
> >>          if (!val) {
> >> @@ -580,6 +595,11 @@ int kvmppc_xive_pre_save(sPAPRXive *xive)
> >>      Error *local_err = NULL;
> >>      CPUState *cs;
> >>  
> >> +    /* The KVM XIVE device is not in use */
> >> +    if (xive->fd == -1) {
> >> +        return 0;
> >> +    }
> >> +
> >>      /* Grab the EAT */
> >>      kvmppc_xive_get_eas_state(xive, &local_err);
> >>      if (local_err) {
> >> @@ -612,6 +632,9 @@ int kvmppc_xive_post_load(sPAPRXive *xive, int version_id)
> >>      Error *local_err = NULL;
> >>      CPUState *cs;
> >>  
> >> +    /* The KVM XIVE device should be in use */
> >> +    assert(xive->fd != -1);
> > 
> > I'm guessing this is an assert() because the handler shouldn't be
> > registered when we're not in KVM mode.  But wouldn't that also be true
> > of the pre_save hook, which errors out rather than asserting?
> 
> The handlers are not symetric.
> 
> The pre_save is registered in the vmstate of the sPAPRXive model and the 
> post_load is handled at the machine level after all XIVE state have been
> transferred.

Ah, ok.  Some comments on site explaining why that's so would be useful.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-02-12  7:18     ` Cédric Le Goater
@ 2019-02-13  1:32       ` David Gibson
  2019-02-13  8:22         ` Cédric Le Goater
  2019-02-22 12:36         ` Cédric Le Goater
  0 siblings, 2 replies; 43+ messages in thread
From: David Gibson @ 2019-02-13  1:32 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 13885 bytes --]

On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:
> On 2/12/19 2:11 AM, David Gibson wrote:
> > On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
> >> The interrupt mode is chosen by the CAS negotiation process and
> >> activated after a reset to take into account the required changes in
> >> the machine. This brings new constraints on how the associated KVM IRQ
> >> device is initialized.
> >>
> >> Currently, each model takes care of the initialization of the KVM
> >> device in their realize method but this is not possible anymore as the
> >> initialization needs to be done globaly when the interrupt mode is
> >> known, i.e. when machine is reseted. It also means that we need a way
> >> to delete a KVM device when another mode is chosen.
> >>
> >> Also, to support migration, the QEMU objects holding the state to
> >> transfer should always be available but not necessarily activated.
> >>
> >> The overall approach of this proposal is to initialize both interrupt
> >> mode at the QEMU level and keep the IRQ number space in sync to allow
> >> switching from one mode to another. For the KVM side of things, the
> >> whole initialization of the KVM device, sources and presenters, is
> >> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> >> handlers are modified accordingly to handle the init and the delete
> >> sequences of the KVM device.
> >>
> >> As KVM is now initialized at reset, we loose the possiblity to
> >> fallback to the QEMU emulated mode in case of failure and failures
> >> become fatal to the machine.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  hw/intc/spapr_xive.c     |  8 +---
> >>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
> >>  hw/intc/xics_kvm.c       | 25 +++++++++++++
> >>  hw/intc/xive.c           |  4 --
> >>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
> >>  5 files changed, 109 insertions(+), 34 deletions(-)
> >>
> >> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >> index 21f3c1ef0901..0661aca35900 100644
> >> --- a/hw/intc/spapr_xive.c
> >> +++ b/hw/intc/spapr_xive.c
> >> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
> >>      xive->endt = g_new0(XiveEND, xive->nr_ends);
> >>  
> >> -    if (kvmppc_xive_enabled()) {
> >> -        kvmppc_xive_connect(xive, &local_err);
> >> -        if (local_err) {
> >> -            error_propagate(errp, local_err);
> >> -            return;
> >> -        }
> >> -    } else {
> >> +    if (!kvmppc_xive_enabled()) {
> >>          /* TIMA initialization */
> >>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> >>                                "xive.tima", 4ull << TM_SHIFT);
> >> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> >> index d35814c1992e..3ebc947f2be7 100644
> >> --- a/hw/intc/spapr_xive_kvm.c
> >> +++ b/hw/intc/spapr_xive_kvm.c
> >> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>      Error *local_err = NULL;
> >>      size_t esb_len;
> >>      size_t tima_len;
> >> +    CPUState *cs;
> >> +
> >> +    /*
> >> +     * The KVM XIVE device already in use. This is the case when
> >> +     * rebooting XIVE -> XIVE
> > 
> > Can this case actually occur?  Further down you appear to
> > unconditionally destroy both KVM devices at reset time.
> 
> I guess you are right. I will check.
> 
> >> +     */
> >> +    if (xive->fd != -1) {
> >> +        return;
> >> +    }
> >>  
> >>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> >>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> >> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>      xive->change = qemu_add_vm_change_state_handler(
> >>          kvmppc_xive_change_state_handler, xive);
> >>  
> >> +    /* Connect the presenters to the initial VCPUs of the machine */
> >> +    CPU_FOREACH(cs) {
> >> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +
> >> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }
> >> +    }
> >> +
> >> +    /* Update the KVM sources */
> >> +    kvmppc_xive_source_reset(xsrc, &local_err);
> >> +    if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +    }
> >> +
> >>      kvm_kernel_irqchip = true;
> >>      kvm_msi_via_irqfd_allowed = true;
> >>      kvm_gsi_direct_mapping = true;
> >> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> >> index 1d21ff217b82..bfc35d71df7f 100644
> >> --- a/hw/intc/xics_kvm.c
> >> +++ b/hw/intc/xics_kvm.c
> >> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> >>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >>      int rc;
> >> +    CPUState *cs;
> >> +    Error *local_err = NULL;
> >> +
> >> +    /*
> >> +     * The KVM XICS device already in use. This is the case when
> >> +     * rebooting XICS -> XICS
> >> +     */
> >> +    if (kernel_xics_fd != -1) {
> >> +        return 0;
> >> +    }
> >>  
> >>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> >>          error_setg(errp,
> >> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>      kvm_msi_via_irqfd_allowed = true;
> >>      kvm_gsi_direct_mapping = true;
> >>  
> >> +    /* Connect the presenters to the initial VCPUs of the machine */
> >> +    CPU_FOREACH(cs) {
> >> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +
> >> +        icp_kvm_connect(cpu->icp, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            goto fail;
> >> +        }
> >> +        icp_set_kvm_state(cpu->icp, 1);
> >> +    }
> >> +
> >> +    /* Update the KVM sources */
> >> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> >> +
> >>      return 0;
> >>  
> >>  fail:
> >> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >> index c5c2fbc3f8bc..c166eab5b210 100644
> >> --- a/hw/intc/xive.c
> >> +++ b/hw/intc/xive.c
> >> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
> >>  
> >>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
> >>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> >> -
> >> -    if (kvmppc_xive_enabled()) {
> >> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> >> -    }
> >>  }
> >>  
> >>  static void xive_source_realize(DeviceState *dev, Error **errp)
> >> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> >> index ba27d9d8e972..5592eec3787b 100644
> >> --- a/hw/ppc/spapr_irq.c
> >> +++ b/hw/ppc/spapr_irq.c
> >> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>      int nr_irqs = spapr->irq->nr_irqs;
> >>      Error *local_err = NULL;
> >>  
> >> -    if (kvm_enabled()) {
> >> -        if (machine_kernel_irqchip_allowed(machine) &&
> >> -            !xics_kvm_init(spapr, &local_err)) {
> >> -            spapr->icp_type = TYPE_KVM_ICP;
> >> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >> -                                          &local_err);
> >> -        }
> >> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> >> -            error_prepend(&local_err,
> >> -                          "kernel_irqchip requested but unavailable: ");
> >> -            goto error;
> > 
> > I don't see anything that replaces the irqchip_required logic, which
> > doesn't seem right.
> 
> Yes. We do loose the ability to fall back to the emulated device in case
> of failure. It is not impossible to do but it will require more changes
> to check what are the KVM capabilities before starting the machine.

Uh... it seems more like it's the other way around.  We'll always fall
back to emulated, even if we've explicitly said on the command line
that we don't want that.

> Nevertheless, any failure in reset when setting the KVM backend will
> result in machine abort.
> 
> C.       
> 
> > 
> >> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> +        spapr->icp_type = TYPE_KVM_ICP;
> >> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >> +                                      &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >>          }
> >> -        error_free(local_err);
> >> -        local_err = NULL;
> >>      }
> >>  
> >>      if (!spapr->ics) {
> >> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>          spapr->icp_type = TYPE_ICP;
> >>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
> >>                                        &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }
> >>      }
> >> -
> >> -error:
> >> -    error_propagate(errp, local_err);
> >>  }
> >>  
> >>  #define ICS_IRQ_FREE(ics, srcno)   \
> >> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
> >>  
> >>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >> -    /* TODO: create the KVM XICS device */
> >> +    MachineState *machine = MACHINE(spapr);
> >> +    Error *local_err = NULL;
> >> +
> >> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> +        xics_kvm_init(spapr, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            error_prepend(errp, "KVM XICS connect failed: ");
> >> +            return;
> >> +        }
> >> +    }
> >>  }
> >>  
> >>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
> >> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
> >>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >>      CPUState *cs;
> >> +    Error *local_err = NULL;
> >>  
> >>      CPU_FOREACH(cs) {
> >>          PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
> >>          spapr_xive_set_tctx_os_cam(cpu->tctx);
> >>      }
> >>  
> >> +    if (kvmppc_xive_enabled()) {
> >> +        kvmppc_xive_connect(spapr->xive, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            error_prepend(errp, "KVM XIVE connect failed: ");
> >> +            return;
> >> +        }
> >> +    }
> >> +
> >>      /* Activate the XIVE MMIOs */
> >>      spapr_xive_mmio_set_enabled(spapr->xive, true);
> >>  }
> >> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
> >>  
> >>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >> -    MachineState *machine = MACHINE(spapr);
> >>      Error *local_err = NULL;
> >>  
> >> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> -        error_setg(errp, "No KVM support for the 'dual' machine");
> >> -        return;
> >> -    }
> >> -
> >>      spapr_irq_xics.init(spapr, &local_err);
> >>      if (local_err) {
> >>          error_propagate(errp, local_err);
> >> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
> >>  
> >>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
> >>  {
> >> +    MachineState *machine = MACHINE(spapr);
> >> +
> >>      /*
> >>       * Force a reset of the XIVE backend after migration. The machine
> >>       * defaults to XICS at startup.
> >>       */
> >>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> +            xics_kvm_disconnect(spapr, &error_fatal);
> >> +        }
> >>          spapr_irq_xive.reset(spapr, &error_fatal);
> >>      }
> >>  
> >> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
> >>  
> >>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >> +    MachineState *machine = MACHINE(spapr);
> >> +    Error *local_err = NULL;
> >> +
> >>      /*
> >>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
> >>       * if selected.
> >>       */
> >>      spapr_xive_mmio_set_enabled(spapr->xive, false);
> >>  
> >> +    /* Destroy all KVM devices */
> >> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> +        xics_kvm_disconnect(spapr, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            error_prepend(errp, "KVM XICS disconnect failed: ");
> >> +            return;
> >> +        }
> >> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
> >> +            return;
> >> +        }
> >> +    }
> >> +
> >>      spapr_irq_current(spapr)->reset(spapr, errp);
> >>  }
> >>  
> > 
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space
  2019-02-12  7:05     ` Cédric Le Goater
@ 2019-02-13  1:33       ` David Gibson
  2019-02-13  8:03         ` Cédric Le Goater
  0 siblings, 1 reply; 43+ messages in thread
From: David Gibson @ 2019-02-13  1:33 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 3200 bytes --]

On Tue, Feb 12, 2019 at 08:05:53AM +0100, Cédric Le Goater wrote:
> On 2/12/19 2:06 AM, David Gibson wrote:
> > On Mon, Jan 07, 2019 at 07:39:45PM +0100, Cédric Le Goater wrote:
> >> The IRQ number space of the XIVE and XICS interrupt mode are aligned
> >> when using the dual interrupt mode for the machine. This means that
> >> the ICS offset is set to zero in QEMU and that the KVM XICS device
> >> should be informed of this new value. Unfortunately, there is now way
> >> to do so and KVM still maintains the XICS_IRQ_BASE (0x1000) offset.
> >>
> >> Ignore the lower 4K which are not used under the XICS interrupt
> >> mode. These IRQ numbers are only claimed by XIVE for the CPU IPIs.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  hw/intc/xics_kvm.c | 18 ++++++++++++++++++
> >>  1 file changed, 18 insertions(+)
> >>
> >> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> >> index 651bbfdf6966..1d21ff217b82 100644
> >> --- a/hw/intc/xics_kvm.c
> >> +++ b/hw/intc/xics_kvm.c
> >> @@ -238,6 +238,15 @@ static void ics_get_kvm_state(ICSState *ics)
> >>      for (i = 0; i < ics->nr_irqs; i++) {
> >>          ICSIRQState *irq = &ics->irqs[i];
> >>  
> >> +        /*
> >> +         * The KVM XICS device considers that the IRQ numbers should
> >> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
> >> +         * numbers (only claimed by XIVE for the CPU IPIs).
> >> +         */
> >> +        if (i + ics->offset < XICS_IRQ_BASE) {
> >> +            continue;
> >> +        }
> >> +
> > 
> > This seems bogus to me.  The guest-visible irq numbers need to line up
> > between xics and xive mode, yes, but that doesn't mean we need to keep
> > around a great big array of unused array of ICS irq states, even in
> > TCG mode.
> 
> This is because the qirqs[] array is under the machine and shared between 
> both interrupt modes, xics and xive.

I don't see how that follows.  ICSIRQState is indexed in terms of the
ICS source number, not the global irq number, so I don't see why it
has to match up with the qirq array.

> 
> C.
> 
> > 
> >>          kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
> >>                            i + ics->offset, &state, false, &error_fatal);
> >>  
> >> @@ -303,6 +312,15 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
> >>          ICSIRQState *irq = &ics->irqs[i];
> >>          int ret;
> >>  
> >> +        /*
> >> +         * The KVM XICS device considers that the IRQ numbers should
> >> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
> >> +         * numbers (only claimed by XIVE for the CPU IPIs).
> >> +         */
> >> +        if (i + ics->offset < XICS_IRQ_BASE) {
> >> +            continue;
> >> +        }
> >> +
> >>          state = irq->server;
> >>          state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
> >>              << KVM_XICS_PRIORITY_SHIFT;
> > 
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space
  2019-02-13  1:33       ` David Gibson
@ 2019-02-13  8:03         ` Cédric Le Goater
  2019-02-13 11:27           ` [Qemu-devel] [Qemu-ppc] " Greg Kurz
  0 siblings, 1 reply; 43+ messages in thread
From: Cédric Le Goater @ 2019-02-13  8:03 UTC (permalink / raw)
  To: David Gibson; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

On 2/13/19 2:33 AM, David Gibson wrote:
> On Tue, Feb 12, 2019 at 08:05:53AM +0100, Cédric Le Goater wrote:
>> On 2/12/19 2:06 AM, David Gibson wrote:
>>> On Mon, Jan 07, 2019 at 07:39:45PM +0100, Cédric Le Goater wrote:
>>>> The IRQ number space of the XIVE and XICS interrupt mode are aligned
>>>> when using the dual interrupt mode for the machine. This means that
>>>> the ICS offset is set to zero in QEMU and that the KVM XICS device
>>>> should be informed of this new value. Unfortunately, there is now way
>>>> to do so and KVM still maintains the XICS_IRQ_BASE (0x1000) offset.
>>>>
>>>> Ignore the lower 4K which are not used under the XICS interrupt
>>>> mode. These IRQ numbers are only claimed by XIVE for the CPU IPIs.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>  hw/intc/xics_kvm.c | 18 ++++++++++++++++++
>>>>  1 file changed, 18 insertions(+)
>>>>
>>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>>>> index 651bbfdf6966..1d21ff217b82 100644
>>>> --- a/hw/intc/xics_kvm.c
>>>> +++ b/hw/intc/xics_kvm.c
>>>> @@ -238,6 +238,15 @@ static void ics_get_kvm_state(ICSState *ics)
>>>>      for (i = 0; i < ics->nr_irqs; i++) {
>>>>          ICSIRQState *irq = &ics->irqs[i];
>>>>  
>>>> +        /*
>>>> +         * The KVM XICS device considers that the IRQ numbers should
>>>> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
>>>> +         * numbers (only claimed by XIVE for the CPU IPIs).
>>>> +         */
>>>> +        if (i + ics->offset < XICS_IRQ_BASE) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>
>>> This seems bogus to me.  The guest-visible irq numbers need to line up
>>> between xics and xive mode, yes, but that doesn't mean we need to keep
>>> around a great big array of unused array of ICS irq states, even in
>>> TCG mode.
>>
>> This is because the qirqs[] array is under the machine and shared between 
>> both interrupt modes, xics and xive.
> 
> I don't see how that follows.  ICSIRQState is indexed in terms of the
> ICS source number, not the global irq number, so I don't see why it
> has to match up with the qirq array.

The root cause is the use of spapr->irq->nr_irqs to initialize the ICS 
and sPAPRXive object. In case of the 'dual' backend, it covers the full 
XIVE IRQ number space (0x2000 today) but XICS only needs 0x1000.

I think we can fix the offset issue by using the appropriate nr_irqs 
which should be for the XICS backend : spapr->irq->nr_irqs - ics->offset


I keep in mind the XIVE support for nested guests and I think we will
need to extend the IRQ number space in L1 and have the L2 use a portion
of it (using an offset).     

C.
 
>>>
>>>>          kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
>>>>                            i + ics->offset, &state, false, &error_fatal);
>>>>  
>>>> @@ -303,6 +312,15 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
>>>>          ICSIRQState *irq = &ics->irqs[i];
>>>>          int ret;
>>>>  
>>>> +        /*
>>>> +         * The KVM XICS device considers that the IRQ numbers should
>>>> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
>>>> +         * numbers (only claimed by XIVE for the CPU IPIs).
>>>> +         */
>>>> +        if (i + ics->offset < XICS_IRQ_BASE) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>>          state = irq->server;
>>>>          state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
>>>>              << KVM_XICS_PRIORITY_SHIFT;
>>>
>>
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-02-13  1:32       ` David Gibson
@ 2019-02-13  8:22         ` Cédric Le Goater
  2019-02-13 10:07           ` [Qemu-devel] [Qemu-ppc] " Greg Kurz
  2019-02-14  3:29           ` [Qemu-devel] " David Gibson
  2019-02-22 12:36         ` Cédric Le Goater
  1 sibling, 2 replies; 43+ messages in thread
From: Cédric Le Goater @ 2019-02-13  8:22 UTC (permalink / raw)
  To: David Gibson; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

On 2/13/19 2:32 AM, David Gibson wrote:
> On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:
>> On 2/12/19 2:11 AM, David Gibson wrote:
>>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
>>>> The interrupt mode is chosen by the CAS negotiation process and
>>>> activated after a reset to take into account the required changes in
>>>> the machine. This brings new constraints on how the associated KVM IRQ
>>>> device is initialized.
>>>>
>>>> Currently, each model takes care of the initialization of the KVM
>>>> device in their realize method but this is not possible anymore as the
>>>> initialization needs to be done globaly when the interrupt mode is
>>>> known, i.e. when machine is reseted. It also means that we need a way
>>>> to delete a KVM device when another mode is chosen.
>>>>
>>>> Also, to support migration, the QEMU objects holding the state to
>>>> transfer should always be available but not necessarily activated.
>>>>
>>>> The overall approach of this proposal is to initialize both interrupt
>>>> mode at the QEMU level and keep the IRQ number space in sync to allow
>>>> switching from one mode to another. For the KVM side of things, the
>>>> whole initialization of the KVM device, sources and presenters, is
>>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
>>>> handlers are modified accordingly to handle the init and the delete
>>>> sequences of the KVM device.
>>>>
>>>> As KVM is now initialized at reset, we loose the possiblity to
>>>> fallback to the QEMU emulated mode in case of failure and failures
>>>> become fatal to the machine.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>  hw/intc/spapr_xive.c     |  8 +---
>>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>>>>  hw/intc/xive.c           |  4 --
>>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>>>>  5 files changed, 109 insertions(+), 34 deletions(-)
>>>>
>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>> index 21f3c1ef0901..0661aca35900 100644
>>>> --- a/hw/intc/spapr_xive.c
>>>> +++ b/hw/intc/spapr_xive.c
>>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>>>>  
>>>> -    if (kvmppc_xive_enabled()) {
>>>> -        kvmppc_xive_connect(xive, &local_err);
>>>> -        if (local_err) {
>>>> -            error_propagate(errp, local_err);
>>>> -            return;
>>>> -        }
>>>> -    } else {
>>>> +    if (!kvmppc_xive_enabled()) {
>>>>          /* TIMA initialization */
>>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>>>>                                "xive.tima", 4ull << TM_SHIFT);
>>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>>>> index d35814c1992e..3ebc947f2be7 100644
>>>> --- a/hw/intc/spapr_xive_kvm.c
>>>> +++ b/hw/intc/spapr_xive_kvm.c
>>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>      Error *local_err = NULL;
>>>>      size_t esb_len;
>>>>      size_t tima_len;
>>>> +    CPUState *cs;
>>>> +
>>>> +    /*
>>>> +     * The KVM XIVE device already in use. This is the case when
>>>> +     * rebooting XIVE -> XIVE
>>>
>>> Can this case actually occur?  Further down you appear to
>>> unconditionally destroy both KVM devices at reset time.
>>
>> I guess you are right. I will check.
>>
>>>> +     */
>>>> +    if (xive->fd != -1) {
>>>> +        return;
>>>> +    }
>>>>  
>>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
>>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
>>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>      xive->change = qemu_add_vm_change_state_handler(
>>>>          kvmppc_xive_change_state_handler, xive);
>>>>  
>>>> +    /* Connect the presenters to the initial VCPUs of the machine */
>>>> +    CPU_FOREACH(cs) {
>>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>> +
>>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Update the KVM sources */
>>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
>>>> +    if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>> +    }
>>>> +
>>>>      kvm_kernel_irqchip = true;
>>>>      kvm_msi_via_irqfd_allowed = true;
>>>>      kvm_gsi_direct_mapping = true;
>>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>>>> index 1d21ff217b82..bfc35d71df7f 100644
>>>> --- a/hw/intc/xics_kvm.c
>>>> +++ b/hw/intc/xics_kvm.c
>>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>>      int rc;
>>>> +    CPUState *cs;
>>>> +    Error *local_err = NULL;
>>>> +
>>>> +    /*
>>>> +     * The KVM XICS device already in use. This is the case when
>>>> +     * rebooting XICS -> XICS
>>>> +     */
>>>> +    if (kernel_xics_fd != -1) {
>>>> +        return 0;
>>>> +    }
>>>>  
>>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>>>>          error_setg(errp,
>>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>>>      kvm_msi_via_irqfd_allowed = true;
>>>>      kvm_gsi_direct_mapping = true;
>>>>  
>>>> +    /* Connect the presenters to the initial VCPUs of the machine */
>>>> +    CPU_FOREACH(cs) {
>>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>> +
>>>> +        icp_kvm_connect(cpu->icp, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            goto fail;
>>>> +        }
>>>> +        icp_set_kvm_state(cpu->icp, 1);
>>>> +    }
>>>> +
>>>> +    /* Update the KVM sources */
>>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
>>>> +
>>>>      return 0;
>>>>  
>>>>  fail:
>>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>>>> index c5c2fbc3f8bc..c166eab5b210 100644
>>>> --- a/hw/intc/xive.c
>>>> +++ b/hw/intc/xive.c
>>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
>>>>  
>>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
>>>> -
>>>> -    if (kvmppc_xive_enabled()) {
>>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
>>>> -    }
>>>>  }
>>>>  
>>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
>>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>>>> index ba27d9d8e972..5592eec3787b 100644
>>>> --- a/hw/ppc/spapr_irq.c
>>>> +++ b/hw/ppc/spapr_irq.c
>>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>>>      int nr_irqs = spapr->irq->nr_irqs;
>>>>      Error *local_err = NULL;
>>>>  
>>>> -    if (kvm_enabled()) {
>>>> -        if (machine_kernel_irqchip_allowed(machine) &&
>>>> -            !xics_kvm_init(spapr, &local_err)) {
>>>> -            spapr->icp_type = TYPE_KVM_ICP;
>>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>>>> -                                          &local_err);
>>>> -        }
>>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
>>>> -            error_prepend(&local_err,
>>>> -                          "kernel_irqchip requested but unavailable: ");
>>>> -            goto error;
>>>
>>> I don't see anything that replaces the irqchip_required logic, which
>>> doesn't seem right.
>>
>> Yes. We do loose the ability to fall back to the emulated device in case
>> of failure. It is not impossible to do but it will require more changes
>> to check what are the KVM capabilities before starting the machine.
> 
> Uh... it seems more like it's the other way around.  We'll always fall
> back to emulated, even if we've explicitly said on the command line
> that we don't want that.

Ah yes. The init function might be also broken. 

XICS mode is a bit more difficult to handle than XIVE because we have 
different object type for the KVM device and the QEMU emulated device, 
and with the 'dual' mode, we activate the device at CAS reset time.

Failures being handled at reset time, should we keep the same logic and  
abort the machine at reset if the kernel irqchip is required ? 

But we won't be able to fall back on the QEMU emulated device if KVM 
XICS fails and if the kernel irqchip is only allowed. It should work for 
XIVE though.

Thanks,

C.


>> Nevertheless, any failure in reset when setting the KVM backend will
>> result in machine abort.
>>
>> C.       
>>
>>>
>>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> +        spapr->icp_type = TYPE_KVM_ICP;
>>>> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>>>> +                                      &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>>          }
>>>> -        error_free(local_err);
>>>> -        local_err = NULL;
>>>>      }
>>>>  
>>>>      if (!spapr->ics) {
>>>> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>>>          spapr->icp_type = TYPE_ICP;
>>>>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
>>>>                                        &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>> +        }
>>>>      }
>>>> -
>>>> -error:
>>>> -    error_propagate(errp, local_err);
>>>>  }
>>>>  
>>>>  #define ICS_IRQ_FREE(ics, srcno)   \
>>>> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>>>>  
>>>>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>> -    /* TODO: create the KVM XICS device */
>>>> +    MachineState *machine = MACHINE(spapr);
>>>> +    Error *local_err = NULL;
>>>> +
>>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> +        xics_kvm_init(spapr, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            error_prepend(errp, "KVM XICS connect failed: ");
>>>> +            return;
>>>> +        }
>>>> +    }
>>>>  }
>>>>  
>>>>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
>>>> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>>>>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>>      CPUState *cs;
>>>> +    Error *local_err = NULL;
>>>>  
>>>>      CPU_FOREACH(cs) {
>>>>          PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>>>          spapr_xive_set_tctx_os_cam(cpu->tctx);
>>>>      }
>>>>  
>>>> +    if (kvmppc_xive_enabled()) {
>>>> +        kvmppc_xive_connect(spapr->xive, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            error_prepend(errp, "KVM XIVE connect failed: ");
>>>> +            return;
>>>> +        }
>>>> +    }
>>>> +
>>>>      /* Activate the XIVE MMIOs */
>>>>      spapr_xive_mmio_set_enabled(spapr->xive, true);
>>>>  }
>>>> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
>>>>  
>>>>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>> -    MachineState *machine = MACHINE(spapr);
>>>>      Error *local_err = NULL;
>>>>  
>>>> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> -        error_setg(errp, "No KVM support for the 'dual' machine");
>>>> -        return;
>>>> -    }
>>>> -
>>>>      spapr_irq_xics.init(spapr, &local_err);
>>>>      if (local_err) {
>>>>          error_propagate(errp, local_err);
>>>> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
>>>>  
>>>>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>>>  {
>>>> +    MachineState *machine = MACHINE(spapr);
>>>> +
>>>>      /*
>>>>       * Force a reset of the XIVE backend after migration. The machine
>>>>       * defaults to XICS at startup.
>>>>       */
>>>>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> +            xics_kvm_disconnect(spapr, &error_fatal);
>>>> +        }
>>>>          spapr_irq_xive.reset(spapr, &error_fatal);
>>>>      }
>>>>  
>>>> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>>>  
>>>>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>> +    MachineState *machine = MACHINE(spapr);
>>>> +    Error *local_err = NULL;
>>>> +
>>>>      /*
>>>>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
>>>>       * if selected.
>>>>       */
>>>>      spapr_xive_mmio_set_enabled(spapr->xive, false);
>>>>  
>>>> +    /* Destroy all KVM devices */
>>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> +        xics_kvm_disconnect(spapr, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            error_prepend(errp, "KVM XICS disconnect failed: ");
>>>> +            return;
>>>> +        }
>>>> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
>>>> +            return;
>>>> +        }
>>>> +    }
>>>> +
>>>>      spapr_irq_current(spapr)->reset(spapr, errp);
>>>>  }
>>>>  
>>>
>>
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [Qemu-ppc] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-02-13  8:22         ` Cédric Le Goater
@ 2019-02-13 10:07           ` Greg Kurz
  2019-02-14  3:35             ` David Gibson
  2019-02-14  3:29           ` [Qemu-devel] " David Gibson
  1 sibling, 1 reply; 43+ messages in thread
From: Greg Kurz @ 2019-02-13 10:07 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: David Gibson, qemu-ppc, qemu-devel

On Wed, 13 Feb 2019 09:22:46 +0100
Cédric Le Goater <clg@kaod.org> wrote:

> On 2/13/19 2:32 AM, David Gibson wrote:
> > On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:  
> >> On 2/12/19 2:11 AM, David Gibson wrote:  
> >>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:  
> >>>> The interrupt mode is chosen by the CAS negotiation process and
> >>>> activated after a reset to take into account the required changes in
> >>>> the machine. This brings new constraints on how the associated KVM IRQ
> >>>> device is initialized.
> >>>>
> >>>> Currently, each model takes care of the initialization of the KVM
> >>>> device in their realize method but this is not possible anymore as the
> >>>> initialization needs to be done globaly when the interrupt mode is
> >>>> known, i.e. when machine is reseted. It also means that we need a way
> >>>> to delete a KVM device when another mode is chosen.
> >>>>
> >>>> Also, to support migration, the QEMU objects holding the state to
> >>>> transfer should always be available but not necessarily activated.
> >>>>
> >>>> The overall approach of this proposal is to initialize both interrupt
> >>>> mode at the QEMU level and keep the IRQ number space in sync to allow
> >>>> switching from one mode to another. For the KVM side of things, the
> >>>> whole initialization of the KVM device, sources and presenters, is
> >>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> >>>> handlers are modified accordingly to handle the init and the delete
> >>>> sequences of the KVM device.
> >>>>
> >>>> As KVM is now initialized at reset, we loose the possiblity to
> >>>> fallback to the QEMU emulated mode in case of failure and failures
> >>>> become fatal to the machine.
> >>>>
> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>> ---
> >>>>  hw/intc/spapr_xive.c     |  8 +---
> >>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
> >>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
> >>>>  hw/intc/xive.c           |  4 --
> >>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
> >>>>  5 files changed, 109 insertions(+), 34 deletions(-)
> >>>>
> >>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >>>> index 21f3c1ef0901..0661aca35900 100644
> >>>> --- a/hw/intc/spapr_xive.c
> >>>> +++ b/hw/intc/spapr_xive.c
> >>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
> >>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
> >>>>  
> >>>> -    if (kvmppc_xive_enabled()) {
> >>>> -        kvmppc_xive_connect(xive, &local_err);
> >>>> -        if (local_err) {
> >>>> -            error_propagate(errp, local_err);
> >>>> -            return;
> >>>> -        }
> >>>> -    } else {
> >>>> +    if (!kvmppc_xive_enabled()) {
> >>>>          /* TIMA initialization */
> >>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> >>>>                                "xive.tima", 4ull << TM_SHIFT);
> >>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> >>>> index d35814c1992e..3ebc947f2be7 100644
> >>>> --- a/hw/intc/spapr_xive_kvm.c
> >>>> +++ b/hw/intc/spapr_xive_kvm.c
> >>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>>>      Error *local_err = NULL;
> >>>>      size_t esb_len;
> >>>>      size_t tima_len;
> >>>> +    CPUState *cs;
> >>>> +
> >>>> +    /*
> >>>> +     * The KVM XIVE device already in use. This is the case when
> >>>> +     * rebooting XIVE -> XIVE  
> >>>
> >>> Can this case actually occur?  Further down you appear to
> >>> unconditionally destroy both KVM devices at reset time.  
> >>
> >> I guess you are right. I will check.
> >>  
> >>>> +     */
> >>>> +    if (xive->fd != -1) {
> >>>> +        return;
> >>>> +    }
> >>>>  
> >>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> >>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> >>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>>>      xive->change = qemu_add_vm_change_state_handler(
> >>>>          kvmppc_xive_change_state_handler, xive);
> >>>>  
> >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> >>>> +    CPU_FOREACH(cs) {
> >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> +
> >>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    /* Update the KVM sources */
> >>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
> >>>> +    if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +    }
> >>>> +
> >>>>      kvm_kernel_irqchip = true;
> >>>>      kvm_msi_via_irqfd_allowed = true;
> >>>>      kvm_gsi_direct_mapping = true;
> >>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> >>>> index 1d21ff217b82..bfc35d71df7f 100644
> >>>> --- a/hw/intc/xics_kvm.c
> >>>> +++ b/hw/intc/xics_kvm.c
> >>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> >>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>>      int rc;
> >>>> +    CPUState *cs;
> >>>> +    Error *local_err = NULL;
> >>>> +
> >>>> +    /*
> >>>> +     * The KVM XICS device already in use. This is the case when
> >>>> +     * rebooting XICS -> XICS
> >>>> +     */
> >>>> +    if (kernel_xics_fd != -1) {
> >>>> +        return 0;
> >>>> +    }
> >>>>  
> >>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> >>>>          error_setg(errp,
> >>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>>>      kvm_msi_via_irqfd_allowed = true;
> >>>>      kvm_gsi_direct_mapping = true;
> >>>>  
> >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> >>>> +    CPU_FOREACH(cs) {
> >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> +
> >>>> +        icp_kvm_connect(cpu->icp, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            goto fail;
> >>>> +        }
> >>>> +        icp_set_kvm_state(cpu->icp, 1);
> >>>> +    }
> >>>> +
> >>>> +    /* Update the KVM sources */
> >>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> >>>> +
> >>>>      return 0;
> >>>>  
> >>>>  fail:
> >>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >>>> index c5c2fbc3f8bc..c166eab5b210 100644
> >>>> --- a/hw/intc/xive.c
> >>>> +++ b/hw/intc/xive.c
> >>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
> >>>>  
> >>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
> >>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> >>>> -
> >>>> -    if (kvmppc_xive_enabled()) {
> >>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> >>>> -    }
> >>>>  }
> >>>>  
> >>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
> >>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> >>>> index ba27d9d8e972..5592eec3787b 100644
> >>>> --- a/hw/ppc/spapr_irq.c
> >>>> +++ b/hw/ppc/spapr_irq.c
> >>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>>>      int nr_irqs = spapr->irq->nr_irqs;
> >>>>      Error *local_err = NULL;
> >>>>  
> >>>> -    if (kvm_enabled()) {
> >>>> -        if (machine_kernel_irqchip_allowed(machine) &&
> >>>> -            !xics_kvm_init(spapr, &local_err)) {
> >>>> -            spapr->icp_type = TYPE_KVM_ICP;
> >>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >>>> -                                          &local_err);
> >>>> -        }
> >>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> >>>> -            error_prepend(&local_err,
> >>>> -                          "kernel_irqchip requested but unavailable: ");
> >>>> -            goto error;  
> >>>
> >>> I don't see anything that replaces the irqchip_required logic, which
> >>> doesn't seem right.  
> >>
> >> Yes. We do loose the ability to fall back to the emulated device in case
> >> of failure. It is not impossible to do but it will require more changes
> >> to check what are the KVM capabilities before starting the machine.  
> > 
> > Uh... it seems more like it's the other way around.  We'll always fall
> > back to emulated, even if we've explicitly said on the command line
> > that we don't want that.  
> 
> Ah yes. The init function might be also broken. 
> 
> XICS mode is a bit more difficult to handle than XIVE because we have 
> different object type for the KVM device and the QEMU emulated device, 

This is indeed a bit unfortunate, but I think there's still room for
improvement. Let's look at the base classes:

struct ICPStateClass {
    DeviceClass parent_class;

    DeviceRealize parent_realize;
    DeviceReset parent_reset;

    void (*pre_save)(ICPState *icp);
    int (*post_load)(ICPState *icp, int version_id);
    void (*synchronize_state)(ICPState *icp);
};

struct ICSStateClass {
    DeviceClass parent_class;

    DeviceRealize parent_realize;
    DeviceReset parent_reset;

    void (*pre_save)(ICSState *s);
    int (*post_load)(ICSState *s, int version_id);
    void (*reject)(ICSState *s, uint32_t irq);
    void (*resend)(ICSState *s);
    void (*eoi)(ICSState *s, uint32_t irq);
    void (*synchronize_state)(ICSState *s);
};

The pre_save and post_load callbacks are only used with
the KVM device. They could be explicitely called from
the corresponding VMStateDescription callbacks with a
kvm_enabled() && kvm_irqchip_in_kernel() check.

Same goes for the synchronize_state callbacks, which are only
needed for 'info pic'.

The reject, resend and eoi callbacks are only called by code that
belongs to the QEMU emulated device. Either the RTAS/hypercalls
or from the machine code with explicit checks like:

static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
{
    sPAPRMachineState *spapr = opaque;
    MachineState *machine = MACHINE(opaque);

    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
        ics_kvm_set_irq(spapr->ics, srcno, val);
    } else {
        ics_simple_set_irq(spapr->ics, srcno, val);
    }
}

or

static int spapr_irq_post_load_xics(sPAPRMachineState *spapr, int version_id)
{
    if (!object_dynamic_cast(OBJECT(spapr->ics), TYPE_ICS_KVM)) {
        CPUState *cs;
        CPU_FOREACH(cs) {
            PowerPCCPU *cpu = POWERPC_CPU(cs);
            icp_resend(spapr_cpu_state(cpu)->icp);
        }
    }
    return 0;
}

Unless I'm missing something, the reject, resend and eoi callbacks could
simply be removed. This would allow to unify KVM and QEMU emulation in
the same ICP and ICS object types.

If this makes sense to you, I can have a look (already started actually ;-)

> and with the 'dual' mode, we activate the device at CAS reset time.
> 
> Failures being handled at reset time, should we keep the same logic and  
> abort the machine at reset if the kernel irqchip is required ? 
> 

If the user passed ic-mode=dual,kernel-irqchip=on, we should at least make
sure KVM supports both XICS and XIVE devices during machine init. Then
during reset if something goes wrong with KVM, it seems ok to abort.

If the user didn't pass kernel-irqchip, ie, kernel_irqchip_allowed is true
and kernel_irqchip_required is false, the current behavior for XICS is
to try KVM first and fallback to QEMU emulation. I guess it could be the
same for XIVE.

> But we won't be able to fall back on the QEMU emulated device if KVM 
> XICS fails and if the kernel irqchip is only allowed. It should work for 
> XIVE though.
> 
> Thanks,
> 
> C.
> 
> 
> >> Nevertheless, any failure in reset when setting the KVM backend will
> >> result in machine abort.
> >>
> >> C.       
> >>  
> >>>  
> >>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> +        spapr->icp_type = TYPE_KVM_ICP;
> >>>> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >>>> +                                      &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>>          }
> >>>> -        error_free(local_err);
> >>>> -        local_err = NULL;
> >>>>      }
> >>>>  
> >>>>      if (!spapr->ics) {
> >>>> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>>>          spapr->icp_type = TYPE_ICP;
> >>>>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
> >>>>                                        &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +        }
> >>>>      }
> >>>> -
> >>>> -error:
> >>>> -    error_propagate(errp, local_err);
> >>>>  }
> >>>>  
> >>>>  #define ICS_IRQ_FREE(ics, srcno)   \
> >>>> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
> >>>>  
> >>>>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>> -    /* TODO: create the KVM XICS device */
> >>>> +    MachineState *machine = MACHINE(spapr);
> >>>> +    Error *local_err = NULL;
> >>>> +
> >>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> +        xics_kvm_init(spapr, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            error_prepend(errp, "KVM XICS connect failed: ");
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>>  }
> >>>>  
> >>>>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
> >>>> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
> >>>>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>>      CPUState *cs;
> >>>> +    Error *local_err = NULL;
> >>>>  
> >>>>      CPU_FOREACH(cs) {
> >>>>          PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
> >>>>          spapr_xive_set_tctx_os_cam(cpu->tctx);
> >>>>      }
> >>>>  
> >>>> +    if (kvmppc_xive_enabled()) {
> >>>> +        kvmppc_xive_connect(spapr->xive, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            error_prepend(errp, "KVM XIVE connect failed: ");
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>>      /* Activate the XIVE MMIOs */
> >>>>      spapr_xive_mmio_set_enabled(spapr->xive, true);
> >>>>  }
> >>>> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
> >>>>  
> >>>>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>> -    MachineState *machine = MACHINE(spapr);
> >>>>      Error *local_err = NULL;
> >>>>  
> >>>> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> -        error_setg(errp, "No KVM support for the 'dual' machine");
> >>>> -        return;
> >>>> -    }
> >>>> -
> >>>>      spapr_irq_xics.init(spapr, &local_err);
> >>>>      if (local_err) {
> >>>>          error_propagate(errp, local_err);
> >>>> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
> >>>>  
> >>>>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
> >>>>  {
> >>>> +    MachineState *machine = MACHINE(spapr);
> >>>> +
> >>>>      /*
> >>>>       * Force a reset of the XIVE backend after migration. The machine
> >>>>       * defaults to XICS at startup.
> >>>>       */
> >>>>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >>>> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> +            xics_kvm_disconnect(spapr, &error_fatal);
> >>>> +        }
> >>>>          spapr_irq_xive.reset(spapr, &error_fatal);
> >>>>      }
> >>>>  
> >>>> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
> >>>>  
> >>>>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>> +    MachineState *machine = MACHINE(spapr);
> >>>> +    Error *local_err = NULL;
> >>>> +
> >>>>      /*
> >>>>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
> >>>>       * if selected.
> >>>>       */
> >>>>      spapr_xive_mmio_set_enabled(spapr->xive, false);
> >>>>  
> >>>> +    /* Destroy all KVM devices */
> >>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> +        xics_kvm_disconnect(spapr, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            error_prepend(errp, "KVM XICS disconnect failed: ");
> >>>> +            return;
> >>>> +        }
> >>>> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>>      spapr_irq_current(spapr)->reset(spapr, errp);
> >>>>  }
> >>>>    
> >>>  
> >>  
> >   
> 
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [Qemu-ppc] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space
  2019-02-13  8:03         ` Cédric Le Goater
@ 2019-02-13 11:27           ` Greg Kurz
  2019-02-13 12:11             ` Greg Kurz
  0 siblings, 1 reply; 43+ messages in thread
From: Greg Kurz @ 2019-02-13 11:27 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: David Gibson, qemu-ppc, qemu-devel

On Wed, 13 Feb 2019 09:03:33 +0100
Cédric Le Goater <clg@kaod.org> wrote:

> On 2/13/19 2:33 AM, David Gibson wrote:
> > On Tue, Feb 12, 2019 at 08:05:53AM +0100, Cédric Le Goater wrote:  
> >> On 2/12/19 2:06 AM, David Gibson wrote:  
> >>> On Mon, Jan 07, 2019 at 07:39:45PM +0100, Cédric Le Goater wrote:  
> >>>> The IRQ number space of the XIVE and XICS interrupt mode are aligned
> >>>> when using the dual interrupt mode for the machine. This means that
> >>>> the ICS offset is set to zero in QEMU and that the KVM XICS device
> >>>> should be informed of this new value. Unfortunately, there is now way
> >>>> to do so and KVM still maintains the XICS_IRQ_BASE (0x1000) offset.
> >>>>
> >>>> Ignore the lower 4K which are not used under the XICS interrupt
> >>>> mode. These IRQ numbers are only claimed by XIVE for the CPU IPIs.
> >>>>
> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>> ---
> >>>>  hw/intc/xics_kvm.c | 18 ++++++++++++++++++
> >>>>  1 file changed, 18 insertions(+)
> >>>>
> >>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> >>>> index 651bbfdf6966..1d21ff217b82 100644
> >>>> --- a/hw/intc/xics_kvm.c
> >>>> +++ b/hw/intc/xics_kvm.c
> >>>> @@ -238,6 +238,15 @@ static void ics_get_kvm_state(ICSState *ics)
> >>>>      for (i = 0; i < ics->nr_irqs; i++) {
> >>>>          ICSIRQState *irq = &ics->irqs[i];
> >>>>  
> >>>> +        /*
> >>>> +         * The KVM XICS device considers that the IRQ numbers should
> >>>> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
> >>>> +         * numbers (only claimed by XIVE for the CPU IPIs).
> >>>> +         */
> >>>> +        if (i + ics->offset < XICS_IRQ_BASE) {
> >>>> +            continue;
> >>>> +        }
> >>>> +  
> >>>
> >>> This seems bogus to me.  The guest-visible irq numbers need to line up
> >>> between xics and xive mode, yes, but that doesn't mean we need to keep
> >>> around a great big array of unused array of ICS irq states, even in
> >>> TCG mode.  
> >>
> >> This is because the qirqs[] array is under the machine and shared between 
> >> both interrupt modes, xics and xive.  
> > 
> > I don't see how that follows.  ICSIRQState is indexed in terms of the
> > ICS source number, not the global irq number, so I don't see why it
> > has to match up with the qirq array.  
> 
> The root cause is the use of spapr->irq->nr_irqs to initialize the ICS 
> and sPAPRXive object. In case of the 'dual' backend, it covers the full 
> XIVE IRQ number space (0x2000 today) but XICS only needs 0x1000.
> 
> I think we can fix the offset issue by using the appropriate nr_irqs 
> which should be for the XICS backend : spapr->irq->nr_irqs - ics->offset
> 

Since the root cause is that the value of spapr->irq->nr_irqs should
be different in XIVE and XICS, what about fixing it during reset ?

Something like:

static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
{
    [...]

    spapr->irq->nr_irqs = spapr_irq_current(spapr)->nr_irqs;

    spapr_irq_current(spapr)->reset(spapr, errp);
}

> 
> I keep in mind the XIVE support for nested guests and I think we will
> need to extend the IRQ number space in L1 and have the L2 use a portion
> of it (using an offset).     
> 
> C.
>  
> >>>  
> >>>>          kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
> >>>>                            i + ics->offset, &state, false, &error_fatal);
> >>>>  
> >>>> @@ -303,6 +312,15 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
> >>>>          ICSIRQState *irq = &ics->irqs[i];
> >>>>          int ret;
> >>>>  
> >>>> +        /*
> >>>> +         * The KVM XICS device considers that the IRQ numbers should
> >>>> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
> >>>> +         * numbers (only claimed by XIVE for the CPU IPIs).
> >>>> +         */
> >>>> +        if (i + ics->offset < XICS_IRQ_BASE) {
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>>          state = irq->server;
> >>>>          state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
> >>>>              << KVM_XICS_PRIORITY_SHIFT;  
> >>>  
> >>  
> >   
> 
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [Qemu-ppc] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space
  2019-02-13 11:27           ` [Qemu-devel] [Qemu-ppc] " Greg Kurz
@ 2019-02-13 12:11             ` Greg Kurz
  0 siblings, 0 replies; 43+ messages in thread
From: Greg Kurz @ 2019-02-13 12:11 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: qemu-ppc, qemu-devel, David Gibson

On Wed, 13 Feb 2019 12:27:13 +0100
Greg Kurz <groug@kaod.org> wrote:

> On Wed, 13 Feb 2019 09:03:33 +0100
> Cédric Le Goater <clg@kaod.org> wrote:
> 
> > On 2/13/19 2:33 AM, David Gibson wrote:  
> > > On Tue, Feb 12, 2019 at 08:05:53AM +0100, Cédric Le Goater wrote:    
> > >> On 2/12/19 2:06 AM, David Gibson wrote:    
> > >>> On Mon, Jan 07, 2019 at 07:39:45PM +0100, Cédric Le Goater wrote:    
> > >>>> The IRQ number space of the XIVE and XICS interrupt mode are aligned
> > >>>> when using the dual interrupt mode for the machine. This means that
> > >>>> the ICS offset is set to zero in QEMU and that the KVM XICS device
> > >>>> should be informed of this new value. Unfortunately, there is now way
> > >>>> to do so and KVM still maintains the XICS_IRQ_BASE (0x1000) offset.
> > >>>>
> > >>>> Ignore the lower 4K which are not used under the XICS interrupt
> > >>>> mode. These IRQ numbers are only claimed by XIVE for the CPU IPIs.
> > >>>>
> > >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> > >>>> ---
> > >>>>  hw/intc/xics_kvm.c | 18 ++++++++++++++++++
> > >>>>  1 file changed, 18 insertions(+)
> > >>>>
> > >>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> > >>>> index 651bbfdf6966..1d21ff217b82 100644
> > >>>> --- a/hw/intc/xics_kvm.c
> > >>>> +++ b/hw/intc/xics_kvm.c
> > >>>> @@ -238,6 +238,15 @@ static void ics_get_kvm_state(ICSState *ics)
> > >>>>      for (i = 0; i < ics->nr_irqs; i++) {
> > >>>>          ICSIRQState *irq = &ics->irqs[i];
> > >>>>  
> > >>>> +        /*
> > >>>> +         * The KVM XICS device considers that the IRQ numbers should
> > >>>> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
> > >>>> +         * numbers (only claimed by XIVE for the CPU IPIs).
> > >>>> +         */
> > >>>> +        if (i + ics->offset < XICS_IRQ_BASE) {
> > >>>> +            continue;
> > >>>> +        }
> > >>>> +    
> > >>>
> > >>> This seems bogus to me.  The guest-visible irq numbers need to line up
> > >>> between xics and xive mode, yes, but that doesn't mean we need to keep
> > >>> around a great big array of unused array of ICS irq states, even in
> > >>> TCG mode.    
> > >>
> > >> This is because the qirqs[] array is under the machine and shared between 
> > >> both interrupt modes, xics and xive.    
> > > 
> > > I don't see how that follows.  ICSIRQState is indexed in terms of the
> > > ICS source number, not the global irq number, so I don't see why it
> > > has to match up with the qirq array.    
> > 
> > The root cause is the use of spapr->irq->nr_irqs to initialize the ICS 
> > and sPAPRXive object. In case of the 'dual' backend, it covers the full 
> > XIVE IRQ number space (0x2000 today) but XICS only needs 0x1000.
> > 
> > I think we can fix the offset issue by using the appropriate nr_irqs 
> > which should be for the XICS backend : spapr->irq->nr_irqs - ics->offset
> >   
> 
> Since the root cause is that the value of spapr->irq->nr_irqs should
> be different in XIVE and XICS, what about fixing it during reset ?
> 

Nah this doesn't make sense :)

But if XICS always needs 0x1000, why just not change spapr_irq_init_xics()
to use SPAPR_IRQ_XICS_NR_IRQS instead of spapr->irq->nr_irqs ?

> Something like:
> 
> static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
> {
>     [...]
> 
>     spapr->irq->nr_irqs = spapr_irq_current(spapr)->nr_irqs;
> 
>     spapr_irq_current(spapr)->reset(spapr, errp);
> }
> 
> > 
> > I keep in mind the XIVE support for nested guests and I think we will
> > need to extend the IRQ number space in L1 and have the L2 use a portion
> > of it (using an offset).     
> > 
> > C.
> >    
> > >>>    
> > >>>>          kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
> > >>>>                            i + ics->offset, &state, false, &error_fatal);
> > >>>>  
> > >>>> @@ -303,6 +312,15 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
> > >>>>          ICSIRQState *irq = &ics->irqs[i];
> > >>>>          int ret;
> > >>>>  
> > >>>> +        /*
> > >>>> +         * The KVM XICS device considers that the IRQ numbers should
> > >>>> +         * start at XICS_IRQ_BASE (0x1000). Ignore the lower 4K
> > >>>> +         * numbers (only claimed by XIVE for the CPU IPIs).
> > >>>> +         */
> > >>>> +        if (i + ics->offset < XICS_IRQ_BASE) {
> > >>>> +            continue;
> > >>>> +        }
> > >>>> +
> > >>>>          state = irq->server;
> > >>>>          state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
> > >>>>              << KVM_XICS_PRIORITY_SHIFT;    
> > >>>    
> > >>    
> > >     
> > 
> >   
> 
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-02-13  8:22         ` Cédric Le Goater
  2019-02-13 10:07           ` [Qemu-devel] [Qemu-ppc] " Greg Kurz
@ 2019-02-14  3:29           ` David Gibson
  1 sibling, 0 replies; 43+ messages in thread
From: David Gibson @ 2019-02-14  3:29 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 9717 bytes --]

On Wed, Feb 13, 2019 at 09:22:46AM +0100, Cédric Le Goater wrote:
> On 2/13/19 2:32 AM, David Gibson wrote:
> > On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:
> >> On 2/12/19 2:11 AM, David Gibson wrote:
> >>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
> >>>> The interrupt mode is chosen by the CAS negotiation process and
> >>>> activated after a reset to take into account the required changes in
> >>>> the machine. This brings new constraints on how the associated KVM IRQ
> >>>> device is initialized.
> >>>>
> >>>> Currently, each model takes care of the initialization of the KVM
> >>>> device in their realize method but this is not possible anymore as the
> >>>> initialization needs to be done globaly when the interrupt mode is
> >>>> known, i.e. when machine is reseted. It also means that we need a way
> >>>> to delete a KVM device when another mode is chosen.
> >>>>
> >>>> Also, to support migration, the QEMU objects holding the state to
> >>>> transfer should always be available but not necessarily activated.
> >>>>
> >>>> The overall approach of this proposal is to initialize both interrupt
> >>>> mode at the QEMU level and keep the IRQ number space in sync to allow
> >>>> switching from one mode to another. For the KVM side of things, the
> >>>> whole initialization of the KVM device, sources and presenters, is
> >>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> >>>> handlers are modified accordingly to handle the init and the delete
> >>>> sequences of the KVM device.
> >>>>
> >>>> As KVM is now initialized at reset, we loose the possiblity to
> >>>> fallback to the QEMU emulated mode in case of failure and failures
> >>>> become fatal to the machine.
> >>>>
> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>> ---
> >>>>  hw/intc/spapr_xive.c     |  8 +---
> >>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
> >>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
> >>>>  hw/intc/xive.c           |  4 --
> >>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
> >>>>  5 files changed, 109 insertions(+), 34 deletions(-)
> >>>>
> >>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >>>> index 21f3c1ef0901..0661aca35900 100644
> >>>> --- a/hw/intc/spapr_xive.c
> >>>> +++ b/hw/intc/spapr_xive.c
> >>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
> >>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
> >>>>  
> >>>> -    if (kvmppc_xive_enabled()) {
> >>>> -        kvmppc_xive_connect(xive, &local_err);
> >>>> -        if (local_err) {
> >>>> -            error_propagate(errp, local_err);
> >>>> -            return;
> >>>> -        }
> >>>> -    } else {
> >>>> +    if (!kvmppc_xive_enabled()) {
> >>>>          /* TIMA initialization */
> >>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> >>>>                                "xive.tima", 4ull << TM_SHIFT);
> >>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> >>>> index d35814c1992e..3ebc947f2be7 100644
> >>>> --- a/hw/intc/spapr_xive_kvm.c
> >>>> +++ b/hw/intc/spapr_xive_kvm.c
> >>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>>>      Error *local_err = NULL;
> >>>>      size_t esb_len;
> >>>>      size_t tima_len;
> >>>> +    CPUState *cs;
> >>>> +
> >>>> +    /*
> >>>> +     * The KVM XIVE device already in use. This is the case when
> >>>> +     * rebooting XIVE -> XIVE
> >>>
> >>> Can this case actually occur?  Further down you appear to
> >>> unconditionally destroy both KVM devices at reset time.
> >>
> >> I guess you are right. I will check.
> >>
> >>>> +     */
> >>>> +    if (xive->fd != -1) {
> >>>> +        return;
> >>>> +    }
> >>>>  
> >>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> >>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> >>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>>>      xive->change = qemu_add_vm_change_state_handler(
> >>>>          kvmppc_xive_change_state_handler, xive);
> >>>>  
> >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> >>>> +    CPU_FOREACH(cs) {
> >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> +
> >>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    /* Update the KVM sources */
> >>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
> >>>> +    if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +    }
> >>>> +
> >>>>      kvm_kernel_irqchip = true;
> >>>>      kvm_msi_via_irqfd_allowed = true;
> >>>>      kvm_gsi_direct_mapping = true;
> >>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> >>>> index 1d21ff217b82..bfc35d71df7f 100644
> >>>> --- a/hw/intc/xics_kvm.c
> >>>> +++ b/hw/intc/xics_kvm.c
> >>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> >>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>>      int rc;
> >>>> +    CPUState *cs;
> >>>> +    Error *local_err = NULL;
> >>>> +
> >>>> +    /*
> >>>> +     * The KVM XICS device already in use. This is the case when
> >>>> +     * rebooting XICS -> XICS
> >>>> +     */
> >>>> +    if (kernel_xics_fd != -1) {
> >>>> +        return 0;
> >>>> +    }
> >>>>  
> >>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> >>>>          error_setg(errp,
> >>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>>>      kvm_msi_via_irqfd_allowed = true;
> >>>>      kvm_gsi_direct_mapping = true;
> >>>>  
> >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> >>>> +    CPU_FOREACH(cs) {
> >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> +
> >>>> +        icp_kvm_connect(cpu->icp, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            goto fail;
> >>>> +        }
> >>>> +        icp_set_kvm_state(cpu->icp, 1);
> >>>> +    }
> >>>> +
> >>>> +    /* Update the KVM sources */
> >>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> >>>> +
> >>>>      return 0;
> >>>>  
> >>>>  fail:
> >>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >>>> index c5c2fbc3f8bc..c166eab5b210 100644
> >>>> --- a/hw/intc/xive.c
> >>>> +++ b/hw/intc/xive.c
> >>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
> >>>>  
> >>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
> >>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> >>>> -
> >>>> -    if (kvmppc_xive_enabled()) {
> >>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> >>>> -    }
> >>>>  }
> >>>>  
> >>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
> >>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> >>>> index ba27d9d8e972..5592eec3787b 100644
> >>>> --- a/hw/ppc/spapr_irq.c
> >>>> +++ b/hw/ppc/spapr_irq.c
> >>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>>>      int nr_irqs = spapr->irq->nr_irqs;
> >>>>      Error *local_err = NULL;
> >>>>  
> >>>> -    if (kvm_enabled()) {
> >>>> -        if (machine_kernel_irqchip_allowed(machine) &&
> >>>> -            !xics_kvm_init(spapr, &local_err)) {
> >>>> -            spapr->icp_type = TYPE_KVM_ICP;
> >>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >>>> -                                          &local_err);
> >>>> -        }
> >>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> >>>> -            error_prepend(&local_err,
> >>>> -                          "kernel_irqchip requested but unavailable: ");
> >>>> -            goto error;
> >>>
> >>> I don't see anything that replaces the irqchip_required logic, which
> >>> doesn't seem right.
> >>
> >> Yes. We do loose the ability to fall back to the emulated device in case
> >> of failure. It is not impossible to do but it will require more changes
> >> to check what are the KVM capabilities before starting the machine.
> > 
> > Uh... it seems more like it's the other way around.  We'll always fall
> > back to emulated, even if we've explicitly said on the command line
> > that we don't want that.
> 
> Ah yes. The init function might be also broken. 
> 
> XICS mode is a bit more difficult to handle than XIVE because we have 
> different object type for the KVM device and the QEMU emulated device, 
> and with the 'dual' mode, we activate the device at CAS reset time.

Yeah.. we should probably fix that.

> Failures being handled at reset time, should we keep the same logic and  
> abort the machine at reset if the kernel irqchip is required ? 
> 
> But we won't be able to fall back on the QEMU emulated device if KVM 
> XICS fails and if the kernel irqchip is only allowed. It should work for 
> XIVE though.

That's fine.  If we've said that kernel irqchip is required, we
shouldn't fall back to emulation.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [Qemu-ppc] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-02-13 10:07           ` [Qemu-devel] [Qemu-ppc] " Greg Kurz
@ 2019-02-14  3:35             ` David Gibson
  2019-02-14  7:13               ` Cédric Le Goater
  0 siblings, 1 reply; 43+ messages in thread
From: David Gibson @ 2019-02-14  3:35 UTC (permalink / raw)
  To: Greg Kurz; +Cc: Cédric Le Goater, qemu-ppc, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 13102 bytes --]

On Wed, Feb 13, 2019 at 11:07:49AM +0100, Greg Kurz wrote:
> On Wed, 13 Feb 2019 09:22:46 +0100
> Cédric Le Goater <clg@kaod.org> wrote:
> 
> > On 2/13/19 2:32 AM, David Gibson wrote:
> > > On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:  
> > >> On 2/12/19 2:11 AM, David Gibson wrote:  
> > >>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:  
> > >>>> The interrupt mode is chosen by the CAS negotiation process and
> > >>>> activated after a reset to take into account the required changes in
> > >>>> the machine. This brings new constraints on how the associated KVM IRQ
> > >>>> device is initialized.
> > >>>>
> > >>>> Currently, each model takes care of the initialization of the KVM
> > >>>> device in their realize method but this is not possible anymore as the
> > >>>> initialization needs to be done globaly when the interrupt mode is
> > >>>> known, i.e. when machine is reseted. It also means that we need a way
> > >>>> to delete a KVM device when another mode is chosen.
> > >>>>
> > >>>> Also, to support migration, the QEMU objects holding the state to
> > >>>> transfer should always be available but not necessarily activated.
> > >>>>
> > >>>> The overall approach of this proposal is to initialize both interrupt
> > >>>> mode at the QEMU level and keep the IRQ number space in sync to allow
> > >>>> switching from one mode to another. For the KVM side of things, the
> > >>>> whole initialization of the KVM device, sources and presenters, is
> > >>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> > >>>> handlers are modified accordingly to handle the init and the delete
> > >>>> sequences of the KVM device.
> > >>>>
> > >>>> As KVM is now initialized at reset, we loose the possiblity to
> > >>>> fallback to the QEMU emulated mode in case of failure and failures
> > >>>> become fatal to the machine.
> > >>>>
> > >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> > >>>> ---
> > >>>>  hw/intc/spapr_xive.c     |  8 +---
> > >>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
> > >>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
> > >>>>  hw/intc/xive.c           |  4 --
> > >>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
> > >>>>  5 files changed, 109 insertions(+), 34 deletions(-)
> > >>>>
> > >>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> > >>>> index 21f3c1ef0901..0661aca35900 100644
> > >>>> --- a/hw/intc/spapr_xive.c
> > >>>> +++ b/hw/intc/spapr_xive.c
> > >>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> > >>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
> > >>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
> > >>>>  
> > >>>> -    if (kvmppc_xive_enabled()) {
> > >>>> -        kvmppc_xive_connect(xive, &local_err);
> > >>>> -        if (local_err) {
> > >>>> -            error_propagate(errp, local_err);
> > >>>> -            return;
> > >>>> -        }
> > >>>> -    } else {
> > >>>> +    if (!kvmppc_xive_enabled()) {
> > >>>>          /* TIMA initialization */
> > >>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> > >>>>                                "xive.tima", 4ull << TM_SHIFT);
> > >>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> > >>>> index d35814c1992e..3ebc947f2be7 100644
> > >>>> --- a/hw/intc/spapr_xive_kvm.c
> > >>>> +++ b/hw/intc/spapr_xive_kvm.c
> > >>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> > >>>>      Error *local_err = NULL;
> > >>>>      size_t esb_len;
> > >>>>      size_t tima_len;
> > >>>> +    CPUState *cs;
> > >>>> +
> > >>>> +    /*
> > >>>> +     * The KVM XIVE device already in use. This is the case when
> > >>>> +     * rebooting XIVE -> XIVE  
> > >>>
> > >>> Can this case actually occur?  Further down you appear to
> > >>> unconditionally destroy both KVM devices at reset time.  
> > >>
> > >> I guess you are right. I will check.
> > >>  
> > >>>> +     */
> > >>>> +    if (xive->fd != -1) {
> > >>>> +        return;
> > >>>> +    }
> > >>>>  
> > >>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> > >>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> > >>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> > >>>>      xive->change = qemu_add_vm_change_state_handler(
> > >>>>          kvmppc_xive_change_state_handler, xive);
> > >>>>  
> > >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> > >>>> +    CPU_FOREACH(cs) {
> > >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> > >>>> +
> > >>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> > >>>> +        if (local_err) {
> > >>>> +            error_propagate(errp, local_err);
> > >>>> +            return;
> > >>>> +        }
> > >>>> +    }
> > >>>> +
> > >>>> +    /* Update the KVM sources */
> > >>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
> > >>>> +    if (local_err) {
> > >>>> +            error_propagate(errp, local_err);
> > >>>> +            return;
> > >>>> +    }
> > >>>> +
> > >>>>      kvm_kernel_irqchip = true;
> > >>>>      kvm_msi_via_irqfd_allowed = true;
> > >>>>      kvm_gsi_direct_mapping = true;
> > >>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> > >>>> index 1d21ff217b82..bfc35d71df7f 100644
> > >>>> --- a/hw/intc/xics_kvm.c
> > >>>> +++ b/hw/intc/xics_kvm.c
> > >>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> > >>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> > >>>>  {
> > >>>>      int rc;
> > >>>> +    CPUState *cs;
> > >>>> +    Error *local_err = NULL;
> > >>>> +
> > >>>> +    /*
> > >>>> +     * The KVM XICS device already in use. This is the case when
> > >>>> +     * rebooting XICS -> XICS
> > >>>> +     */
> > >>>> +    if (kernel_xics_fd != -1) {
> > >>>> +        return 0;
> > >>>> +    }
> > >>>>  
> > >>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> > >>>>          error_setg(errp,
> > >>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> > >>>>      kvm_msi_via_irqfd_allowed = true;
> > >>>>      kvm_gsi_direct_mapping = true;
> > >>>>  
> > >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> > >>>> +    CPU_FOREACH(cs) {
> > >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> > >>>> +
> > >>>> +        icp_kvm_connect(cpu->icp, &local_err);
> > >>>> +        if (local_err) {
> > >>>> +            error_propagate(errp, local_err);
> > >>>> +            goto fail;
> > >>>> +        }
> > >>>> +        icp_set_kvm_state(cpu->icp, 1);
> > >>>> +    }
> > >>>> +
> > >>>> +    /* Update the KVM sources */
> > >>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> > >>>> +
> > >>>>      return 0;
> > >>>>  
> > >>>>  fail:
> > >>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> > >>>> index c5c2fbc3f8bc..c166eab5b210 100644
> > >>>> --- a/hw/intc/xive.c
> > >>>> +++ b/hw/intc/xive.c
> > >>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
> > >>>>  
> > >>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
> > >>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> > >>>> -
> > >>>> -    if (kvmppc_xive_enabled()) {
> > >>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> > >>>> -    }
> > >>>>  }
> > >>>>  
> > >>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
> > >>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> > >>>> index ba27d9d8e972..5592eec3787b 100644
> > >>>> --- a/hw/ppc/spapr_irq.c
> > >>>> +++ b/hw/ppc/spapr_irq.c
> > >>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> > >>>>      int nr_irqs = spapr->irq->nr_irqs;
> > >>>>      Error *local_err = NULL;
> > >>>>  
> > >>>> -    if (kvm_enabled()) {
> > >>>> -        if (machine_kernel_irqchip_allowed(machine) &&
> > >>>> -            !xics_kvm_init(spapr, &local_err)) {
> > >>>> -            spapr->icp_type = TYPE_KVM_ICP;
> > >>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> > >>>> -                                          &local_err);
> > >>>> -        }
> > >>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> > >>>> -            error_prepend(&local_err,
> > >>>> -                          "kernel_irqchip requested but unavailable: ");
> > >>>> -            goto error;  
> > >>>
> > >>> I don't see anything that replaces the irqchip_required logic, which
> > >>> doesn't seem right.  
> > >>
> > >> Yes. We do loose the ability to fall back to the emulated device in case
> > >> of failure. It is not impossible to do but it will require more changes
> > >> to check what are the KVM capabilities before starting the machine.  
> > > 
> > > Uh... it seems more like it's the other way around.  We'll always fall
> > > back to emulated, even if we've explicitly said on the command line
> > > that we don't want that.  
> > 
> > Ah yes. The init function might be also broken. 
> > 
> > XICS mode is a bit more difficult to handle than XIVE because we have 
> > different object type for the KVM device and the QEMU emulated device, 
> 
> This is indeed a bit unfortunate, but I think there's still room for
> improvement. Let's look at the base classes:
> 
> struct ICPStateClass {
>     DeviceClass parent_class;
> 
>     DeviceRealize parent_realize;
>     DeviceReset parent_reset;
> 
>     void (*pre_save)(ICPState *icp);
>     int (*post_load)(ICPState *icp, int version_id);
>     void (*synchronize_state)(ICPState *icp);
> };
> 
> struct ICSStateClass {
>     DeviceClass parent_class;
> 
>     DeviceRealize parent_realize;
>     DeviceReset parent_reset;
> 
>     void (*pre_save)(ICSState *s);
>     int (*post_load)(ICSState *s, int version_id);
>     void (*reject)(ICSState *s, uint32_t irq);
>     void (*resend)(ICSState *s);
>     void (*eoi)(ICSState *s, uint32_t irq);
>     void (*synchronize_state)(ICSState *s);
> };
> 
> The pre_save and post_load callbacks are only used with
> the KVM device. They could be explicitely called from
> the corresponding VMStateDescription callbacks with a
> kvm_enabled() && kvm_irqchip_in_kernel() check.
> 
> Same goes for the synchronize_state callbacks, which are only
> needed for 'info pic'.
> 
> The reject, resend and eoi callbacks are only called by code that
> belongs to the QEMU emulated device. Either the RTAS/hypercalls
> or from the machine code with explicit checks like:
> 
> static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
> {
>     sPAPRMachineState *spapr = opaque;
>     MachineState *machine = MACHINE(opaque);
> 
>     if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>         ics_kvm_set_irq(spapr->ics, srcno, val);
>     } else {
>         ics_simple_set_irq(spapr->ics, srcno, val);
>     }
> }
> 
> or
> 
> static int spapr_irq_post_load_xics(sPAPRMachineState *spapr, int version_id)
> {
>     if (!object_dynamic_cast(OBJECT(spapr->ics), TYPE_ICS_KVM)) {
>         CPUState *cs;
>         CPU_FOREACH(cs) {
>             PowerPCCPU *cpu = POWERPC_CPU(cs);
>             icp_resend(spapr_cpu_state(cpu)->icp);
>         }
>     }
>     return 0;
> }
> 
> Unless I'm missing something, the reject, resend and eoi callbacks could
> simply be removed. This would allow to unify KVM and QEMU emulation in
> the same ICP and ICS object types.
> 
> If this makes sense to you, I can have a look (already started actually ;-)

Please do.  The use of different object types was something that
seemed like a good idea at the time, but in hindsight, wasn't.  In
general different device types should represent guest-visibly
different objects, not just implementation differences.

> > and with the 'dual' mode, we activate the device at CAS reset time.
> > 
> > Failures being handled at reset time, should we keep the same logic and  
> > abort the machine at reset if the kernel irqchip is required ? 
> > 
> 
> If the user passed ic-mode=dual,kernel-irqchip=on, we should at least make
> sure KVM supports both XICS and XIVE devices during machine init. Then
> during reset if something goes wrong with KVM, it seems ok to abort.
> 
> If the user didn't pass kernel-irqchip, ie, kernel_irqchip_allowed is true
> and kernel_irqchip_required is false, the current behavior for XICS is
> to try KVM first and fallback to QEMU emulation. I guess it could be the
> same for XIVE.

Yes, I think that's the behaviour we want, on all counts.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [Qemu-ppc] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-02-14  3:35             ` David Gibson
@ 2019-02-14  7:13               ` Cédric Le Goater
  0 siblings, 0 replies; 43+ messages in thread
From: Cédric Le Goater @ 2019-02-14  7:13 UTC (permalink / raw)
  To: David Gibson, Greg Kurz; +Cc: qemu-ppc, qemu-devel

On 2/14/19 4:35 AM, David Gibson wrote:
> On Wed, Feb 13, 2019 at 11:07:49AM +0100, Greg Kurz wrote:
>> On Wed, 13 Feb 2019 09:22:46 +0100
>> Cédric Le Goater <clg@kaod.org> wrote:
>>
>>> On 2/13/19 2:32 AM, David Gibson wrote:
>>>> On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:  
>>>>> On 2/12/19 2:11 AM, David Gibson wrote:  
>>>>>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:  
>>>>>>> The interrupt mode is chosen by the CAS negotiation process and
>>>>>>> activated after a reset to take into account the required changes in
>>>>>>> the machine. This brings new constraints on how the associated KVM IRQ
>>>>>>> device is initialized.
>>>>>>>
>>>>>>> Currently, each model takes care of the initialization of the KVM
>>>>>>> device in their realize method but this is not possible anymore as the
>>>>>>> initialization needs to be done globaly when the interrupt mode is
>>>>>>> known, i.e. when machine is reseted. It also means that we need a way
>>>>>>> to delete a KVM device when another mode is chosen.
>>>>>>>
>>>>>>> Also, to support migration, the QEMU objects holding the state to
>>>>>>> transfer should always be available but not necessarily activated.
>>>>>>>
>>>>>>> The overall approach of this proposal is to initialize both interrupt
>>>>>>> mode at the QEMU level and keep the IRQ number space in sync to allow
>>>>>>> switching from one mode to another. For the KVM side of things, the
>>>>>>> whole initialization of the KVM device, sources and presenters, is
>>>>>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
>>>>>>> handlers are modified accordingly to handle the init and the delete
>>>>>>> sequences of the KVM device.
>>>>>>>
>>>>>>> As KVM is now initialized at reset, we loose the possiblity to
>>>>>>> fallback to the QEMU emulated mode in case of failure and failures
>>>>>>> become fatal to the machine.
>>>>>>>
>>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>>> ---
>>>>>>>  hw/intc/spapr_xive.c     |  8 +---
>>>>>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>>>>>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>>>>>>>  hw/intc/xive.c           |  4 --
>>>>>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>>>>>>>  5 files changed, 109 insertions(+), 34 deletions(-)
>>>>>>>
>>>>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>>>>> index 21f3c1ef0901..0661aca35900 100644
>>>>>>> --- a/hw/intc/spapr_xive.c
>>>>>>> +++ b/hw/intc/spapr_xive.c
>>>>>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>>>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>>>>>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>>>>>>>  
>>>>>>> -    if (kvmppc_xive_enabled()) {
>>>>>>> -        kvmppc_xive_connect(xive, &local_err);
>>>>>>> -        if (local_err) {
>>>>>>> -            error_propagate(errp, local_err);
>>>>>>> -            return;
>>>>>>> -        }
>>>>>>> -    } else {
>>>>>>> +    if (!kvmppc_xive_enabled()) {
>>>>>>>          /* TIMA initialization */
>>>>>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>>>>>>>                                "xive.tima", 4ull << TM_SHIFT);
>>>>>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>>>>>>> index d35814c1992e..3ebc947f2be7 100644
>>>>>>> --- a/hw/intc/spapr_xive_kvm.c
>>>>>>> +++ b/hw/intc/spapr_xive_kvm.c
>>>>>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>>>>      Error *local_err = NULL;
>>>>>>>      size_t esb_len;
>>>>>>>      size_t tima_len;
>>>>>>> +    CPUState *cs;
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * The KVM XIVE device already in use. This is the case when
>>>>>>> +     * rebooting XIVE -> XIVE  
>>>>>>
>>>>>> Can this case actually occur?  Further down you appear to
>>>>>> unconditionally destroy both KVM devices at reset time.  
>>>>>
>>>>> I guess you are right. I will check.
>>>>>  
>>>>>>> +     */
>>>>>>> +    if (xive->fd != -1) {
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>>  
>>>>>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
>>>>>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
>>>>>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>>>>      xive->change = qemu_add_vm_change_state_handler(
>>>>>>>          kvmppc_xive_change_state_handler, xive);
>>>>>>>  
>>>>>>> +    /* Connect the presenters to the initial VCPUs of the machine */
>>>>>>> +    CPU_FOREACH(cs) {
>>>>>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>>>>> +
>>>>>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
>>>>>>> +        if (local_err) {
>>>>>>> +            error_propagate(errp, local_err);
>>>>>>> +            return;
>>>>>>> +        }
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    /* Update the KVM sources */
>>>>>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
>>>>>>> +    if (local_err) {
>>>>>>> +            error_propagate(errp, local_err);
>>>>>>> +            return;
>>>>>>> +    }
>>>>>>> +
>>>>>>>      kvm_kernel_irqchip = true;
>>>>>>>      kvm_msi_via_irqfd_allowed = true;
>>>>>>>      kvm_gsi_direct_mapping = true;
>>>>>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>>>>>>> index 1d21ff217b82..bfc35d71df7f 100644
>>>>>>> --- a/hw/intc/xics_kvm.c
>>>>>>> +++ b/hw/intc/xics_kvm.c
>>>>>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>>>>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>>>>>>  {
>>>>>>>      int rc;
>>>>>>> +    CPUState *cs;
>>>>>>> +    Error *local_err = NULL;
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * The KVM XICS device already in use. This is the case when
>>>>>>> +     * rebooting XICS -> XICS
>>>>>>> +     */
>>>>>>> +    if (kernel_xics_fd != -1) {
>>>>>>> +        return 0;
>>>>>>> +    }
>>>>>>>  
>>>>>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>>>>>>>          error_setg(errp,
>>>>>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>>>>>>      kvm_msi_via_irqfd_allowed = true;
>>>>>>>      kvm_gsi_direct_mapping = true;
>>>>>>>  
>>>>>>> +    /* Connect the presenters to the initial VCPUs of the machine */
>>>>>>> +    CPU_FOREACH(cs) {
>>>>>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>>>>> +
>>>>>>> +        icp_kvm_connect(cpu->icp, &local_err);
>>>>>>> +        if (local_err) {
>>>>>>> +            error_propagate(errp, local_err);
>>>>>>> +            goto fail;
>>>>>>> +        }
>>>>>>> +        icp_set_kvm_state(cpu->icp, 1);
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    /* Update the KVM sources */
>>>>>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
>>>>>>> +
>>>>>>>      return 0;
>>>>>>>  
>>>>>>>  fail:
>>>>>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>>>>>>> index c5c2fbc3f8bc..c166eab5b210 100644
>>>>>>> --- a/hw/intc/xive.c
>>>>>>> +++ b/hw/intc/xive.c
>>>>>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
>>>>>>>  
>>>>>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>>>>>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
>>>>>>> -
>>>>>>> -    if (kvmppc_xive_enabled()) {
>>>>>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
>>>>>>> -    }
>>>>>>>  }
>>>>>>>  
>>>>>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
>>>>>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>>>>>>> index ba27d9d8e972..5592eec3787b 100644
>>>>>>> --- a/hw/ppc/spapr_irq.c
>>>>>>> +++ b/hw/ppc/spapr_irq.c
>>>>>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>>>>>>      int nr_irqs = spapr->irq->nr_irqs;
>>>>>>>      Error *local_err = NULL;
>>>>>>>  
>>>>>>> -    if (kvm_enabled()) {
>>>>>>> -        if (machine_kernel_irqchip_allowed(machine) &&
>>>>>>> -            !xics_kvm_init(spapr, &local_err)) {
>>>>>>> -            spapr->icp_type = TYPE_KVM_ICP;
>>>>>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>>>>>>> -                                          &local_err);
>>>>>>> -        }
>>>>>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
>>>>>>> -            error_prepend(&local_err,
>>>>>>> -                          "kernel_irqchip requested but unavailable: ");
>>>>>>> -            goto error;  
>>>>>>
>>>>>> I don't see anything that replaces the irqchip_required logic, which
>>>>>> doesn't seem right.  
>>>>>
>>>>> Yes. We do loose the ability to fall back to the emulated device in case
>>>>> of failure. It is not impossible to do but it will require more changes
>>>>> to check what are the KVM capabilities before starting the machine.  
>>>>
>>>> Uh... it seems more like it's the other way around.  We'll always fall
>>>> back to emulated, even if we've explicitly said on the command line
>>>> that we don't want that.  
>>>
>>> Ah yes. The init function might be also broken. 
>>>
>>> XICS mode is a bit more difficult to handle than XIVE because we have 
>>> different object type for the KVM device and the QEMU emulated device, 
>>
>> This is indeed a bit unfortunate, but I think there's still room for
>> improvement. Let's look at the base classes:
>>
>> struct ICPStateClass {
>>     DeviceClass parent_class;
>>
>>     DeviceRealize parent_realize;
>>     DeviceReset parent_reset;
>>
>>     void (*pre_save)(ICPState *icp);
>>     int (*post_load)(ICPState *icp, int version_id);
>>     void (*synchronize_state)(ICPState *icp);
>> };
>>
>> struct ICSStateClass {
>>     DeviceClass parent_class;
>>
>>     DeviceRealize parent_realize;
>>     DeviceReset parent_reset;
>>
>>     void (*pre_save)(ICSState *s);
>>     int (*post_load)(ICSState *s, int version_id);
>>     void (*reject)(ICSState *s, uint32_t irq);
>>     void (*resend)(ICSState *s);
>>     void (*eoi)(ICSState *s, uint32_t irq);
>>     void (*synchronize_state)(ICSState *s);
>> };
>>
>> The pre_save and post_load callbacks are only used with
>> the KVM device. They could be explicitely called from
>> the corresponding VMStateDescription callbacks with a
>> kvm_enabled() && kvm_irqchip_in_kernel() check.

yes.
>> Same goes for the synchronize_state callbacks, which are only
>> needed for 'info pic'.

yes, like we do for XIVE.

>> The reject, resend and eoi callbacks are only called by code that
>> belongs to the QEMU emulated device. Either the RTAS/hypercalls
>> or from the machine code with explicit checks like:
>>
>> static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>> {
>>     sPAPRMachineState *spapr = opaque;
>>     MachineState *machine = MACHINE(opaque);
>>
>>     if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>         ics_kvm_set_irq(spapr->ics, srcno, val);
>>     } else {
>>         ics_simple_set_irq(spapr->ics, srcno, val);
>>     }
>> }
>>
>> or
>>
>> static int spapr_irq_post_load_xics(sPAPRMachineState *spapr, int version_id)
>> {
>>     if (!object_dynamic_cast(OBJECT(spapr->ics), TYPE_ICS_KVM)) {
>>         CPUState *cs;
>>         CPU_FOREACH(cs) {
>>             PowerPCCPU *cpu = POWERPC_CPU(cs);
>>             icp_resend(spapr_cpu_state(cpu)->icp);
>>         }
>>     }
>>     return 0;
>> }
>>
>> Unless I'm missing something, the reject, resend and eoi callbacks could
>> simply be removed. This would allow to unify KVM and QEMU emulation in
>> the same ICP and ICS object types.

yes.  

>> If this makes sense to you, I can have a look (already started actually ;-)
> 
> Please do.  The use of different object types was something that
> seemed like a good idea at the time, but in hindsight, wasn't.  In
> general different device types should represent guest-visibly
> different objects, not just implementation differences.

yes. The direction we took to add KVM support in XIVE proved to be 
much simpler.

>>> and with the 'dual' mode, we activate the device at CAS reset time.
>>>
>>> Failures being handled at reset time, should we keep the same logic and  
>>> abort the machine at reset if the kernel irqchip is required ? 
>>>
>>
>> If the user passed ic-mode=dual,kernel-irqchip=on, we should at least make
>> sure KVM supports both XICS and XIVE devices during machine init. Then
>> during reset if something goes wrong with KVM, it seems ok to abort.
>>
>> If the user didn't pass kernel-irqchip, ie, kernel_irqchip_allowed is true
>> and kernel_irqchip_required is false, the current behavior for XICS is
>> to try KVM first and fallback to QEMU emulation. I guess it could be the
>> same for XIVE.
> 
> Yes, I think that's the behaviour we want, on all counts.
 
Thanks,

C. 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine
  2019-02-13  1:32       ` David Gibson
  2019-02-13  8:22         ` Cédric Le Goater
@ 2019-02-22 12:36         ` Cédric Le Goater
  1 sibling, 0 replies; 43+ messages in thread
From: Cédric Le Goater @ 2019-02-22 12:36 UTC (permalink / raw)
  To: David Gibson; +Cc: Benjamin Herrenschmidt, qemu-ppc, qemu-devel

On 2/13/19 2:32 AM, David Gibson wrote:
> On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:
>> On 2/12/19 2:11 AM, David Gibson wrote:
>>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
>>>> The interrupt mode is chosen by the CAS negotiation process and
>>>> activated after a reset to take into account the required changes in
>>>> the machine. This brings new constraints on how the associated KVM IRQ
>>>> device is initialized.
>>>>
>>>> Currently, each model takes care of the initialization of the KVM
>>>> device in their realize method but this is not possible anymore as the
>>>> initialization needs to be done globaly when the interrupt mode is
>>>> known, i.e. when machine is reseted. It also means that we need a way
>>>> to delete a KVM device when another mode is chosen.
>>>>
>>>> Also, to support migration, the QEMU objects holding the state to
>>>> transfer should always be available but not necessarily activated.
>>>>
>>>> The overall approach of this proposal is to initialize both interrupt
>>>> mode at the QEMU level and keep the IRQ number space in sync to allow
>>>> switching from one mode to another. For the KVM side of things, the
>>>> whole initialization of the KVM device, sources and presenters, is
>>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
>>>> handlers are modified accordingly to handle the init and the delete
>>>> sequences of the KVM device.
>>>>
>>>> As KVM is now initialized at reset, we loose the possiblity to
>>>> fallback to the QEMU emulated mode in case of failure and failures
>>>> become fatal to the machine.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>  hw/intc/spapr_xive.c     |  8 +---
>>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>>>>  hw/intc/xive.c           |  4 --
>>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>>>>  5 files changed, 109 insertions(+), 34 deletions(-)
>>>>
>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>> index 21f3c1ef0901..0661aca35900 100644
>>>> --- a/hw/intc/spapr_xive.c
>>>> +++ b/hw/intc/spapr_xive.c
>>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>>>>  
>>>> -    if (kvmppc_xive_enabled()) {
>>>> -        kvmppc_xive_connect(xive, &local_err);
>>>> -        if (local_err) {
>>>> -            error_propagate(errp, local_err);
>>>> -            return;
>>>> -        }
>>>> -    } else {
>>>> +    if (!kvmppc_xive_enabled()) {
>>>>          /* TIMA initialization */
>>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>>>>                                "xive.tima", 4ull << TM_SHIFT);
>>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>>>> index d35814c1992e..3ebc947f2be7 100644
>>>> --- a/hw/intc/spapr_xive_kvm.c
>>>> +++ b/hw/intc/spapr_xive_kvm.c
>>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>      Error *local_err = NULL;
>>>>      size_t esb_len;
>>>>      size_t tima_len;
>>>> +    CPUState *cs;
>>>> +
>>>> +    /*
>>>> +     * The KVM XIVE device already in use. This is the case when
>>>> +     * rebooting XIVE -> XIVE
>>>
>>> Can this case actually occur?  Further down you appear to
>>> unconditionally destroy both KVM devices at reset time.
>>
>> I guess you are right. I will check.

So we have to keep it for ic-mode=xive.

The number of test combinations is exploding. We now have :

3 hypervisors              KVM, KVM nested, QEMU TCG
3 Interrupt modes          xics, xive, dual (xics+xive)
3 kernel irqchip modes     off, allow, required.


C.
  

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2019-02-22 12:37 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-07 18:39 [Qemu-devel] [PATCH 00/13] spapr: add KVM support to the XIVE interrupt mode Cédric Le Goater
2019-01-07 18:39 ` [Qemu-devel] [PATCH 01/13] linux-headers: update to 5.0 Cédric Le Goater
2019-01-07 18:39 ` [Qemu-devel] [PATCH 02/13] spapr/xive: add KVM support Cédric Le Goater
2019-02-06  2:39   ` David Gibson
2019-01-07 18:39 ` [Qemu-devel] [PATCH 03/13] spapr/xive: add state synchronization with KVM Cédric Le Goater
2019-02-06  2:42   ` David Gibson
2019-01-07 18:39 ` [Qemu-devel] [PATCH 04/13] spapr/xive: introduce a VM state change handler Cédric Le Goater
2019-02-06  2:49   ` David Gibson
2019-01-07 18:39 ` [Qemu-devel] [PATCH 05/13] spapr/xive: add migration support for KVM Cédric Le Goater
2019-02-07  3:41   ` David Gibson
2019-01-07 18:39 ` [Qemu-devel] [PATCH 06/13] spapr/xive: fix migration of the XiveTCTX under TCG Cédric Le Goater
2019-02-08  5:36   ` David Gibson
2019-02-08  7:12     ` Cédric Le Goater
2019-02-12  0:22       ` David Gibson
2019-02-12  6:58         ` Cédric Le Goater
2019-01-07 18:39 ` [Qemu-devel] [PATCH 07/13] ppc/xics: introduce a icp_kvm_connect() routine Cédric Le Goater
2019-01-07 18:39 ` [Qemu-devel] [PATCH 08/13] spapr/rtas: modify spapr_rtas_register() to remove RTAS handlers Cédric Le Goater
2019-01-29  5:09   ` Alexey Kardashevskiy
2019-01-29  7:20     ` Cédric Le Goater
2019-01-07 18:39 ` [Qemu-devel] [PATCH 09/13] sysbus: add a sysbus_mmio_unmap() helper Cédric Le Goater
2019-01-07 18:39 ` [Qemu-devel] [PATCH 10/13] spapr: introduce routines to delete the KVM IRQ device Cédric Le Goater
2019-02-12  0:58   ` David Gibson
2019-01-07 18:39 ` [Qemu-devel] [PATCH 11/13] spapr: check for the activation of " Cédric Le Goater
2019-02-12  1:01   ` David Gibson
2019-02-12  7:12     ` Cédric Le Goater
2019-02-13  0:17       ` David Gibson
2019-01-07 18:39 ` [Qemu-devel] [PATCH 12/13] spapr/xics: ignore the lower 4K in the IRQ number space Cédric Le Goater
2019-02-12  1:06   ` David Gibson
2019-02-12  7:05     ` Cédric Le Goater
2019-02-13  1:33       ` David Gibson
2019-02-13  8:03         ` Cédric Le Goater
2019-02-13 11:27           ` [Qemu-devel] [Qemu-ppc] " Greg Kurz
2019-02-13 12:11             ` Greg Kurz
2019-01-07 18:39 ` [Qemu-devel] [PATCH 13/13] spapr: add KVM support to the 'dual' machine Cédric Le Goater
2019-02-12  1:11   ` David Gibson
2019-02-12  7:18     ` Cédric Le Goater
2019-02-13  1:32       ` David Gibson
2019-02-13  8:22         ` Cédric Le Goater
2019-02-13 10:07           ` [Qemu-devel] [Qemu-ppc] " Greg Kurz
2019-02-14  3:35             ` David Gibson
2019-02-14  7:13               ` Cédric Le Goater
2019-02-14  3:29           ` [Qemu-devel] " David Gibson
2019-02-22 12:36         ` Cédric Le Goater

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.