All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain
@ 2018-05-20  7:28 Zihan Yang
  2018-05-20  7:28 ` [Qemu-devel] [RFC 1/3] pci_expander_bridge: reserve enough mcfg space for pxb host Zihan Yang
                   ` (3 more replies)
  0 siblings, 4 replies; 46+ messages in thread
From: Zihan Yang @ 2018-05-20  7:28 UTC (permalink / raw)
  To: qemu-devel; +Cc: Zihan Yang

Currently only q35 host bridge is allocated an item in MCFG table, all pxb
host bridges stay within pci domain 0. This series of patches put each pxb
host bridge in separate pci domain, allocating a new MCFG table item for it.

The pxb host does not have an mch in it, as q35 host already has one. All
pxb host bridges just serve as expander bridges in QEMU, which makes them
a little simpler than q35 host bridge.

NOTE: These patches are just initial proof of concept, asking for suggestions.
      The aml part and seabios part are not finished at all. This means it
      still behaves like before, that all pci expander bridges stay in the
      same pci domain as a35 host.

Zihan Yang (3):
  pci_expander_bridge: reserve enough mcfg space for pxb host
  pci: Link pci_host_bridges with QTAILQ
  acpi-build: allocate mcfg for multiple host bridges

 hw/i386/acpi-build.c                        | 83 ++++++++++++++++++-------
 hw/i386/pc.c                                |  5 ++
 hw/pci-bridge/pci_expander_bridge.c         | 96 ++++++++++++++++++++++++++++-
 hw/pci/pci.c                                |  9 +--
 include/hw/pci-bridge/pci_expander_bridge.h |  7 +++
 include/hw/pci/pci_host.h                   |  2 +-
 6 files changed, 175 insertions(+), 27 deletions(-)
 create mode 100644 include/hw/pci-bridge/pci_expander_bridge.h

-- 
2.7.4

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [Qemu-devel] [RFC 1/3] pci_expander_bridge: reserve enough mcfg space for pxb host
  2018-05-20  7:28 [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain Zihan Yang
@ 2018-05-20  7:28 ` Zihan Yang
  2018-05-21 11:03   ` Marcel Apfelbaum
  2018-05-20  7:28 ` [Qemu-devel] [RFC 2/3] pci: Link pci_host_bridges with QTAILQ Zihan Yang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 46+ messages in thread
From: Zihan Yang @ 2018-05-20  7:28 UTC (permalink / raw)
  To: qemu-devel
  Cc: Zihan Yang, Paolo Bonzini, Richard Henderson, Eduardo Habkost,
	Michael S. Tsirkin, Marcel Apfelbaum

To put each pxb into separate pci domain, we need to reserve enough MCFG space
for each pxb host in the main memory. First try to put them under 4G, before
pci_hole_start, if there are too many hosts, try to put them into main memory
above 4G, before pci_hole64_start. We should check if there is enough memory
to reserve for them

Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
---
 hw/i386/pc.c                                |  5 ++
 hw/pci-bridge/pci_expander_bridge.c         | 96 ++++++++++++++++++++++++++++-
 include/hw/pci-bridge/pci_expander_bridge.h |  7 +++
 3 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 include/hw/pci-bridge/pci_expander_bridge.h

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index d768930..98097fd 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -34,6 +34,7 @@
 #include "hw/ide.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_bus.h"
+#include "hw/pci-bridge/pci_expander_bridge.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/timer/hpet.h"
 #include "hw/smbios/smbios.h"
@@ -1466,6 +1467,7 @@ uint64_t pc_pci_hole64_start(void)
     PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
     MachineState *ms = MACHINE(pcms);
     uint64_t hole64_start = 0;
+    int pxb_hosts;
 
     if (pcmc->has_reserved_memory && ms->device_memory->base) {
         hole64_start = ms->device_memory->base;
@@ -1473,6 +1475,9 @@ uint64_t pc_pci_hole64_start(void)
             hole64_start += memory_region_size(&ms->device_memory->mr);
         }
     } else {
+        /* make sure enough space is left for pxb host, otherwise fail */
+        pxb_hosts = pxb_get_expander_hosts();
+        g_assert (pxb_hosts <= 0 || pcms->above_4g_mem_size >= (pxb_hosts << 28ULL));
         hole64_start = 0x100000000ULL + pcms->above_4g_mem_size;
     }
 
diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c
index e62de42..8409c87 100644
--- a/hw/pci-bridge/pci_expander_bridge.c
+++ b/hw/pci-bridge/pci_expander_bridge.c
@@ -12,10 +12,13 @@
 
 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "qapi/visitor.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_host.h"
 #include "hw/pci/pci_bridge.h"
+#include "hw/pci-host/q35.h"
+#include "hw/pci-bridge/pci_expander_bridge.h"
 #include "qemu/range.h"
 #include "qemu/error-report.h"
 #include "sysemu/numa.h"
@@ -57,7 +60,13 @@ static PXBDev *convert_to_pxb(PCIDevice *dev)
 
 static GList *pxb_dev_list;
 
+typedef struct PXBPCIHost {
+    PCIExpressHost parent_obj;
+} PXBPCIHost;
+
 #define TYPE_PXB_HOST "pxb-host"
+#define PXB_HOST_DEVICE(obj) \
+     OBJECT_CHECK(PXBPCIHost, (obj), TYPE_PXB_HOST)
 
 static int pxb_bus_num(PCIBus *bus)
 {
@@ -111,6 +120,14 @@ static const char *pxb_host_root_bus_path(PCIHostState *host_bridge,
     return bus->bus_path;
 }
 
+static void pxb_host_get_mmcfg_size(Object *obj, Visitor *v, const char *name,
+                                    void *opaque, Error **errp)
+{
+    PCIExpressHost *e = PCIE_HOST_BRIDGE(obj);
+
+    visit_type_uint64(v, name, &e->size, errp);
+}
+
 static char *pxb_host_ofw_unit_address(const SysBusDevice *dev)
 {
     const PCIHostState *pxb_host;
@@ -142,6 +159,80 @@ static char *pxb_host_ofw_unit_address(const SysBusDevice *dev)
     return NULL;
 }
 
+static Object *pxb_get_i386_pci_host(void)
+{
+    PCIHostState *host;
+
+    host = OBJECT_CHECK(PCIHostState,
+                        object_resolve_path("/machine/i440fx", NULL),
+                        TYPE_PCI_HOST_BRIDGE);
+    if (!host) {
+        host = OBJECT_CHECK(PCIHostState,
+                            object_resolve_path("/machine/q35", NULL),
+                            TYPE_PCI_HOST_BRIDGE);
+    }
+
+    return OBJECT(host);
+}
+
+int pxhb_cnt = 0;
+
+/* -1 means to exclude q35 host */
+#define MCFG_IN_PCI_HOLE (((IO_APIC_DEFAULT_ADDRESS - MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT) >> 28) - 1)
+
+int pxb_get_expander_hosts(void)
+{
+    return pxhb_cnt - MCFG_IN_PCI_HOLE;
+}
+
+/* Dirty workaround */
+static void modify_q35_pci_hole(void)
+{
+    Object *pci_host;
+    Q35PCIHost *s;
+
+    pci_host = pxb_get_i386_pci_host();
+    g_assert(pci_host);
+    s = Q35_HOST_DEVICE(pci_host);
+
+    ++pxhb_cnt;
+    if (pxhb_cnt <= MCFG_IN_PCI_HOLE) {
+        range_set_bounds(&s->mch.pci_hole,
+            MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT + ((1 + pxhb_cnt) << 28),
+            IO_APIC_DEFAULT_ADDRESS - 1);
+    }
+
+    // leave pci hole64 to acpi build part
+}
+
+static void pxb_host_initfn(Object *obj)
+{
+    PCIHostState *phb = PCI_HOST_BRIDGE(obj);
+
+    memory_region_init_io(&phb->conf_mem, obj, &pci_host_conf_le_ops, phb,
+                          "pci-conf-idx", 4);
+    memory_region_init_io(&phb->data_mem, obj, &pci_host_data_le_ops, phb,
+                          "pci-conf-data", 4);
+
+    object_property_add(obj, PCIE_HOST_MCFG_SIZE, "uint64",
+                         pxb_host_get_mmcfg_size,
+                         NULL, NULL, NULL, NULL);
+
+    /* Leave enough space for the biggest MCFG BAR */
+    /* TODO. Since pxb host is just an expander bridge without an mch,
+     * we modify the range in q35 host. It should be workable as it is
+     * before acpi build, although it is dirty
+     */
+    modify_q35_pci_hole();
+}
+
+/* default value does not matter as guest firmware will overwrite it */
+static Property pxb_host_props[] = {
+    DEFINE_PROP_UINT64(PCIE_HOST_MCFG_BASE, PXBPCIHost, parent_obj.base_addr,
+                        MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void pxb_host_class_init(ObjectClass *class, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(class);
@@ -149,6 +240,7 @@ static void pxb_host_class_init(ObjectClass *class, void *data)
     PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(class);
 
     dc->fw_name = "pci";
+    dc->props = pxb_host_props;
     /* Reason: Internal part of the pxb/pxb-pcie device, not usable by itself */
     dc->user_creatable = false;
     sbc->explicit_ofw_unit_address = pxb_host_ofw_unit_address;
@@ -157,7 +249,9 @@ static void pxb_host_class_init(ObjectClass *class, void *data)
 
 static const TypeInfo pxb_host_info = {
     .name          = TYPE_PXB_HOST,
-    .parent        = TYPE_PCI_HOST_BRIDGE,
+    .parent        = TYPE_PCIE_HOST_BRIDGE,
+    .instance_size = sizeof(PXBPCIHost),
+    .instance_init = pxb_host_initfn,
     .class_init    = pxb_host_class_init,
 };
 
diff --git a/include/hw/pci-bridge/pci_expander_bridge.h b/include/hw/pci-bridge/pci_expander_bridge.h
new file mode 100644
index 0000000..d48ddb1
--- /dev/null
+++ b/include/hw/pci-bridge/pci_expander_bridge.h
@@ -0,0 +1,7 @@
+#ifndef HW_PCI_EXPANDER_H
+#define HW_PCI_EXPANDER_H
+
+/* return the number of pxb hosts that resides in the main memory above 4G */
+int pxb_get_expander_hosts(void);
+
+#endif
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [Qemu-devel] [RFC 2/3] pci: Link pci_host_bridges with QTAILQ
  2018-05-20  7:28 [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain Zihan Yang
  2018-05-20  7:28 ` [Qemu-devel] [RFC 1/3] pci_expander_bridge: reserve enough mcfg space for pxb host Zihan Yang
@ 2018-05-20  7:28 ` Zihan Yang
  2018-05-21 11:05   ` Marcel Apfelbaum
  2018-05-20  7:28 ` [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges Zihan Yang
  2018-05-21 15:23 ` [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain Marcel Apfelbaum
  3 siblings, 1 reply; 46+ messages in thread
From: Zihan Yang @ 2018-05-20  7:28 UTC (permalink / raw)
  To: qemu-devel; +Cc: Zihan Yang, Michael S. Tsirkin, Marcel Apfelbaum

QLIST will place the original q35 host bridge at the end of list because it is
added first. Replace it with QTAILQ to make q35 at the first of queue, which
makes it convinient and compatible when there are pxb hosts other than q35 hosts

Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
---
 hw/pci/pci.c              | 9 +++++----
 include/hw/pci/pci_host.h | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 80bc459..ddc27ba 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -196,7 +196,8 @@ static void pci_del_option_rom(PCIDevice *pdev);
 static uint16_t pci_default_sub_vendor_id = PCI_SUBVENDOR_ID_REDHAT_QUMRANET;
 static uint16_t pci_default_sub_device_id = PCI_SUBDEVICE_ID_QEMU;
 
-static QLIST_HEAD(, PCIHostState) pci_host_bridges;
+static QTAILQ_HEAD(, PCIHostState) pci_host_bridges =
+    QTAILQ_HEAD_INITIALIZER(pci_host_bridges);
 
 int pci_bar(PCIDevice *d, int reg)
 {
@@ -330,7 +331,7 @@ static void pci_host_bus_register(DeviceState *host)
 {
     PCIHostState *host_bridge = PCI_HOST_BRIDGE(host);
 
-    QLIST_INSERT_HEAD(&pci_host_bridges, host_bridge, next);
+    QTAILQ_INSERT_TAIL(&pci_host_bridges, host_bridge, next);
 }
 
 PCIBus *pci_device_root_bus(const PCIDevice *d)
@@ -1798,7 +1799,7 @@ PciInfoList *qmp_query_pci(Error **errp)
     PciInfoList *info, *head = NULL, *cur_item = NULL;
     PCIHostState *host_bridge;
 
-    QLIST_FOREACH(host_bridge, &pci_host_bridges, next) {
+    QTAILQ_FOREACH(host_bridge, &pci_host_bridges, next) {
         info = g_malloc0(sizeof(*info));
         info->value = qmp_query_pci_bus(host_bridge->bus,
                                         pci_bus_num(host_bridge->bus));
@@ -2493,7 +2494,7 @@ int pci_qdev_find_device(const char *id, PCIDevice **pdev)
     PCIHostState *host_bridge;
     int rc = -ENODEV;
 
-    QLIST_FOREACH(host_bridge, &pci_host_bridges, next) {
+    QTAILQ_FOREACH(host_bridge, &pci_host_bridges, next) {
         int tmp = pci_qdev_find_recursive(host_bridge->bus, id, pdev);
         if (!tmp) {
             rc = 0;
diff --git a/include/hw/pci/pci_host.h b/include/hw/pci/pci_host.h
index ba31595..a5617cf 100644
--- a/include/hw/pci/pci_host.h
+++ b/include/hw/pci/pci_host.h
@@ -47,7 +47,7 @@ struct PCIHostState {
     uint32_t config_reg;
     PCIBus *bus;
 
-    QLIST_ENTRY(PCIHostState) next;
+    QTAILQ_ENTRY(PCIHostState) next;
 };
 
 typedef struct PCIHostBridgeClass {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-20  7:28 [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain Zihan Yang
  2018-05-20  7:28 ` [Qemu-devel] [RFC 1/3] pci_expander_bridge: reserve enough mcfg space for pxb host Zihan Yang
  2018-05-20  7:28 ` [Qemu-devel] [RFC 2/3] pci: Link pci_host_bridges with QTAILQ Zihan Yang
@ 2018-05-20  7:28 ` Zihan Yang
  2018-05-21 11:53   ` Marcel Apfelbaum
  2018-05-21 15:23 ` [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain Marcel Apfelbaum
  3 siblings, 1 reply; 46+ messages in thread
From: Zihan Yang @ 2018-05-20  7:28 UTC (permalink / raw)
  To: qemu-devel
  Cc: Zihan Yang, Michael S. Tsirkin, Igor Mammedov, Marcel Apfelbaum,
	Paolo Bonzini, Richard Henderson, Eduardo Habkost

Currently only q35 host bridge us allocated space in MCFG table. To put pxb host
into sepratate pci domain, each of them should have its own configuration space
int MCFG table

Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
---
 hw/i386/acpi-build.c | 83 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 62 insertions(+), 21 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 9bc6d97..808d815 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -89,6 +89,7 @@
 typedef struct AcpiMcfgInfo {
     uint64_t mcfg_base;
     uint32_t mcfg_size;
+    struct AcpiMcfgInfo *next;
 } AcpiMcfgInfo;
 
 typedef struct AcpiPmInfo {
@@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info)
 {
     AcpiTableMcfg *mcfg;
     const char *sig;
-    int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
+    int len, count = 0;
+    AcpiMcfgInfo *cfg = info;
 
+    while (cfg) {
+        ++count;
+        cfg = cfg->next;
+    }
+    len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]);
     mcfg = acpi_data_push(table_data, len);
-    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
-    /* Only a single allocation so no need to play with segments */
-    mcfg->allocation[0].pci_segment = cpu_to_le16(0);
-    mcfg->allocation[0].start_bus_number = 0;
-    mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1);
 
     /* MCFG is used for ECAM which can be enabled or disabled by guest.
      * To avoid table size changes (which create migration issues),
@@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info)
     } else {
         sig = "MCFG";
     }
+
+    count = 0;
+    while (info) {
+        mcfg[count].allocation[0].address = cpu_to_le64(info->mcfg_base);
+        /* Only a single allocation so no need to play with segments */
+        mcfg[count].allocation[0].pci_segment = cpu_to_le16(count);
+        mcfg[count].allocation[0].start_bus_number = 0;
+        mcfg[count++].allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1);
+        info = info->next;
+    }
+
     build_header(linker, table_data, (void *)mcfg, sig, len, 1, NULL, NULL);
 }
 
@@ -2602,26 +2615,52 @@ struct AcpiBuildState {
     MemoryRegion *linker_mr;
 } AcpiBuildState;
 
-static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
+static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg)
+{
+    AcpiMcfgInfo *tmp;
+    while (mcfg) {
+        tmp = mcfg->next;
+        g_free(mcfg);
+        mcfg = tmp;
+    }
+}
+
+static AcpiMcfgInfo *acpi_get_mcfg(void)
 {
     Object *pci_host;
     QObject *o;
+    AcpiMcfgInfo *head = NULL, *tail, *mcfg;
 
     pci_host = acpi_get_i386_pci_host();
     g_assert(pci_host);
 
-    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL);
-    if (!o) {
-        return false;
+    while (pci_host) {
+        mcfg = g_new0(AcpiMcfgInfo, 1);
+        mcfg->next = NULL;
+        if (!head) {
+            tail = head = mcfg;
+        } else {
+            tail->next = mcfg;
+            tail = mcfg;
+        }
+
+        o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL);
+        if (!o) {
+            cleanup_mcfg(head);
+            g_free(mcfg);
+            return NULL;
+        }
+        mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
+        qobject_unref(o);
+
+        o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL);
+        assert(o);
+        mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o));
+        qobject_unref(o);
+
+        pci_host = OBJECT(QTAILQ_NEXT(PCI_HOST_BRIDGE(pci_host), next));
     }
-    mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
-    qobject_unref(o);
-
-    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL);
-    assert(o);
-    mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o));
-    qobject_unref(o);
-    return true;
+    return head;
 }
 
 static
@@ -2633,7 +2672,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
     unsigned facs, dsdt, rsdt, fadt;
     AcpiPmInfo pm;
     AcpiMiscInfo misc;
-    AcpiMcfgInfo mcfg;
+    AcpiMcfgInfo *mcfg;
     Range pci_hole, pci_hole64;
     uint8_t *u;
     size_t aml_len = 0;
@@ -2714,10 +2753,12 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
             build_slit(tables_blob, tables->linker);
         }
     }
-    if (acpi_get_mcfg(&mcfg)) {
+    if ((mcfg = acpi_get_mcfg()) != NULL) {
         acpi_add_table(table_offsets, tables_blob);
-        build_mcfg_q35(tables_blob, tables->linker, &mcfg);
+        build_mcfg_q35(tables_blob, tables->linker, mcfg);
     }
+    cleanup_mcfg(mcfg);
+
     if (x86_iommu_get_default()) {
         IommuType IOMMUType = x86_iommu_get_type();
         if (IOMMUType == TYPE_AMD) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 1/3] pci_expander_bridge: reserve enough mcfg space for pxb host
  2018-05-20  7:28 ` [Qemu-devel] [RFC 1/3] pci_expander_bridge: reserve enough mcfg space for pxb host Zihan Yang
@ 2018-05-21 11:03   ` Marcel Apfelbaum
  2018-05-22  5:59     ` Zihan Yang
  0 siblings, 1 reply; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-21 11:03 UTC (permalink / raw)
  To: Zihan Yang, qemu-devel
  Cc: Paolo Bonzini, Richard Henderson, Eduardo Habkost, Michael S. Tsirkin

Hi Zihan,
On 05/20/2018 10:28 AM, Zihan Yang wrote:


Thank you for posting the patches!
For the next version please add a cover letter so we can discuss cross 
patchesideas.

> To put each pxb into separate pci domain, we need to reserve enough MCFG space
> for each pxb host in the main memory.

Right

>   First try to put them under 4G, before
> pci_hole_start, if there are too many hosts, try to put them into main memory
> above 4G, before pci_hole64_start. We should check if there is enough memory
> to reserve for them

I don't think we should try to place the MMCFGs before 4G even if there
is enough room. Is better to place them always after 4G.

>
> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
> ---
>   hw/i386/pc.c                                |  5 ++
>   hw/pci-bridge/pci_expander_bridge.c         | 96 ++++++++++++++++++++++++++++-
>   include/hw/pci-bridge/pci_expander_bridge.h |  7 +++
>   3 files changed, 107 insertions(+), 1 deletion(-)
>   create mode 100644 include/hw/pci-bridge/pci_expander_bridge.h
>
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index d768930..98097fd 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -34,6 +34,7 @@
>   #include "hw/ide.h"
>   #include "hw/pci/pci.h"
>   #include "hw/pci/pci_bus.h"
> +#include "hw/pci-bridge/pci_expander_bridge.h"
>   #include "hw/nvram/fw_cfg.h"
>   #include "hw/timer/hpet.h"
>   #include "hw/smbios/smbios.h"
> @@ -1466,6 +1467,7 @@ uint64_t pc_pci_hole64_start(void)
>       PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
>       MachineState *ms = MACHINE(pcms);
>       uint64_t hole64_start = 0;
> +    int pxb_hosts;
>   
>       if (pcmc->has_reserved_memory && ms->device_memory->base) {
>           hole64_start = ms->device_memory->base;
> @@ -1473,6 +1475,9 @@ uint64_t pc_pci_hole64_start(void)
>               hole64_start += memory_region_size(&ms->device_memory->mr);
>           }
>       } else {
> +        /* make sure enough space is left for pxb host, otherwise fail */
> +        pxb_hosts = pxb_get_expander_hosts();
> +        g_assert (pxb_hosts <= 0 || pcms->above_4g_mem_size >= (pxb_hosts << 28ULL));

"above_4g_mem" PCI hole it is reserved for PCI devices hotplug. We 
cannot use if for
MMCFGs. What I think we can do is to "move" the 64 bit PCI hole after 
the MMCFGs.
So the layout of over 4G space will be:

[RAM hotplug][MMCFGs][PCI hotplug]...

>           hole64_start = 0x100000000ULL + pcms->above_4g_mem_size;
>       }
>   
> diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c
> index e62de42..8409c87 100644
> --- a/hw/pci-bridge/pci_expander_bridge.c
> +++ b/hw/pci-bridge/pci_expander_bridge.c
> @@ -12,10 +12,13 @@
>   
>   #include "qemu/osdep.h"
>   #include "qapi/error.h"
> +#include "qapi/visitor.h"
>   #include "hw/pci/pci.h"
>   #include "hw/pci/pci_bus.h"
>   #include "hw/pci/pci_host.h"
>   #include "hw/pci/pci_bridge.h"
> +#include "hw/pci-host/q35.h"
> +#include "hw/pci-bridge/pci_expander_bridge.h"
>   #include "qemu/range.h"
>   #include "qemu/error-report.h"
>   #include "sysemu/numa.h"
> @@ -57,7 +60,13 @@ static PXBDev *convert_to_pxb(PCIDevice *dev)
>   
>   static GList *pxb_dev_list;
>   
> +typedef struct PXBPCIHost {
> +    PCIExpressHost parent_obj;
> +} PXBPCIHost;
> +
>   #define TYPE_PXB_HOST "pxb-host"
> +#define PXB_HOST_DEVICE(obj) \
> +     OBJECT_CHECK(PXBPCIHost, (obj), TYPE_PXB_HOST)
>   
>   static int pxb_bus_num(PCIBus *bus)
>   {
> @@ -111,6 +120,14 @@ static const char *pxb_host_root_bus_path(PCIHostState *host_bridge,
>       return bus->bus_path;
>   }
>   
> +static void pxb_host_get_mmcfg_size(Object *obj, Visitor *v, const char *name,
> +                                    void *opaque, Error **errp)
> +{
> +    PCIExpressHost *e = PCIE_HOST_BRIDGE(obj);
> +
> +    visit_type_uint64(v, name, &e->size, errp);
> +}
> +
>   static char *pxb_host_ofw_unit_address(const SysBusDevice *dev)
>   {
>       const PCIHostState *pxb_host;
> @@ -142,6 +159,80 @@ static char *pxb_host_ofw_unit_address(const SysBusDevice *dev)
>       return NULL;
>   }
>   
> +static Object *pxb_get_i386_pci_host(void)
> +{
> +    PCIHostState *host;
> +
> +    host = OBJECT_CHECK(PCIHostState,
> +                        object_resolve_path("/machine/i440fx", NULL),
> +                        TYPE_PCI_HOST_BRIDGE);
> +    if (!host) {
> +        host = OBJECT_CHECK(PCIHostState,
> +                            object_resolve_path("/machine/q35", NULL),
> +                            TYPE_PCI_HOST_BRIDGE);
> +    }
> +
> +    return OBJECT(host);
> +}
> +

The about function appears also in  hw/i386/acpi-build.c, you may wantto 
re-use.

> +int pxhb_cnt = 0;
> +
> +/* -1 means to exclude q35 host */
> +#define MCFG_IN_PCI_HOLE (((IO_APIC_DEFAULT_ADDRESS - MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT) >> 28) - 1)
> +
I don't understand the above. If you need  a single MMCFG size you can 
use MCH_HOST_BRIDGE_PCIEXBAR_MAX.

> +int pxb_get_expander_hosts(void)
> +{
> +    return pxhb_cnt - MCFG_IN_PCI_HOLE;
> +}

Do you need the  number of existing expander hosts? We have a 
pxbdev_list, just query it.

> +
> +/* Dirty workaround */
> +static void modify_q35_pci_hole(void)
> +{
> +    Object *pci_host;
> +    Q35PCIHost *s;
> +
> +    pci_host = pxb_get_i386_pci_host();
> +    g_assert(pci_host);
> +    s = Q35_HOST_DEVICE(pci_host);
> +
> +    ++pxhb_cnt;
> +    if (pxhb_cnt <= MCFG_IN_PCI_HOLE) {
> +        range_set_bounds(&s->mch.pci_hole,
> +            MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT + ((1 + pxhb_cnt) << 28),
> +            IO_APIC_DEFAULT_ADDRESS - 1);
> +    }
> +
> +    // leave pci hole64 to acpi build part
> +}
> +
> +static void pxb_host_initfn(Object *obj)
> +{
> +    PCIHostState *phb = PCI_HOST_BRIDGE(obj);
> +
> +    memory_region_init_io(&phb->conf_mem, obj, &pci_host_conf_le_ops, phb,
> +                          "pci-conf-idx", 4);
> +    memory_region_init_io(&phb->data_mem, obj, &pci_host_data_le_ops, phb,
> +                          "pci-conf-data", 4);
> +
> +    object_property_add(obj, PCIE_HOST_MCFG_SIZE, "uint64",
> +                         pxb_host_get_mmcfg_size,
> +                         NULL, NULL, NULL, NULL);
> +
> +    /* Leave enough space for the biggest MCFG BAR */
> +    /* TODO. Since pxb host is just an expander bridge without an mch,
> +     * we modify the range in q35 host. It should be workable as it is
> +     * before acpi build, although it is dirty
> +     */
> +    modify_q35_pci_hole();

The above will need to change. We move the pci hole, not resize it.
I am not sure this is the right place to handle it, maybe we add a new 
property
right beside pci_hole ones (extra-mmcfg?) and default it to 0.

> +}
> +
> +/* default value does not matter as guest firmware will overwrite it */
> +static Property pxb_host_props[] = {
> +    DEFINE_PROP_UINT64(PCIE_HOST_MCFG_BASE, PXBPCIHost, parent_obj.base_addr,
> +                        MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT),

You cannot use the MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT as it is in use
by the "main" root complex MMCFG. Actually I don't think we can come up
with a valid default.


> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>   static void pxb_host_class_init(ObjectClass *class, void *data)
>   {
>       DeviceClass *dc = DEVICE_CLASS(class);
> @@ -149,6 +240,7 @@ static void pxb_host_class_init(ObjectClass *class, void *data)
>       PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(class);
>   
>       dc->fw_name = "pci";
> +    dc->props = pxb_host_props;
>       /* Reason: Internal part of the pxb/pxb-pcie device, not usable by itself */
>       dc->user_creatable = false;
>       sbc->explicit_ofw_unit_address = pxb_host_ofw_unit_address;
> @@ -157,7 +249,9 @@ static void pxb_host_class_init(ObjectClass *class, void *data)
>   
>   static const TypeInfo pxb_host_info = {
>       .name          = TYPE_PXB_HOST,
> -    .parent        = TYPE_PCI_HOST_BRIDGE,
> +    .parent        = TYPE_PCIE_HOST_BRIDGE,

Be aware this is used by both pxb and pxb-pcie devices, I think you 
should split the type
for each one and let the pxb's one as before.

> +    .instance_size = sizeof(PXBPCIHost),
> +    .instance_init = pxb_host_initfn,
>       .class_init    = pxb_host_class_init,
>   };
>   
> diff --git a/include/hw/pci-bridge/pci_expander_bridge.h b/include/hw/pci-bridge/pci_expander_bridge.h
> new file mode 100644
> index 0000000..d48ddb1
> --- /dev/null
> +++ b/include/hw/pci-bridge/pci_expander_bridge.h
> @@ -0,0 +1,7 @@
> +#ifndef HW_PCI_EXPANDER_H
> +#define HW_PCI_EXPANDER_H
> +
> +/* return the number of pxb hosts that resides in the main memory above 4G */
> +int pxb_get_expander_hosts(void);
> +
> +#endif

Thanks,
Marcel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 2/3] pci: Link pci_host_bridges with QTAILQ
  2018-05-20  7:28 ` [Qemu-devel] [RFC 2/3] pci: Link pci_host_bridges with QTAILQ Zihan Yang
@ 2018-05-21 11:05   ` Marcel Apfelbaum
  2018-05-22  5:59     ` Zihan Yang
  0 siblings, 1 reply; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-21 11:05 UTC (permalink / raw)
  To: Zihan Yang, qemu-devel; +Cc: Michael S. Tsirkin



On 05/20/2018 10:28 AM, Zihan Yang wrote:
> QLIST will place the original q35 host bridge at the end of list because it is
> added first. Replace it with QTAILQ to make q35 at the first of queue, which
> makes it convinient and compatible when there are pxb hosts other than q35 hosts

I have no objection here, we'll see later how the modification helps.

Thanks,
Marcek

> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
> ---
>   hw/pci/pci.c              | 9 +++++----
>   include/hw/pci/pci_host.h | 2 +-
>   2 files changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 80bc459..ddc27ba 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -196,7 +196,8 @@ static void pci_del_option_rom(PCIDevice *pdev);
>   static uint16_t pci_default_sub_vendor_id = PCI_SUBVENDOR_ID_REDHAT_QUMRANET;
>   static uint16_t pci_default_sub_device_id = PCI_SUBDEVICE_ID_QEMU;
>   
> -static QLIST_HEAD(, PCIHostState) pci_host_bridges;
> +static QTAILQ_HEAD(, PCIHostState) pci_host_bridges =
> +    QTAILQ_HEAD_INITIALIZER(pci_host_bridges);
>   
>   int pci_bar(PCIDevice *d, int reg)
>   {
> @@ -330,7 +331,7 @@ static void pci_host_bus_register(DeviceState *host)
>   {
>       PCIHostState *host_bridge = PCI_HOST_BRIDGE(host);
>   
> -    QLIST_INSERT_HEAD(&pci_host_bridges, host_bridge, next);
> +    QTAILQ_INSERT_TAIL(&pci_host_bridges, host_bridge, next);
>   }
>   
>   PCIBus *pci_device_root_bus(const PCIDevice *d)
> @@ -1798,7 +1799,7 @@ PciInfoList *qmp_query_pci(Error **errp)
>       PciInfoList *info, *head = NULL, *cur_item = NULL;
>       PCIHostState *host_bridge;
>   
> -    QLIST_FOREACH(host_bridge, &pci_host_bridges, next) {
> +    QTAILQ_FOREACH(host_bridge, &pci_host_bridges, next) {
>           info = g_malloc0(sizeof(*info));
>           info->value = qmp_query_pci_bus(host_bridge->bus,
>                                           pci_bus_num(host_bridge->bus));
> @@ -2493,7 +2494,7 @@ int pci_qdev_find_device(const char *id, PCIDevice **pdev)
>       PCIHostState *host_bridge;
>       int rc = -ENODEV;
>   
> -    QLIST_FOREACH(host_bridge, &pci_host_bridges, next) {
> +    QTAILQ_FOREACH(host_bridge, &pci_host_bridges, next) {
>           int tmp = pci_qdev_find_recursive(host_bridge->bus, id, pdev);
>           if (!tmp) {
>               rc = 0;
> diff --git a/include/hw/pci/pci_host.h b/include/hw/pci/pci_host.h
> index ba31595..a5617cf 100644
> --- a/include/hw/pci/pci_host.h
> +++ b/include/hw/pci/pci_host.h
> @@ -47,7 +47,7 @@ struct PCIHostState {
>       uint32_t config_reg;
>       PCIBus *bus;
>   
> -    QLIST_ENTRY(PCIHostState) next;
> +    QTAILQ_ENTRY(PCIHostState) next;
>   };
>   
>   typedef struct PCIHostBridgeClass {

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-20  7:28 ` [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges Zihan Yang
@ 2018-05-21 11:53   ` Marcel Apfelbaum
  2018-05-22  6:03     ` Zihan Yang
  2018-05-22  9:52     ` Laszlo Ersek
  0 siblings, 2 replies; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-21 11:53 UTC (permalink / raw)
  To: Zihan Yang, qemu-devel
  Cc: Michael S. Tsirkin, Igor Mammedov, Paolo Bonzini,
	Richard Henderson, Eduardo Habkost, Laszlo Ersek



On 05/20/2018 10:28 AM, Zihan Yang wrote:
> Currently only q35 host bridge us allocated space in MCFG table. To put pxb host
> into sepratate pci domain, each of them should have its own configuration space
> int MCFG table
>
> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
> ---
>   hw/i386/acpi-build.c | 83 +++++++++++++++++++++++++++++++++++++++-------------
>   1 file changed, 62 insertions(+), 21 deletions(-)
>
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 9bc6d97..808d815 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -89,6 +89,7 @@
>   typedef struct AcpiMcfgInfo {
>       uint64_t mcfg_base;
>       uint32_t mcfg_size;
> +    struct AcpiMcfgInfo *next;
>   } AcpiMcfgInfo;
>   
>   typedef struct AcpiPmInfo {
> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info)
>   {
>       AcpiTableMcfg *mcfg;
>       const char *sig;
> -    int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
> +    int len, count = 0;
> +    AcpiMcfgInfo *cfg = info;
>   
> +    while (cfg) {
> +        ++count;
> +        cfg = cfg->next;
> +    }
> +    len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]);
>       mcfg = acpi_data_push(table_data, len);
> -    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
> -    /* Only a single allocation so no need to play with segments */
> -    mcfg->allocation[0].pci_segment = cpu_to_le16(0);
> -    mcfg->allocation[0].start_bus_number = 0;
> -    mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1);
>   
>       /* MCFG is used for ECAM which can be enabled or disabled by guest.
>        * To avoid table size changes (which create migration issues),
> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info)
>       } else {
>           sig = "MCFG";
>       }
> +
> +    count = 0;
> +    while (info) {
> +        mcfg[count].allocation[0].address = cpu_to_le64(info->mcfg_base);
> +        /* Only a single allocation so no need to play with segments */
> +        mcfg[count].allocation[0].pci_segment = cpu_to_le16(count);
> +        mcfg[count].allocation[0].start_bus_number = 0;
> +        mcfg[count++].allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1);

An interesting point is if we want to limit the MMFCG size for PXBs, as 
we may not be
interested to use all the buses in a specific domain. For each bus we 
require some
address space that remains unused.

> +        info = info->next;
> +    }
> +
>       build_header(linker, table_data, (void *)mcfg, sig, len, 1, NULL, NULL);
>   }
>   
> @@ -2602,26 +2615,52 @@ struct AcpiBuildState {
>       MemoryRegion *linker_mr;
>   } AcpiBuildState;
>   
> -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg)
> +{
> +    AcpiMcfgInfo *tmp;
> +    while (mcfg) {
> +        tmp = mcfg->next;
> +        g_free(mcfg);
> +        mcfg = tmp;
> +    }
> +}
> +
> +static AcpiMcfgInfo *acpi_get_mcfg(void)
>   {
>       Object *pci_host;
>       QObject *o;
> +    AcpiMcfgInfo *head = NULL, *tail, *mcfg;
>   
>       pci_host = acpi_get_i386_pci_host();
>       g_assert(pci_host);
>   
> -    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL);
> -    if (!o) {
> -        return false;
> +    while (pci_host) {
> +        mcfg = g_new0(AcpiMcfgInfo, 1);
> +        mcfg->next = NULL;
> +        if (!head) {
> +            tail = head = mcfg;
> +        } else {
> +            tail->next = mcfg;
> +            tail = mcfg;
> +        }
> +
> +        o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL);
> +        if (!o) {
> +            cleanup_mcfg(head);
> +            g_free(mcfg);
> +            return NULL;
> +        }
> +        mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
> +        qobject_unref(o);
> +
> +        o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL);

I'll let Igor to comment on the APCI bits, but the idea of querying each 
PCI host
for the MMFCG details and put it into a different table sounds good to me.

[Adding Laszlo for his insights]
Looking forward to see the SeaBIOS patches :)

Thanks!
Marcel

> +        assert(o);
> +        mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o));
> +        qobject_unref(o);
> +
> +        pci_host = OBJECT(QTAILQ_NEXT(PCI_HOST_BRIDGE(pci_host), next));
>       }
> -    mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
> -    qobject_unref(o);
> -
> -    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL);
> -    assert(o);
> -    mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o));
> -    qobject_unref(o);
> -    return true;
> +    return head;
>   }
>   
>   static
> @@ -2633,7 +2672,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
>       unsigned facs, dsdt, rsdt, fadt;
>       AcpiPmInfo pm;
>       AcpiMiscInfo misc;
> -    AcpiMcfgInfo mcfg;
> +    AcpiMcfgInfo *mcfg;
>       Range pci_hole, pci_hole64;
>       uint8_t *u;
>       size_t aml_len = 0;
> @@ -2714,10 +2753,12 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
>               build_slit(tables_blob, tables->linker);
>           }
>       }
> -    if (acpi_get_mcfg(&mcfg)) {
> +    if ((mcfg = acpi_get_mcfg()) != NULL) {
>           acpi_add_table(table_offsets, tables_blob);
> -        build_mcfg_q35(tables_blob, tables->linker, &mcfg);
> +        build_mcfg_q35(tables_blob, tables->linker, mcfg);
>       }
> +    cleanup_mcfg(mcfg);
> +
>       if (x86_iommu_get_default()) {
>           IommuType IOMMUType = x86_iommu_get_type();
>           if (IOMMUType == TYPE_AMD) {

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain
  2018-05-20  7:28 [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain Zihan Yang
                   ` (2 preceding siblings ...)
  2018-05-20  7:28 ` [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges Zihan Yang
@ 2018-05-21 15:23 ` Marcel Apfelbaum
  2018-05-22  6:04   ` Zihan Yang
  3 siblings, 1 reply; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-21 15:23 UTC (permalink / raw)
  To: Zihan Yang, qemu-devel

Hi Zihan

On 05/20/2018 10:28 AM, Zihan Yang wrote:

You do have patch 0, sorry for not seeing it, please disregard my prev 
comment.

> Currently only q35 host bridge is allocated an item in MCFG table, all pxb
> host bridges stay within pci domain 0. This series of patches put each pxb
> host bridge in separate pci domain, allocating a new MCFG table item for it.

Maybe we should add a parameter to the pxb-pci device specifying the PCI 
domain
instead of adding it on the fly. It will help ensuring the PCI domain 
will not change over
time due to code changes.In this case "bus sharing" should be off. For 
the moment
the pxb devices get the bus range from the same PCI domain.
> The pxb host does not have an mch in it, as q35 host already has one. All
> pxb host bridges just serve as expander bridges in QEMU, which makes them
> a lit
> tle simpler than q35 host bridge.
Agreed

> NOTE: These patches are just initial proof of concept, asking for suggestions.
>        The aml part and seabios part are not finished at all. This means it
>        still behaves like before, that all pci expander bridges stay in the
>        same pci domain as a35 host.

Looks promising!

Thanks,
Marcel

> Zihan Yang (3):
>    pci_expander_bridge: reserve enough mcfg space for pxb host
>    pci: Link pci_host_bridges with QTAILQ
>    acpi-build: allocate mcfg for multiple host bridges
>
>   hw/i386/acpi-build.c                        | 83 ++++++++++++++++++-------
>   hw/i386/pc.c                                |  5 ++
>   hw/pci-bridge/pci_expander_bridge.c         | 96 ++++++++++++++++++++++++++++-
>   hw/pci/pci.c                                |  9 +--
>   include/hw/pci-bridge/pci_expander_bridge.h |  7 +++
>   include/hw/pci/pci_host.h                   |  2 +-
>   6 files changed, 175 insertions(+), 27 deletions(-)
>   create mode 100644 include/hw/pci-bridge/pci_expander_bridge.h
>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 1/3] pci_expander_bridge: reserve enough mcfg space for pxb host
  2018-05-21 11:03   ` Marcel Apfelbaum
@ 2018-05-22  5:59     ` Zihan Yang
  2018-05-22 18:47       ` Marcel Apfelbaum
  0 siblings, 1 reply; 46+ messages in thread
From: Zihan Yang @ 2018-05-22  5:59 UTC (permalink / raw)
  To: qemu-devel, Marcel Apfelbaum
  Cc: Paolo Bonzini, Richard Henderson, Eduardo Habkost, Michael S. Tsirkin

Hi Marcel,

Thanks a lot for your feedback.

> I don't think we should try to place the MMCFGs before 4G even if there
> is enough room. Is better to place them always after 4G.
>
> "above_4g_mem" PCI hole it is reserved for PCI devices hotplug. We cannot
use if for
> MMCFGs. What I think we can do is to "move" the 64 bit PCI hole after the
MMCFGs.
> So the layout of over 4G space will be:
>
> [RAM hotplug][MMCFGs][PCI hotplug]...

OK, I will reorganize the memory layout. Should the number of MMCFG be
limited,
as there might be insufficient memory above 4G?

> Do you need the  number of existing expander hosts? We have a
pxbdev_list, just query it.

Great, I think I missed that list.

> The above will need to change. We move the pci hole, not resize it.
> I am not sure this is the right place to handle it, maybe we add a new
property
> right beside pci_hole ones (extra-mmcfg?) and default it to 0.

That sounds good, so that we just need to check this range when setting
mcfg table instead of traversing the host bridge list.

> You cannot use the MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT as it is in use
> by the "main" root complex MMCFG. Actually I don't think we can come up
> with a valid default.

I see, I'll replcae it with unmapped then.

> Be aware this is used by both pxb and pxb-pcie devices, I think you
should split the type
>for each one and let the pxb's one as before.

OK, I had thought it would make codes simpler, as TYPE_PCIE_HOST_BRIDGE
is also the child of TYPE_PCI_HOST_BRIDGE, but I did forget about the pxb
devices. I'll split it in v2.

Thanks
Zihan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 2/3] pci: Link pci_host_bridges with QTAILQ
  2018-05-21 11:05   ` Marcel Apfelbaum
@ 2018-05-22  5:59     ` Zihan Yang
  2018-05-22 18:39       ` Marcel Apfelbaum
  0 siblings, 1 reply; 46+ messages in thread
From: Zihan Yang @ 2018-05-22  5:59 UTC (permalink / raw)
  To: qemu-devel, Marcel Apfelbaum; +Cc: Michael S. Tsirkin

>  have no objection here, we'll see later how the modification helps.

The purpose is to place the q35 host at the start of queue. In the original
QLIST,
when a new pxb host is added, q35 host will be bumped to the end end list.

By replacing it with QTAILQ, we can always get q35 host bridges first, so
that
we can make sure q35 host stays in pci domain 0. If there is no pxb host,
the code needs not change because the 'next' field will be NULL.

I'm not sure whether it is necessary to let q35 host reside in pci domain 0,
but I think I'd better reserve the original when I don't know the potential
impact.

Thanks
Zihan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-21 11:53   ` Marcel Apfelbaum
@ 2018-05-22  6:03     ` Zihan Yang
  2018-05-22 18:43       ` Marcel Apfelbaum
  2018-05-22  9:52     ` Laszlo Ersek
  1 sibling, 1 reply; 46+ messages in thread
From: Zihan Yang @ 2018-05-22  6:03 UTC (permalink / raw)
  To: qemu-devel, Marcel Apfelbaum
  Cc: Michael S. Tsirkin, Igor Mammedov, Paolo Bonzini,
	Richard Henderson, Eduardo Habkost, lersek

> An interesting point is if we want to limit the MMFCG size for PXBs, as
we may not be
> interested to use all the buses in a specific domain.

OK, perhaps providing an option for the user to specify the desired bus
numbers?

> For each bus we require some address space that remains unused.

Does this mean we should reserve some slots in a bus, or reduce
the number of buses used in each domain?

> [Adding Laszlo for his insights]
> Looking forward to see the SeaBIOS patches :)

I appreciate your reviews, I'll write the rest part as soon as possible.

Thanks

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain
  2018-05-21 15:23 ` [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain Marcel Apfelbaum
@ 2018-05-22  6:04   ` Zihan Yang
  0 siblings, 0 replies; 46+ messages in thread
From: Zihan Yang @ 2018-05-22  6:04 UTC (permalink / raw)
  To: qemu-devel, Marcel Apfelbaum

> Maybe we should add a parameter to the pxb-pci device specifying the PCI
domain
> instead of adding it on the fly. It will help ensuring the PCI domain
will not change over
> time due to code changes.In this case "bus sharing" should be off. For
the moment
> the pxb devices get the bus range from the same PCI domain.

That's good advice. I'll do that in v2.

Thanks,
Zihan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-21 11:53   ` Marcel Apfelbaum
  2018-05-22  6:03     ` Zihan Yang
@ 2018-05-22  9:52     ` Laszlo Ersek
  2018-05-22 19:01       ` Marcel Apfelbaum
  1 sibling, 1 reply; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-22  9:52 UTC (permalink / raw)
  To: Marcel Apfelbaum, Zihan Yang, qemu-devel
  Cc: Michael S. Tsirkin, Igor Mammedov, Paolo Bonzini,
	Richard Henderson, Eduardo Habkost

On 05/21/18 13:53, Marcel Apfelbaum wrote:
> 
> 
> On 05/20/2018 10:28 AM, Zihan Yang wrote:
>> Currently only q35 host bridge us allocated space in MCFG table. To
>> put pxb host
>> into sepratate pci domain, each of them should have its own
>> configuration space
>> int MCFG table
>>
>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
>> ---
>>   hw/i386/acpi-build.c | 83
>> +++++++++++++++++++++++++++++++++++++++-------------
>>   1 file changed, 62 insertions(+), 21 deletions(-)
>>
>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
>> index 9bc6d97..808d815 100644
>> --- a/hw/i386/acpi-build.c
>> +++ b/hw/i386/acpi-build.c
>> @@ -89,6 +89,7 @@
>>   typedef struct AcpiMcfgInfo {
>>       uint64_t mcfg_base;
>>       uint32_t mcfg_size;
>> +    struct AcpiMcfgInfo *next;
>>   } AcpiMcfgInfo;
>>     typedef struct AcpiPmInfo {
>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
>> *linker, AcpiMcfgInfo *info)
>>   {
>>       AcpiTableMcfg *mcfg;
>>       const char *sig;
>> -    int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
>> +    int len, count = 0;
>> +    AcpiMcfgInfo *cfg = info;
>>   +    while (cfg) {
>> +        ++count;
>> +        cfg = cfg->next;
>> +    }
>> +    len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]);
>>       mcfg = acpi_data_push(table_data, len);
>> -    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
>> -    /* Only a single allocation so no need to play with segments */
>> -    mcfg->allocation[0].pci_segment = cpu_to_le16(0);
>> -    mcfg->allocation[0].start_bus_number = 0;
>> -    mcfg->allocation[0].end_bus_number =
>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
>>         /* MCFG is used for ECAM which can be enabled or disabled by
>> guest.
>>        * To avoid table size changes (which create migration issues),
>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
>> *linker, AcpiMcfgInfo *info)
>>       } else {
>>           sig = "MCFG";
>>       }
>> +
>> +    count = 0;
>> +    while (info) {
>> +        mcfg[count].allocation[0].address =
>> cpu_to_le64(info->mcfg_base);
>> +        /* Only a single allocation so no need to play with segments */
>> +        mcfg[count].allocation[0].pci_segment = cpu_to_le16(count);
>> +        mcfg[count].allocation[0].start_bus_number = 0;
>> +        mcfg[count++].allocation[0].end_bus_number =
>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
> 
> An interesting point is if we want to limit the MMFCG size for PXBs, as
> we may not be
> interested to use all the buses in a specific domain. For each bus we
> require some
> address space that remains unused.
> 
>> +        info = info->next;
>> +    }
>> +
>>       build_header(linker, table_data, (void *)mcfg, sig, len, 1,
>> NULL, NULL);
>>   }
>>   @@ -2602,26 +2615,52 @@ struct AcpiBuildState {
>>       MemoryRegion *linker_mr;
>>   } AcpiBuildState;
>>   -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg)
>> +{
>> +    AcpiMcfgInfo *tmp;
>> +    while (mcfg) {
>> +        tmp = mcfg->next;
>> +        g_free(mcfg);
>> +        mcfg = tmp;
>> +    }
>> +}
>> +
>> +static AcpiMcfgInfo *acpi_get_mcfg(void)
>>   {
>>       Object *pci_host;
>>       QObject *o;
>> +    AcpiMcfgInfo *head = NULL, *tail, *mcfg;
>>         pci_host = acpi_get_i386_pci_host();
>>       g_assert(pci_host);
>>   -    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE,
>> NULL);
>> -    if (!o) {
>> -        return false;
>> +    while (pci_host) {
>> +        mcfg = g_new0(AcpiMcfgInfo, 1);
>> +        mcfg->next = NULL;
>> +        if (!head) {
>> +            tail = head = mcfg;
>> +        } else {
>> +            tail->next = mcfg;
>> +            tail = mcfg;
>> +        }
>> +
>> +        o = object_property_get_qobject(pci_host,
>> PCIE_HOST_MCFG_BASE, NULL);
>> +        if (!o) {
>> +            cleanup_mcfg(head);
>> +            g_free(mcfg);
>> +            return NULL;
>> +        }
>> +        mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
>> +        qobject_unref(o);
>> +
>> +        o = object_property_get_qobject(pci_host,
>> PCIE_HOST_MCFG_SIZE, NULL);
> 
> I'll let Igor to comment on the APCI bits, but the idea of querying each
> PCI host
> for the MMFCG details and put it into a different table sounds good to me.
> 
> [Adding Laszlo for his insights]

Thanks for the CC -- I don't have many (positive) insights here to
offer, I'm afraid. First, the patch set (including the blurb) doesn't
seem to explain *why* multiple host bridges / ECAM areas are a good
idea. Second, supporting such would take very intrusive patches / large
feature development for OVMF (and that's not just for OvmfPkg and
ArmVirtPkg platform code, but also core MdeModulePkg code).

Obviously I'm not saying this feature should not be implemented for QEMU
+ SeaBIOS, just that don't expect edk2 support for it anytime soon.
Without a convincing use case, even the review impact seems hard to justify.

Thanks,
Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 2/3] pci: Link pci_host_bridges with QTAILQ
  2018-05-22  5:59     ` Zihan Yang
@ 2018-05-22 18:39       ` Marcel Apfelbaum
  0 siblings, 0 replies; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-22 18:39 UTC (permalink / raw)
  To: Zihan Yang, qemu-devel; +Cc: Michael S. Tsirkin

Hi Zihan,

On 05/22/2018 08:59 AM, Zihan Yang wrote:
> >  have no objection here, we'll see later how the modification helps.
>
> The purpose is to place the q35 host at the start of queue. In the 
> original QLIST,
> when a new pxb host is added, q35 host will be bumped to the end end list.
>
> By replacing it with QTAILQ, we can always get q35 host bridges first, 
> so that
> we can make sure q35 host stays in pci domain 0. If there is no pxb host,
> the code needs not change because the 'next' field will be NULL.
>

We can use the pxb list we keep, but I don't see an issue here.

> I'm not sure whether it is necessary to let q35 host reside in pci 
> domain 0,
> but I think I'd better reserve the original when I don't know the 
> potential impact.
>

Q35 should always reside in PCI domain 0.

Thanks,
Marcel

> Thanks
> Zihan
>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22  6:03     ` Zihan Yang
@ 2018-05-22 18:43       ` Marcel Apfelbaum
  0 siblings, 0 replies; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-22 18:43 UTC (permalink / raw)
  To: Zihan Yang, qemu-devel
  Cc: Michael S. Tsirkin, Igor Mammedov, Paolo Bonzini,
	Richard Henderson, Eduardo Habkost, lersek



On 05/22/2018 09:03 AM, Zihan Yang wrote:
> > An interesting point is if we want to limit the MMFCG size for PXBs, 
> as we may not be
> > interested to use all the buses in a specific domain.
>
> OK, perhaps providing an option for the user to specify the desired 
> bus numbers?
>

Right, specifying the number of buses that can be used (e.g: max_busses) 
will decrease a lot
the amount of address space we have to reserve.

> > For each bus we require some address space that remains unused.
>
> Does this mean we should reserve some slots in a bus, or reduce
> the number of buses used in each domain?
>

Limit the number of buses per PCI domain (excluding PCI domain 0, of course)

> > [Adding Laszlo for his insights]
> > Looking forward to see the SeaBIOS patches :)
>
> I appreciate your reviews, I'll write the rest part as soon as possible.
>

Thanks,
Marcel

> Thanks
>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 1/3] pci_expander_bridge: reserve enough mcfg space for pxb host
  2018-05-22  5:59     ` Zihan Yang
@ 2018-05-22 18:47       ` Marcel Apfelbaum
  0 siblings, 0 replies; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-22 18:47 UTC (permalink / raw)
  To: Zihan Yang, qemu-devel
  Cc: Paolo Bonzini, Richard Henderson, Eduardo Habkost, Michael S. Tsirkin



On 05/22/2018 08:59 AM, Zihan Yang wrote:
> Hi Marcel,
>
> Thanks a lot for your feedback.
>
> > I don't think we should try to place the MMCFGs before 4G even if there
> > is enough room. Is better to place them always after 4G.
> >
> > "above_4g_mem" PCI hole it is reserved for PCI devices hotplug. We 
> cannot use if for
> > MMCFGs. What I think we can do is to "move" the 64 bit PCI hole 
> after the MMCFGs.
> > So the layout of over 4G space will be:
> >
> > [RAM hotplug][MMCFGs][PCI hotplug]...
>
> OK, I will reorganize the memory layout. Should the number of MMCFG be 
> limited,
> as there might be insufficient memory above 4G?
>

Is not about the memory, is about the address space that can be reached 
by the CPU
(CPU addressable bits).
The total address space used by all MMCFGs should be CPU addressable.
But is too early at this stage to discuss that, just keep it in mind.

> > Do you need the  number of existing expander hosts? We have a 
> pxbdev_list, just query it.
>
> Great, I think I missed that list.
>
> > The above will need to change. We move the pci hole, not resize it.
> > I am not sure this is the right place to handle it, maybe we add a 
> new property
> > right beside pci_hole ones (extra-mmcfg?) and default it to 0.
>
> That sounds good, so that we just need to check this range when setting
> mcfg table instead of traversing the host bridge list.
>
> > You cannot use the MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT as it is in use
> > by the "main" root complex MMCFG. Actually I don't think we can come up
> > with a valid default.
>
> I see, I'll replcae it with unmapped then.
>
> > Be aware this is used by both pxb and pxb-pcie devices, I think you 
> should split the type
> >for each one and let the pxb's one as before.
>
> OK, I had thought it would make codes simpler, as TYPE_PCIE_HOST_BRIDGE
> is also the child of TYPE_PCI_HOST_BRIDGE, but I did forget about the pxb
> devices. I'll split it in v2.
>

Thanks,
Marcel

> Thanks
> Zihan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22  9:52     ` Laszlo Ersek
@ 2018-05-22 19:01       ` Marcel Apfelbaum
  2018-05-22 19:51         ` Laszlo Ersek
  0 siblings, 1 reply; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-22 19:01 UTC (permalink / raw)
  To: Laszlo Ersek, Zihan Yang, qemu-devel
  Cc: Michael S. Tsirkin, Igor Mammedov, Alex Williamson

Hi Laszlo,

On 05/22/2018 12:52 PM, Laszlo Ersek wrote:
> On 05/21/18 13:53, Marcel Apfelbaum wrote:
>>
>> On 05/20/2018 10:28 AM, Zihan Yang wrote:
>>> Currently only q35 host bridge us allocated space in MCFG table. To
>>> put pxb host
>>> into sepratate pci domain, each of them should have its own
>>> configuration space
>>> int MCFG table
>>>
>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
>>> ---
>>>    hw/i386/acpi-build.c | 83
>>> +++++++++++++++++++++++++++++++++++++++-------------
>>>    1 file changed, 62 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
>>> index 9bc6d97..808d815 100644
>>> --- a/hw/i386/acpi-build.c
>>> +++ b/hw/i386/acpi-build.c
>>> @@ -89,6 +89,7 @@
>>>    typedef struct AcpiMcfgInfo {
>>>        uint64_t mcfg_base;
>>>        uint32_t mcfg_size;
>>> +    struct AcpiMcfgInfo *next;
>>>    } AcpiMcfgInfo;
>>>      typedef struct AcpiPmInfo {
>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
>>> *linker, AcpiMcfgInfo *info)
>>>    {
>>>        AcpiTableMcfg *mcfg;
>>>        const char *sig;
>>> -    int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
>>> +    int len, count = 0;
>>> +    AcpiMcfgInfo *cfg = info;
>>>    +    while (cfg) {
>>> +        ++count;
>>> +        cfg = cfg->next;
>>> +    }
>>> +    len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]);
>>>        mcfg = acpi_data_push(table_data, len);
>>> -    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
>>> -    /* Only a single allocation so no need to play with segments */
>>> -    mcfg->allocation[0].pci_segment = cpu_to_le16(0);
>>> -    mcfg->allocation[0].start_bus_number = 0;
>>> -    mcfg->allocation[0].end_bus_number =
>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
>>>          /* MCFG is used for ECAM which can be enabled or disabled by
>>> guest.
>>>         * To avoid table size changes (which create migration issues),
>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
>>> *linker, AcpiMcfgInfo *info)
>>>        } else {
>>>            sig = "MCFG";
>>>        }
>>> +
>>> +    count = 0;
>>> +    while (info) {
>>> +        mcfg[count].allocation[0].address =
>>> cpu_to_le64(info->mcfg_base);
>>> +        /* Only a single allocation so no need to play with segments */
>>> +        mcfg[count].allocation[0].pci_segment = cpu_to_le16(count);
>>> +        mcfg[count].allocation[0].start_bus_number = 0;
>>> +        mcfg[count++].allocation[0].end_bus_number =
>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
>> An interesting point is if we want to limit the MMFCG size for PXBs, as
>> we may not be
>> interested to use all the buses in a specific domain. For each bus we
>> require some
>> address space that remains unused.
>>
>>> +        info = info->next;
>>> +    }
>>> +
>>>        build_header(linker, table_data, (void *)mcfg, sig, len, 1,
>>> NULL, NULL);
>>>    }
>>>    @@ -2602,26 +2615,52 @@ struct AcpiBuildState {
>>>        MemoryRegion *linker_mr;
>>>    } AcpiBuildState;
>>>    -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg)
>>> +{
>>> +    AcpiMcfgInfo *tmp;
>>> +    while (mcfg) {
>>> +        tmp = mcfg->next;
>>> +        g_free(mcfg);
>>> +        mcfg = tmp;
>>> +    }
>>> +}
>>> +
>>> +static AcpiMcfgInfo *acpi_get_mcfg(void)
>>>    {
>>>        Object *pci_host;
>>>        QObject *o;
>>> +    AcpiMcfgInfo *head = NULL, *tail, *mcfg;
>>>          pci_host = acpi_get_i386_pci_host();
>>>        g_assert(pci_host);
>>>    -    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE,
>>> NULL);
>>> -    if (!o) {
>>> -        return false;
>>> +    while (pci_host) {
>>> +        mcfg = g_new0(AcpiMcfgInfo, 1);
>>> +        mcfg->next = NULL;
>>> +        if (!head) {
>>> +            tail = head = mcfg;
>>> +        } else {
>>> +            tail->next = mcfg;
>>> +            tail = mcfg;
>>> +        }
>>> +
>>> +        o = object_property_get_qobject(pci_host,
>>> PCIE_HOST_MCFG_BASE, NULL);
>>> +        if (!o) {
>>> +            cleanup_mcfg(head);
>>> +            g_free(mcfg);
>>> +            return NULL;
>>> +        }
>>> +        mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
>>> +        qobject_unref(o);
>>> +
>>> +        o = object_property_get_qobject(pci_host,
>>> PCIE_HOST_MCFG_SIZE, NULL);
>> I'll let Igor to comment on the APCI bits, but the idea of querying each
>> PCI host
>> for the MMFCG details and put it into a different table sounds good to me.
>>
>> [Adding Laszlo for his insights]
> Thanks for the CC -- I don't have many (positive) insights here to
> offer, I'm afraid. First, the patch set (including the blurb) doesn't
> seem to explain *why* multiple host bridges / ECAM areas are a good
> idea.

The issue we want to solve is the hard limitation of 256 PCIe devices
that can be used in a Q35 machine. We can use "PCI functions" to increase
the number, but then we loose the hot-plugging granularity.

Currently pxb-pcie host bridges share the same PCI domain (0)
bus numbers, so adding more of them will not result in more usable
devices (each PCIe device resides on its own bus),
but putting each of them on a different domain removes
that limitation.

Another possible use (there is a good chance I am wrong, adding Alex to 
correct me),
may be the "modeling" of a multi-function assigned device.
Currently all the functions of an assigneddevice will be placed on 
different PCIe
Root Ports "loosing" the connection between them.

However, if we put all the functions of the same assigned device in the same
PCI domain we may be able to "re-connect" them.

>   Second, supporting such would take very intrusive patches / large
> feature development for OVMF (and that's not just for OvmfPkg and
> ArmVirtPkg platform code, but also core MdeModulePkg code).
>
> Obviously I'm not saying this feature should not be implemented for QEMU
> + SeaBIOS, just that don't expect edk2 support for it anytime soon.
> Without a convincing use case, even the review impact seems hard to justify.

I understand, thank you for the feedback, the point was to be sure
we don't decide something that *can't be done* in OVMF.

Thanks,
Marcel

> Thanks,
> Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 19:01       ` Marcel Apfelbaum
@ 2018-05-22 19:51         ` Laszlo Ersek
  2018-05-22 20:58           ` Michael S. Tsirkin
                             ` (2 more replies)
  0 siblings, 3 replies; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-22 19:51 UTC (permalink / raw)
  To: Marcel Apfelbaum, Zihan Yang, qemu-devel
  Cc: Michael S. Tsirkin, Igor Mammedov, Alex Williamson, Eric Auger,
	Drew Jones, Wei Huang

On 05/22/18 21:01, Marcel Apfelbaum wrote:
> Hi Laszlo,
> 
> On 05/22/2018 12:52 PM, Laszlo Ersek wrote:
>> On 05/21/18 13:53, Marcel Apfelbaum wrote:
>>>
>>> On 05/20/2018 10:28 AM, Zihan Yang wrote:
>>>> Currently only q35 host bridge us allocated space in MCFG table. To
>>>> put pxb host
>>>> into sepratate pci domain, each of them should have its own
>>>> configuration space
>>>> int MCFG table
>>>>
>>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
>>>> ---
>>>>    hw/i386/acpi-build.c | 83
>>>> +++++++++++++++++++++++++++++++++++++++-------------
>>>>    1 file changed, 62 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
>>>> index 9bc6d97..808d815 100644
>>>> --- a/hw/i386/acpi-build.c
>>>> +++ b/hw/i386/acpi-build.c
>>>> @@ -89,6 +89,7 @@
>>>>    typedef struct AcpiMcfgInfo {
>>>>        uint64_t mcfg_base;
>>>>        uint32_t mcfg_size;
>>>> +    struct AcpiMcfgInfo *next;
>>>>    } AcpiMcfgInfo;
>>>>      typedef struct AcpiPmInfo {
>>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
>>>> *linker, AcpiMcfgInfo *info)
>>>>    {
>>>>        AcpiTableMcfg *mcfg;
>>>>        const char *sig;
>>>> -    int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
>>>> +    int len, count = 0;
>>>> +    AcpiMcfgInfo *cfg = info;
>>>>    +    while (cfg) {
>>>> +        ++count;
>>>> +        cfg = cfg->next;
>>>> +    }
>>>> +    len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]);
>>>>        mcfg = acpi_data_push(table_data, len);
>>>> -    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
>>>> -    /* Only a single allocation so no need to play with segments */
>>>> -    mcfg->allocation[0].pci_segment = cpu_to_le16(0);
>>>> -    mcfg->allocation[0].start_bus_number = 0;
>>>> -    mcfg->allocation[0].end_bus_number =
>>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
>>>>          /* MCFG is used for ECAM which can be enabled or disabled by
>>>> guest.
>>>>         * To avoid table size changes (which create migration issues),
>>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
>>>> *linker, AcpiMcfgInfo *info)
>>>>        } else {
>>>>            sig = "MCFG";
>>>>        }
>>>> +
>>>> +    count = 0;
>>>> +    while (info) {
>>>> +        mcfg[count].allocation[0].address =
>>>> cpu_to_le64(info->mcfg_base);
>>>> +        /* Only a single allocation so no need to play with
>>>> segments */
>>>> +        mcfg[count].allocation[0].pci_segment = cpu_to_le16(count);
>>>> +        mcfg[count].allocation[0].start_bus_number = 0;
>>>> +        mcfg[count++].allocation[0].end_bus_number =
>>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
>>> An interesting point is if we want to limit the MMFCG size for PXBs, as
>>> we may not be
>>> interested to use all the buses in a specific domain. For each bus we
>>> require some
>>> address space that remains unused.
>>>
>>>> +        info = info->next;
>>>> +    }
>>>> +
>>>>        build_header(linker, table_data, (void *)mcfg, sig, len, 1,
>>>> NULL, NULL);
>>>>    }
>>>>    @@ -2602,26 +2615,52 @@ struct AcpiBuildState {
>>>>        MemoryRegion *linker_mr;
>>>>    } AcpiBuildState;
>>>>    -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
>>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg)
>>>> +{
>>>> +    AcpiMcfgInfo *tmp;
>>>> +    while (mcfg) {
>>>> +        tmp = mcfg->next;
>>>> +        g_free(mcfg);
>>>> +        mcfg = tmp;
>>>> +    }
>>>> +}
>>>> +
>>>> +static AcpiMcfgInfo *acpi_get_mcfg(void)
>>>>    {
>>>>        Object *pci_host;
>>>>        QObject *o;
>>>> +    AcpiMcfgInfo *head = NULL, *tail, *mcfg;
>>>>          pci_host = acpi_get_i386_pci_host();
>>>>        g_assert(pci_host);
>>>>    -    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE,
>>>> NULL);
>>>> -    if (!o) {
>>>> -        return false;
>>>> +    while (pci_host) {
>>>> +        mcfg = g_new0(AcpiMcfgInfo, 1);
>>>> +        mcfg->next = NULL;
>>>> +        if (!head) {
>>>> +            tail = head = mcfg;
>>>> +        } else {
>>>> +            tail->next = mcfg;
>>>> +            tail = mcfg;
>>>> +        }
>>>> +
>>>> +        o = object_property_get_qobject(pci_host,
>>>> PCIE_HOST_MCFG_BASE, NULL);
>>>> +        if (!o) {
>>>> +            cleanup_mcfg(head);
>>>> +            g_free(mcfg);
>>>> +            return NULL;
>>>> +        }
>>>> +        mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
>>>> +        qobject_unref(o);
>>>> +
>>>> +        o = object_property_get_qobject(pci_host,
>>>> PCIE_HOST_MCFG_SIZE, NULL);
>>> I'll let Igor to comment on the APCI bits, but the idea of querying each
>>> PCI host
>>> for the MMFCG details and put it into a different table sounds good
>>> to me.
>>>
>>> [Adding Laszlo for his insights]
>> Thanks for the CC -- I don't have many (positive) insights here to
>> offer, I'm afraid. First, the patch set (including the blurb) doesn't
>> seem to explain *why* multiple host bridges / ECAM areas are a good
>> idea.
> 
> The issue we want to solve is the hard limitation of 256 PCIe devices
> that can be used in a Q35 machine.

Implying that there are use cases for which ~256 PCIe devices aren't
enough. I remain unconvinced until proved wrong :)

Anyway, a significant source of waste comes from the restriction that we
can only put 1 device (with up to 8 functions) on each non-root-complex
PCI Express bus (such as root ports and downstream ports). This forfeits
a huge portion of the ECAM area (about 31/32th) that we already have.
Rather than spending more MMIO guest-phys address range on new
discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem
to recall from your earlier presentation that ARI could recover that
lost address space (meaning both ECAM ranges and PCIe B/D/F address space).

There are signs that the edk2 core supports ARI if the underlying
platform supports it. (Which is not the case with multiple PCIe domains
/ multiple ECAM ranges.)

ARI support could also help aarch64/virt. Eric (CC'd) has been working
on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and
AFAIR one of the challenges there has been finding a good spot for the
larger ECAM in guest-phys address space. Fiddling with such address maps
is always a pain.

Back to x86, the guest-phys address space is quite crowded too. The
32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO
areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce
resource. Plus, reaching agreement between OVMF and QEMU on the exact
location of the 32-bit PCI MMIO aperture has always been a huge pain; so
you'd likely shoot for 64-bit.

But 64-bit is ill-partitioned and/or crowded too: first you have the
cold-plugged >4GB DRAM (whose size the firmware can interrogate), then
the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the
64-bit PCI MMIO aperture (whose size the firmware decides itself,
because QEMU cannot be asked about it presently). Placing the additional
MMCFGs somewhere would need extending the total order between these
things at the design level, more fw_cfg files, more sanity checks in
platform code in the firmware, and, quite importantly, adding support to
multiple discontiguous MMCFGs to core edk2 code.

I don't know much about ARI but it already looks a whole lot more
attractive to me.

> We can use "PCI functions" to increase
> the number, but then we loose the hot-plugging granularity.
> 
> Currently pxb-pcie host bridges share the same PCI domain (0)
> bus numbers, so adding more of them will not result in more usable
> devices (each PCIe device resides on its own bus),
> but putting each of them on a different domain removes
> that limitation.
> 
> Another possible use (there is a good chance I am wrong, adding Alex to
> correct me),
> may be the "modeling" of a multi-function assigned device.
> Currently all the functions of an assigneddevice will be placed on
> different PCIe
> Root Ports "loosing" the connection between them.
> 
> However, if we put all the functions of the same assigned device in the
> same
> PCI domain we may be able to "re-connect" them.

OK -- why is that useful? What's the current practical problem with the
splitting you describe?

>>   Second, supporting such would take very intrusive patches / large
>> feature development for OVMF (and that's not just for OvmfPkg and
>> ArmVirtPkg platform code, but also core MdeModulePkg code).
>>
>> Obviously I'm not saying this feature should not be implemented for QEMU
>> + SeaBIOS, just that don't expect edk2 support for it anytime soon.
>> Without a convincing use case, even the review impact seems hard to
>> justify.
> 
> I understand, thank you for the feedback, the point was to be sure
> we don't decide something that *can't be done* in OVMF.

Semantics :) Obviously, everything "can be done" in software; that's why
it's *soft*ware. Who is going to write it in practice? It had taken
years until the edk2 core gained a universal PciHostBridgeDxe driver
with a well-defined platform customization interface, and that interface
doesn't support multiple domains / segments. It had also taken years
until the same edk2 core driver gained support for nonzero MMIO address
translation (i.e. where the CPU view of system RAM addresses differs
from the PCI device view of the same, for DMA purposes) -- and that's
just a linear translation, not even about IOMMUs. The PCI core
maintainers in edk2 are always very busy, and upstreaming such changes
tends to take forever.

Again, I'm not opposing this feature per se. I just see any potential
support for it in edk2 very difficult. I remain unconvinced that ~256
PCIe devices aren't enough. Even assuming I'm proved plain wrong there,
I'd still much prefer ARI (although I don't know much about it at this
point), because (a) the edk2 core already shows signs that it intends to
support ARI, (b) ARI would help aarch64/virt with nearly the same
effort, (c) it seems that ARI would address the problem (namely, wasting
MMCFG) head-on, rather than throwing yet more MMCFG at it.

My apologies if I'm completely wrong about ARI (although that wouldn't
change anything about the difficulties that are foreseeable in edk2 for
supporting multiple host bridges).

Thanks,
Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 19:51         ` Laszlo Ersek
@ 2018-05-22 20:58           ` Michael S. Tsirkin
  2018-05-22 21:36             ` Alex Williamson
  2018-05-22 21:17           ` Alex Williamson
  2018-05-22 22:42           ` Laszlo Ersek
  2 siblings, 1 reply; 46+ messages in thread
From: Michael S. Tsirkin @ 2018-05-22 20:58 UTC (permalink / raw)
  To: Laszlo Ersek
  Cc: Marcel Apfelbaum, Zihan Yang, qemu-devel, Igor Mammedov,
	Alex Williamson, Eric Auger, Drew Jones, Wei Huang

On Tue, May 22, 2018 at 09:51:47PM +0200, Laszlo Ersek wrote:
> On 05/22/18 21:01, Marcel Apfelbaum wrote:
> > Hi Laszlo,
> > 
> > On 05/22/2018 12:52 PM, Laszlo Ersek wrote:
> >> On 05/21/18 13:53, Marcel Apfelbaum wrote:
> >>>
> >>> On 05/20/2018 10:28 AM, Zihan Yang wrote:
> >>>> Currently only q35 host bridge us allocated space in MCFG table. To
> >>>> put pxb host
> >>>> into sepratate pci domain, each of them should have its own
> >>>> configuration space
> >>>> int MCFG table
> >>>>
> >>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
> >>>> ---
> >>>>    hw/i386/acpi-build.c | 83
> >>>> +++++++++++++++++++++++++++++++++++++++-------------
> >>>>    1 file changed, 62 insertions(+), 21 deletions(-)
> >>>>
> >>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> >>>> index 9bc6d97..808d815 100644
> >>>> --- a/hw/i386/acpi-build.c
> >>>> +++ b/hw/i386/acpi-build.c
> >>>> @@ -89,6 +89,7 @@
> >>>>    typedef struct AcpiMcfgInfo {
> >>>>        uint64_t mcfg_base;
> >>>>        uint32_t mcfg_size;
> >>>> +    struct AcpiMcfgInfo *next;
> >>>>    } AcpiMcfgInfo;
> >>>>      typedef struct AcpiPmInfo {
> >>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
> >>>> *linker, AcpiMcfgInfo *info)
> >>>>    {
> >>>>        AcpiTableMcfg *mcfg;
> >>>>        const char *sig;
> >>>> -    int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
> >>>> +    int len, count = 0;
> >>>> +    AcpiMcfgInfo *cfg = info;
> >>>>    +    while (cfg) {
> >>>> +        ++count;
> >>>> +        cfg = cfg->next;
> >>>> +    }
> >>>> +    len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]);
> >>>>        mcfg = acpi_data_push(table_data, len);
> >>>> -    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
> >>>> -    /* Only a single allocation so no need to play with segments */
> >>>> -    mcfg->allocation[0].pci_segment = cpu_to_le16(0);
> >>>> -    mcfg->allocation[0].start_bus_number = 0;
> >>>> -    mcfg->allocation[0].end_bus_number =
> >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
> >>>>          /* MCFG is used for ECAM which can be enabled or disabled by
> >>>> guest.
> >>>>         * To avoid table size changes (which create migration issues),
> >>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
> >>>> *linker, AcpiMcfgInfo *info)
> >>>>        } else {
> >>>>            sig = "MCFG";
> >>>>        }
> >>>> +
> >>>> +    count = 0;
> >>>> +    while (info) {
> >>>> +        mcfg[count].allocation[0].address =
> >>>> cpu_to_le64(info->mcfg_base);
> >>>> +        /* Only a single allocation so no need to play with
> >>>> segments */
> >>>> +        mcfg[count].allocation[0].pci_segment = cpu_to_le16(count);
> >>>> +        mcfg[count].allocation[0].start_bus_number = 0;
> >>>> +        mcfg[count++].allocation[0].end_bus_number =
> >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
> >>> An interesting point is if we want to limit the MMFCG size for PXBs, as
> >>> we may not be
> >>> interested to use all the buses in a specific domain. For each bus we
> >>> require some
> >>> address space that remains unused.
> >>>
> >>>> +        info = info->next;
> >>>> +    }
> >>>> +
> >>>>        build_header(linker, table_data, (void *)mcfg, sig, len, 1,
> >>>> NULL, NULL);
> >>>>    }
> >>>>    @@ -2602,26 +2615,52 @@ struct AcpiBuildState {
> >>>>        MemoryRegion *linker_mr;
> >>>>    } AcpiBuildState;
> >>>>    -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
> >>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg)
> >>>> +{
> >>>> +    AcpiMcfgInfo *tmp;
> >>>> +    while (mcfg) {
> >>>> +        tmp = mcfg->next;
> >>>> +        g_free(mcfg);
> >>>> +        mcfg = tmp;
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +static AcpiMcfgInfo *acpi_get_mcfg(void)
> >>>>    {
> >>>>        Object *pci_host;
> >>>>        QObject *o;
> >>>> +    AcpiMcfgInfo *head = NULL, *tail, *mcfg;
> >>>>          pci_host = acpi_get_i386_pci_host();
> >>>>        g_assert(pci_host);
> >>>>    -    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE,
> >>>> NULL);
> >>>> -    if (!o) {
> >>>> -        return false;
> >>>> +    while (pci_host) {
> >>>> +        mcfg = g_new0(AcpiMcfgInfo, 1);
> >>>> +        mcfg->next = NULL;
> >>>> +        if (!head) {
> >>>> +            tail = head = mcfg;
> >>>> +        } else {
> >>>> +            tail->next = mcfg;
> >>>> +            tail = mcfg;
> >>>> +        }
> >>>> +
> >>>> +        o = object_property_get_qobject(pci_host,
> >>>> PCIE_HOST_MCFG_BASE, NULL);
> >>>> +        if (!o) {
> >>>> +            cleanup_mcfg(head);
> >>>> +            g_free(mcfg);
> >>>> +            return NULL;
> >>>> +        }
> >>>> +        mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
> >>>> +        qobject_unref(o);
> >>>> +
> >>>> +        o = object_property_get_qobject(pci_host,
> >>>> PCIE_HOST_MCFG_SIZE, NULL);
> >>> I'll let Igor to comment on the APCI bits, but the idea of querying each
> >>> PCI host
> >>> for the MMFCG details and put it into a different table sounds good
> >>> to me.
> >>>
> >>> [Adding Laszlo for his insights]
> >> Thanks for the CC -- I don't have many (positive) insights here to
> >> offer, I'm afraid. First, the patch set (including the blurb) doesn't
> >> seem to explain *why* multiple host bridges / ECAM areas are a good
> >> idea.
> > 
> > The issue we want to solve is the hard limitation of 256 PCIe devices
> > that can be used in a Q35 machine.
> 
> Implying that there are use cases for which ~256 PCIe devices aren't
> enough. I remain unconvinced until proved wrong :)
> 
> Anyway, a significant source of waste comes from the restriction that we
> can only put 1 device (with up to 8 functions) on each non-root-complex
> PCI Express bus (such as root ports and downstream ports). This forfeits
> a huge portion of the ECAM area (about 31/32th) that we already have.
> Rather than spending more MMIO guest-phys address range on new
> discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem
> to recall from your earlier presentation that ARI could recover that
> lost address space (meaning both ECAM ranges and PCIe B/D/F address space).

Well ARI just adds more functions. I do think it's a useful feature
but individual functions remain non-hotpluggable with the standard
PCIe controller.


> There are signs that the edk2 core supports ARI if the underlying
> platform supports it. (Which is not the case with multiple PCIe domains
> / multiple ECAM ranges.)
> 
> ARI support could also help aarch64/virt. Eric (CC'd) has been working
> on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and
> AFAIR one of the challenges there has been finding a good spot for the
> larger ECAM in guest-phys address space. Fiddling with such address maps
> is always a pain.
> 
> Back to x86, the guest-phys address space is quite crowded too. The
> 32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO
> areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce
> resource. Plus, reaching agreement between OVMF and QEMU on the exact
> location of the 32-bit PCI MMIO aperture has always been a huge pain; so
> you'd likely shoot for 64-bit.
> 
> But 64-bit is ill-partitioned and/or crowded too: first you have the
> cold-plugged >4GB DRAM (whose size the firmware can interrogate), then
> the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the
> 64-bit PCI MMIO aperture (whose size the firmware decides itself,
> because QEMU cannot be asked about it presently). Placing the additional
> MMCFGs somewhere would need extending the total order between these
> things at the design level, more fw_cfg files, more sanity checks in
> platform code in the firmware, and, quite importantly, adding support to
> multiple discontiguous MMCFGs to core edk2 code.

In fact, last I looked the proposed patches left the layout
completely to the firmware. So firmware can just select where
it wants these things to be, and program hardware accordingly -
there should not be a need for new fw cfg files.

> I don't know much about ARI but it already looks a whole lot more
> attractive to me.
> 
> > We can use "PCI functions" to increase
> > the number, but then we loose the hot-plugging granularity.
> > 
> > Currently pxb-pcie host bridges share the same PCI domain (0)
> > bus numbers, so adding more of them will not result in more usable
> > devices (each PCIe device resides on its own bus),
> > but putting each of them on a different domain removes
> > that limitation.
> > 
> > Another possible use (there is a good chance I am wrong, adding Alex to
> > correct me),
> > may be the "modeling" of a multi-function assigned device.
> > Currently all the functions of an assigneddevice will be placed on
> > different PCIe
> > Root Ports "loosing" the connection between them.
> > 
> > However, if we put all the functions of the same assigned device in the
> > same
> > PCI domain we may be able to "re-connect" them.
> 
> OK -- why is that useful? What's the current practical problem with the
> splitting you describe?
> 
> >>   Second, supporting such would take very intrusive patches / large
> >> feature development for OVMF (and that's not just for OvmfPkg and
> >> ArmVirtPkg platform code, but also core MdeModulePkg code).

So there certainly exist hardware platforms with multiple pci root
complexes. It's not a PV technology. So it's probably worth it to
support in OVMF.  The pxb hack I'm less convinced about. It was
apparently designed to make pci support very easy for seabios, if it's
not doing that for pcie then we should do something else. And certainly,
the fact that it doesn't allow >256 devices is not a plus.


> >> Obviously I'm not saying this feature should not be implemented for QEMU
> >> + SeaBIOS, just that don't expect edk2 support for it anytime soon.
> >> Without a convincing use case, even the review impact seems hard to
> >> justify.
> > 
> > I understand, thank you for the feedback, the point was to be sure
> > we don't decide something that *can't be done* in OVMF.
> 
> Semantics :) Obviously, everything "can be done" in software; that's why
> it's *soft*ware. Who is going to write it in practice? It had taken
> years until the edk2 core gained a universal PciHostBridgeDxe driver
> with a well-defined platform customization interface, and that interface
> doesn't support multiple domains / segments. It had also taken years
> until the same edk2 core driver gained support for nonzero MMIO address
> translation (i.e. where the CPU view of system RAM addresses differs
> from the PCI device view of the same, for DMA purposes) -- and that's
> just a linear translation, not even about IOMMUs. The PCI core
> maintainers in edk2 are always very busy, and upstreaming such changes
> tends to take forever.
> 
> Again, I'm not opposing this feature per se. I just see any potential
> support for it in edk2 very difficult. I remain unconvinced that ~256
> PCIe devices aren't enough. Even assuming I'm proved plain wrong there,
> I'd still much prefer ARI (although I don't know much about it at this
> point), because (a) the edk2 core already shows signs that it intends to
> support ARI, (b) ARI would help aarch64/virt with nearly the same
> effort, (c) it seems that ARI would address the problem (namely, wasting
> MMCFG) head-on, rather than throwing yet more MMCFG at it.
> 
> My apologies if I'm completely wrong about ARI (although that wouldn't
> change anything about the difficulties that are foreseeable in edk2 for
> supporting multiple host bridges).
> 
> Thanks,
> Laszlo

It's not hard to think of a use-case where >256 devices
are helpful, for example a nested virt scenario where
each device is passed on to a different nested guest.

But I think the main feature this is needed for is numa modeling.
Guests seem to assume a numa node per PCI root, ergo we need more PCI
roots.

-- 
MST

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 19:51         ` Laszlo Ersek
  2018-05-22 20:58           ` Michael S. Tsirkin
@ 2018-05-22 21:17           ` Alex Williamson
  2018-05-22 21:22             ` Michael S. Tsirkin
                               ` (2 more replies)
  2018-05-22 22:42           ` Laszlo Ersek
  2 siblings, 3 replies; 46+ messages in thread
From: Alex Williamson @ 2018-05-22 21:17 UTC (permalink / raw)
  To: Laszlo Ersek
  Cc: Marcel Apfelbaum, Zihan Yang, qemu-devel, Wei Huang, Drew Jones,
	Michael S. Tsirkin, Eric Auger, Igor Mammedov

On Tue, 22 May 2018 21:51:47 +0200
Laszlo Ersek <lersek@redhat.com> wrote:

> On 05/22/18 21:01, Marcel Apfelbaum wrote:
> > Hi Laszlo,
> > 
> > On 05/22/2018 12:52 PM, Laszlo Ersek wrote:  
> >> On 05/21/18 13:53, Marcel Apfelbaum wrote:  
> >>>
> >>> On 05/20/2018 10:28 AM, Zihan Yang wrote:  
> >>>> Currently only q35 host bridge us allocated space in MCFG table. To
> >>>> put pxb host
> >>>> into sepratate pci domain, each of them should have its own
> >>>> configuration space
> >>>> int MCFG table
> >>>>
> >>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
> >>>> ---
> >>>>    hw/i386/acpi-build.c | 83
> >>>> +++++++++++++++++++++++++++++++++++++++-------------
> >>>>    1 file changed, 62 insertions(+), 21 deletions(-)
> >>>>
> >>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> >>>> index 9bc6d97..808d815 100644
> >>>> --- a/hw/i386/acpi-build.c
> >>>> +++ b/hw/i386/acpi-build.c
> >>>> @@ -89,6 +89,7 @@
> >>>>    typedef struct AcpiMcfgInfo {
> >>>>        uint64_t mcfg_base;
> >>>>        uint32_t mcfg_size;
> >>>> +    struct AcpiMcfgInfo *next;
> >>>>    } AcpiMcfgInfo;
> >>>>      typedef struct AcpiPmInfo {
> >>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
> >>>> *linker, AcpiMcfgInfo *info)
> >>>>    {
> >>>>        AcpiTableMcfg *mcfg;
> >>>>        const char *sig;
> >>>> -    int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
> >>>> +    int len, count = 0;
> >>>> +    AcpiMcfgInfo *cfg = info;
> >>>>    +    while (cfg) {
> >>>> +        ++count;
> >>>> +        cfg = cfg->next;
> >>>> +    }
> >>>> +    len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]);
> >>>>        mcfg = acpi_data_push(table_data, len);
> >>>> -    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
> >>>> -    /* Only a single allocation so no need to play with segments */
> >>>> -    mcfg->allocation[0].pci_segment = cpu_to_le16(0);
> >>>> -    mcfg->allocation[0].start_bus_number = 0;
> >>>> -    mcfg->allocation[0].end_bus_number =
> >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
> >>>>          /* MCFG is used for ECAM which can be enabled or disabled by
> >>>> guest.
> >>>>         * To avoid table size changes (which create migration issues),
> >>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
> >>>> *linker, AcpiMcfgInfo *info)
> >>>>        } else {
> >>>>            sig = "MCFG";
> >>>>        }
> >>>> +
> >>>> +    count = 0;
> >>>> +    while (info) {
> >>>> +        mcfg[count].allocation[0].address =
> >>>> cpu_to_le64(info->mcfg_base);
> >>>> +        /* Only a single allocation so no need to play with
> >>>> segments */
> >>>> +        mcfg[count].allocation[0].pci_segment = cpu_to_le16(count);
> >>>> +        mcfg[count].allocation[0].start_bus_number = 0;
> >>>> +        mcfg[count++].allocation[0].end_bus_number =
> >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);  
> >>> An interesting point is if we want to limit the MMFCG size for PXBs, as
> >>> we may not be
> >>> interested to use all the buses in a specific domain. For each bus we
> >>> require some
> >>> address space that remains unused.
> >>>  
> >>>> +        info = info->next;
> >>>> +    }
> >>>> +
> >>>>        build_header(linker, table_data, (void *)mcfg, sig, len, 1,
> >>>> NULL, NULL);
> >>>>    }
> >>>>    @@ -2602,26 +2615,52 @@ struct AcpiBuildState {
> >>>>        MemoryRegion *linker_mr;
> >>>>    } AcpiBuildState;
> >>>>    -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
> >>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg)
> >>>> +{
> >>>> +    AcpiMcfgInfo *tmp;
> >>>> +    while (mcfg) {
> >>>> +        tmp = mcfg->next;
> >>>> +        g_free(mcfg);
> >>>> +        mcfg = tmp;
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +static AcpiMcfgInfo *acpi_get_mcfg(void)
> >>>>    {
> >>>>        Object *pci_host;
> >>>>        QObject *o;
> >>>> +    AcpiMcfgInfo *head = NULL, *tail, *mcfg;
> >>>>          pci_host = acpi_get_i386_pci_host();
> >>>>        g_assert(pci_host);
> >>>>    -    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE,
> >>>> NULL);
> >>>> -    if (!o) {
> >>>> -        return false;
> >>>> +    while (pci_host) {
> >>>> +        mcfg = g_new0(AcpiMcfgInfo, 1);
> >>>> +        mcfg->next = NULL;
> >>>> +        if (!head) {
> >>>> +            tail = head = mcfg;
> >>>> +        } else {
> >>>> +            tail->next = mcfg;
> >>>> +            tail = mcfg;
> >>>> +        }
> >>>> +
> >>>> +        o = object_property_get_qobject(pci_host,
> >>>> PCIE_HOST_MCFG_BASE, NULL);
> >>>> +        if (!o) {
> >>>> +            cleanup_mcfg(head);
> >>>> +            g_free(mcfg);
> >>>> +            return NULL;
> >>>> +        }
> >>>> +        mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
> >>>> +        qobject_unref(o);
> >>>> +
> >>>> +        o = object_property_get_qobject(pci_host,
> >>>> PCIE_HOST_MCFG_SIZE, NULL);  
> >>> I'll let Igor to comment on the APCI bits, but the idea of querying each
> >>> PCI host
> >>> for the MMFCG details and put it into a different table sounds good
> >>> to me.
> >>>
> >>> [Adding Laszlo for his insights]  
> >> Thanks for the CC -- I don't have many (positive) insights here to
> >> offer, I'm afraid. First, the patch set (including the blurb) doesn't
> >> seem to explain *why* multiple host bridges / ECAM areas are a good
> >> idea.  
> > 
> > The issue we want to solve is the hard limitation of 256 PCIe devices
> > that can be used in a Q35 machine.

Isn't it interesting that conventional PCI can easily support so many
more devices?  Sorta makes you wonder why we care that virtual devices
are express rather than conventional for a high density configuration...

> Implying that there are use cases for which ~256 PCIe devices aren't
> enough. I remain unconvinced until proved wrong :)
> 
> Anyway, a significant source of waste comes from the restriction that we
> can only put 1 device (with up to 8 functions) on each non-root-complex
> PCI Express bus (such as root ports and downstream ports). This forfeits
> a huge portion of the ECAM area (about 31/32th) that we already have.
> Rather than spending more MMIO guest-phys address range on new
> discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem
> to recall from your earlier presentation that ARI could recover that
> lost address space (meaning both ECAM ranges and PCIe B/D/F address space).

How does ARI solve the hotplug problem?  ARI is effectively
multifunction on steroids, the ARI capability in each function points
to the next function number so that we don't need to scan the entire
devfn address space per bus (an inefficiency we don't care about when
there are only 8 function).  So yeah, we can fill an entire bus with
devices with ARI, but they're all rooted at 00.0.
 
> There are signs that the edk2 core supports ARI if the underlying
> platform supports it. (Which is not the case with multiple PCIe domains
> / multiple ECAM ranges.)

It's pretty surprising to me that edk2 wouldn't already have support
for multiple PCIe domains, they're really not *that* uncommon.  Some
architectures make almost gratuitous use of PCIe domains.  I certainly
know there were UEFI ia64 platforms with multiple domains.

> ARI support could also help aarch64/virt. Eric (CC'd) has been working
> on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and
> AFAIR one of the challenges there has been finding a good spot for the
> larger ECAM in guest-phys address space. Fiddling with such address maps
> is always a pain.
> 
> Back to x86, the guest-phys address space is quite crowded too. The
> 32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO
> areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce
> resource. Plus, reaching agreement between OVMF and QEMU on the exact
> location of the 32-bit PCI MMIO aperture has always been a huge pain; so
> you'd likely shoot for 64-bit.

Why do we need to equally split 32-bit MMIO space between domains?  Put
it on domain 0 and require devices installed into non-zero domains have
no 32-bit dependencies.

> But 64-bit is ill-partitioned and/or crowded too: first you have the
> cold-plugged >4GB DRAM (whose size the firmware can interrogate), then
> the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the
> 64-bit PCI MMIO aperture (whose size the firmware decides itself,
> because QEMU cannot be asked about it presently). Placing the additional
> MMCFGs somewhere would need extending the total order between these
> things at the design level, more fw_cfg files, more sanity checks in
> platform code in the firmware, and, quite importantly, adding support to
> multiple discontiguous MMCFGs to core edk2 code.

Hmm, we're doing something wrong if we can't figure out some standards
within QEMU for placing per domain 64-bit MMIO and MMCFG ranges.

> I don't know much about ARI but it already looks a whole lot more
> attractive to me.
> 
> > We can use "PCI functions" to increase
> > the number, but then we loose the hot-plugging granularity.
> > 
> > Currently pxb-pcie host bridges share the same PCI domain (0)
> > bus numbers, so adding more of them will not result in more usable
> > devices (each PCIe device resides on its own bus),
> > but putting each of them on a different domain removes
> > that limitation.
> > 
> > Another possible use (there is a good chance I am wrong, adding Alex to
> > correct me),
> > may be the "modeling" of a multi-function assigned device.
> > Currently all the functions of an assigneddevice will be placed on
> > different PCIe
> > Root Ports "loosing" the connection between them.
> > 
> > However, if we put all the functions of the same assigned device in the
> > same
> > PCI domain we may be able to "re-connect" them.  
> 
> OK -- why is that useful? What's the current practical problem with the
> splitting you describe?

There are very few cases where this is useful, things like associating
USB companion devices or translating a guest bus reset to host bus
reset when functions are split to separate virtual buses.  That said, I
have no idea how multiple domains plays a factor here.  Regardless of
how many PCIe domains a VM supports, we can easily use existing
multifunction within a single domain to expose multifunction assigned
devices in a way that better resembles the physical hardware.
Multifunction PCIe endpoints cannot span PCIe domains.

In fact, IOMMUs generally cannot span domains either, so we better also
be thinking about at least a VT-d DHRD or vIOMMU per PCIe domain.
 
> >>   Second, supporting such would take very intrusive patches / large
> >> feature development for OVMF (and that's not just for OvmfPkg and
> >> ArmVirtPkg platform code, but also core MdeModulePkg code).
> >>
> >> Obviously I'm not saying this feature should not be implemented for QEMU
> >> + SeaBIOS, just that don't expect edk2 support for it anytime soon.
> >> Without a convincing use case, even the review impact seems hard to
> >> justify.  
> > 
> > I understand, thank you for the feedback, the point was to be sure
> > we don't decide something that *can't be done* in OVMF.  
> 
> Semantics :) Obviously, everything "can be done" in software; that's why
> it's *soft*ware. Who is going to write it in practice? It had taken
> years until the edk2 core gained a universal PciHostBridgeDxe driver
> with a well-defined platform customization interface, and that interface
> doesn't support multiple domains / segments. It had also taken years
> until the same edk2 core driver gained support for nonzero MMIO address
> translation (i.e. where the CPU view of system RAM addresses differs
> from the PCI device view of the same, for DMA purposes) -- and that's
> just a linear translation, not even about IOMMUs. The PCI core
> maintainers in edk2 are always very busy, and upstreaming such changes
> tends to take forever.

Wow, that's surprising, ia64 was doing all of that on UEFI platforms a
decade ago.
 
> Again, I'm not opposing this feature per se. I just see any potential
> support for it in edk2 very difficult. I remain unconvinced that ~256
> PCIe devices aren't enough. Even assuming I'm proved plain wrong there,
> I'd still much prefer ARI (although I don't know much about it at this
> point), because (a) the edk2 core already shows signs that it intends to
> support ARI, (b) ARI would help aarch64/virt with nearly the same
> effort, (c) it seems that ARI would address the problem (namely, wasting
> MMCFG) head-on, rather than throwing yet more MMCFG at it.

-ENOHOTPLUG  From an assigned device perspective, this also implies
that we're potentially adding a PCIe extended capability that may not
exist on the physical device to the virtual view of the device.
There's no guarantee that we can place that capability in the devices
extended config space without stepping on registers that the hardware
has hidden between the capabilities (for real, GPUs have registers in
extended config space that aren't covered by capabilities) or the
unlikely event that there's no more room in extended config space.
Thanks,

Alex

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 21:17           ` Alex Williamson
@ 2018-05-22 21:22             ` Michael S. Tsirkin
  2018-05-22 21:58               ` Laszlo Ersek
  2018-05-22 21:50             ` Laszlo Ersek
  2018-05-23 17:00             ` Marcel Apfelbaum
  2 siblings, 1 reply; 46+ messages in thread
From: Michael S. Tsirkin @ 2018-05-22 21:22 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Laszlo Ersek, Marcel Apfelbaum, Zihan Yang, qemu-devel,
	Wei Huang, Drew Jones, Eric Auger, Igor Mammedov

On Tue, May 22, 2018 at 03:17:32PM -0600, Alex Williamson wrote:
> On Tue, 22 May 2018 21:51:47 +0200
> Laszlo Ersek <lersek@redhat.com> wrote:
> 
> > On 05/22/18 21:01, Marcel Apfelbaum wrote:
> > > Hi Laszlo,
> > > 
> > > On 05/22/2018 12:52 PM, Laszlo Ersek wrote:  
> > >> On 05/21/18 13:53, Marcel Apfelbaum wrote:  
> > >>>
> > >>> On 05/20/2018 10:28 AM, Zihan Yang wrote:  
> > >>>> Currently only q35 host bridge us allocated space in MCFG table. To
> > >>>> put pxb host
> > >>>> into sepratate pci domain, each of them should have its own
> > >>>> configuration space
> > >>>> int MCFG table
> > >>>>
> > >>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
> > >>>> ---
> > >>>>    hw/i386/acpi-build.c | 83
> > >>>> +++++++++++++++++++++++++++++++++++++++-------------
> > >>>>    1 file changed, 62 insertions(+), 21 deletions(-)
> > >>>>
> > >>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> > >>>> index 9bc6d97..808d815 100644
> > >>>> --- a/hw/i386/acpi-build.c
> > >>>> +++ b/hw/i386/acpi-build.c
> > >>>> @@ -89,6 +89,7 @@
> > >>>>    typedef struct AcpiMcfgInfo {
> > >>>>        uint64_t mcfg_base;
> > >>>>        uint32_t mcfg_size;
> > >>>> +    struct AcpiMcfgInfo *next;
> > >>>>    } AcpiMcfgInfo;
> > >>>>      typedef struct AcpiPmInfo {
> > >>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
> > >>>> *linker, AcpiMcfgInfo *info)
> > >>>>    {
> > >>>>        AcpiTableMcfg *mcfg;
> > >>>>        const char *sig;
> > >>>> -    int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
> > >>>> +    int len, count = 0;
> > >>>> +    AcpiMcfgInfo *cfg = info;
> > >>>>    +    while (cfg) {
> > >>>> +        ++count;
> > >>>> +        cfg = cfg->next;
> > >>>> +    }
> > >>>> +    len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]);
> > >>>>        mcfg = acpi_data_push(table_data, len);
> > >>>> -    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
> > >>>> -    /* Only a single allocation so no need to play with segments */
> > >>>> -    mcfg->allocation[0].pci_segment = cpu_to_le16(0);
> > >>>> -    mcfg->allocation[0].start_bus_number = 0;
> > >>>> -    mcfg->allocation[0].end_bus_number =
> > >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
> > >>>>          /* MCFG is used for ECAM which can be enabled or disabled by
> > >>>> guest.
> > >>>>         * To avoid table size changes (which create migration issues),
> > >>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
> > >>>> *linker, AcpiMcfgInfo *info)
> > >>>>        } else {
> > >>>>            sig = "MCFG";
> > >>>>        }
> > >>>> +
> > >>>> +    count = 0;
> > >>>> +    while (info) {
> > >>>> +        mcfg[count].allocation[0].address =
> > >>>> cpu_to_le64(info->mcfg_base);
> > >>>> +        /* Only a single allocation so no need to play with
> > >>>> segments */
> > >>>> +        mcfg[count].allocation[0].pci_segment = cpu_to_le16(count);
> > >>>> +        mcfg[count].allocation[0].start_bus_number = 0;
> > >>>> +        mcfg[count++].allocation[0].end_bus_number =
> > >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);  
> > >>> An interesting point is if we want to limit the MMFCG size for PXBs, as
> > >>> we may not be
> > >>> interested to use all the buses in a specific domain. For each bus we
> > >>> require some
> > >>> address space that remains unused.
> > >>>  
> > >>>> +        info = info->next;
> > >>>> +    }
> > >>>> +
> > >>>>        build_header(linker, table_data, (void *)mcfg, sig, len, 1,
> > >>>> NULL, NULL);
> > >>>>    }
> > >>>>    @@ -2602,26 +2615,52 @@ struct AcpiBuildState {
> > >>>>        MemoryRegion *linker_mr;
> > >>>>    } AcpiBuildState;
> > >>>>    -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
> > >>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg)
> > >>>> +{
> > >>>> +    AcpiMcfgInfo *tmp;
> > >>>> +    while (mcfg) {
> > >>>> +        tmp = mcfg->next;
> > >>>> +        g_free(mcfg);
> > >>>> +        mcfg = tmp;
> > >>>> +    }
> > >>>> +}
> > >>>> +
> > >>>> +static AcpiMcfgInfo *acpi_get_mcfg(void)
> > >>>>    {
> > >>>>        Object *pci_host;
> > >>>>        QObject *o;
> > >>>> +    AcpiMcfgInfo *head = NULL, *tail, *mcfg;
> > >>>>          pci_host = acpi_get_i386_pci_host();
> > >>>>        g_assert(pci_host);
> > >>>>    -    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE,
> > >>>> NULL);
> > >>>> -    if (!o) {
> > >>>> -        return false;
> > >>>> +    while (pci_host) {
> > >>>> +        mcfg = g_new0(AcpiMcfgInfo, 1);
> > >>>> +        mcfg->next = NULL;
> > >>>> +        if (!head) {
> > >>>> +            tail = head = mcfg;
> > >>>> +        } else {
> > >>>> +            tail->next = mcfg;
> > >>>> +            tail = mcfg;
> > >>>> +        }
> > >>>> +
> > >>>> +        o = object_property_get_qobject(pci_host,
> > >>>> PCIE_HOST_MCFG_BASE, NULL);
> > >>>> +        if (!o) {
> > >>>> +            cleanup_mcfg(head);
> > >>>> +            g_free(mcfg);
> > >>>> +            return NULL;
> > >>>> +        }
> > >>>> +        mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
> > >>>> +        qobject_unref(o);
> > >>>> +
> > >>>> +        o = object_property_get_qobject(pci_host,
> > >>>> PCIE_HOST_MCFG_SIZE, NULL);  
> > >>> I'll let Igor to comment on the APCI bits, but the idea of querying each
> > >>> PCI host
> > >>> for the MMFCG details and put it into a different table sounds good
> > >>> to me.
> > >>>
> > >>> [Adding Laszlo for his insights]  
> > >> Thanks for the CC -- I don't have many (positive) insights here to
> > >> offer, I'm afraid. First, the patch set (including the blurb) doesn't
> > >> seem to explain *why* multiple host bridges / ECAM areas are a good
> > >> idea.  
> > > 
> > > The issue we want to solve is the hard limitation of 256 PCIe devices
> > > that can be used in a Q35 machine.
> 
> Isn't it interesting that conventional PCI can easily support so many
> more devices?  Sorta makes you wonder why we care that virtual devices
> are express rather than conventional for a high density configuration...

Because they are express in real life?

> > Implying that there are use cases for which ~256 PCIe devices aren't
> > enough. I remain unconvinced until proved wrong :)
> > 
> > Anyway, a significant source of waste comes from the restriction that we
> > can only put 1 device (with up to 8 functions) on each non-root-complex
> > PCI Express bus (such as root ports and downstream ports). This forfeits
> > a huge portion of the ECAM area (about 31/32th) that we already have.
> > Rather than spending more MMIO guest-phys address range on new
> > discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem
> > to recall from your earlier presentation that ARI could recover that
> > lost address space (meaning both ECAM ranges and PCIe B/D/F address space).
> 
> How does ARI solve the hotplug problem?  ARI is effectively
> multifunction on steroids, the ARI capability in each function points
> to the next function number so that we don't need to scan the entire
> devfn address space per bus (an inefficiency we don't care about when
> there are only 8 function).  So yeah, we can fill an entire bus with
> devices with ARI, but they're all rooted at 00.0.
>  
> > There are signs that the edk2 core supports ARI if the underlying
> > platform supports it. (Which is not the case with multiple PCIe domains
> > / multiple ECAM ranges.)
> 
> It's pretty surprising to me that edk2 wouldn't already have support
> for multiple PCIe domains, they're really not *that* uncommon.  Some
> architectures make almost gratuitous use of PCIe domains.  I certainly
> know there were UEFI ia64 platforms with multiple domains.
> 
> > ARI support could also help aarch64/virt. Eric (CC'd) has been working
> > on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and
> > AFAIR one of the challenges there has been finding a good spot for the
> > larger ECAM in guest-phys address space. Fiddling with such address maps
> > is always a pain.
> > 
> > Back to x86, the guest-phys address space is quite crowded too. The
> > 32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO
> > areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce
> > resource. Plus, reaching agreement between OVMF and QEMU on the exact
> > location of the 32-bit PCI MMIO aperture has always been a huge pain; so
> > you'd likely shoot for 64-bit.
> 
> Why do we need to equally split 32-bit MMIO space between domains?  Put
> it on domain 0 and require devices installed into non-zero domains have
> no 32-bit dependencies.

+1

> > But 64-bit is ill-partitioned and/or crowded too: first you have the
> > cold-plugged >4GB DRAM (whose size the firmware can interrogate), then
> > the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the
> > 64-bit PCI MMIO aperture (whose size the firmware decides itself,
> > because QEMU cannot be asked about it presently). Placing the additional
> > MMCFGs somewhere would need extending the total order between these
> > things at the design level, more fw_cfg files, more sanity checks in
> > platform code in the firmware, and, quite importantly, adding support to
> > multiple discontiguous MMCFGs to core edk2 code.
> 
> Hmm, we're doing something wrong if we can't figure out some standards
> within QEMU for placing per domain 64-bit MMIO and MMCFG ranges.

I thought in this patch it is done by firmware.

> > I don't know much about ARI but it already looks a whole lot more
> > attractive to me.
> > 
> > > We can use "PCI functions" to increase
> > > the number, but then we loose the hot-plugging granularity.
> > > 
> > > Currently pxb-pcie host bridges share the same PCI domain (0)
> > > bus numbers, so adding more of them will not result in more usable
> > > devices (each PCIe device resides on its own bus),
> > > but putting each of them on a different domain removes
> > > that limitation.
> > > 
> > > Another possible use (there is a good chance I am wrong, adding Alex to
> > > correct me),
> > > may be the "modeling" of a multi-function assigned device.
> > > Currently all the functions of an assigneddevice will be placed on
> > > different PCIe
> > > Root Ports "loosing" the connection between them.
> > > 
> > > However, if we put all the functions of the same assigned device in the
> > > same
> > > PCI domain we may be able to "re-connect" them.  
> > 
> > OK -- why is that useful? What's the current practical problem with the
> > splitting you describe?
> 
> There are very few cases where this is useful, things like associating
> USB companion devices or translating a guest bus reset to host bus
> reset when functions are split to separate virtual buses.  That said, I
> have no idea how multiple domains plays a factor here.  Regardless of
> how many PCIe domains a VM supports, we can easily use existing
> multifunction within a single domain to expose multifunction assigned
> devices in a way that better resembles the physical hardware.
> Multifunction PCIe endpoints cannot span PCIe domains.
> 
> In fact, IOMMUs generally cannot span domains either, so we better also
> be thinking about at least a VT-d DHRD or vIOMMU per PCIe domain.
>  
> > >>   Second, supporting such would take very intrusive patches / large
> > >> feature development for OVMF (and that's not just for OvmfPkg and
> > >> ArmVirtPkg platform code, but also core MdeModulePkg code).
> > >>
> > >> Obviously I'm not saying this feature should not be implemented for QEMU
> > >> + SeaBIOS, just that don't expect edk2 support for it anytime soon.
> > >> Without a convincing use case, even the review impact seems hard to
> > >> justify.  
> > > 
> > > I understand, thank you for the feedback, the point was to be sure
> > > we don't decide something that *can't be done* in OVMF.  
> > 
> > Semantics :) Obviously, everything "can be done" in software; that's why
> > it's *soft*ware. Who is going to write it in practice? It had taken
> > years until the edk2 core gained a universal PciHostBridgeDxe driver
> > with a well-defined platform customization interface, and that interface
> > doesn't support multiple domains / segments. It had also taken years
> > until the same edk2 core driver gained support for nonzero MMIO address
> > translation (i.e. where the CPU view of system RAM addresses differs
> > from the PCI device view of the same, for DMA purposes) -- and that's
> > just a linear translation, not even about IOMMUs. The PCI core
> > maintainers in edk2 are always very busy, and upstreaming such changes
> > tends to take forever.
> 
> Wow, that's surprising, ia64 was doing all of that on UEFI platforms a
> decade ago.
>  
> > Again, I'm not opposing this feature per se. I just see any potential
> > support for it in edk2 very difficult. I remain unconvinced that ~256
> > PCIe devices aren't enough. Even assuming I'm proved plain wrong there,
> > I'd still much prefer ARI (although I don't know much about it at this
> > point), because (a) the edk2 core already shows signs that it intends to
> > support ARI, (b) ARI would help aarch64/virt with nearly the same
> > effort, (c) it seems that ARI would address the problem (namely, wasting
> > MMCFG) head-on, rather than throwing yet more MMCFG at it.
> 
> -ENOHOTPLUG  From an assigned device perspective, this also implies
> that we're potentially adding a PCIe extended capability that may not
> exist on the physical device to the virtual view of the device.
> There's no guarantee that we can place that capability in the devices
> extended config space without stepping on registers that the hardware
> has hidden between the capabilities (for real, GPUs have registers in
> extended config space that aren't covered by capabilities) or the
> unlikely event that there's no more room in extended config space.
> Thanks,
> 
> Alex

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 20:58           ` Michael S. Tsirkin
@ 2018-05-22 21:36             ` Alex Williamson
  2018-05-22 21:44               ` Michael S. Tsirkin
  0 siblings, 1 reply; 46+ messages in thread
From: Alex Williamson @ 2018-05-22 21:36 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Laszlo Ersek, Marcel Apfelbaum, Zihan Yang, qemu-devel,
	Igor Mammedov, Eric Auger, Drew Jones, Wei Huang

On Tue, 22 May 2018 23:58:30 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote: 
>
> It's not hard to think of a use-case where >256 devices
> are helpful, for example a nested virt scenario where
> each device is passed on to a different nested guest.
>
> But I think the main feature this is needed for is numa modeling.
> Guests seem to assume a numa node per PCI root, ergo we need more PCI
> roots.

But even if we have NUMA affinity per PCI host bridge, a PCI host
bridge does not necessarily imply a new PCIe domain.  Nearly any Intel
multi-socket system proves this.  Don't get me wrong, I'd like to see
PCIe domain support and I'm surprised edk2 is so far from supporting
it, but other than >256 hotpluggable slots, I'm having trouble coming
up with use cases.  Maybe hotpluggable PCI root hierarchies are easier
with separate domains?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 21:36             ` Alex Williamson
@ 2018-05-22 21:44               ` Michael S. Tsirkin
  2018-05-22 21:47                 ` Alex Williamson
  0 siblings, 1 reply; 46+ messages in thread
From: Michael S. Tsirkin @ 2018-05-22 21:44 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Laszlo Ersek, Marcel Apfelbaum, Zihan Yang, qemu-devel,
	Igor Mammedov, Eric Auger, Drew Jones, Wei Huang

On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote:
> On Tue, 22 May 2018 23:58:30 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote: 
> >
> > It's not hard to think of a use-case where >256 devices
> > are helpful, for example a nested virt scenario where
> > each device is passed on to a different nested guest.
> >
> > But I think the main feature this is needed for is numa modeling.
> > Guests seem to assume a numa node per PCI root, ergo we need more PCI
> > roots.
> 
> But even if we have NUMA affinity per PCI host bridge, a PCI host
> bridge does not necessarily imply a new PCIe domain.

What are you calling a PCIe domain?

>  Nearly any Intel
> multi-socket system proves this.  Don't get me wrong, I'd like to see
> PCIe domain support and I'm surprised edk2 is so far from supporting
> it, but other than >256 hotpluggable slots, I'm having trouble coming
> up with use cases.  Maybe hotpluggable PCI root hierarchies are easier
> with separate domains?  Thanks,
> 
> Alex

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 21:44               ` Michael S. Tsirkin
@ 2018-05-22 21:47                 ` Alex Williamson
  2018-05-22 22:00                   ` Laszlo Ersek
  2018-05-22 23:38                   ` Michael S. Tsirkin
  0 siblings, 2 replies; 46+ messages in thread
From: Alex Williamson @ 2018-05-22 21:47 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Laszlo Ersek, Marcel Apfelbaum, Zihan Yang, qemu-devel,
	Igor Mammedov, Eric Auger, Drew Jones, Wei Huang

On Wed, 23 May 2018 00:44:22 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote:
> > On Tue, 22 May 2018 23:58:30 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:   
> > >
> > > It's not hard to think of a use-case where >256 devices
> > > are helpful, for example a nested virt scenario where
> > > each device is passed on to a different nested guest.
> > >
> > > But I think the main feature this is needed for is numa modeling.
> > > Guests seem to assume a numa node per PCI root, ergo we need more PCI
> > > roots.  
> > 
> > But even if we have NUMA affinity per PCI host bridge, a PCI host
> > bridge does not necessarily imply a new PCIe domain.  
> 
> What are you calling a PCIe domain?

Domain/segment

0000:00:00.0
^^^^ This

Isn't that the only reason we'd need a new MCFG section and the reason
we're limited to 256 buses?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 21:17           ` Alex Williamson
  2018-05-22 21:22             ` Michael S. Tsirkin
@ 2018-05-22 21:50             ` Laszlo Ersek
  2018-05-23 17:00             ` Marcel Apfelbaum
  2 siblings, 0 replies; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-22 21:50 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Marcel Apfelbaum, Zihan Yang, qemu-devel, Wei Huang, Drew Jones,
	Michael S. Tsirkin, Eric Auger, Igor Mammedov

On 05/22/18 23:17, Alex Williamson wrote:
> On Tue, 22 May 2018 21:51:47 +0200
> Laszlo Ersek <lersek@redhat.com> wrote:

Thanks Michael and Alex for the education on ARI.

I'd just like to comment on one sub-topic:

>> There are signs that the edk2 core supports ARI if the underlying
>> platform supports it. (Which is not the case with multiple PCIe domains
>> / multiple ECAM ranges.)
> 
> It's pretty surprising to me that edk2 wouldn't already have support
> for multiple PCIe domains, they're really not *that* uncommon.  Some
> architectures make almost gratuitous use of PCIe domains.  I certainly
> know there were UEFI ia64 platforms with multiple domains.

>> Semantics :) Obviously, everything "can be done" in software; that's why
>> it's *soft*ware. Who is going to write it in practice? It had taken
>> years until the edk2 core gained a universal PciHostBridgeDxe driver
>> with a well-defined platform customization interface, and that interface
>> doesn't support multiple domains / segments. It had also taken years
>> until the same edk2 core driver gained support for nonzero MMIO address
>> translation (i.e. where the CPU view of system RAM addresses differs
>> from the PCI device view of the same, for DMA purposes) -- and that's
>> just a linear translation, not even about IOMMUs. The PCI core
>> maintainers in edk2 are always very busy, and upstreaming such changes
>> tends to take forever.
> 
> Wow, that's surprising, ia64 was doing all of that on UEFI platforms a
> decade ago.

Plenty of physical PI/UEFI platforms support multiple PCIe domains
(similarly to how plenty of physical PI/UEFI platforms support CPU
hotplug with SMM).

None of that is open source, to my knowledge, so it might as well not
exist. EDK2 purposely does not accept any contributions under the GPL.
Convincing proprietary downstreams to open up and upstream their code is
nigh impossible, and even when it succeeds, it takes absolutely forever.

Technically it also happens that a proprietary downstream contributes an
independent (likely more limited) open source variant of a feature that
their physical platforms support -- assuming they see (or are shown)
value in allocating resources to such a contribution. This is very rare,
and I'm including it for technical correctness.

For open source edk2 this would be a development from zero; it would
even conflict with some assumptions in edk2.

(On May 4th I submitted a small library to core edk2 that parses the
capabilities lists of a device and puts the capabilities into an
associative array (basically a dictionary). So that we wouldn't have to
open-code yet more caplist parsing loops when looking for specific
capabilities. In 18 days I've pinged once and haven't gotten any
technical feedback. I'm not even annoyed because I know they simply
don't have time for reviewing larger than halfway trivial patches.)

Thanks,
Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 21:22             ` Michael S. Tsirkin
@ 2018-05-22 21:58               ` Laszlo Ersek
  0 siblings, 0 replies; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-22 21:58 UTC (permalink / raw)
  To: Michael S. Tsirkin, Alex Williamson
  Cc: Marcel Apfelbaum, Zihan Yang, qemu-devel, Wei Huang, Drew Jones,
	Eric Auger, Igor Mammedov

On 05/22/18 23:22, Michael S. Tsirkin wrote:
> On Tue, May 22, 2018 at 03:17:32PM -0600, Alex Williamson wrote:
>> On Tue, 22 May 2018 21:51:47 +0200
>> Laszlo Ersek <lersek@redhat.com> wrote:

>>> But 64-bit is ill-partitioned and/or crowded too: first you have the
>>> cold-plugged >4GB DRAM (whose size the firmware can interrogate), then
>>> the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the
>>> 64-bit PCI MMIO aperture (whose size the firmware decides itself,
>>> because QEMU cannot be asked about it presently). Placing the additional
>>> MMCFGs somewhere would need extending the total order between these
>>> things at the design level, more fw_cfg files, more sanity checks in
>>> platform code in the firmware, and, quite importantly, adding support to
>>> multiple discontiguous MMCFGs to core edk2 code.
>>
>> Hmm, we're doing something wrong if we can't figure out some standards
>> within QEMU for placing per domain 64-bit MMIO and MMCFG ranges.
> 
> I thought in this patch it is done by firmware.

Sure, the firmware programs the exbar registers already, but it needs to
rely on some information to figure out the values it should program.
(Also, the information should not arrive in the form of *just* an ACPI
table; that's too hard and/or too late to parse for the firmware.) Right
now the "information" is a hard-coded constant that's known not to
conflict with other parts.

Furthermore, I didn't highlight it earlier, but we can't go arbitrarily
high -- if EPT is enabled, then the physical address width of the host
CPU limits the guest-physical memory address space (incl. memory mapped
devices).

Thanks
Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 21:47                 ` Alex Williamson
@ 2018-05-22 22:00                   ` Laszlo Ersek
  2018-05-22 23:38                   ` Michael S. Tsirkin
  1 sibling, 0 replies; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-22 22:00 UTC (permalink / raw)
  To: Alex Williamson, Michael S. Tsirkin
  Cc: Marcel Apfelbaum, Zihan Yang, qemu-devel, Igor Mammedov,
	Eric Auger, Drew Jones, Wei Huang

On 05/22/18 23:47, Alex Williamson wrote:
> On Wed, 23 May 2018 00:44:22 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
>> On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote:
>>> On Tue, 22 May 2018 23:58:30 +0300
>>> "Michael S. Tsirkin" <mst@redhat.com> wrote:   
>>>>
>>>> It's not hard to think of a use-case where >256 devices
>>>> are helpful, for example a nested virt scenario where
>>>> each device is passed on to a different nested guest.
>>>>
>>>> But I think the main feature this is needed for is numa modeling.
>>>> Guests seem to assume a numa node per PCI root, ergo we need more PCI
>>>> roots.  
>>>
>>> But even if we have NUMA affinity per PCI host bridge, a PCI host
>>> bridge does not necessarily imply a new PCIe domain.  
>>
>> What are you calling a PCIe domain?
> 
> Domain/segment
> 
> 0000:00:00.0
> ^^^^ This
> 
> Isn't that the only reason we'd need a new MCFG section and the reason
> we're limited to 256 buses?  Thanks,

(Just to confirm: this matches my understanding of the thread as well.)

Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 19:51         ` Laszlo Ersek
  2018-05-22 20:58           ` Michael S. Tsirkin
  2018-05-22 21:17           ` Alex Williamson
@ 2018-05-22 22:42           ` Laszlo Ersek
  2018-05-22 23:40             ` Michael S. Tsirkin
  2 siblings, 1 reply; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-22 22:42 UTC (permalink / raw)
  To: Marcel Apfelbaum, Zihan Yang, qemu-devel
  Cc: Michael S. Tsirkin, Igor Mammedov, Alex Williamson, Eric Auger,
	Drew Jones, Wei Huang

Hold on,

On 05/22/18 21:51, Laszlo Ersek wrote:

> It had taken years until the edk2 core gained a universal
> PciHostBridgeDxe driver with a well-defined platform customization
> interface, and that interface doesn't support multiple domains /
> segments.

after doing a bit more research: I was wrong about this. What I
remembered was not the current state. Edk2 does seem to support multiple
domains, with different segment numbers, ECAM base addresses, and bus
number ranges. If we figure out a placement strategy or an easy to
consume representation of these data for the firmware, it might be
possible for OVMF to hook them into the edk2 core (although not in the
earliest firmware phases, such as SEC and PEI).

In retrospect, I'm honestly surprised that so much multi-segment support
has been upstreamed to the edk2 core. Sorry about the FUD. (My general
points remain valid, for the record... But perhaps they no longer matter
for this discussion.)

(I meant to send this message soon after
<http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>,
but my internet connection had to die right then.)

Thanks
Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 21:47                 ` Alex Williamson
  2018-05-22 22:00                   ` Laszlo Ersek
@ 2018-05-22 23:38                   ` Michael S. Tsirkin
  2018-05-23  4:28                     ` Alex Williamson
  1 sibling, 1 reply; 46+ messages in thread
From: Michael S. Tsirkin @ 2018-05-22 23:38 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Laszlo Ersek, Marcel Apfelbaum, Zihan Yang, qemu-devel,
	Igor Mammedov, Eric Auger, Drew Jones, Wei Huang

On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote:
> On Wed, 23 May 2018 00:44:22 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote:
> > > On Tue, 22 May 2018 23:58:30 +0300
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:   
> > > >
> > > > It's not hard to think of a use-case where >256 devices
> > > > are helpful, for example a nested virt scenario where
> > > > each device is passed on to a different nested guest.
> > > >
> > > > But I think the main feature this is needed for is numa modeling.
> > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI
> > > > roots.  
> > > 
> > > But even if we have NUMA affinity per PCI host bridge, a PCI host
> > > bridge does not necessarily imply a new PCIe domain.  
> > 
> > What are you calling a PCIe domain?
> 
> Domain/segment
> 
> 0000:00:00.0
> ^^^^ This

Right. So we can thinkably have PCIe root complexes share an ACPI segment.
I don't see what this buys us by itself.

> Isn't that the only reason we'd need a new MCFG section and the reason
> we're limited to 256 buses?  Thanks,
> 
> Alex

I don't know whether a single MCFG section can describe multiple roots.
I think it would be certainly unusual.

-- 
MST

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 22:42           ` Laszlo Ersek
@ 2018-05-22 23:40             ` Michael S. Tsirkin
  2018-05-23  7:32               ` Laszlo Ersek
  0 siblings, 1 reply; 46+ messages in thread
From: Michael S. Tsirkin @ 2018-05-22 23:40 UTC (permalink / raw)
  To: Laszlo Ersek
  Cc: Marcel Apfelbaum, Zihan Yang, qemu-devel, Igor Mammedov,
	Alex Williamson, Eric Auger, Drew Jones, Wei Huang

On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote:
> Hold on,
> 
> On 05/22/18 21:51, Laszlo Ersek wrote:
> 
> > It had taken years until the edk2 core gained a universal
> > PciHostBridgeDxe driver with a well-defined platform customization
> > interface, and that interface doesn't support multiple domains /
> > segments.
> 
> after doing a bit more research: I was wrong about this. What I
> remembered was not the current state. Edk2 does seem to support multiple
> domains, with different segment numbers, ECAM base addresses, and bus
> number ranges. If we figure out a placement strategy or an easy to
> consume representation of these data for the firmware, it might be
> possible for OVMF to hook them into the edk2 core (although not in the
> earliest firmware phases, such as SEC and PEI).
> 
> In retrospect, I'm honestly surprised that so much multi-segment support
> has been upstreamed to the edk2 core. Sorry about the FUD. (My general
> points remain valid, for the record... But perhaps they no longer matter
> for this discussion.)
> 
> (I meant to send this message soon after
> <http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>,
> but my internet connection had to die right then.)
> 
> Thanks
> Laszlo

Is there support for any hardware which we could emulate?

-- 
MST

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 23:38                   ` Michael S. Tsirkin
@ 2018-05-23  4:28                     ` Alex Williamson
  2018-05-23 14:25                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 46+ messages in thread
From: Alex Williamson @ 2018-05-23  4:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Laszlo Ersek, Marcel Apfelbaum, Zihan Yang, qemu-devel,
	Igor Mammedov, Eric Auger, Drew Jones, Wei Huang

On Wed, 23 May 2018 02:38:52 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote:
> > On Wed, 23 May 2018 00:44:22 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote:  
> > > > On Tue, 22 May 2018 23:58:30 +0300
> > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:     
> > > > >
> > > > > It's not hard to think of a use-case where >256 devices
> > > > > are helpful, for example a nested virt scenario where
> > > > > each device is passed on to a different nested guest.
> > > > >
> > > > > But I think the main feature this is needed for is numa modeling.
> > > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI
> > > > > roots.    
> > > > 
> > > > But even if we have NUMA affinity per PCI host bridge, a PCI host
> > > > bridge does not necessarily imply a new PCIe domain.    
> > > 
> > > What are you calling a PCIe domain?  
> > 
> > Domain/segment
> > 
> > 0000:00:00.0
> > ^^^^ This  
> 
> Right. So we can thinkably have PCIe root complexes share an ACPI segment.
> I don't see what this buys us by itself.

The ability to define NUMA locality for a PCI sub-hierarchy while
maintaining compatibility with non-segment aware OSes (and firmware).

> > Isn't that the only reason we'd need a new MCFG section and the reason
> > we're limited to 256 buses?  Thanks,
> > 
> > Alex  
> 
> I don't know whether a single MCFG section can describe multiple roots.
> I think it would be certainly unusual.

I'm not sure here if you're referring to the actual MCFG ACPI table or
the MMCONFIG range, aka the ECAM.  Neither of these describe PCI host
bridges.  The MCFG table can describe one or more ECAM ranges, which
provides the ECAM base address, the PCI segment associated with that
ECAM and the start and end bus numbers to know the offset and extent of
the ECAM range.  PCI host bridges would then theoretically be separate
ACPI objects with _SEG and _BBN methods to associate them to the
correct ECAM range by segment number and base bus number.  So it seems
that tooling exists that an ECAM/MMCONFIG range could be provided per
PCI host bridge, even if they exist within the same domain, but in
practice what I see on systems I have access to is a single MMCONFIG
range supporting all of the host bridges.  It also seems there are
numerous ways to describe the MMCONFIG range and I haven't actually
found an example that seems to use the MCFG table.  Two have MCFG
tables (that don't seem terribly complete) and the kernel claims to
find the MMCONFIG via e820, another doesn't even have an MCFG table and
the kernel claims to find MMCONFIG via an ACPI motherboard resource.
I'm not sure if I can enable PCI segments on anything to see how the
firmware changes.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 23:40             ` Michael S. Tsirkin
@ 2018-05-23  7:32               ` Laszlo Ersek
  2018-05-23 11:11                 ` Zihan Yang
                                   ` (2 more replies)
  0 siblings, 3 replies; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-23  7:32 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Marcel Apfelbaum, Zihan Yang, qemu-devel, Igor Mammedov,
	Alex Williamson, Eric Auger, Drew Jones, Wei Huang

On 05/23/18 01:40, Michael S. Tsirkin wrote:
> On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote:
>> Hold on,
>>
>> On 05/22/18 21:51, Laszlo Ersek wrote:
>>
>>> It had taken years until the edk2 core gained a universal
>>> PciHostBridgeDxe driver with a well-defined platform customization
>>> interface, and that interface doesn't support multiple domains /
>>> segments.
>>
>> after doing a bit more research: I was wrong about this. What I
>> remembered was not the current state. Edk2 does seem to support multiple
>> domains, with different segment numbers, ECAM base addresses, and bus
>> number ranges. If we figure out a placement strategy or an easy to
>> consume representation of these data for the firmware, it might be
>> possible for OVMF to hook them into the edk2 core (although not in the
>> earliest firmware phases, such as SEC and PEI).
>>
>> In retrospect, I'm honestly surprised that so much multi-segment support
>> has been upstreamed to the edk2 core. Sorry about the FUD. (My general
>> points remain valid, for the record... But perhaps they no longer matter
>> for this discussion.)
>>
>> (I meant to send this message soon after
>> <http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>,
>> but my internet connection had to die right then.)
>>
>> Thanks
>> Laszlo
> 
> Is there support for any hardware which we could emulate?

I don't see any actual hw support in the edk2 project, but I'll ask.

Thanks
Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23  7:32               ` Laszlo Ersek
@ 2018-05-23 11:11                 ` Zihan Yang
  2018-05-23 12:28                   ` Laszlo Ersek
  2018-05-23 17:33                   ` Marcel Apfelbaum
  2018-05-23 17:11                 ` Marcel Apfelbaum
  2018-05-28 11:02                 ` Laszlo Ersek
  2 siblings, 2 replies; 46+ messages in thread
From: Zihan Yang @ 2018-05-23 11:11 UTC (permalink / raw)
  To: lersek, qemu-devel
  Cc: Marcel Apfelbaum, Igor Mammedov, alex.williamson, eauger, drjones, wei

Hi all,

Thanks for all your comments and suggestions, I wasn't expecting so many
professional
reviewers. Some of the things you mentioned are beyond my knowledge right
now.
Please correct me if I'm wrong below.

The original purpose was just to support multiple segments in Intel Q35
archtecure
for PCIe topology, which makes bus number a less scarce resource. The
patches are
very primitive and many things are left for firmware to finish(the initial
plan was
to implement it in SeaBIOS), the AML part in QEMU is not finished either.
I'm not
familiar with OVMF or edk2, so there is no plan to touch it yet, but it
seems not
necessary since it already supports multi-segment in the end.

Also, in this patch the assumption is one domain per host bridge, described
by '_SEG()'
in AML, which means a ECAM range per host bridge, but that should be
configurable
if the user prefers to staying in the same domain, I guess?

I'd like to list a few things you've discussed to confirm I don't get it
wrong

* ARI enlarges the number of functions, but they does not solve the
hot-pluggable issue.
  The multifunction PCIe endpoints cannot span PCIe domains
* IOMMUs cannot span domains either, so bringing new domains introduces the
need
  to add a VT-d DHRD or vIOMMU per PCIe domain
* 64-bit space is crowded and there are no standards within QEMU for
placing per
  domain 64-bit MMIO and MMCFG ranges
* NUMA modeling seems to be a stronger motivation than the limitation of
256 but
  nubmers, that each NUMA node holds its own PCI(e) sub-hierarchy
* We cannot put ECAM arbitrarily high because guest's PA width is limited
by host's
  when EPT is enabled.
* Compatibility issues in platforms that do not have MCFG table at all
(perhaps we limit
  it to only q35 at present in which MCFG is present).

Based on your discussions, I guess this proposal is still worth doing
overall, but it seems
many restrictions should be imposed to be compatible with some complicated
situations.

Please correct me if I get anything wrong or missing.

Thanks
Zihan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23 11:11                 ` Zihan Yang
@ 2018-05-23 12:28                   ` Laszlo Ersek
  2018-05-23 17:23                     ` Marcel Apfelbaum
  2018-05-24  9:57                     ` Zihan Yang
  2018-05-23 17:33                   ` Marcel Apfelbaum
  1 sibling, 2 replies; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-23 12:28 UTC (permalink / raw)
  To: Zihan Yang, qemu-devel
  Cc: Marcel Apfelbaum, Igor Mammedov, alex.williamson, eauger, drjones, wei

On 05/23/18 13:11, Zihan Yang wrote:
> Hi all,

> The original purpose was just to support multiple segments in Intel
> Q35 archtecure for PCIe topology, which makes bus number a less scarce
> resource. The patches are very primitive and many things are left for
> firmware to finish(the initial plan was to implement it in SeaBIOS),
> the AML part in QEMU is not finished either. I'm not familiar with
> OVMF or edk2, so there is no plan to touch it yet, but it seems not
> necessary since it already supports multi-segment in the end.

That's incorrect. EDK2 stands for "EFI Development Kit II", and it is a
collection of "universal" (= platform- and ISA-independent) modules
(drivers and libraries), and platfor- and/or ISA-dependent modules
(drivers and libraries). The OVMF firmware is built from a subset of
these modules; the final firmware image includes modules from both
categories -- universal modules, and modules specific to the i440fx and
Q35 QEMU boards. The first category generally lives under MdePkg/,
MdeModulePkg/, UefiCpuPkg/, NetworkPkg/, PcAtChipsetPkg, etc; while the
second category lives under OvmfPkg/.

(The exact same applies to the ArmVirtQemu firmware, with the second
category consisting of ArmVirtPkg/ and OvmfPkg/ modules.)

When we discuss anything PCI-related in edk2, it usually affects both
categories:

(a) the universal/core modules, such as

  - the PCI host bridge / root bridge driver at
    "MdeModulePkg/Bus/Pci/PciHostBridgeDxe",

  - the PCI bus driver at "MdeModulePkg/Bus/Pci/PciBusDxe",

(b) and the platform-specific modules, such as

  - "OvmfPkg/IncompatiblePciDeviceSupportDxe" which causes PciBusDxe to
    allocate 64-bit MMIO BARs above 4 GB, regardless of option ROM
    availability (as long as a CSM is not present), conserving 32-bit
    MMIO aperture for 32-bit BARs,

  - "OvmfPkg/PciHotPlugInitDxe", which implements support for QEMU's
    resource reservation hints, so that we can avoid IO space exhaustion
    with many PCIe root ports, and so that we can reserve MMIO aperture
    for hot-plugging devices with large MMIO BARs,

  - "OvmfPkg/Library/DxePciLibI440FxQ35", which is a low-level PCI
    config space access library, usable in the DXE and later phases,
    that plugs into several drivers, and uses 0xCF8/0xCFC on i440x, and
    ECAM on Q35,

  - "OvmfPkg/Library/PciHostBridgeLib", which plugs into
    "PciHostBridgeDxe" above, exposing the various resource apertures to
    said host bridge / root bridge driver, and implementing support for
    the PXB / PXBe devices,

  - "OvmfPkg/PlatformPei", which is an early (PEI phase) module with a
    grab-bag of platform support code; e.g. it informs
    "DxePciLibI440FxQ35" above about the QEMU board being Q35 vs.
    i440fx, it configures the ECAM (exbar) registers on Q35, it
    determines where the 32-bit and 64-bit PCI MMIO apertures should be;

  - "ArmVirtPkg/Library/BaseCachingPciExpressLib", which is the
    aarch64/virt counterpart of "DxePciLibI440FxQ35" above,

  - "ArmVirtPkg/Library/FdtPciHostBridgeLib", which is the aarch64/virt
    counterpart of "PciHostBridgeLib", consuming the DTB exposed by
    qemu-system-aarch64,

  - "ArmVirtPkg/Library/FdtPciPcdProducerLib", which is an internal
    library that turns parts of the DTB that is exposed by
    qemu-system-aarch64 into various PCI-related, firmware-wide, scalar
    variables (called "PCDs"), upon which both
    "BaseCachingPciExpressLib" and "FdtPciHostBridgeLib" rely.

The point is that any PCI feature in any edk2 platform firmware comes
together from (a) core module support for the feature, and (b) platform
integration between the core code and the QEMU board in question.

If (a) is missing, that implies a very painful uphill battle, which is
why I'd been loudly whining, initially, in this thread, until I realized
that the core support was there in edk2, for PCIe segments.

However, (b) is required as well -- i.e., platform integration under
OvmfPkg/ and perhaps ArmVirtPkg/, between the QEMU boards and the core
edk2 code --, and that definitely doesn't exist for the PCIe segments
feature.

If (a) exists and is flexible enough, then we at least have a chance at
writing the platform support code (b) for it. So that's why I've stopped
whining. Writing (b) is never easy -- in this case, a great many of the
platform modules that I've listed above, under OvmfPkg/ pathnames, could
be affected, or even be eligible for replacement -- but (b) is at least
imaginable practice. Modules in category (a) are shipped *in* -- not
"on" -- every single physical UEFI platform that you can buy today,
which is one reason why it's hugely difficult to implement nontrivial
changes for them.

In brief: your statement is incorrect because category (b) is missing.
And that requires dedicated QEMU support, similarly to how
"OvmfPkg/PciHotPlugInitDxe" requires the vendor-specific resource
reservation capability, and how "OvmfPkg/Library/PciHostBridgeLib"
consumes the "etc/extra-pci-roots" fw_cfg file, and how most everything
that ArmVirtQemu does for PCI(e) originates from QEMU's DTB.

> * 64-bit space is crowded and there are no standards within QEMU for
>   placing per domain 64-bit MMIO and MMCFG ranges
> * We cannot put ECAM arbitrarily high because guest's PA width is
>   limited by host's when EPT is enabled.

That's right. One argument is that firmware can lay out these apertures
and ECAM ranges internally. But that argument breaks down when you hit
the PCPU physical address width, and would like the management stack,
such as libvirtd, to warn you in advance. For that, either libvirtd or
QEMU has to know, or direct, the layout.

> * NUMA modeling seems to be a stronger motivation than the limitation
>   of 256 but nubmers, that each NUMA node holds its own PCI(e)
>   sub-hierarchy

I'd also like to get more information about this -- I thought pxb-pci(e)
was already motivated by supporting NUMA locality. And, to my knowledge,
pxb-pci(e) actually *solved* this problem. Am I wrong? Let's say you
have 16 NUMA nodes (which seems pretty large to me); is it really
insufficient to assign ~16 devices to each node?

Thanks
Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23  4:28                     ` Alex Williamson
@ 2018-05-23 14:25                       ` Michael S. Tsirkin
  2018-05-23 14:57                         ` Alex Williamson
  2018-05-23 16:50                         ` Marcel Apfelbaum
  0 siblings, 2 replies; 46+ messages in thread
From: Michael S. Tsirkin @ 2018-05-23 14:25 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Laszlo Ersek, Marcel Apfelbaum, Zihan Yang, qemu-devel,
	Igor Mammedov, Eric Auger, Drew Jones, Wei Huang

On Tue, May 22, 2018 at 10:28:56PM -0600, Alex Williamson wrote:
> On Wed, 23 May 2018 02:38:52 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote:
> > > On Wed, 23 May 2018 00:44:22 +0300
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >   
> > > > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote:  
> > > > > On Tue, 22 May 2018 23:58:30 +0300
> > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:     
> > > > > >
> > > > > > It's not hard to think of a use-case where >256 devices
> > > > > > are helpful, for example a nested virt scenario where
> > > > > > each device is passed on to a different nested guest.
> > > > > >
> > > > > > But I think the main feature this is needed for is numa modeling.
> > > > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI
> > > > > > roots.    
> > > > > 
> > > > > But even if we have NUMA affinity per PCI host bridge, a PCI host
> > > > > bridge does not necessarily imply a new PCIe domain.    
> > > > 
> > > > What are you calling a PCIe domain?  
> > > 
> > > Domain/segment
> > > 
> > > 0000:00:00.0
> > > ^^^^ This  
> > 
> > Right. So we can thinkably have PCIe root complexes share an ACPI segment.
> > I don't see what this buys us by itself.
> 
> The ability to define NUMA locality for a PCI sub-hierarchy while
> maintaining compatibility with non-segment aware OSes (and firmware).

Fur sure, but NUMA is a kind of advanced topic, MCFG has been around for
longer than various NUMA tables. Are there really non-segment aware
guests that also know how to make use of NUMA?


> > > Isn't that the only reason we'd need a new MCFG section and the reason
> > > we're limited to 256 buses?  Thanks,
> > > 
> > > Alex  
> > 
> > I don't know whether a single MCFG section can describe multiple roots.
> > I think it would be certainly unusual.
> 
> I'm not sure here if you're referring to the actual MCFG ACPI table or
> the MMCONFIG range, aka the ECAM.  Neither of these describe PCI host
> bridges.  The MCFG table can describe one or more ECAM ranges, which
> provides the ECAM base address, the PCI segment associated with that
> ECAM and the start and end bus numbers to know the offset and extent of
> the ECAM range.  PCI host bridges would then theoretically be separate
> ACPI objects with _SEG and _BBN methods to associate them to the
> correct ECAM range by segment number and base bus number.  So it seems
> that tooling exists that an ECAM/MMCONFIG range could be provided per
> PCI host bridge, even if they exist within the same domain, but in
> practice what I see on systems I have access to is a single MMCONFIG
> range supporting all of the host bridges.  It also seems there are
> numerous ways to describe the MMCONFIG range and I haven't actually
> found an example that seems to use the MCFG table.  Two have MCFG
> tables (that don't seem terribly complete) and the kernel claims to
> find the MMCONFIG via e820, another doesn't even have an MCFG table and
> the kernel claims to find MMCONFIG via an ACPI motherboard resource.
> I'm not sure if I can enable PCI segments on anything to see how the
> firmware changes.  Thanks,
> 
> Alex

Let me clarify.  So MCFG have base address allocation structures.
Each maps a segment and a range of bus numbers into memory.
This structure is what I meant.

IIUC you are saying on your systems everything is within a single
segment, right? Multiple pci hosts map into a single segment?

If you do this you can do NUMA, but do not gain > 256 devices.

Are we are the same page then?

-- 
MST

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23 14:25                       ` Michael S. Tsirkin
@ 2018-05-23 14:57                         ` Alex Williamson
  2018-05-23 15:01                           ` Michael S. Tsirkin
  2018-05-23 16:50                         ` Marcel Apfelbaum
  1 sibling, 1 reply; 46+ messages in thread
From: Alex Williamson @ 2018-05-23 14:57 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Laszlo Ersek, Marcel Apfelbaum, Zihan Yang, qemu-devel,
	Igor Mammedov, Eric Auger, Drew Jones, Wei Huang

On Wed, 23 May 2018 17:25:32 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, May 22, 2018 at 10:28:56PM -0600, Alex Williamson wrote:
> > On Wed, 23 May 2018 02:38:52 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote:  
> > > > On Wed, 23 May 2018 00:44:22 +0300
> > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > >     
> > > > > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote:    
> > > > > > On Tue, 22 May 2018 23:58:30 +0300
> > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:       
> > > > > > >
> > > > > > > It's not hard to think of a use-case where >256 devices
> > > > > > > are helpful, for example a nested virt scenario where
> > > > > > > each device is passed on to a different nested guest.
> > > > > > >
> > > > > > > But I think the main feature this is needed for is numa modeling.
> > > > > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI
> > > > > > > roots.      
> > > > > > 
> > > > > > But even if we have NUMA affinity per PCI host bridge, a PCI host
> > > > > > bridge does not necessarily imply a new PCIe domain.      
> > > > > 
> > > > > What are you calling a PCIe domain?    
> > > > 
> > > > Domain/segment
> > > > 
> > > > 0000:00:00.0
> > > > ^^^^ This    
> > > 
> > > Right. So we can thinkably have PCIe root complexes share an ACPI segment.
> > > I don't see what this buys us by itself.  
> > 
> > The ability to define NUMA locality for a PCI sub-hierarchy while
> > maintaining compatibility with non-segment aware OSes (and firmware).  
> 
> Fur sure, but NUMA is a kind of advanced topic, MCFG has been around for
> longer than various NUMA tables. Are there really non-segment aware
> guests that also know how to make use of NUMA?

I can't answer that question, but I assume that multi-segment PCI
support is perhaps not as pervasive as we may think considering hardware
OEMs tend to avoid it for their default configurations with multiple
host bridges.

> > > > Isn't that the only reason we'd need a new MCFG section and the reason
> > > > we're limited to 256 buses?  Thanks,
> > > > 
> > > > Alex    
> > > 
> > > I don't know whether a single MCFG section can describe multiple roots.
> > > I think it would be certainly unusual.  
> > 
> > I'm not sure here if you're referring to the actual MCFG ACPI table or
> > the MMCONFIG range, aka the ECAM.  Neither of these describe PCI host
> > bridges.  The MCFG table can describe one or more ECAM ranges, which
> > provides the ECAM base address, the PCI segment associated with that
> > ECAM and the start and end bus numbers to know the offset and extent of
> > the ECAM range.  PCI host bridges would then theoretically be separate
> > ACPI objects with _SEG and _BBN methods to associate them to the
> > correct ECAM range by segment number and base bus number.  So it seems
> > that tooling exists that an ECAM/MMCONFIG range could be provided per
> > PCI host bridge, even if they exist within the same domain, but in
> > practice what I see on systems I have access to is a single MMCONFIG
> > range supporting all of the host bridges.  It also seems there are
> > numerous ways to describe the MMCONFIG range and I haven't actually
> > found an example that seems to use the MCFG table.  Two have MCFG
> > tables (that don't seem terribly complete) and the kernel claims to
> > find the MMCONFIG via e820, another doesn't even have an MCFG table and
> > the kernel claims to find MMCONFIG via an ACPI motherboard resource.
> > I'm not sure if I can enable PCI segments on anything to see how the
> > firmware changes.  Thanks,
> > 
> > Alex  
> 
> Let me clarify.  So MCFG have base address allocation structures.
> Each maps a segment and a range of bus numbers into memory.
> This structure is what I meant.

Ok, so this is the  ECAM/MMCONFIG range through which we do config
accesses, which is described by MCFG, among other options.

> IIUC you are saying on your systems everything is within a single
> segment, right? Multiple pci hosts map into a single segment?

Yes, for instance a single MMCONFIG range handles bus number ranges
0x00-0x7f within segment 0x0 and the system has host bridges with base
bus numbers of 0x00 and 0x40, each with different NUMA locality.

> If you do this you can do NUMA, but do not gain > 256 devices.

Correct, but let's also clarify that we're not limited to 256 devices,
a segment is limited to 256 buses and each PCIe slot is a bus, so the
limitation is number of hotpluggable slots.  "Devices" implies that it
includes multi-function, ARI, and SR-IOV devices as well, but we can
have 256 of those per bus, we just don't have the desired hotplug
granularity for those.

> Are we are the same page then?

Seems so.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23 14:57                         ` Alex Williamson
@ 2018-05-23 15:01                           ` Michael S. Tsirkin
  0 siblings, 0 replies; 46+ messages in thread
From: Michael S. Tsirkin @ 2018-05-23 15:01 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Laszlo Ersek, Marcel Apfelbaum, Zihan Yang, qemu-devel,
	Igor Mammedov, Eric Auger, Drew Jones, Wei Huang

On Wed, May 23, 2018 at 08:57:51AM -0600, Alex Williamson wrote:
> On Wed, 23 May 2018 17:25:32 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Tue, May 22, 2018 at 10:28:56PM -0600, Alex Williamson wrote:
> > > On Wed, 23 May 2018 02:38:52 +0300
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >   
> > > > On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote:  
> > > > > On Wed, 23 May 2018 00:44:22 +0300
> > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > >     
> > > > > > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote:    
> > > > > > > On Tue, 22 May 2018 23:58:30 +0300
> > > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:       
> > > > > > > >
> > > > > > > > It's not hard to think of a use-case where >256 devices
> > > > > > > > are helpful, for example a nested virt scenario where
> > > > > > > > each device is passed on to a different nested guest.
> > > > > > > >
> > > > > > > > But I think the main feature this is needed for is numa modeling.
> > > > > > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI
> > > > > > > > roots.      
> > > > > > > 
> > > > > > > But even if we have NUMA affinity per PCI host bridge, a PCI host
> > > > > > > bridge does not necessarily imply a new PCIe domain.      
> > > > > > 
> > > > > > What are you calling a PCIe domain?    
> > > > > 
> > > > > Domain/segment
> > > > > 
> > > > > 0000:00:00.0
> > > > > ^^^^ This    
> > > > 
> > > > Right. So we can thinkably have PCIe root complexes share an ACPI segment.
> > > > I don't see what this buys us by itself.  
> > > 
> > > The ability to define NUMA locality for a PCI sub-hierarchy while
> > > maintaining compatibility with non-segment aware OSes (and firmware).  
> > 
> > Fur sure, but NUMA is a kind of advanced topic, MCFG has been around for
> > longer than various NUMA tables. Are there really non-segment aware
> > guests that also know how to make use of NUMA?
> 
> I can't answer that question, but I assume that multi-segment PCI
> support is perhaps not as pervasive as we may think considering hardware
> OEMs tend to avoid it for their default configurations with multiple
> host bridges.
> 
> > > > > Isn't that the only reason we'd need a new MCFG section and the reason
> > > > > we're limited to 256 buses?  Thanks,
> > > > > 
> > > > > Alex    
> > > > 
> > > > I don't know whether a single MCFG section can describe multiple roots.
> > > > I think it would be certainly unusual.  
> > > 
> > > I'm not sure here if you're referring to the actual MCFG ACPI table or
> > > the MMCONFIG range, aka the ECAM.  Neither of these describe PCI host
> > > bridges.  The MCFG table can describe one or more ECAM ranges, which
> > > provides the ECAM base address, the PCI segment associated with that
> > > ECAM and the start and end bus numbers to know the offset and extent of
> > > the ECAM range.  PCI host bridges would then theoretically be separate
> > > ACPI objects with _SEG and _BBN methods to associate them to the
> > > correct ECAM range by segment number and base bus number.  So it seems
> > > that tooling exists that an ECAM/MMCONFIG range could be provided per
> > > PCI host bridge, even if they exist within the same domain, but in
> > > practice what I see on systems I have access to is a single MMCONFIG
> > > range supporting all of the host bridges.  It also seems there are
> > > numerous ways to describe the MMCONFIG range and I haven't actually
> > > found an example that seems to use the MCFG table.  Two have MCFG
> > > tables (that don't seem terribly complete) and the kernel claims to
> > > find the MMCONFIG via e820, another doesn't even have an MCFG table and
> > > the kernel claims to find MMCONFIG via an ACPI motherboard resource.
> > > I'm not sure if I can enable PCI segments on anything to see how the
> > > firmware changes.  Thanks,
> > > 
> > > Alex  
> > 
> > Let me clarify.  So MCFG have base address allocation structures.
> > Each maps a segment and a range of bus numbers into memory.
> > This structure is what I meant.
> 
> Ok, so this is the  ECAM/MMCONFIG range through which we do config
> accesses, which is described by MCFG, among other options.
> 
> > IIUC you are saying on your systems everything is within a single
> > segment, right? Multiple pci hosts map into a single segment?
> 
> Yes, for instance a single MMCONFIG range handles bus number ranges
> 0x00-0x7f within segment 0x0 and the system has host bridges with base
> bus numbers of 0x00 and 0x40, each with different NUMA locality.
> 
> > If you do this you can do NUMA, but do not gain > 256 devices.
> 
> Correct, but let's also clarify that we're not limited to 256 devices,
> a segment is limited to 256 buses and each PCIe slot is a bus, so the
> limitation is number of hotpluggable slots.  "Devices" implies that it
> includes multi-function, ARI, and SR-IOV devices as well, but we can
> have 256 of those per bus, we just don't have the desired hotplug
> granularity for those.

Right, I consider a group of PF and all its VFs a device,
and all functions in a multi-function device a single
device for this purpose.

> > Are we are the same page then?
> 
> Seems so.  Thanks,
> 
> Alex

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23 14:25                       ` Michael S. Tsirkin
  2018-05-23 14:57                         ` Alex Williamson
@ 2018-05-23 16:50                         ` Marcel Apfelbaum
  1 sibling, 0 replies; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-23 16:50 UTC (permalink / raw)
  To: Michael S. Tsirkin, Alex Williamson
  Cc: Laszlo Ersek, Zihan Yang, qemu-devel, Igor Mammedov, Eric Auger,
	Drew Jones, Wei Huang



On 05/23/2018 05:25 PM, Michael S. Tsirkin wrote:
> On Tue, May 22, 2018 at 10:28:56PM -0600, Alex Williamson wrote:
>> On Wed, 23 May 2018 02:38:52 +0300
>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>
>>> On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote:
>>>> On Wed, 23 May 2018 00:44:22 +0300
>>>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>>>    
>>>>> On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote:
>>>>>> On Tue, 22 May 2018 23:58:30 +0300
>>>>>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>>>>>> It's not hard to think of a use-case where >256 devices
>>>>>>> are helpful, for example a nested virt scenario where
>>>>>>> each device is passed on to a different nested guest.
>>>>>>>
>>>>>>> But I think the main feature this is needed for is numa modeling.
>>>>>>> Guests seem to assume a numa node per PCI root, ergo we need more PCI
>>>>>>> roots.
>>>>>> But even if we have NUMA affinity per PCI host bridge, a PCI host
>>>>>> bridge does not necessarily imply a new PCIe domain.
>>>>> What are you calling a PCIe domain?
>>>> Domain/segment
>>>>
>>>> 0000:00:00.0
>>>> ^^^^ This
>>> Right. So we can thinkably have PCIe root complexes share an ACPI segment.
>>> I don't see what this buys us by itself.
>> The ability to define NUMA locality for a PCI sub-hierarchy while
>> maintaining compatibility with non-segment aware OSes (and firmware).
> Fur sure, but NUMA is a kind of advanced topic, MCFG has been around for
> longer than various NUMA tables. Are there really non-segment aware
> guests that also know how to make use of NUMA?
>

Yes, the current pxb devices accomplish exactly that. Multiple NUMA nodes
while sharing PCI domain 0.

Thanks,
Marcel

>>>> Isn't that the only reason we'd need a new MCFG section and the reason
>>>> we're limited to 256 buses?  Thanks,
>>>>
>>>> Alex
>>> I don't know whether a single MCFG section can describe multiple roots.
>>> I think it would be certainly unusual.
>> I'm not sure here if you're referring to the actual MCFG ACPI table or
>> the MMCONFIG range, aka the ECAM.  Neither of these describe PCI host
>> bridges.  The MCFG table can describe one or more ECAM ranges, which
>> provides the ECAM base address, the PCI segment associated with that
>> ECAM and the start and end bus numbers to know the offset and extent of
>> the ECAM range.  PCI host bridges would then theoretically be separate
>> ACPI objects with _SEG and _BBN methods to associate them to the
>> correct ECAM range by segment number and base bus number.  So it seems
>> that tooling exists that an ECAM/MMCONFIG range could be provided per
>> PCI host bridge, even if they exist within the same domain, but in
>> practice what I see on systems I have access to is a single MMCONFIG
>> range supporting all of the host bridges.  It also seems there are
>> numerous ways to describe the MMCONFIG range and I haven't actually
>> found an example that seems to use the MCFG table.  Two have MCFG
>> tables (that don't seem terribly complete) and the kernel claims to
>> find the MMCONFIG via e820, another doesn't even have an MCFG table and
>> the kernel claims to find MMCONFIG via an ACPI motherboard resource.
>> I'm not sure if I can enable PCI segments on anything to see how the
>> firmware changes.  Thanks,
>>
>> Alex
> Let me clarify.  So MCFG have base address allocation structures.
> Each maps a segment and a range of bus numbers into memory.
> This structure is what I meant.
>
> IIUC you are saying on your systems everything is within a single
> segment, right? Multiple pci hosts map into a single segment?
>
> If you do this you can do NUMA, but do not gain > 256 devices.
>
> Are we are the same page then?
>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-22 21:17           ` Alex Williamson
  2018-05-22 21:22             ` Michael S. Tsirkin
  2018-05-22 21:50             ` Laszlo Ersek
@ 2018-05-23 17:00             ` Marcel Apfelbaum
  2 siblings, 0 replies; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-23 17:00 UTC (permalink / raw)
  To: Alex Williamson, Laszlo Ersek
  Cc: Zihan Yang, qemu-devel, Wei Huang, Drew Jones,
	Michael S. Tsirkin, Eric Auger, Igor Mammedov

Hi Alex,

On 05/23/2018 12:17 AM, Alex Williamson wrote:
> On Tue, 22 May 2018 21:51:47 +0200
> Laszlo Ersek <lersek@redhat.com> wrote:
>
>> On 05/22/18 21:01, Marcel Apfelbaum wrote:
>>> Hi Laszlo,
>>>
>>> On 05/22/2018 12:52 PM, Laszlo Ersek wrote:
>>>> On 05/21/18 13:53, Marcel Apfelbaum wrote:
>>>>> On 05/20/2018 10:28 AM, Zihan Yang wrote:
>>>>>> Currently only q35 host bridge us allocated space in MCFG table. To
>>>>>> put pxb host
>>>>>> into sepratate pci domain, each of them should have its own
>>>>>> configuration space
>>>>>> int MCFG table
>>>>>>
>>>>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
>>>>>> ---
>>>>>>     hw/i386/acpi-build.c | 83
>>>>>> +++++++++++++++++++++++++++++++++++++++-------------
>>>>>>     1 file changed, 62 insertions(+), 21 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
>>>>>> index 9bc6d97..808d815 100644
>>>>>> --- a/hw/i386/acpi-build.c
>>>>>> +++ b/hw/i386/acpi-build.c
>>>>>> @@ -89,6 +89,7 @@
>>>>>>     typedef struct AcpiMcfgInfo {
>>>>>>         uint64_t mcfg_base;
>>>>>>         uint32_t mcfg_size;
>>>>>> +    struct AcpiMcfgInfo *next;
>>>>>>     } AcpiMcfgInfo;
>>>>>>       typedef struct AcpiPmInfo {
>>>>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
>>>>>> *linker, AcpiMcfgInfo *info)
>>>>>>     {
>>>>>>         AcpiTableMcfg *mcfg;
>>>>>>         const char *sig;
>>>>>> -    int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
>>>>>> +    int len, count = 0;
>>>>>> +    AcpiMcfgInfo *cfg = info;
>>>>>>     +    while (cfg) {
>>>>>> +        ++count;
>>>>>> +        cfg = cfg->next;
>>>>>> +    }
>>>>>> +    len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]);
>>>>>>         mcfg = acpi_data_push(table_data, len);
>>>>>> -    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
>>>>>> -    /* Only a single allocation so no need to play with segments */
>>>>>> -    mcfg->allocation[0].pci_segment = cpu_to_le16(0);
>>>>>> -    mcfg->allocation[0].start_bus_number = 0;
>>>>>> -    mcfg->allocation[0].end_bus_number =
>>>>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
>>>>>>           /* MCFG is used for ECAM which can be enabled or disabled by
>>>>>> guest.
>>>>>>          * To avoid table size changes (which create migration issues),
>>>>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker
>>>>>> *linker, AcpiMcfgInfo *info)
>>>>>>         } else {
>>>>>>             sig = "MCFG";
>>>>>>         }
>>>>>> +
>>>>>> +    count = 0;
>>>>>> +    while (info) {
>>>>>> +        mcfg[count].allocation[0].address =
>>>>>> cpu_to_le64(info->mcfg_base);
>>>>>> +        /* Only a single allocation so no need to play with
>>>>>> segments */
>>>>>> +        mcfg[count].allocation[0].pci_segment = cpu_to_le16(count);
>>>>>> +        mcfg[count].allocation[0].start_bus_number = 0;
>>>>>> +        mcfg[count++].allocation[0].end_bus_number =
>>>>>> PCIE_MMCFG_BUS(info->mcfg_size - 1);
>>>>> An interesting point is if we want to limit the MMFCG size for PXBs, as
>>>>> we may not be
>>>>> interested to use all the buses in a specific domain. For each bus we
>>>>> require some
>>>>> address space that remains unused.
>>>>>   
>>>>>> +        info = info->next;
>>>>>> +    }
>>>>>> +
>>>>>>         build_header(linker, table_data, (void *)mcfg, sig, len, 1,
>>>>>> NULL, NULL);
>>>>>>     }
>>>>>>     @@ -2602,26 +2615,52 @@ struct AcpiBuildState {
>>>>>>         MemoryRegion *linker_mr;
>>>>>>     } AcpiBuildState;
>>>>>>     -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
>>>>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg)
>>>>>> +{
>>>>>> +    AcpiMcfgInfo *tmp;
>>>>>> +    while (mcfg) {
>>>>>> +        tmp = mcfg->next;
>>>>>> +        g_free(mcfg);
>>>>>> +        mcfg = tmp;
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +static AcpiMcfgInfo *acpi_get_mcfg(void)
>>>>>>     {
>>>>>>         Object *pci_host;
>>>>>>         QObject *o;
>>>>>> +    AcpiMcfgInfo *head = NULL, *tail, *mcfg;
>>>>>>           pci_host = acpi_get_i386_pci_host();
>>>>>>         g_assert(pci_host);
>>>>>>     -    o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE,
>>>>>> NULL);
>>>>>> -    if (!o) {
>>>>>> -        return false;
>>>>>> +    while (pci_host) {
>>>>>> +        mcfg = g_new0(AcpiMcfgInfo, 1);
>>>>>> +        mcfg->next = NULL;
>>>>>> +        if (!head) {
>>>>>> +            tail = head = mcfg;
>>>>>> +        } else {
>>>>>> +            tail->next = mcfg;
>>>>>> +            tail = mcfg;
>>>>>> +        }
>>>>>> +
>>>>>> +        o = object_property_get_qobject(pci_host,
>>>>>> PCIE_HOST_MCFG_BASE, NULL);
>>>>>> +        if (!o) {
>>>>>> +            cleanup_mcfg(head);
>>>>>> +            g_free(mcfg);
>>>>>> +            return NULL;
>>>>>> +        }
>>>>>> +        mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
>>>>>> +        qobject_unref(o);
>>>>>> +
>>>>>> +        o = object_property_get_qobject(pci_host,
>>>>>> PCIE_HOST_MCFG_SIZE, NULL);
>>>>> I'll let Igor to comment on the APCI bits, but the idea of querying each
>>>>> PCI host
>>>>> for the MMFCG details and put it into a different table sounds good
>>>>> to me.
>>>>>
>>>>> [Adding Laszlo for his insights]
>>>> Thanks for the CC -- I don't have many (positive) insights here to
>>>> offer, I'm afraid. First, the patch set (including the blurb) doesn't
>>>> seem to explain *why* multiple host bridges / ECAM areas are a good
>>>> idea.
>>> The issue we want to solve is the hard limitation of 256 PCIe devices
>>> that can be used in a Q35 machine.
> Isn't it interesting that conventional PCI can easily support so many
> more devices?  Sorta makes you wonder why we care that virtual devices
> are express rather than conventional for a high density configuration...
>
>> Implying that there are use cases for which ~256 PCIe devices aren't
>> enough. I remain unconvinced until proved wrong :)
>>
>> Anyway, a significant source of waste comes from the restriction that we
>> can only put 1 device (with up to 8 functions) on each non-root-complex
>> PCI Express bus (such as root ports and downstream ports). This forfeits
>> a huge portion of the ECAM area (about 31/32th) that we already have.
>> Rather than spending more MMIO guest-phys address range on new
>> discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem
>> to recall from your earlier presentation that ARI could recover that
>> lost address space (meaning both ECAM ranges and PCIe B/D/F address space).
> How does ARI solve the hotplug problem?  ARI is effectively
> multifunction on steroids,

+1

>   the ARI capability in each function points
> to the next function number so that we don't need to scan the entire
> devfn address space per bus (an inefficiency we don't care about when
> there are only 8 function).  So yeah, we can fill an entire bus with
> devices with ARI, but they're all rooted at 00.0.
>   
>> There are signs that the edk2 core supports ARI if the underlying
>> platform supports it. (Which is not the case with multiple PCIe domains
>> / multiple ECAM ranges.)
> It's pretty surprising to me that edk2 wouldn't already have support
> for multiple PCIe domains, they're really not *that* uncommon.  Some
> architectures make almost gratuitous use of PCIe domains.  I certainly
> know there were UEFI ia64 platforms with multiple domains.
>
>> ARI support could also help aarch64/virt. Eric (CC'd) has been working
>> on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and
>> AFAIR one of the challenges there has been finding a good spot for the
>> larger ECAM in guest-phys address space. Fiddling with such address maps
>> is always a pain.
>>
>> Back to x86, the guest-phys address space is quite crowded too. The
>> 32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO
>> areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce
>> resource. Plus, reaching agreement between OVMF and QEMU on the exact
>> location of the 32-bit PCI MMIO aperture has always been a huge pain; so
>> you'd likely shoot for 64-bit.
> Why do we need to equally split 32-bit MMIO space between domains?  Put
> it on domain 0 and require devices installed into non-zero domains have
> no 32-bit dependencies.
>
>> But 64-bit is ill-partitioned and/or crowded too: first you have the
>> cold-plugged >4GB DRAM (whose size the firmware can interrogate), then
>> the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the
>> 64-bit PCI MMIO aperture (whose size the firmware decides itself,
>> because QEMU cannot be asked about it presently). Placing the additional
>> MMCFGs somewhere would need extending the total order between these
>> things at the design level, more fw_cfg files, more sanity checks in
>> platform code in the firmware, and, quite importantly, adding support to
>> multiple discontiguous MMCFGs to core edk2 code.
> Hmm, we're doing something wrong if we can't figure out some standards
> within QEMU for placing per domain 64-bit MMIO and MMCFG ranges.
>
>> I don't know much about ARI but it already looks a whole lot more
>> attractive to me.
>>
>>> We can use "PCI functions" to increase
>>> the number, but then we loose the hot-plugging granularity.
>>>
>>> Currently pxb-pcie host bridges share the same PCI domain (0)
>>> bus numbers, so adding more of them will not result in more usable
>>> devices (each PCIe device resides on its own bus),
>>> but putting each of them on a different domain removes
>>> that limitation.
>>>
>>> Another possible use (there is a good chance I am wrong, adding Alex to
>>> correct me),
>>> may be the "modeling" of a multi-function assigned device.
>>> Currently all the functions of an assigneddevice will be placed on
>>> different PCIe
>>> Root Ports "loosing" the connection between them.
>>>
>>> However, if we put all the functions of the same assigned device in the
>>> same
>>> PCI domain we may be able to "re-connect" them.
>> OK -- why is that useful? What's the current practical problem with the
>> splitting you describe?
> There are very few cases where this is useful, things like associating
> USB companion devices or translating a guest bus reset to host bus
> reset when functions are split to separate virtual buses.  That said, I
> have no idea how multiple domains plays a factor here.  Regardless of
> how many PCIe domains a VM supports, we can easily use existing
> multifunction within a single domain to expose multifunction assigned
> devices in a way that better resembles the physical hardware.
> Multifunction PCIe endpoints cannot span PCIe domains.

Sorry for the misunderstanding.  I was referring to the complete opposite.
We want to assign two functions of the same phys device to the same VM.
They will land on different PCIe Root Ports and we may have issues with
the data flow between them.

I was wondering if assigning them both to the same PCI domain (different 
from 0) will
help us avoid implementing the ACS for a PCIe Root Ports. But again, I 
may be way off here.

> In fact, IOMMUs generally cannot span domains either, so we better also
> be thinking about at least a VT-d DHRD or vIOMMU per PCIe domain.
>   

Right


Thanks,
Marcel


[...]

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23  7:32               ` Laszlo Ersek
  2018-05-23 11:11                 ` Zihan Yang
@ 2018-05-23 17:11                 ` Marcel Apfelbaum
  2018-05-23 17:25                   ` Laszlo Ersek
  2018-05-28 11:02                 ` Laszlo Ersek
  2 siblings, 1 reply; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-23 17:11 UTC (permalink / raw)
  To: Laszlo Ersek, Michael S. Tsirkin
  Cc: Zihan Yang, qemu-devel, Igor Mammedov, Alex Williamson,
	Eric Auger, Drew Jones, Wei Huang



On 05/23/2018 10:32 AM, Laszlo Ersek wrote:
> On 05/23/18 01:40, Michael S. Tsirkin wrote:
>> On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote:
>>> Hold on,
>>>
>>> On 05/22/18 21:51, Laszlo Ersek wrote:
>>>
>>>> It had taken years until the edk2 core gained a universal
>>>> PciHostBridgeDxe driver with a well-defined platform customization
>>>> interface, and that interface doesn't support multiple domains /
>>>> segments.
>>> after doing a bit more research: I was wrong about this. What I
>>> remembered was not the current state. Edk2 does seem to support multiple
>>> domains, with different segment numbers, ECAM base addresses, and bus
>>> number ranges.

Good news!

>>>   If we figure out a placement strategy or an easy to
>>> consume representation of these data for the firmware, it might be
>>> possible for OVMF to hook them into the edk2 core (although not in the
>>> earliest firmware phases, such as SEC and PEI).

Can you please remind me how OVMF places the 64-bit PCI hotplug window?
We may do something similar.
Let me emphasize,  I am not implying you/anybody else should work on 
that :),
I just want to be on the same page if/when the time will come.
For the moment we are looking for a POC, nothing more.

>>> In retrospect, I'm honestly surprised that so much multi-segment support
>>> has been upstreamed to the edk2 core. Sorry about the FUD. (My general
>>> points remain valid, for the record... But perhaps they no longer matter
>>> for this discussion.)
>>>
>>> (I meant to send this message soon after
>>> <http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>,
>>> but my internet connection had to die right then.)
>>>
>>> Thanks
>>> Laszlo
>> Is there support for any hardware which we could emulate?
> I don't see any actual hw support in the edk2 project, but I'll ask.

I think we may be able to succeed with "standard" APCI declarations of
the PCI segments + placing the extra MMCONFIG ranges before the 64-bit 
PCI hotplug area.

Thanks,
Marcel

> Thanks
> Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23 12:28                   ` Laszlo Ersek
@ 2018-05-23 17:23                     ` Marcel Apfelbaum
  2018-05-24  9:57                     ` Zihan Yang
  1 sibling, 0 replies; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-23 17:23 UTC (permalink / raw)
  To: Laszlo Ersek, Zihan Yang, qemu-devel
  Cc: Igor Mammedov, alex.williamson, eauger, drjones, wei



On 05/23/2018 03:28 PM, Laszlo Ersek wrote:
> On 05/23/18 13:11, Zihan Yang wrote:
>> Hi all,
>> The original purpose was just to support multiple segments in Intel
>> Q35 archtecure for PCIe topology, which makes bus number a less scarce
>> resource. The patches are very primitive and many things are left for
>> firmware to finish(the initial plan was to implement it in SeaBIOS),
>> the AML part in QEMU is not finished either. I'm not familiar with
>> OVMF or edk2, so there is no plan to touch it yet, but it seems not
>> necessary since it already supports multi-segment in the end.
> That's incorrect. EDK2 stands for "EFI Development Kit II", and it is a
> collection of "universal" (= platform- and ISA-independent) modules
> (drivers and libraries), and platfor- and/or ISA-dependent modules
> (drivers and libraries). The OVMF firmware is built from a subset of
> these modules; the final firmware image includes modules from both
> categories -- universal modules, and modules specific to the i440fx and
> Q35 QEMU boards. The first category generally lives under MdePkg/,
> MdeModulePkg/, UefiCpuPkg/, NetworkPkg/, PcAtChipsetPkg, etc; while the
> second category lives under OvmfPkg/.
>
> (The exact same applies to the ArmVirtQemu firmware, with the second
> category consisting of ArmVirtPkg/ and OvmfPkg/ modules.)
>
> When we discuss anything PCI-related in edk2, it usually affects both
> categories:
>
> (a) the universal/core modules, such as
>
>    - the PCI host bridge / root bridge driver at
>      "MdeModulePkg/Bus/Pci/PciHostBridgeDxe",
>
>    - the PCI bus driver at "MdeModulePkg/Bus/Pci/PciBusDxe",
>
> (b) and the platform-specific modules, such as
>
>    - "OvmfPkg/IncompatiblePciDeviceSupportDxe" which causes PciBusDxe to
>      allocate 64-bit MMIO BARs above 4 GB, regardless of option ROM
>      availability (as long as a CSM is not present), conserving 32-bit
>      MMIO aperture for 32-bit BARs,
>
>    - "OvmfPkg/PciHotPlugInitDxe", which implements support for QEMU's
>      resource reservation hints, so that we can avoid IO space exhaustion
>      with many PCIe root ports, and so that we can reserve MMIO aperture
>      for hot-plugging devices with large MMIO BARs,
>
>    - "OvmfPkg/Library/DxePciLibI440FxQ35", which is a low-level PCI
>      config space access library, usable in the DXE and later phases,
>      that plugs into several drivers, and uses 0xCF8/0xCFC on i440x, and
>      ECAM on Q35,
>
>    - "OvmfPkg/Library/PciHostBridgeLib", which plugs into
>      "PciHostBridgeDxe" above, exposing the various resource apertures to
>      said host bridge / root bridge driver, and implementing support for
>      the PXB / PXBe devices,
>
>    - "OvmfPkg/PlatformPei", which is an early (PEI phase) module with a
>      grab-bag of platform support code; e.g. it informs
>      "DxePciLibI440FxQ35" above about the QEMU board being Q35 vs.
>      i440fx, it configures the ECAM (exbar) registers on Q35, it
>      determines where the 32-bit and 64-bit PCI MMIO apertures should be;
>
>    - "ArmVirtPkg/Library/BaseCachingPciExpressLib", which is the
>      aarch64/virt counterpart of "DxePciLibI440FxQ35" above,
>
>    - "ArmVirtPkg/Library/FdtPciHostBridgeLib", which is the aarch64/virt
>      counterpart of "PciHostBridgeLib", consuming the DTB exposed by
>      qemu-system-aarch64,
>
>    - "ArmVirtPkg/Library/FdtPciPcdProducerLib", which is an internal
>      library that turns parts of the DTB that is exposed by
>      qemu-system-aarch64 into various PCI-related, firmware-wide, scalar
>      variables (called "PCDs"), upon which both
>      "BaseCachingPciExpressLib" and "FdtPciHostBridgeLib" rely.
>
> The point is that any PCI feature in any edk2 platform firmware comes
> together from (a) core module support for the feature, and (b) platform
> integration between the core code and the QEMU board in question.
>
> If (a) is missing, that implies a very painful uphill battle, which is
> why I'd been loudly whining, initially, in this thread, until I realized
> that the core support was there in edk2, for PCIe segments.
>
> However, (b) is required as well -- i.e., platform integration under
> OvmfPkg/ and perhaps ArmVirtPkg/, between the QEMU boards and the core
> edk2 code --, and that definitely doesn't exist for the PCIe segments
> feature.
>
> If (a) exists and is flexible enough, then we at least have a chance at
> writing the platform support code (b) for it. So that's why I've stopped
> whining. Writing (b) is never easy -- in this case, a great many of the
> platform modules that I've listed above, under OvmfPkg/ pathnames, could
> be affected, or even be eligible for replacement -- but (b) is at least
> imaginable practice. Modules in category (a) are shipped *in* -- not
> "on" -- every single physical UEFI platform that you can buy today,
> which is one reason why it's hugely difficult to implement nontrivial
> changes for them.
>
> In brief: your statement is incorrect because category (b) is missing.
> And that requires dedicated QEMU support, similarly to how
> "OvmfPkg/PciHotPlugInitDxe" requires the vendor-specific resource
> reservation capability, and how "OvmfPkg/Library/PciHostBridgeLib"
> consumes the "etc/extra-pci-roots" fw_cfg file, and how most everything
> that ArmVirtQemu does for PCI(e) originates from QEMU's DTB.
>
>> * 64-bit space is crowded and there are no standards within QEMU for
>>    placing per domain 64-bit MMIO and MMCFG ranges
>> * We cannot put ECAM arbitrarily high because guest's PA width is
>>    limited by host's when EPT is enabled.
> That's right. One argument is that firmware can lay out these apertures
> and ECAM ranges internally. But that argument breaks down when you hit
> the PCPU physical address width, and would like the management stack,
> such as libvirtd, to warn you in advance. For that, either libvirtd or
> QEMU has to know, or direct, the layout.
>
>> * NUMA modeling seems to be a stronger motivation than the limitation
>>    of 256 but nubmers, that each NUMA node holds its own PCI(e)
>>    sub-hierarchy

NUMA modeling is not the motivation, the motivation is that each PCI
domain can have up to 256 buses and the PCI Express architecture
dictates one PCI device per bus.

The limitation we have with NUMA is that a PCI Host-Bridge can
belong to a single NUMA node.
> I'd also like to get more information about this -- I thought pxb-pci(e)
> was already motivated by supporting NUMA locality.

Right
>   And, to my knowledge,
> pxb-pci(e) actually *solved* this problem. Am I wrong?
You are right.
>   Let's say you
> have 16 NUMA nodes (which seems pretty large to me); is it really
> insufficient to assign ~16 devices to each node?
Is not about "Per Node limitation", it is about several scenarios:
  - We have Ray from Intel trying to use 1000 virtio-net devices (God 
knows why :) ).
  - We may have a VM managing some backups (tapes), we may have a lot of 
these.
  - We may want indeed to create a nested solution as Michael mentioned.
The "main/hidden" issue: At some point we will switch to Q35 as the 
default X86 machine (QEMU 3.0 :) ?)
and then we don't want people to be disappointed by such a "regression".

Thanks for your time Laszlo, and sorry putting you on the spotlight.
Marcel

>
> Thanks
> Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23 17:11                 ` Marcel Apfelbaum
@ 2018-05-23 17:25                   ` Laszlo Ersek
  0 siblings, 0 replies; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-23 17:25 UTC (permalink / raw)
  To: Marcel Apfelbaum, Michael S. Tsirkin
  Cc: Zihan Yang, qemu-devel, Igor Mammedov, Alex Williamson,
	Eric Auger, Drew Jones, Wei Huang

On 05/23/18 19:11, Marcel Apfelbaum wrote:
> On 05/23/2018 10:32 AM, Laszlo Ersek wrote:
>> On 05/23/18 01:40, Michael S. Tsirkin wrote:
>>> On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote:

>>>> If we figure out a placement strategy or an easy to consume
>>>> representation of these data for the firmware, it might be possible
>>>> for OVMF to hook them into the edk2 core (although not in the
>>>> earliest firmware phases, such as SEC and PEI).
>
> Can you please remind me how OVMF places the 64-bit PCI hotplug
> window?

If you mean the 64-bit PCI MMIO aperture, I described it here in detail:

  https://bugzilla.redhat.com/show_bug.cgi?id=1353591#c8

I'll also quote it inline, before returning to your email:

On 03/26/18 16:10, bugzilla@redhat.com wrote:
> https://bugzilla.redhat.com/show_bug.cgi?id=1353591
>
> Laszlo Ersek <lersek@redhat.com> changed:
>
>            What    |Removed                     |Added
> ----------------------------------------------------------------------------
>               Flags|needinfo?(lersek@redhat.com |
>                    |)                           |
>
>
>
> --- Comment #8 from Laszlo Ersek <lersek@redhat.com> ---
> Sure, I can attempt :) The function to look at is GetFirstNonAddress()
> in "OvmfPkg/PlatformPei/MemDetect.c". I'll try to write it up here in
> natural language (although I commented the function heavily as well).
>
> As an introduction, the "number of address bits" is a quantity that
> the firmware itself needs to know, so that in the DXE phase page
> tables exist that actually map that address space. The
> GetFirstNonAddress() function (in the PEI phase) calculates the
> highest *exclusive* address that the firmware might want or need to
> use (in the DXE phase).
>
> (1) First we get the highest exclusive cold-plugged RAM address.
> (There are two methods for this, the more robust one is to read QEMU's
> E820 map, the older / less robust one is to calculate it from the
> CMOS.) If the result would be <4GB, then we take exactly 4GB from this
> step, because the firmware always needs to be able to address up to
> 4GB. Note that this is already somewhat non-intuitive; for example, if
> you have 4GB of RAM (as in, *amount*), it will go up to 6GB in the
> guest-phys address space (because [0x8000_0000..0xFFFF_FFFF] is not
> RAM but MMIO on q35).
>
> (2) If the DXE phase is 32-bit, then we're done. (No addresses >=4GB
> can be accessed, either for RAM or MMIO.) For RHEL this is never the
> case.
>
> (3) Grab the size of the 64-bit PCI MMIO aperture. This defaults to
> 32GB, but a custom (OVMF specific) fw_cfg file (from the QEMU command
> line) can resize it or even disable it. This aperture is relevant
> because it's going to be the top of the address space that the
> firmware is interested in. If the aperture is disabled (on the QEMU
> cmdline), then we're done, and only the value from point (1) matters
> -- that determines the address width we need.
>
> (4) OK, so we have a 64-bit PCI MMIO aperture (for allocating BARs out
> of, later); we have to place it somewhere. The base cannot match the
> value from (1) directly, because that would not leave room for the
> DIMM hotplug area. So the end of that area is read from the fw_cfg
> file "etc/reserved-memory-end". DIMM hotplug is enabled iff
> "etc/reserved-memory-end" exists. If "etc/reserved-memory-end" exists,
> then it is guaranteed to be larger than the value from (1) -- i.e.,
> top of cold-plugged RAM.
>
> (5) We round up the size of the 64-bit PCI aperture to 1GB. We also
> round up the base of the same -- i.e., from (4) or (1), as appropriate
> -- to 1GB. This is inspired by SeaBIOS, because this lets the host map
> the aperture with 1GB hugepages.
>
> (6) The base address of the aperture is then rounded up so that it
> ends up aligned "naturally". "Natural" alignment means that we take
> the largest whole power of two (i.e., BAR size) that can fit *within*
> the aperture (whose size comes from (3) and (5)) and use that BAR size
> as alignment requirement. This is because the PciBusDxe driver sorts
> the BARs in decreasing size order (and equivalently, decreasing
> alignment order), for allocation in increasing address order, so if
> our aperture base is aligned sufficiently for the largest BAR that can
> theoretically fit into the aperture, then the base will be aligned
> correctly for *any* other BAR that fits.
>
> For example, if you have a 32GB aperture size, then the largest BAR
> that can fit is 32GB, so the alignment requirement in step (6) will be
> 32GB. Whereas, if the user configures a 48GB aperture size in (3),
> then your alignment will remain 32GB in step (6), because a 64GB BAR
> would not fit, and a 32GB BAR (which fits) dictates a 32GB alignment.
>
> Thus we have the following "ladder" of ranges:
>
> (a) cold-plugged RAM (low, <2GB)
> (b) 32-bit PCI MMIO aperture, ECAM/MMCONFIG, APIC, pflash, etc (<4GB)
> (c) cold-plugged RAM (high, >=4GB)
> (d) DIMM hot-plug area
> (e) padding up to 1GB alignment (for hugepages)
> (f) padding up to the natural alignment of the 64-bit PCI MMIO
>    aperture size (32GB by default)
> (g) 64-bit PCI MMIO aperture
>
> To my understanding, "maxmem" determines the end of (d). And, the
> address width is dictated by the end of (g).
>
> Two more examples.
>
> - If you have 36 phys address bits, that doesn't let you use
>   maxmem=32G. This is because maxmem=32G puts the end of the DIMM
>   hotplug area (d) strictly *above* 32GB (due to the "RAM gap" (b)),
>   and then the padding (f) places the 64-bit PCI MMIO aperture at
>   64GB. So 36 phys address bits don't suffice.
>
> - On the other hand, if you have 37 phys address bits, that *should*
>   let you use maxmem=64G. While the DIMM hot-plug area will end
>   strictly above 64GB, the 64-bit PCI MMIO aperture (of size 32GB) can
>   be placed at 96GB, so it will all fit into 128GB (i.e. 37 address
>   bits).
>
> Sorry if this is confusing, I got very little sleep last night.
>

Back to your email:

On 05/23/18 19:11, Marcel Apfelbaum wrote:
> I think we may be able to succeed with "standard" APCI declarations of
> the PCI segments + placing the extra MMCONFIG ranges before the 64-bit
> PCI hotplug area.

That idea could work, but firmware will need hints about it.

Thanks!
Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23 11:11                 ` Zihan Yang
  2018-05-23 12:28                   ` Laszlo Ersek
@ 2018-05-23 17:33                   ` Marcel Apfelbaum
  2018-05-24 10:00                     ` Zihan Yang
  1 sibling, 1 reply; 46+ messages in thread
From: Marcel Apfelbaum @ 2018-05-23 17:33 UTC (permalink / raw)
  To: Zihan Yang, lersek, qemu-devel
  Cc: Igor Mammedov, alex.williamson, eauger, drjones, wei



On 05/23/2018 02:11 PM, Zihan Yang wrote:
> Hi all,
>
> Thanks for all your comments and suggestions, I wasn't expecting so 
> many professional
> reviewers. Some of the things you mentioned are beyond my knowledge 
> right now.
> Please correct me if I'm wrong below.
>
> The original purpose was just to support multiple segments in Intel 
> Q35 archtecure
> for PCIe topology, which makes bus number a less scarce resource. The 
> patches are
> very primitive and many things are left for firmware to finish(the 
> initial plan was
> to implement it in SeaBIOS), the AML part in QEMU is not finished 
> either. I'm not
> familiar with OVMF or edk2, so there is no plan to touch it yet, but 
> it seems not
> necessary since it already supports multi-segment in the end.
>
> Also, in this patch the assumption is one domain per host bridge, 
> described by '_SEG()'
> in AML, which means a ECAM range per host bridge, but that should be 
> configurable
> if the user prefers to staying in the same domain, I guess?

Yes.

>
> I'd like to list a few things you've discussed to confirm I don't get 
> it wrong
>
> * ARI enlarges the number of functions, but they does not solve the 
> hot-pluggable issue.
>   The multifunction PCIe endpoints cannot span PCIe domains

Right

> * IOMMUs cannot span domains either, so bringing new domains 
> introduces the need
>   to add a VT-d DHRD or vIOMMU per PCIe domain

Not really, you may have PCI domains not associated to an vIOMMU. As a 
first step,
you should not deal with it. The IOMMU can't span over multiple domains, 
yes.

> * 64-bit space is crowded and there are no standards within QEMU for 
> placing per
>   domain 64-bit MMIO and MMCFG ranges

Yes, but we do have some layout for the "over 4G" area and we can continue
building on it.

> * NUMA modeling seems to be a stronger motivation than the limitation 
> of 256 but
>   nubmers, that each NUMA node holds its own PCI(e) sub-hierarchy

No, the 256 devices limitation is the biggest issue we are trying to solve.

> * We cannot put ECAM arbitrarily high because guest's PA width is 
> limited by host's
>   when EPT is enabled.

Indeed, we should be careful about the size
the MMCFGs to not exceed CPU addressable bits.

> * Compatibility issues in platforms that do not have MCFG table at all 
> (perhaps we limit
>   it to only q35 at present in which MCFG is present).
>

For sure.

> Based on your discussions, I guess this proposal is still worth doing 
> overall, but it seems
> many restrictions should be imposed to be compatible with some 
> complicated situations.
>

Correct.

> Please correct me if I get anything wrong or missing.
>

You are on the right path, this discussion is meant to help you 
understand wider concerns
and make you aware of different constrains we didn't think about.

Good luck with the next version!

Thanks,
Marcel

> Thanks
> Zihan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23 12:28                   ` Laszlo Ersek
  2018-05-23 17:23                     ` Marcel Apfelbaum
@ 2018-05-24  9:57                     ` Zihan Yang
  1 sibling, 0 replies; 46+ messages in thread
From: Zihan Yang @ 2018-05-24  9:57 UTC (permalink / raw)
  To: qemu-devel, lersek
  Cc: Marcel Apfelbaum, Igor Mammedov, alex.williamson, eauger, drjones, wei

> > The original purpose was just to support multiple segments in Intel
> > Q35 archtecure for PCIe topology, which makes bus number a less scarce
> > resource. The patches are very primitive and many things are left for
> > firmware to finish(the initial plan was to implement it in SeaBIOS),
> > the AML part in QEMU is not finished either. I'm not familiar with
> > OVMF or edk2, so there is no plan to touch it yet, but it seems not
> > necessary since it already supports multi-segment in the end.
>
> That's incorrect. EDK2 stands for "EFI Development Kit II", and it is a
> collection of "universal" (= platform- and ISA-independent) modules
> (drivers and libraries), and platfor- and/or ISA-dependent modules
> (drivers and libraries). The OVMF firmware is built from a subset of
> these modules; the final firmware image includes modules from both
> categories -- universal modules, and modules specific to the i440fx and
> Q35 QEMU boards. The first category generally lives under MdePkg/,
> MdeModulePkg/, UefiCpuPkg/, NetworkPkg/, PcAtChipsetPkg, etc; while the
> second category lives under OvmfPkg/.
>
> (The exact same applies to the ArmVirtQemu firmware, with the second
> category consisting of ArmVirtPkg/ and OvmfPkg/ modules.)
>
> When we discuss anything PCI-related in edk2, it usually affects both
> categories:
>
> (a) the universal/core modules, such as
>
>   - the PCI host bridge / root bridge driver at
>     "MdeModulePkg/Bus/Pci/PciHostBridgeDxe",
>
>   - the PCI bus driver at "MdeModulePkg/Bus/Pci/PciBusDxe",
>
> (b) and the platform-specific modules, such as
>
>   - "OvmfPkg/IncompatiblePciDeviceSupportDxe" which causes PciBusDxe to
>     allocate 64-bit MMIO BARs above 4 GB, regardless of option ROM
>     availability (as long as a CSM is not present), conserving 32-bit
>     MMIO aperture for 32-bit BARs,
>
>   - "OvmfPkg/PciHotPlugInitDxe", which implements support for QEMU's
>     resource reservation hints, so that we can avoid IO space exhaustion
>     with many PCIe root ports, and so that we can reserve MMIO aperture
>     for hot-plugging devices with large MMIO BARs,
>
>   - "OvmfPkg/Library/DxePciLibI440FxQ35", which is a low-level PCI
>     config space access library, usable in the DXE and later phases,
>     that plugs into several drivers, and uses 0xCF8/0xCFC on i440x, and
>     ECAM on Q35,
>
>   - "OvmfPkg/Library/PciHostBridgeLib", which plugs into
>     "PciHostBridgeDxe" above, exposing the various resource apertures to
>     said host bridge / root bridge driver, and implementing support for
>     the PXB / PXBe devices,
>
>   - "OvmfPkg/PlatformPei", which is an early (PEI phase) module with a
>     grab-bag of platform support code; e.g. it informs
>     "DxePciLibI440FxQ35" above about the QEMU board being Q35 vs.
>     i440fx, it configures the ECAM (exbar) registers on Q35, it
>     determines where the 32-bit and 64-bit PCI MMIO apertures should be;
>
>   - "ArmVirtPkg/Library/BaseCachingPciExpressLib", which is the
>     aarch64/virt counterpart of "DxePciLibI440FxQ35" above,
>
>   - "ArmVirtPkg/Library/FdtPciHostBridgeLib", which is the aarch64/virt
>     counterpart of "PciHostBridgeLib", consuming the DTB exposed by
>     qemu-system-aarch64,
>
>   - "ArmVirtPkg/Library/FdtPciPcdProducerLib", which is an internal
>     library that turns parts of the DTB that is exposed by
>     qemu-system-aarch64 into various PCI-related, firmware-wide, scalar
>     variables (called "PCDs"), upon which both
>     "BaseCachingPciExpressLib" and "FdtPciHostBridgeLib" rely.
>
> The point is that any PCI feature in any edk2 platform firmware comes
> together from (a) core module support for the feature, and (b) platform
> integration between the core code and the QEMU board in question.
>
> If (a) is missing, that implies a very painful uphill battle, which is
> why I'd been loudly whining, initially, in this thread, until I realized
> that the core support was there in edk2, for PCIe segments.
>
> However, (b) is required as well -- i.e., platform integration under
> OvmfPkg/ and perhaps ArmVirtPkg/, between the QEMU boards and the core
> edk2 code --, and that definitely doesn't exist for the PCIe segments
> feature.
>
> If (a) exists and is flexible enough, then we at least have a chance at
> writing the platform support code (b) for it. So that's why I've stopped
> whining. Writing (b) is never easy -- in this case, a great many of the
> platform modules that I've listed above, under OvmfPkg/ pathnames, could
> be affected, or even be eligible for replacement -- but (b) is at least
> imaginable practice. Modules in category (a) are shipped *in* -- not
> "on" -- every single physical UEFI platform that you can buy today,
> which is one reason why it's hugely difficult to implement nontrivial
> changes for them.
>
> In brief: your statement is incorrect because category (b) is missing.
> And that requires dedicated QEMU support, similarly to how
> "OvmfPkg/PciHotPlugInitDxe" requires the vendor-specific resource
> reservation capability, and how "OvmfPkg/Library/PciHostBridgeLib"
> consumes the "etc/extra-pci-roots" fw_cfg file, and how most everything
> that ArmVirtQemu does for PCI(e) originates from QEMU's DTB.

Thanks for clarifying, I didn't realize the complexity in OVMF, I think at least
for now I'll only focus on SeaBIOS.

> > * NUMA modeling seems to be a stronger motivation than the limitation
> >   of 256 but nubmers, that each NUMA node holds its own PCI(e)
> >   sub-hierarchy
>
> I'd also like to get more information about this -- I thought pxb-pci(e)
> was already motivated by supporting NUMA locality. And, to my knowledge,
> pxb-pci(e) actually *solved* this problem. Am I wrong? Let's say you
> have 16 NUMA nodes (which seems pretty large to me); is it really
> insufficient to assign ~16 devices to each node?

It seems my previous statement was incorrect about the motivation,
the device number limitation is still the motivation as Marcel points out in
later comment. NUMA modeling just happens to fit into this situation.

Thanks
Zihan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23 17:33                   ` Marcel Apfelbaum
@ 2018-05-24 10:00                     ` Zihan Yang
  0 siblings, 0 replies; 46+ messages in thread
From: Zihan Yang @ 2018-05-24 10:00 UTC (permalink / raw)
  To: qemu-devel, Marcel Apfelbaum
  Cc: Igor Mammedov, alex.williamson, eauger, drjones, wei

2018-05-24 1:33 GMT+08:00 Marcel Apfelbaum <marcel.apfelbaum@gmail.com>:
>
>> * IOMMUs cannot span domains either, so bringing new domains introduces
>> the need
>>   to add a VT-d DHRD or vIOMMU per PCIe domain
>
>
> Not really, you may have PCI domains not associated to an vIOMMU. As a first
> step,
> you should not deal with it. The IOMMU can't span over multiple domains,
> yes.
>

OK, I'll leave IOMMU part at present.

>> * 64-bit space is crowded and there are no standards within QEMU for
>> placing per
>>   domain 64-bit MMIO and MMCFG ranges
>
>
> Yes, but we do have some layout for the "over 4G" area and we can continue
> building on it.

That sounds good.

>> * NUMA modeling seems to be a stronger motivation than the limitation of
>> 256 but
>>   nubmers, that each NUMA node holds its own PCI(e) sub-hierarchy
>
>
> No, the 256 devices limitation is the biggest issue we are trying to solve.

Great, the initial purpose still holds.

> You are on the right path, this discussion is meant to help you understand
> wider concerns
> and make you aware of different constrains we didn't think about.
>
> Good luck with the next version!

It is indeed very helpful. I'll try to deal with (at least most of) the issues
int v2. Thanks for your valuable reviews!

Zihan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges
  2018-05-23  7:32               ` Laszlo Ersek
  2018-05-23 11:11                 ` Zihan Yang
  2018-05-23 17:11                 ` Marcel Apfelbaum
@ 2018-05-28 11:02                 ` Laszlo Ersek
  2 siblings, 0 replies; 46+ messages in thread
From: Laszlo Ersek @ 2018-05-28 11:02 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Marcel Apfelbaum, Zihan Yang, qemu-devel, Igor Mammedov,
	Alex Williamson, Eric Auger, Drew Jones, Wei Huang

On 05/23/18 09:32, Laszlo Ersek wrote:
> On 05/23/18 01:40, Michael S. Tsirkin wrote:
>> On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote:
>>> Hold on,
>>>
>>> On 05/22/18 21:51, Laszlo Ersek wrote:
>>>
>>>> It had taken years until the edk2 core gained a universal
>>>> PciHostBridgeDxe driver with a well-defined platform customization
>>>> interface, and that interface doesn't support multiple domains /
>>>> segments.
>>>
>>> after doing a bit more research: I was wrong about this. What I
>>> remembered was not the current state. Edk2 does seem to support multiple
>>> domains, with different segment numbers, ECAM base addresses, and bus
>>> number ranges. If we figure out a placement strategy or an easy to
>>> consume representation of these data for the firmware, it might be
>>> possible for OVMF to hook them into the edk2 core (although not in the
>>> earliest firmware phases, such as SEC and PEI).
>>>
>>> In retrospect, I'm honestly surprised that so much multi-segment support
>>> has been upstreamed to the edk2 core. Sorry about the FUD. (My general
>>> points remain valid, for the record... But perhaps they no longer matter
>>> for this discussion.)
>>>
>>> (I meant to send this message soon after
>>> <http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>,
>>> but my internet connection had to die right then.)
>>>
>>> Thanks
>>> Laszlo
>>
>> Is there support for any hardware which we could emulate?
> 
> I don't see any actual hw support in the edk2 project, but I'll ask.

I got an answer in the negative.

Thanks
Laszlo

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2018-05-28 11:02 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-20  7:28 [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain Zihan Yang
2018-05-20  7:28 ` [Qemu-devel] [RFC 1/3] pci_expander_bridge: reserve enough mcfg space for pxb host Zihan Yang
2018-05-21 11:03   ` Marcel Apfelbaum
2018-05-22  5:59     ` Zihan Yang
2018-05-22 18:47       ` Marcel Apfelbaum
2018-05-20  7:28 ` [Qemu-devel] [RFC 2/3] pci: Link pci_host_bridges with QTAILQ Zihan Yang
2018-05-21 11:05   ` Marcel Apfelbaum
2018-05-22  5:59     ` Zihan Yang
2018-05-22 18:39       ` Marcel Apfelbaum
2018-05-20  7:28 ` [Qemu-devel] [RFC 3/3] acpi-build: allocate mcfg for multiple host bridges Zihan Yang
2018-05-21 11:53   ` Marcel Apfelbaum
2018-05-22  6:03     ` Zihan Yang
2018-05-22 18:43       ` Marcel Apfelbaum
2018-05-22  9:52     ` Laszlo Ersek
2018-05-22 19:01       ` Marcel Apfelbaum
2018-05-22 19:51         ` Laszlo Ersek
2018-05-22 20:58           ` Michael S. Tsirkin
2018-05-22 21:36             ` Alex Williamson
2018-05-22 21:44               ` Michael S. Tsirkin
2018-05-22 21:47                 ` Alex Williamson
2018-05-22 22:00                   ` Laszlo Ersek
2018-05-22 23:38                   ` Michael S. Tsirkin
2018-05-23  4:28                     ` Alex Williamson
2018-05-23 14:25                       ` Michael S. Tsirkin
2018-05-23 14:57                         ` Alex Williamson
2018-05-23 15:01                           ` Michael S. Tsirkin
2018-05-23 16:50                         ` Marcel Apfelbaum
2018-05-22 21:17           ` Alex Williamson
2018-05-22 21:22             ` Michael S. Tsirkin
2018-05-22 21:58               ` Laszlo Ersek
2018-05-22 21:50             ` Laszlo Ersek
2018-05-23 17:00             ` Marcel Apfelbaum
2018-05-22 22:42           ` Laszlo Ersek
2018-05-22 23:40             ` Michael S. Tsirkin
2018-05-23  7:32               ` Laszlo Ersek
2018-05-23 11:11                 ` Zihan Yang
2018-05-23 12:28                   ` Laszlo Ersek
2018-05-23 17:23                     ` Marcel Apfelbaum
2018-05-24  9:57                     ` Zihan Yang
2018-05-23 17:33                   ` Marcel Apfelbaum
2018-05-24 10:00                     ` Zihan Yang
2018-05-23 17:11                 ` Marcel Apfelbaum
2018-05-23 17:25                   ` Laszlo Ersek
2018-05-28 11:02                 ` Laszlo Ersek
2018-05-21 15:23 ` [Qemu-devel] [RFC 0/3] pci_expander_brdige: Put pxb host bridge into separate pci domain Marcel Apfelbaum
2018-05-22  6:04   ` Zihan Yang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.