All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] virtio-pci: new config layout: using memory BAR
@ 2013-05-28 16:03 Michael S. Tsirkin
  2013-05-28 17:15 ` Anthony Liguori
  2013-05-28 17:15 ` Anthony Liguori
  0 siblings, 2 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-28 16:03 UTC (permalink / raw)
  To: virtualization, Rusty Russell, kvm
  Cc: Peter Maydell, Anthony Liguori, Michael S. Tsirkin,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

This adds support for new config, and is designed to work with
the new layout code in Rusty's new layout branch.

At the moment all fields are in the same memory BAR (bar 2).
This will be used to test performance and compare
memory, io and hypercall latency.

Compiles but does not work yet.
Migration isn't handled yet.

It's not clear what do queue_enable/queue_disable
fields do, not yet implemented.

Gateway for config access with config cycles
not yet implemented.

Sending out for early review/flames.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-pci.c     | 393 +++++++++++++++++++++++++++++++++++++++++++--
 hw/virtio/virtio-pci.h     |  55 +++++++
 hw/virtio/virtio.c         |  20 +++
 include/hw/virtio/virtio.h |   4 +
 4 files changed, 458 insertions(+), 14 deletions(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 752991a..f4db224 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -259,6 +259,26 @@ static void virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
     proxy->ioeventfd_started = false;
 }
 
+static void virtio_pci_set_status(VirtIOPCIProxy *proxy, uint8_t val)
+{
+    VirtIODevice *vdev = proxy->vdev;
+
+    if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
+        virtio_pci_stop_ioeventfd(proxy);
+    }
+
+    virtio_set_status(vdev, val & 0xFF);
+
+    if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
+        virtio_pci_start_ioeventfd(proxy);
+    }
+
+    if (vdev->status == 0) {
+        virtio_reset(proxy->vdev);
+        msix_unuse_all_vectors(&proxy->pci_dev);
+    }
+}
+
 static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 {
     VirtIOPCIProxy *proxy = opaque;
@@ -293,20 +313,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         }
         break;
     case VIRTIO_PCI_STATUS:
-        if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
-            virtio_pci_stop_ioeventfd(proxy);
-        }
-
-        virtio_set_status(vdev, val & 0xFF);
-
-        if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
-            virtio_pci_start_ioeventfd(proxy);
-        }
-
-        if (vdev->status == 0) {
-            virtio_reset(proxy->vdev);
-            msix_unuse_all_vectors(&proxy->pci_dev);
-        }
+        virtio_pci_set_status(proxy, val);
 
         /* Linux before 2.6.34 sets the device as OK without enabling
            the PCI device bus master bit. In this case we need to disable
@@ -455,6 +462,226 @@ static void virtio_pci_config_write(void *opaque, hwaddr addr,
     }
 }
 
+static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
+                                              unsigned size)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    VirtIODevice *vdev = proxy->vdev;
+
+    uint64_t low = 0xffffffffull;
+
+    switch (addr) {
+    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
+        return proxy->device_feature_select;
+    case offsetof(struct virtio_pci_common_cfg, device_feature):
+        /* TODO: 64-bit features */
+	return proxy->device_feature_select ? 0 : proxy->host_features;
+    case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
+        return proxy->guest_feature_select;
+    case offsetof(struct virtio_pci_common_cfg, guest_feature):
+        /* TODO: 64-bit features */
+	return proxy->guest_feature_select ? 0 : vdev->guest_features;
+    case offsetof(struct virtio_pci_common_cfg, msix_config):
+	return vdev->config_vector;
+    case offsetof(struct virtio_pci_common_cfg, num_queues):
+        /* TODO: more exact limit? */
+	return VIRTIO_PCI_QUEUE_MAX;
+    case offsetof(struct virtio_pci_common_cfg, device_status):
+        return vdev->status;
+
+	/* About a specific virtqueue. */
+    case offsetof(struct virtio_pci_common_cfg, queue_select):
+        return  vdev->queue_sel;
+    case offsetof(struct virtio_pci_common_cfg, queue_size):
+        return virtio_queue_get_num(vdev, vdev->queue_sel);
+    case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
+	return virtio_queue_vector(vdev, vdev->queue_sel);
+    case offsetof(struct virtio_pci_common_cfg, queue_enable):
+        /* TODO */
+	return 0;
+    case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
+        return vdev->queue_sel;
+    case offsetof(struct virtio_pci_common_cfg, queue_desc):
+        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
+    case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
+        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) >> 32;
+    case offsetof(struct virtio_pci_common_cfg, queue_avail):
+        return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
+    case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
+        return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) >> 32;
+    case offsetof(struct virtio_pci_common_cfg, queue_used):
+        return virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
+    case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
+        return virtio_queue_get_used_addr(vdev, vdev->queue_sel) >> 32;
+    default:
+        return 0;
+    }
+}
+
+static void virtio_pci_config_common_write(void *opaque, hwaddr addr,
+                                           uint64_t val, unsigned size)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    VirtIODevice *vdev = proxy->vdev;
+
+    uint64_t low = 0xffffffffull;
+    uint64_t high = ~low;
+
+    switch (addr) {
+    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
+        proxy->device_feature_select = val;
+        break;
+    case offsetof(struct virtio_pci_common_cfg, device_feature):
+        break;
+    case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
+        proxy->guest_feature_select = val;
+        break;
+    case offsetof(struct virtio_pci_common_cfg, guest_feature):
+        /* TODO: 64-bit features */
+	if (!proxy->guest_feature_select) {
+            virtio_set_features(vdev, val);
+        }
+        break;
+    case offsetof(struct virtio_pci_common_cfg, msix_config):
+	vdev->config_vector = val;
+        break;
+    case offsetof(struct virtio_pci_common_cfg, num_queues):
+        break;
+    case offsetof(struct virtio_pci_common_cfg, device_status):
+        virtio_pci_set_status(proxy, val);
+        break;
+	/* About a specific virtqueue. */
+    case offsetof(struct virtio_pci_common_cfg, queue_select):
+        assert(val < VIRTIO_PCI_QUEUE_MAX);
+        vdev->queue_sel = val;
+        break;
+    case offsetof(struct virtio_pci_common_cfg, queue_size):
+        assert(val && val < 0x8ffff && !(val & (val - 1)));
+        virtio_queue_set_num(vdev, vdev->queue_sel, val);
+        break;
+    case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
+	virtio_queue_set_vector(vdev, vdev->queue_sel, val);
+        break;
+    case offsetof(struct virtio_pci_common_cfg, queue_enable):
+        /* TODO */
+	break;
+    case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
+        break;
+    case offsetof(struct virtio_pci_common_cfg, queue_desc):
+        val &= low;
+        val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & high;
+        virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
+        break;
+    case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
+        val = val << 32;
+        val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
+        virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
+        break;
+    case offsetof(struct virtio_pci_common_cfg, queue_avail):
+        val &= low;
+        val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & high;
+        virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
+        break;
+    case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
+        val = val << 32;
+        val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
+        virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
+        break;
+    case offsetof(struct virtio_pci_common_cfg, queue_used):
+        val &= low;
+        val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & high;
+        virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);
+        break;
+    case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
+        val = val << 32;
+        val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
+        virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);
+        break;
+    default:
+        break;
+    }
+}
+
+static uint64_t virtio_pci_config_isr_read(void *opaque, hwaddr addr,
+                                           unsigned size)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    VirtIODevice *vdev = proxy->vdev;
+    uint8_t ret;
+
+    /* reading from the ISR also clears it. */
+    ret = vdev->isr;
+    vdev->isr = 0;
+    qemu_set_irq(proxy->pci_dev.irq[0], 0);
+    return ret;
+}
+
+static void virtio_pci_config_isr_write(void *opaque, hwaddr addr,
+                                           uint64_t val, unsigned size)
+{
+}
+
+static uint64_t virtio_pci_config_notify_read(void *opaque, hwaddr addr,
+                                              unsigned size)
+{
+    return 0;
+}
+
+static void virtio_pci_config_notify_write(void *opaque, hwaddr addr,
+                                           uint64_t val, unsigned size)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    VirtIODevice *vdev = proxy->vdev;
+    if (val < VIRTIO_PCI_QUEUE_MAX) {
+        virtio_queue_notify(vdev, val);
+    }
+}
+
+static uint64_t virtio_pci_config_device_read(void *opaque, hwaddr addr,
+                                              unsigned size)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    uint64_t val = 0;
+    switch (size) {
+    case 1:
+        val = virtio_config_readb(proxy->vdev, addr);
+        break;
+    case 2:
+        val = virtio_config_readw(proxy->vdev, addr);
+        break;
+    case 4:
+        /* Most devices don't have 64 bit config fields.
+         * Block is an exception: first 8 bytes include
+         * a 64 bit capacity field.
+         */
+        if (VIRTIO_HOST_IS_BIG_ENDIAN &&
+            proxy->vdev->device_id == VIRTIO_ID_BLOCK && addr < 8) {
+            /* Swap first two words */
+            addr ^= 0x4;
+        }
+        val = virtio_config_readl(proxy->vdev, addr);
+        break;
+    }
+    return val;
+}
+
+static void virtio_pci_config_device_write(void *opaque, hwaddr addr,
+                                           uint64_t val, unsigned size)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    switch (size) {
+    case 1:
+        virtio_config_writeb(proxy->vdev, addr, val);
+        break;
+    case 2:
+        virtio_config_writew(proxy->vdev, addr, val);
+        break;
+    case 4:
+        virtio_config_writel(proxy->vdev, addr, val);
+        break;
+    }
+}
+
 static const MemoryRegionOps virtio_pci_config_ops = {
     .read = virtio_pci_config_read,
     .write = virtio_pci_config_write,
@@ -465,6 +692,46 @@ static const MemoryRegionOps virtio_pci_config_ops = {
     .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
+static const MemoryRegionOps virtio_pci_config_common_ops = {
+    .read = virtio_pci_config_common_read,
+    .write = virtio_pci_config_common_write,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 4,
+    },
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static const MemoryRegionOps virtio_pci_config_isr_ops = {
+    .read = virtio_pci_config_isr_read,
+    .write = virtio_pci_config_isr_write,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 1,
+    },
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static const MemoryRegionOps virtio_pci_config_notify_ops = {
+    .read = virtio_pci_config_notify_read,
+    .write = virtio_pci_config_notify_write,
+    .impl = {
+        .min_access_size = 2,
+        .max_access_size = 2,
+    },
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static const MemoryRegionOps virtio_pci_config_device_ops = {
+    .read = virtio_pci_config_device_read,
+    .write = virtio_pci_config_device_write,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 4,
+    },
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
 static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
                                 uint32_t val, int len)
 {
@@ -949,11 +1216,49 @@ static const TypeInfo virtio_9p_pci_info = {
  * virtio-pci: This is the PCIDevice which has a virtio-pci-bus.
  */
 
+#define VIRTIO_PCI_CONFIG_BAR_NUM 2
+
+static void *virtio_pci_cap_add(VirtIOPCIProxy *proxy,
+                                const char *name,
+                                MemoryRegion *reg,
+                                const MemoryRegionOps *ops,
+                                unsigned offset,
+                                unsigned size,
+                                unsigned cap_type,
+                                unsigned cap_length)
+{
+    struct virtio_pci_cap *cap;
+    int c;
+    memory_region_init_io(reg, ops, proxy, name, size);
+    memory_region_add_subregion(&proxy->config_bar, offset, reg);
+
+    if (!cap_length) {
+        cap_length = sizeof *cap;
+    }
+    c = pci_add_capability(&proxy->pci_dev, PCI_CAP_ID_VNDR, 0, cap_length);
+    assert(c > 0);
+    cap = (void *)proxy->pci_dev.config + c;
+    cap->cap_length = cap_length;
+    cap->cfg_type = cap_type;
+    cap->bar = VIRTIO_PCI_CONFIG_BAR_NUM;
+    cap->offset = offset;
+    cap->length = size;
+
+    return cap;
+}
+
+static void virtio_pci_cap_del(VirtIOPCIProxy *proxy, MemoryRegion *reg)
+{
+    memory_region_del_subregion(&proxy->config_bar, reg);
+    memory_region_destroy(reg);
+}
+
 /* This is called by virtio-bus just after the device is plugged. */
 static void virtio_pci_device_plugged(DeviceState *d)
 {
     VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
     VirtioBusState *bus = &proxy->bus;
+    struct virtio_pci_notify_cap *notify_cap;
     uint8_t *config;
     uint32_t size;
 
@@ -986,6 +1291,59 @@ static void virtio_pci_device_plugged(DeviceState *d)
     pci_register_bar(&proxy->pci_dev, 0, PCI_BASE_ADDRESS_SPACE_IO,
                      &proxy->bar);
 
+    /* New config layout */
+    /* Four capabilities: common, notify, isr, device cfg */
+#define VIRTIO_PCI_CONFIG_BAR_SIZE 0x1000
+#define VIRTIO_PCI_CONFIG_COMMON 0x0
+#define VIRTIO_PCI_CONFIG_NOTIFY 0x100
+#define VIRTIO_PCI_CONFIG_ISR 0x200
+#define VIRTIO_PCI_CONFIG_DEVICE 0x300
+
+    /* Make sure we left enough space for all vqs */
+    assert(VIRTIO_PCI_CONFIG_ISR - VIRTIO_PCI_CONFIG_NOTIFY >=
+           VIRTIO_PCI_QUEUE_MAX * 2);
+    assert(VIRTIO_PCI_CONFIG_BAR_SIZE - VIRTIO_PCI_CONFIG_DEVICE >=
+           proxy->vdev->config_len);
+
+    memory_region_init(&proxy->config_bar, "virtio-pci-config",
+                       VIRTIO_PCI_CONFIG_BAR_SIZE);
+
+    virtio_pci_cap_add(proxy, "virtio-pci-common", 
+                       &proxy->config_common,
+                       &virtio_pci_config_common_ops,
+                       VIRTIO_PCI_CONFIG_COMMON,
+                       sizeof(struct virtio_pci_common_cfg),
+                       VIRTIO_PCI_CAP_COMMON_CFG,
+                       0);
+    notify_cap = virtio_pci_cap_add(proxy, "virtio-pci-notify", 
+                                    &proxy->config_notify,
+                                    &virtio_pci_config_notify_ops,
+                                    VIRTIO_PCI_CONFIG_NOTIFY,
+                                    VIRTIO_PCI_QUEUE_MAX * 2,
+                                    VIRTIO_PCI_CAP_NOTIFY_CFG,
+                                    sizeof *notify_cap);
+    notify_cap->notify_off_multiplier = 0x2;
+    virtio_pci_cap_add(proxy, "virtio-pci-isr", 
+                       &proxy->config_isr,
+                       &virtio_pci_config_isr_ops,
+                       VIRTIO_PCI_CONFIG_ISR,
+                       0x1,
+                       VIRTIO_PCI_CAP_ISR_CFG,
+                       0);
+    if (proxy->vdev->config_len) {
+        virtio_pci_cap_add(proxy, "virtio-pci-device", 
+                           &proxy->config_device,
+                           &virtio_pci_config_device_ops,
+                           VIRTIO_PCI_CONFIG_DEVICE,
+                           proxy->vdev->config_len,
+                           VIRTIO_PCI_CAP_DEVICE_CFG,
+                           0);
+    }
+
+    pci_register_bar(&proxy->pci_dev, VIRTIO_PCI_CONFIG_BAR_NUM,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY,
+                     &proxy->config_bar);
+
     if (!kvm_has_many_ioeventfds()) {
         proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
     }
@@ -1012,6 +1370,13 @@ static void virtio_pci_exit(PCIDevice *pci_dev)
     VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev);
     virtio_pci_stop_ioeventfd(proxy);
     memory_region_destroy(&proxy->bar);
+    virtio_pci_cap_del(proxy, &proxy->config_common);
+    virtio_pci_cap_del(proxy, &proxy->config_notify);
+    virtio_pci_cap_del(proxy, &proxy->config_isr);
+    if (&proxy->vdev->config_len) {
+        virtio_pci_cap_del(proxy, &proxy->config_device);
+    }
+    memory_region_destroy(&proxy->config_bar);
     msix_uninit_exclusive_bar(pci_dev);
 }
 
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index 917bcc5..01005e7 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -84,6 +84,11 @@ struct VirtIOPCIProxy {
     PCIDevice pci_dev;
     VirtIODevice *vdev;
     MemoryRegion bar;
+    MemoryRegion config_bar;
+    MemoryRegion config_common;
+    MemoryRegion config_isr;
+    MemoryRegion config_notify;
+    MemoryRegion config_device;
     uint32_t flags;
     uint32_t class_code;
     uint32_t nvectors;
@@ -93,6 +98,8 @@ struct VirtIOPCIProxy {
     VirtIOIRQFD *vector_irqfd;
     int nvqs_with_notifiers;
     VirtioBusState bus;
+    unsigned device_feature_select;
+    unsigned guest_feature_select;
 };
 
 
@@ -203,4 +210,52 @@ struct VirtIORngPCI {
 /* Virtio ABI version, if we increment this, we break the guest driver. */
 #define VIRTIO_PCI_ABI_VERSION          0
 
+/* IDs for different capabilities.  Must all exist. */
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR access */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific confiuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_length;	/* Generic PCI field: capability length. */
+	uint8_t cfg_type;	/* One of the VIRTIO_PCI_CAP_*_CFG. */
+	uint8_t bar;		/* Where to find it. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;		/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-only */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t unused1;
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint64_t queue_desc;		/* read-write */
+	uint64_t queue_avail;		/* read-write */
+	uint64_t queue_used;		/* read-write */
+};
 #endif
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 8176c14..e54d692 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -672,6 +672,11 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n)
     return vdev->vq[n].vring.num;
 }
 
+void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
+{
+    vdev->vq[n].vring.num = num;
+}
+
 int virtio_queue_get_id(VirtQueue *vq)
 {
     VirtIODevice *vdev = vq->vdev;
@@ -987,6 +992,21 @@ hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n)
     return vdev->vq[n].vring.used;
 }
 
+void virtio_queue_set_desc_addr(VirtIODevice *vdev, int n, hwaddr addr)
+{
+    vdev->vq[n].vring.desc = addr;
+}
+
+void virtio_queue_set_avail_addr(VirtIODevice *vdev, int n, hwaddr addr)
+{
+    vdev->vq[n].vring.avail = addr;
+}
+
+void virtio_queue_set_used_addr(VirtIODevice *vdev, int n, hwaddr addr)
+{
+    vdev->vq[n].vring.used = addr;
+}
+
 hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n)
 {
     return vdev->vq[n].vring.desc;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index a6c5c53..d056d58 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -199,6 +199,7 @@ void virtio_config_writel(VirtIODevice *vdev, uint32_t addr, uint32_t data);
 void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr);
 hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n);
 int virtio_queue_get_num(VirtIODevice *vdev, int n);
+void virtio_queue_set_num(VirtIODevice *vdev, int n, int num);
 void virtio_queue_notify(VirtIODevice *vdev, int n);
 uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
 void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector);
@@ -226,6 +227,9 @@ typedef struct VirtIORNGConf VirtIORNGConf;
 hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n);
 hwaddr virtio_queue_get_avail_addr(VirtIODevice *vdev, int n);
 hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n);
+void virtio_queue_set_desc_addr(VirtIODevice *vdev, int n, hwaddr addr);
+void virtio_queue_set_avail_addr(VirtIODevice *vdev, int n, hwaddr addr);
+void virtio_queue_set_used_addr(VirtIODevice *vdev, int n, hwaddr addr);
 hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n);
 hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n);
 hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n);
-- 
MST

^ permalink raw reply related	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 16:03 [PATCH RFC] virtio-pci: new config layout: using memory BAR Michael S. Tsirkin
  2013-05-28 17:15 ` Anthony Liguori
@ 2013-05-28 17:15 ` Anthony Liguori
  2013-05-28 17:32   ` Michael S. Tsirkin
  2013-05-29  4:31     ` Rusty Russell
  1 sibling, 2 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-28 17:15 UTC (permalink / raw)
  To: Michael S. Tsirkin, virtualization, Rusty Russell, kvm
  Cc: KONRAD Frederic, Michael S. Tsirkin, Paolo Bonzini,
	Peter Maydell, Stefan Hajnoczi

"Michael S. Tsirkin" <mst@redhat.com> writes:

> This adds support for new config, and is designed to work with
> the new layout code in Rusty's new layout branch.
>
> At the moment all fields are in the same memory BAR (bar 2).
> This will be used to test performance and compare
> memory, io and hypercall latency.
>
> Compiles but does not work yet.
> Migration isn't handled yet.
>
> It's not clear what do queue_enable/queue_disable
> fields do, not yet implemented.
>
> Gateway for config access with config cycles
> not yet implemented.
>
> Sending out for early review/flames.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  hw/virtio/virtio-pci.c     | 393 +++++++++++++++++++++++++++++++++++++++++++--
>  hw/virtio/virtio-pci.h     |  55 +++++++
>  hw/virtio/virtio.c         |  20 +++
>  include/hw/virtio/virtio.h |   4 +
>  4 files changed, 458 insertions(+), 14 deletions(-)
>
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index 752991a..f4db224 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -259,6 +259,26 @@ static void virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
>      proxy->ioeventfd_started = false;
>  }
>  
> +static void virtio_pci_set_status(VirtIOPCIProxy *proxy, uint8_t val)
> +{
> +    VirtIODevice *vdev = proxy->vdev;
> +
> +    if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
> +        virtio_pci_stop_ioeventfd(proxy);
> +    }
> +
> +    virtio_set_status(vdev, val & 0xFF);
> +
> +    if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
> +        virtio_pci_start_ioeventfd(proxy);
> +    }
> +
> +    if (vdev->status == 0) {
> +        virtio_reset(proxy->vdev);
> +        msix_unuse_all_vectors(&proxy->pci_dev);
> +    }
> +}
> +
>  static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>  {
>      VirtIOPCIProxy *proxy = opaque;
> @@ -293,20 +313,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>          }
>          break;
>      case VIRTIO_PCI_STATUS:
> -        if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
> -            virtio_pci_stop_ioeventfd(proxy);
> -        }
> -
> -        virtio_set_status(vdev, val & 0xFF);
> -
> -        if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
> -            virtio_pci_start_ioeventfd(proxy);
> -        }
> -
> -        if (vdev->status == 0) {
> -            virtio_reset(proxy->vdev);
> -            msix_unuse_all_vectors(&proxy->pci_dev);
> -        }
> +        virtio_pci_set_status(proxy, val);
>  
>          /* Linux before 2.6.34 sets the device as OK without enabling
>             the PCI device bus master bit. In this case we need to disable
> @@ -455,6 +462,226 @@ static void virtio_pci_config_write(void *opaque, hwaddr addr,
>      }
>  }
>  
> +static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
> +                                              unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    VirtIODevice *vdev = proxy->vdev;
> +
> +    uint64_t low = 0xffffffffull;
> +
> +    switch (addr) {
> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> +        return proxy->device_feature_select;

Oh dear no...  Please use defines like the rest of QEMU.

>From a QEMU pov, take a look at:

https://github.com/aliguori/qemu/commit/587c35c1a3fe90f6af0f97927047ef4d3182a659

And:

https://github.com/aliguori/qemu/commit/01ba80a23cf2eb1e15056f82b44b94ec381565cb

Which lets virtio-pci be subclassable and then remaps the config space to
BAR2.

Haven't looked at the proposed new ring layout yet.

Regards,

Anthony Liguori

> +    case offsetof(struct virtio_pci_common_cfg, device_feature):
> +        /* TODO: 64-bit features */
> +	return proxy->device_feature_select ? 0 : proxy->host_features;
> +    case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
> +        return proxy->guest_feature_select;
> +    case offsetof(struct virtio_pci_common_cfg, guest_feature):
> +        /* TODO: 64-bit features */
> +	return proxy->guest_feature_select ? 0 : vdev->guest_features;
> +    case offsetof(struct virtio_pci_common_cfg, msix_config):
> +	return vdev->config_vector;
> +    case offsetof(struct virtio_pci_common_cfg, num_queues):
> +        /* TODO: more exact limit? */
> +	return VIRTIO_PCI_QUEUE_MAX;
> +    case offsetof(struct virtio_pci_common_cfg, device_status):
> +        return vdev->status;
> +
> +	/* About a specific virtqueue. */
> +    case offsetof(struct virtio_pci_common_cfg, queue_select):
> +        return  vdev->queue_sel;
> +    case offsetof(struct virtio_pci_common_cfg, queue_size):
> +        return virtio_queue_get_num(vdev, vdev->queue_sel);
> +    case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
> +	return virtio_queue_vector(vdev, vdev->queue_sel);
> +    case offsetof(struct virtio_pci_common_cfg, queue_enable):
> +        /* TODO */
> +	return 0;
> +    case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
> +        return vdev->queue_sel;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc):
> +        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
> +        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) >> 32;
> +    case offsetof(struct virtio_pci_common_cfg, queue_avail):
> +        return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
> +    case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
> +        return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) >> 32;
> +    case offsetof(struct virtio_pci_common_cfg, queue_used):
> +        return virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
> +    case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
> +        return virtio_queue_get_used_addr(vdev, vdev->queue_sel) >> 32;
> +    default:
> +        return 0;
> +    }
> +}
> +
> +static void virtio_pci_config_common_write(void *opaque, hwaddr addr,
> +                                           uint64_t val, unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    VirtIODevice *vdev = proxy->vdev;
> +
> +    uint64_t low = 0xffffffffull;
> +    uint64_t high = ~low;
> +
> +    switch (addr) {
> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> +        proxy->device_feature_select = val;
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, device_feature):
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
> +        proxy->guest_feature_select = val;
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, guest_feature):
> +        /* TODO: 64-bit features */
> +	if (!proxy->guest_feature_select) {
> +            virtio_set_features(vdev, val);
> +        }
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, msix_config):
> +	vdev->config_vector = val;
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, num_queues):
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, device_status):
> +        virtio_pci_set_status(proxy, val);
> +        break;
> +	/* About a specific virtqueue. */
> +    case offsetof(struct virtio_pci_common_cfg, queue_select):
> +        assert(val < VIRTIO_PCI_QUEUE_MAX);
> +        vdev->queue_sel = val;
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_size):
> +        assert(val && val < 0x8ffff && !(val & (val - 1)));
> +        virtio_queue_set_num(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
> +	virtio_queue_set_vector(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_enable):
> +        /* TODO */
> +	break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc):
> +        val &= low;
> +        val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & high;
> +        virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
> +        val = val << 32;
> +        val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
> +        virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_avail):
> +        val &= low;
> +        val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & high;
> +        virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
> +        val = val << 32;
> +        val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
> +        virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_used):
> +        val &= low;
> +        val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & high;
> +        virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
> +        val = val << 32;
> +        val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
> +        virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    default:
> +        break;
> +    }
> +}
> +
> +static uint64_t virtio_pci_config_isr_read(void *opaque, hwaddr addr,
> +                                           unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    VirtIODevice *vdev = proxy->vdev;
> +    uint8_t ret;
> +
> +    /* reading from the ISR also clears it. */
> +    ret = vdev->isr;
> +    vdev->isr = 0;
> +    qemu_set_irq(proxy->pci_dev.irq[0], 0);
> +    return ret;
> +}
> +
> +static void virtio_pci_config_isr_write(void *opaque, hwaddr addr,
> +                                           uint64_t val, unsigned size)
> +{
> +}
> +
> +static uint64_t virtio_pci_config_notify_read(void *opaque, hwaddr addr,
> +                                              unsigned size)
> +{
> +    return 0;
> +}
> +
> +static void virtio_pci_config_notify_write(void *opaque, hwaddr addr,
> +                                           uint64_t val, unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    VirtIODevice *vdev = proxy->vdev;
> +    if (val < VIRTIO_PCI_QUEUE_MAX) {
> +        virtio_queue_notify(vdev, val);
> +    }
> +}
> +
> +static uint64_t virtio_pci_config_device_read(void *opaque, hwaddr addr,
> +                                              unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    uint64_t val = 0;
> +    switch (size) {
> +    case 1:
> +        val = virtio_config_readb(proxy->vdev, addr);
> +        break;
> +    case 2:
> +        val = virtio_config_readw(proxy->vdev, addr);
> +        break;
> +    case 4:
> +        /* Most devices don't have 64 bit config fields.
> +         * Block is an exception: first 8 bytes include
> +         * a 64 bit capacity field.
> +         */
> +        if (VIRTIO_HOST_IS_BIG_ENDIAN &&
> +            proxy->vdev->device_id == VIRTIO_ID_BLOCK && addr < 8) {
> +            /* Swap first two words */
> +            addr ^= 0x4;
> +        }
> +        val = virtio_config_readl(proxy->vdev, addr);
> +        break;
> +    }
> +    return val;
> +}
> +
> +static void virtio_pci_config_device_write(void *opaque, hwaddr addr,
> +                                           uint64_t val, unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    switch (size) {
> +    case 1:
> +        virtio_config_writeb(proxy->vdev, addr, val);
> +        break;
> +    case 2:
> +        virtio_config_writew(proxy->vdev, addr, val);
> +        break;
> +    case 4:
> +        virtio_config_writel(proxy->vdev, addr, val);
> +        break;
> +    }
> +}
> +
>  static const MemoryRegionOps virtio_pci_config_ops = {
>      .read = virtio_pci_config_read,
>      .write = virtio_pci_config_write,
> @@ -465,6 +692,46 @@ static const MemoryRegionOps virtio_pci_config_ops = {
>      .endianness = DEVICE_LITTLE_ENDIAN,
>  };
>  
> +static const MemoryRegionOps virtio_pci_config_common_ops = {
> +    .read = virtio_pci_config_common_read,
> +    .write = virtio_pci_config_common_write,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 4,
> +    },
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};
> +
> +static const MemoryRegionOps virtio_pci_config_isr_ops = {
> +    .read = virtio_pci_config_isr_read,
> +    .write = virtio_pci_config_isr_write,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 1,
> +    },
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};
> +
> +static const MemoryRegionOps virtio_pci_config_notify_ops = {
> +    .read = virtio_pci_config_notify_read,
> +    .write = virtio_pci_config_notify_write,
> +    .impl = {
> +        .min_access_size = 2,
> +        .max_access_size = 2,
> +    },
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};
> +
> +static const MemoryRegionOps virtio_pci_config_device_ops = {
> +    .read = virtio_pci_config_device_read,
> +    .write = virtio_pci_config_device_write,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 4,
> +    },
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};
> +
>  static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
>                                  uint32_t val, int len)
>  {
> @@ -949,11 +1216,49 @@ static const TypeInfo virtio_9p_pci_info = {
>   * virtio-pci: This is the PCIDevice which has a virtio-pci-bus.
>   */
>  
> +#define VIRTIO_PCI_CONFIG_BAR_NUM 2
> +
> +static void *virtio_pci_cap_add(VirtIOPCIProxy *proxy,
> +                                const char *name,
> +                                MemoryRegion *reg,
> +                                const MemoryRegionOps *ops,
> +                                unsigned offset,
> +                                unsigned size,
> +                                unsigned cap_type,
> +                                unsigned cap_length)
> +{
> +    struct virtio_pci_cap *cap;
> +    int c;
> +    memory_region_init_io(reg, ops, proxy, name, size);
> +    memory_region_add_subregion(&proxy->config_bar, offset, reg);
> +
> +    if (!cap_length) {
> +        cap_length = sizeof *cap;
> +    }
> +    c = pci_add_capability(&proxy->pci_dev, PCI_CAP_ID_VNDR, 0, cap_length);
> +    assert(c > 0);
> +    cap = (void *)proxy->pci_dev.config + c;
> +    cap->cap_length = cap_length;
> +    cap->cfg_type = cap_type;
> +    cap->bar = VIRTIO_PCI_CONFIG_BAR_NUM;
> +    cap->offset = offset;
> +    cap->length = size;
> +
> +    return cap;
> +}
> +
> +static void virtio_pci_cap_del(VirtIOPCIProxy *proxy, MemoryRegion *reg)
> +{
> +    memory_region_del_subregion(&proxy->config_bar, reg);
> +    memory_region_destroy(reg);
> +}
> +
>  /* This is called by virtio-bus just after the device is plugged. */
>  static void virtio_pci_device_plugged(DeviceState *d)
>  {
>      VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
>      VirtioBusState *bus = &proxy->bus;
> +    struct virtio_pci_notify_cap *notify_cap;
>      uint8_t *config;
>      uint32_t size;
>  
> @@ -986,6 +1291,59 @@ static void virtio_pci_device_plugged(DeviceState *d)
>      pci_register_bar(&proxy->pci_dev, 0, PCI_BASE_ADDRESS_SPACE_IO,
>                       &proxy->bar);
>  
> +    /* New config layout */
> +    /* Four capabilities: common, notify, isr, device cfg */
> +#define VIRTIO_PCI_CONFIG_BAR_SIZE 0x1000
> +#define VIRTIO_PCI_CONFIG_COMMON 0x0
> +#define VIRTIO_PCI_CONFIG_NOTIFY 0x100
> +#define VIRTIO_PCI_CONFIG_ISR 0x200
> +#define VIRTIO_PCI_CONFIG_DEVICE 0x300
> +
> +    /* Make sure we left enough space for all vqs */
> +    assert(VIRTIO_PCI_CONFIG_ISR - VIRTIO_PCI_CONFIG_NOTIFY >=
> +           VIRTIO_PCI_QUEUE_MAX * 2);
> +    assert(VIRTIO_PCI_CONFIG_BAR_SIZE - VIRTIO_PCI_CONFIG_DEVICE >=
> +           proxy->vdev->config_len);
> +
> +    memory_region_init(&proxy->config_bar, "virtio-pci-config",
> +                       VIRTIO_PCI_CONFIG_BAR_SIZE);
> +
> +    virtio_pci_cap_add(proxy, "virtio-pci-common", 
> +                       &proxy->config_common,
> +                       &virtio_pci_config_common_ops,
> +                       VIRTIO_PCI_CONFIG_COMMON,
> +                       sizeof(struct virtio_pci_common_cfg),
> +                       VIRTIO_PCI_CAP_COMMON_CFG,
> +                       0);
> +    notify_cap = virtio_pci_cap_add(proxy, "virtio-pci-notify", 
> +                                    &proxy->config_notify,
> +                                    &virtio_pci_config_notify_ops,
> +                                    VIRTIO_PCI_CONFIG_NOTIFY,
> +                                    VIRTIO_PCI_QUEUE_MAX * 2,
> +                                    VIRTIO_PCI_CAP_NOTIFY_CFG,
> +                                    sizeof *notify_cap);
> +    notify_cap->notify_off_multiplier = 0x2;
> +    virtio_pci_cap_add(proxy, "virtio-pci-isr", 
> +                       &proxy->config_isr,
> +                       &virtio_pci_config_isr_ops,
> +                       VIRTIO_PCI_CONFIG_ISR,
> +                       0x1,
> +                       VIRTIO_PCI_CAP_ISR_CFG,
> +                       0);
> +    if (proxy->vdev->config_len) {
> +        virtio_pci_cap_add(proxy, "virtio-pci-device", 
> +                           &proxy->config_device,
> +                           &virtio_pci_config_device_ops,
> +                           VIRTIO_PCI_CONFIG_DEVICE,
> +                           proxy->vdev->config_len,
> +                           VIRTIO_PCI_CAP_DEVICE_CFG,
> +                           0);
> +    }
> +
> +    pci_register_bar(&proxy->pci_dev, VIRTIO_PCI_CONFIG_BAR_NUM,
> +                     PCI_BASE_ADDRESS_SPACE_MEMORY,
> +                     &proxy->config_bar);
> +
>      if (!kvm_has_many_ioeventfds()) {
>          proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
>      }
> @@ -1012,6 +1370,13 @@ static void virtio_pci_exit(PCIDevice *pci_dev)
>      VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev);
>      virtio_pci_stop_ioeventfd(proxy);
>      memory_region_destroy(&proxy->bar);
> +    virtio_pci_cap_del(proxy, &proxy->config_common);
> +    virtio_pci_cap_del(proxy, &proxy->config_notify);
> +    virtio_pci_cap_del(proxy, &proxy->config_isr);
> +    if (&proxy->vdev->config_len) {
> +        virtio_pci_cap_del(proxy, &proxy->config_device);
> +    }
> +    memory_region_destroy(&proxy->config_bar);
>      msix_uninit_exclusive_bar(pci_dev);
>  }
>  
> diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
> index 917bcc5..01005e7 100644
> --- a/hw/virtio/virtio-pci.h
> +++ b/hw/virtio/virtio-pci.h
> @@ -84,6 +84,11 @@ struct VirtIOPCIProxy {
>      PCIDevice pci_dev;
>      VirtIODevice *vdev;
>      MemoryRegion bar;
> +    MemoryRegion config_bar;
> +    MemoryRegion config_common;
> +    MemoryRegion config_isr;
> +    MemoryRegion config_notify;
> +    MemoryRegion config_device;
>      uint32_t flags;
>      uint32_t class_code;
>      uint32_t nvectors;
> @@ -93,6 +98,8 @@ struct VirtIOPCIProxy {
>      VirtIOIRQFD *vector_irqfd;
>      int nvqs_with_notifiers;
>      VirtioBusState bus;
> +    unsigned device_feature_select;
> +    unsigned guest_feature_select;
>  };
>  
>  
> @@ -203,4 +210,52 @@ struct VirtIORngPCI {
>  /* Virtio ABI version, if we increment this, we break the guest driver. */
>  #define VIRTIO_PCI_ABI_VERSION          0
>  
> +/* IDs for different capabilities.  Must all exist. */
> +/* Common configuration */
> +#define VIRTIO_PCI_CAP_COMMON_CFG	1
> +/* Notifications */
> +#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
> +/* ISR access */
> +#define VIRTIO_PCI_CAP_ISR_CFG		3
> +/* Device specific confiuration */
> +#define VIRTIO_PCI_CAP_DEVICE_CFG	4
> +
> +/* This is the PCI capability header: */
> +struct virtio_pci_cap {
> +	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
> +	uint8_t cap_next;	/* Generic PCI field: next ptr. */
> +	uint8_t cap_length;	/* Generic PCI field: capability length. */
> +	uint8_t cfg_type;	/* One of the VIRTIO_PCI_CAP_*_CFG. */
> +	uint8_t bar;		/* Where to find it. */
> +	uint32_t offset;	/* Offset within bar. */
> +	uint32_t length;	/* Length. */
> +};
> +
> +struct virtio_pci_notify_cap {
> +	struct virtio_pci_cap cap;
> +	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
> +};
> +
> +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> +struct virtio_pci_common_cfg {
> +	/* About the whole device. */
> +	uint32_t device_feature_select;	/* read-write */
> +	uint32_t device_feature;		/* read-only */
> +	uint32_t guest_feature_select;	/* read-write */
> +	uint32_t guest_feature;		/* read-only */
> +	uint16_t msix_config;		/* read-write */
> +	uint16_t num_queues;		/* read-only */
> +	uint8_t device_status;		/* read-write */
> +	uint8_t unused1;
> +
> +	/* About a specific virtqueue. */
> +	uint16_t queue_select;		/* read-write */
> +	uint16_t queue_size;		/* read-write, power of 2. */
> +	uint16_t queue_msix_vector;	/* read-write */
> +	uint16_t queue_enable;		/* read-write */
> +	uint16_t queue_notify_off;	/* read-only */
> +	uint64_t queue_desc;		/* read-write */
> +	uint64_t queue_avail;		/* read-write */
> +	uint64_t queue_used;		/* read-write */
> +};
>  #endif
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 8176c14..e54d692 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -672,6 +672,11 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n)
>      return vdev->vq[n].vring.num;
>  }
>  
> +void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
> +{
> +    vdev->vq[n].vring.num = num;
> +}
> +
>  int virtio_queue_get_id(VirtQueue *vq)
>  {
>      VirtIODevice *vdev = vq->vdev;
> @@ -987,6 +992,21 @@ hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n)
>      return vdev->vq[n].vring.used;
>  }
>  
> +void virtio_queue_set_desc_addr(VirtIODevice *vdev, int n, hwaddr addr)
> +{
> +    vdev->vq[n].vring.desc = addr;
> +}
> +
> +void virtio_queue_set_avail_addr(VirtIODevice *vdev, int n, hwaddr addr)
> +{
> +    vdev->vq[n].vring.avail = addr;
> +}
> +
> +void virtio_queue_set_used_addr(VirtIODevice *vdev, int n, hwaddr addr)
> +{
> +    vdev->vq[n].vring.used = addr;
> +}
> +
>  hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n)
>  {
>      return vdev->vq[n].vring.desc;
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index a6c5c53..d056d58 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -199,6 +199,7 @@ void virtio_config_writel(VirtIODevice *vdev, uint32_t addr, uint32_t data);
>  void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr);
>  hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n);
>  int virtio_queue_get_num(VirtIODevice *vdev, int n);
> +void virtio_queue_set_num(VirtIODevice *vdev, int n, int num);
>  void virtio_queue_notify(VirtIODevice *vdev, int n);
>  uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
>  void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector);
> @@ -226,6 +227,9 @@ typedef struct VirtIORNGConf VirtIORNGConf;
>  hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n);
>  hwaddr virtio_queue_get_avail_addr(VirtIODevice *vdev, int n);
>  hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n);
> +void virtio_queue_set_desc_addr(VirtIODevice *vdev, int n, hwaddr addr);
> +void virtio_queue_set_avail_addr(VirtIODevice *vdev, int n, hwaddr addr);
> +void virtio_queue_set_used_addr(VirtIODevice *vdev, int n, hwaddr addr);
>  hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n);
>  hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n);
>  hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n);
> -- 
> MST
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 16:03 [PATCH RFC] virtio-pci: new config layout: using memory BAR Michael S. Tsirkin
@ 2013-05-28 17:15 ` Anthony Liguori
  2013-05-28 17:15 ` Anthony Liguori
  1 sibling, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-28 17:15 UTC (permalink / raw)
  To: virtualization, Rusty Russell, kvm
  Cc: Peter Maydell, Paolo Bonzini, Michael S. Tsirkin,
	Stefan Hajnoczi, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> This adds support for new config, and is designed to work with
> the new layout code in Rusty's new layout branch.
>
> At the moment all fields are in the same memory BAR (bar 2).
> This will be used to test performance and compare
> memory, io and hypercall latency.
>
> Compiles but does not work yet.
> Migration isn't handled yet.
>
> It's not clear what do queue_enable/queue_disable
> fields do, not yet implemented.
>
> Gateway for config access with config cycles
> not yet implemented.
>
> Sending out for early review/flames.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  hw/virtio/virtio-pci.c     | 393 +++++++++++++++++++++++++++++++++++++++++++--
>  hw/virtio/virtio-pci.h     |  55 +++++++
>  hw/virtio/virtio.c         |  20 +++
>  include/hw/virtio/virtio.h |   4 +
>  4 files changed, 458 insertions(+), 14 deletions(-)
>
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index 752991a..f4db224 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -259,6 +259,26 @@ static void virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
>      proxy->ioeventfd_started = false;
>  }
>  
> +static void virtio_pci_set_status(VirtIOPCIProxy *proxy, uint8_t val)
> +{
> +    VirtIODevice *vdev = proxy->vdev;
> +
> +    if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
> +        virtio_pci_stop_ioeventfd(proxy);
> +    }
> +
> +    virtio_set_status(vdev, val & 0xFF);
> +
> +    if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
> +        virtio_pci_start_ioeventfd(proxy);
> +    }
> +
> +    if (vdev->status == 0) {
> +        virtio_reset(proxy->vdev);
> +        msix_unuse_all_vectors(&proxy->pci_dev);
> +    }
> +}
> +
>  static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>  {
>      VirtIOPCIProxy *proxy = opaque;
> @@ -293,20 +313,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>          }
>          break;
>      case VIRTIO_PCI_STATUS:
> -        if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
> -            virtio_pci_stop_ioeventfd(proxy);
> -        }
> -
> -        virtio_set_status(vdev, val & 0xFF);
> -
> -        if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
> -            virtio_pci_start_ioeventfd(proxy);
> -        }
> -
> -        if (vdev->status == 0) {
> -            virtio_reset(proxy->vdev);
> -            msix_unuse_all_vectors(&proxy->pci_dev);
> -        }
> +        virtio_pci_set_status(proxy, val);
>  
>          /* Linux before 2.6.34 sets the device as OK without enabling
>             the PCI device bus master bit. In this case we need to disable
> @@ -455,6 +462,226 @@ static void virtio_pci_config_write(void *opaque, hwaddr addr,
>      }
>  }
>  
> +static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
> +                                              unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    VirtIODevice *vdev = proxy->vdev;
> +
> +    uint64_t low = 0xffffffffull;
> +
> +    switch (addr) {
> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> +        return proxy->device_feature_select;

Oh dear no...  Please use defines like the rest of QEMU.

From a QEMU pov, take a look at:

https://github.com/aliguori/qemu/commit/587c35c1a3fe90f6af0f97927047ef4d3182a659

And:

https://github.com/aliguori/qemu/commit/01ba80a23cf2eb1e15056f82b44b94ec381565cb

Which lets virtio-pci be subclassable and then remaps the config space to
BAR2.

Haven't looked at the proposed new ring layout yet.

Regards,

Anthony Liguori

> +    case offsetof(struct virtio_pci_common_cfg, device_feature):
> +        /* TODO: 64-bit features */
> +	return proxy->device_feature_select ? 0 : proxy->host_features;
> +    case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
> +        return proxy->guest_feature_select;
> +    case offsetof(struct virtio_pci_common_cfg, guest_feature):
> +        /* TODO: 64-bit features */
> +	return proxy->guest_feature_select ? 0 : vdev->guest_features;
> +    case offsetof(struct virtio_pci_common_cfg, msix_config):
> +	return vdev->config_vector;
> +    case offsetof(struct virtio_pci_common_cfg, num_queues):
> +        /* TODO: more exact limit? */
> +	return VIRTIO_PCI_QUEUE_MAX;
> +    case offsetof(struct virtio_pci_common_cfg, device_status):
> +        return vdev->status;
> +
> +	/* About a specific virtqueue. */
> +    case offsetof(struct virtio_pci_common_cfg, queue_select):
> +        return  vdev->queue_sel;
> +    case offsetof(struct virtio_pci_common_cfg, queue_size):
> +        return virtio_queue_get_num(vdev, vdev->queue_sel);
> +    case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
> +	return virtio_queue_vector(vdev, vdev->queue_sel);
> +    case offsetof(struct virtio_pci_common_cfg, queue_enable):
> +        /* TODO */
> +	return 0;
> +    case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
> +        return vdev->queue_sel;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc):
> +        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
> +        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) >> 32;
> +    case offsetof(struct virtio_pci_common_cfg, queue_avail):
> +        return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
> +    case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
> +        return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) >> 32;
> +    case offsetof(struct virtio_pci_common_cfg, queue_used):
> +        return virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
> +    case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
> +        return virtio_queue_get_used_addr(vdev, vdev->queue_sel) >> 32;
> +    default:
> +        return 0;
> +    }
> +}
> +
> +static void virtio_pci_config_common_write(void *opaque, hwaddr addr,
> +                                           uint64_t val, unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    VirtIODevice *vdev = proxy->vdev;
> +
> +    uint64_t low = 0xffffffffull;
> +    uint64_t high = ~low;
> +
> +    switch (addr) {
> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> +        proxy->device_feature_select = val;
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, device_feature):
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
> +        proxy->guest_feature_select = val;
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, guest_feature):
> +        /* TODO: 64-bit features */
> +	if (!proxy->guest_feature_select) {
> +            virtio_set_features(vdev, val);
> +        }
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, msix_config):
> +	vdev->config_vector = val;
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, num_queues):
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, device_status):
> +        virtio_pci_set_status(proxy, val);
> +        break;
> +	/* About a specific virtqueue. */
> +    case offsetof(struct virtio_pci_common_cfg, queue_select):
> +        assert(val < VIRTIO_PCI_QUEUE_MAX);
> +        vdev->queue_sel = val;
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_size):
> +        assert(val && val < 0x8ffff && !(val & (val - 1)));
> +        virtio_queue_set_num(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
> +	virtio_queue_set_vector(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_enable):
> +        /* TODO */
> +	break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc):
> +        val &= low;
> +        val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & high;
> +        virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
> +        val = val << 32;
> +        val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
> +        virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_avail):
> +        val &= low;
> +        val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & high;
> +        virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
> +        val = val << 32;
> +        val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
> +        virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_used):
> +        val &= low;
> +        val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & high;
> +        virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
> +        val = val << 32;
> +        val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
> +        virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);
> +        break;
> +    default:
> +        break;
> +    }
> +}
> +
> +static uint64_t virtio_pci_config_isr_read(void *opaque, hwaddr addr,
> +                                           unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    VirtIODevice *vdev = proxy->vdev;
> +    uint8_t ret;
> +
> +    /* reading from the ISR also clears it. */
> +    ret = vdev->isr;
> +    vdev->isr = 0;
> +    qemu_set_irq(proxy->pci_dev.irq[0], 0);
> +    return ret;
> +}
> +
> +static void virtio_pci_config_isr_write(void *opaque, hwaddr addr,
> +                                           uint64_t val, unsigned size)
> +{
> +}
> +
> +static uint64_t virtio_pci_config_notify_read(void *opaque, hwaddr addr,
> +                                              unsigned size)
> +{
> +    return 0;
> +}
> +
> +static void virtio_pci_config_notify_write(void *opaque, hwaddr addr,
> +                                           uint64_t val, unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    VirtIODevice *vdev = proxy->vdev;
> +    if (val < VIRTIO_PCI_QUEUE_MAX) {
> +        virtio_queue_notify(vdev, val);
> +    }
> +}
> +
> +static uint64_t virtio_pci_config_device_read(void *opaque, hwaddr addr,
> +                                              unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    uint64_t val = 0;
> +    switch (size) {
> +    case 1:
> +        val = virtio_config_readb(proxy->vdev, addr);
> +        break;
> +    case 2:
> +        val = virtio_config_readw(proxy->vdev, addr);
> +        break;
> +    case 4:
> +        /* Most devices don't have 64 bit config fields.
> +         * Block is an exception: first 8 bytes include
> +         * a 64 bit capacity field.
> +         */
> +        if (VIRTIO_HOST_IS_BIG_ENDIAN &&
> +            proxy->vdev->device_id == VIRTIO_ID_BLOCK && addr < 8) {
> +            /* Swap first two words */
> +            addr ^= 0x4;
> +        }
> +        val = virtio_config_readl(proxy->vdev, addr);
> +        break;
> +    }
> +    return val;
> +}
> +
> +static void virtio_pci_config_device_write(void *opaque, hwaddr addr,
> +                                           uint64_t val, unsigned size)
> +{
> +    VirtIOPCIProxy *proxy = opaque;
> +    switch (size) {
> +    case 1:
> +        virtio_config_writeb(proxy->vdev, addr, val);
> +        break;
> +    case 2:
> +        virtio_config_writew(proxy->vdev, addr, val);
> +        break;
> +    case 4:
> +        virtio_config_writel(proxy->vdev, addr, val);
> +        break;
> +    }
> +}
> +
>  static const MemoryRegionOps virtio_pci_config_ops = {
>      .read = virtio_pci_config_read,
>      .write = virtio_pci_config_write,
> @@ -465,6 +692,46 @@ static const MemoryRegionOps virtio_pci_config_ops = {
>      .endianness = DEVICE_LITTLE_ENDIAN,
>  };
>  
> +static const MemoryRegionOps virtio_pci_config_common_ops = {
> +    .read = virtio_pci_config_common_read,
> +    .write = virtio_pci_config_common_write,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 4,
> +    },
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};
> +
> +static const MemoryRegionOps virtio_pci_config_isr_ops = {
> +    .read = virtio_pci_config_isr_read,
> +    .write = virtio_pci_config_isr_write,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 1,
> +    },
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};
> +
> +static const MemoryRegionOps virtio_pci_config_notify_ops = {
> +    .read = virtio_pci_config_notify_read,
> +    .write = virtio_pci_config_notify_write,
> +    .impl = {
> +        .min_access_size = 2,
> +        .max_access_size = 2,
> +    },
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};
> +
> +static const MemoryRegionOps virtio_pci_config_device_ops = {
> +    .read = virtio_pci_config_device_read,
> +    .write = virtio_pci_config_device_write,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 4,
> +    },
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};
> +
>  static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
>                                  uint32_t val, int len)
>  {
> @@ -949,11 +1216,49 @@ static const TypeInfo virtio_9p_pci_info = {
>   * virtio-pci: This is the PCIDevice which has a virtio-pci-bus.
>   */
>  
> +#define VIRTIO_PCI_CONFIG_BAR_NUM 2
> +
> +static void *virtio_pci_cap_add(VirtIOPCIProxy *proxy,
> +                                const char *name,
> +                                MemoryRegion *reg,
> +                                const MemoryRegionOps *ops,
> +                                unsigned offset,
> +                                unsigned size,
> +                                unsigned cap_type,
> +                                unsigned cap_length)
> +{
> +    struct virtio_pci_cap *cap;
> +    int c;
> +    memory_region_init_io(reg, ops, proxy, name, size);
> +    memory_region_add_subregion(&proxy->config_bar, offset, reg);
> +
> +    if (!cap_length) {
> +        cap_length = sizeof *cap;
> +    }
> +    c = pci_add_capability(&proxy->pci_dev, PCI_CAP_ID_VNDR, 0, cap_length);
> +    assert(c > 0);
> +    cap = (void *)proxy->pci_dev.config + c;
> +    cap->cap_length = cap_length;
> +    cap->cfg_type = cap_type;
> +    cap->bar = VIRTIO_PCI_CONFIG_BAR_NUM;
> +    cap->offset = offset;
> +    cap->length = size;
> +
> +    return cap;
> +}
> +
> +static void virtio_pci_cap_del(VirtIOPCIProxy *proxy, MemoryRegion *reg)
> +{
> +    memory_region_del_subregion(&proxy->config_bar, reg);
> +    memory_region_destroy(reg);
> +}
> +
>  /* This is called by virtio-bus just after the device is plugged. */
>  static void virtio_pci_device_plugged(DeviceState *d)
>  {
>      VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
>      VirtioBusState *bus = &proxy->bus;
> +    struct virtio_pci_notify_cap *notify_cap;
>      uint8_t *config;
>      uint32_t size;
>  
> @@ -986,6 +1291,59 @@ static void virtio_pci_device_plugged(DeviceState *d)
>      pci_register_bar(&proxy->pci_dev, 0, PCI_BASE_ADDRESS_SPACE_IO,
>                       &proxy->bar);
>  
> +    /* New config layout */
> +    /* Four capabilities: common, notify, isr, device cfg */
> +#define VIRTIO_PCI_CONFIG_BAR_SIZE 0x1000
> +#define VIRTIO_PCI_CONFIG_COMMON 0x0
> +#define VIRTIO_PCI_CONFIG_NOTIFY 0x100
> +#define VIRTIO_PCI_CONFIG_ISR 0x200
> +#define VIRTIO_PCI_CONFIG_DEVICE 0x300
> +
> +    /* Make sure we left enough space for all vqs */
> +    assert(VIRTIO_PCI_CONFIG_ISR - VIRTIO_PCI_CONFIG_NOTIFY >=
> +           VIRTIO_PCI_QUEUE_MAX * 2);
> +    assert(VIRTIO_PCI_CONFIG_BAR_SIZE - VIRTIO_PCI_CONFIG_DEVICE >=
> +           proxy->vdev->config_len);
> +
> +    memory_region_init(&proxy->config_bar, "virtio-pci-config",
> +                       VIRTIO_PCI_CONFIG_BAR_SIZE);
> +
> +    virtio_pci_cap_add(proxy, "virtio-pci-common", 
> +                       &proxy->config_common,
> +                       &virtio_pci_config_common_ops,
> +                       VIRTIO_PCI_CONFIG_COMMON,
> +                       sizeof(struct virtio_pci_common_cfg),
> +                       VIRTIO_PCI_CAP_COMMON_CFG,
> +                       0);
> +    notify_cap = virtio_pci_cap_add(proxy, "virtio-pci-notify", 
> +                                    &proxy->config_notify,
> +                                    &virtio_pci_config_notify_ops,
> +                                    VIRTIO_PCI_CONFIG_NOTIFY,
> +                                    VIRTIO_PCI_QUEUE_MAX * 2,
> +                                    VIRTIO_PCI_CAP_NOTIFY_CFG,
> +                                    sizeof *notify_cap);
> +    notify_cap->notify_off_multiplier = 0x2;
> +    virtio_pci_cap_add(proxy, "virtio-pci-isr", 
> +                       &proxy->config_isr,
> +                       &virtio_pci_config_isr_ops,
> +                       VIRTIO_PCI_CONFIG_ISR,
> +                       0x1,
> +                       VIRTIO_PCI_CAP_ISR_CFG,
> +                       0);
> +    if (proxy->vdev->config_len) {
> +        virtio_pci_cap_add(proxy, "virtio-pci-device", 
> +                           &proxy->config_device,
> +                           &virtio_pci_config_device_ops,
> +                           VIRTIO_PCI_CONFIG_DEVICE,
> +                           proxy->vdev->config_len,
> +                           VIRTIO_PCI_CAP_DEVICE_CFG,
> +                           0);
> +    }
> +
> +    pci_register_bar(&proxy->pci_dev, VIRTIO_PCI_CONFIG_BAR_NUM,
> +                     PCI_BASE_ADDRESS_SPACE_MEMORY,
> +                     &proxy->config_bar);
> +
>      if (!kvm_has_many_ioeventfds()) {
>          proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
>      }
> @@ -1012,6 +1370,13 @@ static void virtio_pci_exit(PCIDevice *pci_dev)
>      VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev);
>      virtio_pci_stop_ioeventfd(proxy);
>      memory_region_destroy(&proxy->bar);
> +    virtio_pci_cap_del(proxy, &proxy->config_common);
> +    virtio_pci_cap_del(proxy, &proxy->config_notify);
> +    virtio_pci_cap_del(proxy, &proxy->config_isr);
> +    if (&proxy->vdev->config_len) {
> +        virtio_pci_cap_del(proxy, &proxy->config_device);
> +    }
> +    memory_region_destroy(&proxy->config_bar);
>      msix_uninit_exclusive_bar(pci_dev);
>  }
>  
> diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
> index 917bcc5..01005e7 100644
> --- a/hw/virtio/virtio-pci.h
> +++ b/hw/virtio/virtio-pci.h
> @@ -84,6 +84,11 @@ struct VirtIOPCIProxy {
>      PCIDevice pci_dev;
>      VirtIODevice *vdev;
>      MemoryRegion bar;
> +    MemoryRegion config_bar;
> +    MemoryRegion config_common;
> +    MemoryRegion config_isr;
> +    MemoryRegion config_notify;
> +    MemoryRegion config_device;
>      uint32_t flags;
>      uint32_t class_code;
>      uint32_t nvectors;
> @@ -93,6 +98,8 @@ struct VirtIOPCIProxy {
>      VirtIOIRQFD *vector_irqfd;
>      int nvqs_with_notifiers;
>      VirtioBusState bus;
> +    unsigned device_feature_select;
> +    unsigned guest_feature_select;
>  };
>  
>  
> @@ -203,4 +210,52 @@ struct VirtIORngPCI {
>  /* Virtio ABI version, if we increment this, we break the guest driver. */
>  #define VIRTIO_PCI_ABI_VERSION          0
>  
> +/* IDs for different capabilities.  Must all exist. */
> +/* Common configuration */
> +#define VIRTIO_PCI_CAP_COMMON_CFG	1
> +/* Notifications */
> +#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
> +/* ISR access */
> +#define VIRTIO_PCI_CAP_ISR_CFG		3
> +/* Device specific confiuration */
> +#define VIRTIO_PCI_CAP_DEVICE_CFG	4
> +
> +/* This is the PCI capability header: */
> +struct virtio_pci_cap {
> +	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
> +	uint8_t cap_next;	/* Generic PCI field: next ptr. */
> +	uint8_t cap_length;	/* Generic PCI field: capability length. */
> +	uint8_t cfg_type;	/* One of the VIRTIO_PCI_CAP_*_CFG. */
> +	uint8_t bar;		/* Where to find it. */
> +	uint32_t offset;	/* Offset within bar. */
> +	uint32_t length;	/* Length. */
> +};
> +
> +struct virtio_pci_notify_cap {
> +	struct virtio_pci_cap cap;
> +	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
> +};
> +
> +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> +struct virtio_pci_common_cfg {
> +	/* About the whole device. */
> +	uint32_t device_feature_select;	/* read-write */
> +	uint32_t device_feature;		/* read-only */
> +	uint32_t guest_feature_select;	/* read-write */
> +	uint32_t guest_feature;		/* read-only */
> +	uint16_t msix_config;		/* read-write */
> +	uint16_t num_queues;		/* read-only */
> +	uint8_t device_status;		/* read-write */
> +	uint8_t unused1;
> +
> +	/* About a specific virtqueue. */
> +	uint16_t queue_select;		/* read-write */
> +	uint16_t queue_size;		/* read-write, power of 2. */
> +	uint16_t queue_msix_vector;	/* read-write */
> +	uint16_t queue_enable;		/* read-write */
> +	uint16_t queue_notify_off;	/* read-only */
> +	uint64_t queue_desc;		/* read-write */
> +	uint64_t queue_avail;		/* read-write */
> +	uint64_t queue_used;		/* read-write */
> +};
>  #endif
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 8176c14..e54d692 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -672,6 +672,11 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n)
>      return vdev->vq[n].vring.num;
>  }
>  
> +void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
> +{
> +    vdev->vq[n].vring.num = num;
> +}
> +
>  int virtio_queue_get_id(VirtQueue *vq)
>  {
>      VirtIODevice *vdev = vq->vdev;
> @@ -987,6 +992,21 @@ hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n)
>      return vdev->vq[n].vring.used;
>  }
>  
> +void virtio_queue_set_desc_addr(VirtIODevice *vdev, int n, hwaddr addr)
> +{
> +    vdev->vq[n].vring.desc = addr;
> +}
> +
> +void virtio_queue_set_avail_addr(VirtIODevice *vdev, int n, hwaddr addr)
> +{
> +    vdev->vq[n].vring.avail = addr;
> +}
> +
> +void virtio_queue_set_used_addr(VirtIODevice *vdev, int n, hwaddr addr)
> +{
> +    vdev->vq[n].vring.used = addr;
> +}
> +
>  hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n)
>  {
>      return vdev->vq[n].vring.desc;
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index a6c5c53..d056d58 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -199,6 +199,7 @@ void virtio_config_writel(VirtIODevice *vdev, uint32_t addr, uint32_t data);
>  void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr);
>  hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n);
>  int virtio_queue_get_num(VirtIODevice *vdev, int n);
> +void virtio_queue_set_num(VirtIODevice *vdev, int n, int num);
>  void virtio_queue_notify(VirtIODevice *vdev, int n);
>  uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
>  void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector);
> @@ -226,6 +227,9 @@ typedef struct VirtIORNGConf VirtIORNGConf;
>  hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n);
>  hwaddr virtio_queue_get_avail_addr(VirtIODevice *vdev, int n);
>  hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n);
> +void virtio_queue_set_desc_addr(VirtIODevice *vdev, int n, hwaddr addr);
> +void virtio_queue_set_avail_addr(VirtIODevice *vdev, int n, hwaddr addr);
> +void virtio_queue_set_used_addr(VirtIODevice *vdev, int n, hwaddr addr);
>  hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n);
>  hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n);
>  hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n);
> -- 
> MST
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 17:15 ` Anthony Liguori
@ 2013-05-28 17:32   ` Michael S. Tsirkin
  2013-05-28 17:43     ` Paolo Bonzini
  2013-05-28 18:53     ` Anthony Liguori
  2013-05-29  4:31     ` Rusty Russell
  1 sibling, 2 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-28 17:32 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Tue, May 28, 2013 at 12:15:16PM -0500, Anthony Liguori wrote:
> > @@ -455,6 +462,226 @@ static void virtio_pci_config_write(void *opaque, hwaddr addr,
> >      }
> >  }
> >  
> > +static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
> > +                                              unsigned size)
> > +{
> > +    VirtIOPCIProxy *proxy = opaque;
> > +    VirtIODevice *vdev = proxy->vdev;
> > +
> > +    uint64_t low = 0xffffffffull;
> > +
> > +    switch (addr) {
> > +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> > +        return proxy->device_feature_select;
> 
> Oh dear no...  Please use defines like the rest of QEMU.

Any good reason not to use offsetof?
I see about 138 examples in qemu.

Anyway, that's the way Rusty wrote it in the kernel header -
I started with defines.
If you convince Rusty to switch I can switch too,
but I prefer reusing same structures as linux even if
for now I've given up on reusing same headers.


> >From a QEMU pov, take a look at:
> 
> https://github.com/aliguori/qemu/commit/587c35c1a3fe90f6af0f97927047ef4d3182a659
> 
> And:
> 
> https://github.com/aliguori/qemu/commit/01ba80a23cf2eb1e15056f82b44b94ec381565cb
> 
> Which lets virtio-pci be subclassable and then remaps the config space to
> BAR2.


Interesting. Have the spec anywhere?
You are saying this is going to conflict because
of BAR2 usage, yes?

So let's only do this virtio-fb only for new layout, so we don't need
to maintain compatibility. In particular, we are working
on making memory BAR access fast for virtio devices
in a generic way. At the moment they are measureably slower
than PIO on x86.


> Haven't looked at the proposed new ring layout yet.
> 
> Regards,

No new ring layout. It's new config layout.


-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 17:32   ` Michael S. Tsirkin
@ 2013-05-28 17:43     ` Paolo Bonzini
  2013-05-29  2:02         ` Laszlo Ersek
                         ` (2 more replies)
  2013-05-28 18:53     ` Anthony Liguori
  1 sibling, 3 replies; 109+ messages in thread
From: Paolo Bonzini @ 2013-05-28 17:43 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Laszlo Ersek, KONRAD Frederic

Il 28/05/2013 19:32, Michael S. Tsirkin ha scritto:
>>> > > +
>>> > > +    switch (addr) {
>>> > > +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>> > > +        return proxy->device_feature_select;
>> > 
>> > Oh dear no...  Please use defines like the rest of QEMU.
> Any good reason not to use offsetof?

I'm not sure it's portable to use it in case labels.  IIRC, the
definition ((int)&(((T *)0)->field)) is not a valid C integer constant
expression.  Laszlo?

> I see about 138 examples in qemu.

Almost all of them are about fields in the host, not the guest, and none
of them are in case statements.

Paolo

> 
> Anyway, that's the way Rusty wrote it in the kernel header -
> I started with defines.
> If you convince Rusty to switch I can switch too,
> but I prefer reusing same structures as linux even if
> for now I've given up on reusing same headers.
> 
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 17:32   ` Michael S. Tsirkin
  2013-05-28 17:43     ` Paolo Bonzini
@ 2013-05-28 18:53     ` Anthony Liguori
  2013-05-28 19:27       ` Michael S. Tsirkin
  1 sibling, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-05-28 18:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Tue, May 28, 2013 at 12:15:16PM -0500, Anthony Liguori wrote:
>> > @@ -455,6 +462,226 @@ static void virtio_pci_config_write(void *opaque, hwaddr addr,
>> >      }
>> >  }
>> >  
>> > +static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
>> > +                                              unsigned size)
>> > +{
>> > +    VirtIOPCIProxy *proxy = opaque;
>> > +    VirtIODevice *vdev = proxy->vdev;
>> > +
>> > +    uint64_t low = 0xffffffffull;
>> > +
>> > +    switch (addr) {
>> > +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>> > +        return proxy->device_feature_select;
>> 
>> Oh dear no...  Please use defines like the rest of QEMU.
>
> Any good reason not to use offsetof?
> I see about 138 examples in qemu.

There are exactly zero:

$ find . -name "*.c" -exec grep -l "case offset" {} \;
$

> Anyway, that's the way Rusty wrote it in the kernel header -
> I started with defines.
> If you convince Rusty to switch I can switch too,

We have 300+ devices in QEMU that use #defines.  We're not using this
kind of thing just because you want to copy code from the kernel.

>> https://github.com/aliguori/qemu/commit/587c35c1a3fe90f6af0f97927047ef4d3182a659
>> 
>> And:
>> 
>> https://github.com/aliguori/qemu/commit/01ba80a23cf2eb1e15056f82b44b94ec381565cb
>> 
>> Which lets virtio-pci be subclassable and then remaps the config space to
>> BAR2.
>
>
> Interesting. Have the spec anywhere?

Not yet, but working on that.

> You are saying this is going to conflict because
> of BAR2 usage, yes?

No, this whole thing is flexible.  I had to use BAR2 because BAR0 has to
be the vram mapping.  It also had to be an MMIO bar.

The new layout might make it easier to implement a device like this.  I
shared it mainly because I wanted to show the subclassing idea vs. just
tacking an option onto the existing virtio-pci code in QEMU.

Regards,

Anthony Liguori

> So let's only do this virtio-fb only for new layout, so we don't need
> to maintain compatibility. In particular, we are working
> on making memory BAR access fast for virtio devices
> in a generic way. At the moment they are measureably slower
> than PIO on x86.
>
>
>> Haven't looked at the proposed new ring layout yet.
>> 
>> Regards,
>
> No new ring layout. It's new config layout.
>
>
> -- 
> MST
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 18:53     ` Anthony Liguori
@ 2013-05-28 19:27       ` Michael S. Tsirkin
  0 siblings, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-28 19:27 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Tue, May 28, 2013 at 01:53:35PM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Tue, May 28, 2013 at 12:15:16PM -0500, Anthony Liguori wrote:
> >> > @@ -455,6 +462,226 @@ static void virtio_pci_config_write(void *opaque, hwaddr addr,
> >> >      }
> >> >  }
> >> >  
> >> > +static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
> >> > +                                              unsigned size)
> >> > +{
> >> > +    VirtIOPCIProxy *proxy = opaque;
> >> > +    VirtIODevice *vdev = proxy->vdev;
> >> > +
> >> > +    uint64_t low = 0xffffffffull;
> >> > +
> >> > +    switch (addr) {
> >> > +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> >> > +        return proxy->device_feature_select;
> >> 
> >> Oh dear no...  Please use defines like the rest of QEMU.
> >
> > Any good reason not to use offsetof?
> > I see about 138 examples in qemu.
> 
> There are exactly zero:
> 
> $ find . -name "*.c" -exec grep -l "case offset" {} \;
> $
> 
> > Anyway, that's the way Rusty wrote it in the kernel header -
> > I started with defines.
> > If you convince Rusty to switch I can switch too,
> 
> We have 300+ devices in QEMU that use #defines.  We're not using this
> kind of thing just because you want to copy code from the kernel.

Rusty, let's switch to defines?

> >> https://github.com/aliguori/qemu/commit/587c35c1a3fe90f6af0f97927047ef4d3182a659
> >> 
> >> And:
> >> 
> >> https://github.com/aliguori/qemu/commit/01ba80a23cf2eb1e15056f82b44b94ec381565cb
> >> 
> >> Which lets virtio-pci be subclassable and then remaps the config space to
> >> BAR2.
> >
> >
> > Interesting. Have the spec anywhere?
> 
> Not yet, but working on that.
> 
> > You are saying this is going to conflict because
> > of BAR2 usage, yes?
> 
> No, this whole thing is flexible.  I had to use BAR2 because BAR0 has to
> be the vram mapping.  It also had to be an MMIO bar.
> 
> The new layout might make it easier to implement a device like this.

Absolutely, you can put thing anywhere you like.

>  I
> shared it mainly because I wanted to show the subclassing idea vs. just
> tacking an option onto the existing virtio-pci code in QEMU.
> 
> Regards,
> 
> Anthony Liguori

I don't think a subclass will work for the new layout. This should be
completely transparent to users, just have more devices
work with more flexibility.

> > So let's only do this virtio-fb only for new layout, so we don't need
> > to maintain compatibility. In particular, we are working
> > on making memory BAR access fast for virtio devices
> > in a generic way. At the moment they are measureably slower
> > than PIO on x86.
> >
> >
> >> Haven't looked at the proposed new ring layout yet.
> >> 
> >> Regards,
> >
> > No new ring layout. It's new config layout.
> >
> >
> > -- 
> > MST
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 17:43     ` Paolo Bonzini
@ 2013-05-29  2:02         ` Laszlo Ersek
  2013-05-29  4:33       ` Rusty Russell
  2013-05-29  4:33       ` Rusty Russell
  2 siblings, 0 replies; 109+ messages in thread
From: Laszlo Ersek @ 2013-05-29  2:02 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, KONRAD Frederic

On 05/28/13 19:43, Paolo Bonzini wrote:
> Il 28/05/2013 19:32, Michael S. Tsirkin ha scritto:
>>>>>> +
>>>>>> +    switch (addr) {
>>>>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>>>>> +        return proxy->device_feature_select;
>>>>
>>>> Oh dear no...  Please use defines like the rest of QEMU.
>> Any good reason not to use offsetof?
> 
> I'm not sure it's portable to use it in case labels.  IIRC, the
> definition ((int)&(((T *)0)->field)) is not a valid C integer constant
> expression.  Laszlo?

As long as we use the offsetof() that comes with the C implementation
(ie. we don't hack it up ourselves), we should be safe; ISO C99
elegantly defines offsetof() in "7.17 Common definitions <stddef.h>" p3
as

  The macros are [...]

    offsetof(type, member-designator)

  which expands to an integer constant expression that has type
  *size_t*, the value of which is the offset in bytes, to the structure
  member (designated by /member-designator/), from the beginning of its
  structure (designated by /type/). The type and member designator shall
  be such that given

    static type t;

  then the expression &(t.member-designator) evaluates to an address
  constant. (If the specified member is a bit-field, the behavior is
  undefined.)

("Address constant" is described in 6.6p9 but quoting even that here
doesn't seem necesary.)

>From 6.8.4.2p3, "The expression of each case label shall be an integer
constant expression [...]".

So,
(a) if we use the standard offsetof(),
(b) and you can also write, for example,

  static struct virtio_pci_common_cfg t;
  static void *p = &t.device_feature_select;

then the construct seems valid.

(Consistently with the above mention of UB if the specified member is a
bit-field: the address-of operator's constraints also forbid a bit-field
object as operand.)

Laszlo

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
@ 2013-05-29  2:02         ` Laszlo Ersek
  0 siblings, 0 replies; 109+ messages in thread
From: Laszlo Ersek @ 2013-05-29  2:02 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, KONRAD Frederic

On 05/28/13 19:43, Paolo Bonzini wrote:
> Il 28/05/2013 19:32, Michael S. Tsirkin ha scritto:
>>>>>> +
>>>>>> +    switch (addr) {
>>>>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>>>>> +        return proxy->device_feature_select;
>>>>
>>>> Oh dear no...  Please use defines like the rest of QEMU.
>> Any good reason not to use offsetof?
> 
> I'm not sure it's portable to use it in case labels.  IIRC, the
> definition ((int)&(((T *)0)->field)) is not a valid C integer constant
> expression.  Laszlo?

As long as we use the offsetof() that comes with the C implementation
(ie. we don't hack it up ourselves), we should be safe; ISO C99
elegantly defines offsetof() in "7.17 Common definitions <stddef.h>" p3
as

  The macros are [...]

    offsetof(type, member-designator)

  which expands to an integer constant expression that has type
  *size_t*, the value of which is the offset in bytes, to the structure
  member (designated by /member-designator/), from the beginning of its
  structure (designated by /type/). The type and member designator shall
  be such that given

    static type t;

  then the expression &(t.member-designator) evaluates to an address
  constant. (If the specified member is a bit-field, the behavior is
  undefined.)

("Address constant" is described in 6.6p9 but quoting even that here
doesn't seem necesary.)

From 6.8.4.2p3, "The expression of each case label shall be an integer
constant expression [...]".

So,
(a) if we use the standard offsetof(),
(b) and you can also write, for example,

  static struct virtio_pci_common_cfg t;
  static void *p = &t.device_feature_select;

then the construct seems valid.

(Consistently with the above mention of UB if the specified member is a
bit-field: the address-of operator's constraints also forbid a bit-field
object as operand.)

Laszlo

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 17:15 ` Anthony Liguori
@ 2013-05-29  4:31     ` Rusty Russell
  2013-05-29  4:31     ` Rusty Russell
  1 sibling, 0 replies; 109+ messages in thread
From: Rusty Russell @ 2013-05-29  4:31 UTC (permalink / raw)
  To: Anthony Liguori, Michael S. Tsirkin, virtualization, kvm
  Cc: Peter Maydell, Paolo Bonzini, Michael S. Tsirkin,
	Stefan Hajnoczi, KONRAD Frederic

Anthony Liguori <aliguori@us.ibm.com> writes:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>> +        return proxy->device_feature_select;
>
> Oh dear no...  Please use defines like the rest of QEMU.

It is pretty ugly.

Yet the structure definitions are descriptive, capturing layout, size
and endianness in natural a format readable by any C programmer.

So AFAICT the question is, do we put the required

#define VIRTIO_PCI_CFG_FEATURE_SEL \
         (offsetof(struct virtio_pci_common_cfg, device_feature_select))

etc. in the kernel headers or qemu?

> Haven't looked at the proposed new ring layout yet.

No change, but there's an open question on whether we should nail it to
little endian (or define the endian by the transport).

Of course, I can't rule out that the 1.0 standard *may* decide to frob
the ring layout somehow, but I'd think it would require a compelling
reason.  I suggest that's 2.0 material...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
@ 2013-05-29  4:31     ` Rusty Russell
  0 siblings, 0 replies; 109+ messages in thread
From: Rusty Russell @ 2013-05-29  4:31 UTC (permalink / raw)
  To: Anthony Liguori, virtualization, kvm
  Cc: Peter Maydell, Paolo Bonzini, Michael S. Tsirkin,
	Stefan Hajnoczi, KONRAD Frederic

Anthony Liguori <aliguori@us.ibm.com> writes:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>> +        return proxy->device_feature_select;
>
> Oh dear no...  Please use defines like the rest of QEMU.

It is pretty ugly.

Yet the structure definitions are descriptive, capturing layout, size
and endianness in natural a format readable by any C programmer.

So AFAICT the question is, do we put the required

#define VIRTIO_PCI_CFG_FEATURE_SEL \
         (offsetof(struct virtio_pci_common_cfg, device_feature_select))

etc. in the kernel headers or qemu?

> Haven't looked at the proposed new ring layout yet.

No change, but there's an open question on whether we should nail it to
little endian (or define the endian by the transport).

Of course, I can't rule out that the 1.0 standard *may* decide to frob
the ring layout somehow, but I'd think it would require a compelling
reason.  I suggest that's 2.0 material...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 17:43     ` Paolo Bonzini
  2013-05-29  2:02         ` Laszlo Ersek
  2013-05-29  4:33       ` Rusty Russell
@ 2013-05-29  4:33       ` Rusty Russell
  2013-05-29  7:27         ` Paolo Bonzini
  2 siblings, 1 reply; 109+ messages in thread
From: Rusty Russell @ 2013-05-29  4:33 UTC (permalink / raw)
  To: Paolo Bonzini, Michael S. Tsirkin
  Cc: Anthony Liguori, virtualization, kvm, KONRAD Frederic,
	Peter Maydell, Stefan Hajnoczi, Laszlo Ersek

Paolo Bonzini <pbonzini@redhat.com> writes:
> Il 28/05/2013 19:32, Michael S. Tsirkin ha scritto:
>>>> > > +
>>>> > > +    switch (addr) {
>>>> > > +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>>> > > +        return proxy->device_feature_select;
>>> > 
>>> > Oh dear no...  Please use defines like the rest of QEMU.
>> Any good reason not to use offsetof?
>
> I'm not sure it's portable to use it in case labels.  IIRC, the
> definition ((int)&(((T *)0)->field)) is not a valid C integer constant
> expression.  Laszlo?

It's defined to yield an integer constant expression in the ISO standard
(and I think ANSI too, though that's not at hand):

    7.19, para 3:

        ...offsetof(type, member-designator)
        which expands to an integer constant expression that has type
        size_t, ...

The real question is whether compilers qemu cares about meet the
standard (there's some evidence that older compilers fail this).  If
not, we'll have to define them as raw offsets...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-28 17:43     ` Paolo Bonzini
  2013-05-29  2:02         ` Laszlo Ersek
@ 2013-05-29  4:33       ` Rusty Russell
  2013-05-29  4:33       ` Rusty Russell
  2 siblings, 0 replies; 109+ messages in thread
From: Rusty Russell @ 2013-05-29  4:33 UTC (permalink / raw)
  To: Paolo Bonzini, Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Laszlo Ersek, KONRAD Frederic

Paolo Bonzini <pbonzini@redhat.com> writes:
> Il 28/05/2013 19:32, Michael S. Tsirkin ha scritto:
>>>> > > +
>>>> > > +    switch (addr) {
>>>> > > +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>>> > > +        return proxy->device_feature_select;
>>> > 
>>> > Oh dear no...  Please use defines like the rest of QEMU.
>> Any good reason not to use offsetof?
>
> I'm not sure it's portable to use it in case labels.  IIRC, the
> definition ((int)&(((T *)0)->field)) is not a valid C integer constant
> expression.  Laszlo?

It's defined to yield an integer constant expression in the ISO standard
(and I think ANSI too, though that's not at hand):

    7.19, para 3:

        ...offsetof(type, member-designator)
        which expands to an integer constant expression that has type
        size_t, ...

The real question is whether compilers qemu cares about meet the
standard (there's some evidence that older compilers fail this).  If
not, we'll have to define them as raw offsets...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29  4:33       ` Rusty Russell
@ 2013-05-29  7:27         ` Paolo Bonzini
  2013-05-29  8:05           ` Michael S. Tsirkin
  2013-05-29 10:07           ` Laszlo Ersek
  0 siblings, 2 replies; 109+ messages in thread
From: Paolo Bonzini @ 2013-05-29  7:27 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, Laszlo Ersek, KONRAD Frederic

Il 29/05/2013 06:33, Rusty Russell ha scritto:
> Paolo Bonzini <pbonzini@redhat.com> writes:
>> Il 28/05/2013 19:32, Michael S. Tsirkin ha scritto:
>>>>>>> +
>>>>>>> +    switch (addr) {
>>>>>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>>>>>> +        return proxy->device_feature_select;
>>>>>
>>>>> Oh dear no...  Please use defines like the rest of QEMU.
>>> Any good reason not to use offsetof?
>>
>> I'm not sure it's portable to use it in case labels.  IIRC, the
>> definition ((int)&(((T *)0)->field)) is not a valid C integer constant
>> expression.  Laszlo?
> 
> It's defined to yield an integer constant expression in the ISO standard
> (and I think ANSI too, though that's not at hand):

It's not in C89.  The oldest compiler QEMU cares about is GCC 4.2.  I
don't know if it has a builtin offsetof, probably it does.  But I'm not
sure about other users of virtio headers.

Paolo

> 
>     7.19, para 3:
> 
>         ...offsetof(type, member-designator)
>         which expands to an integer constant expression that has type
>         size_t, ...
> 
> The real question is whether compilers qemu cares about meet the
> standard (there's some evidence that older compilers fail this).  If
> not, we'll have to define them as raw offsets...
> 
> Cheers,
> Rusty.
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29  7:27         ` Paolo Bonzini
@ 2013-05-29  8:05           ` Michael S. Tsirkin
  2013-05-29 10:07           ` Laszlo Ersek
  1 sibling, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29  8:05 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Laszlo Ersek, KONRAD Frederic

On Wed, May 29, 2013 at 09:27:02AM +0200, Paolo Bonzini wrote:
> Il 29/05/2013 06:33, Rusty Russell ha scritto:
> > Paolo Bonzini <pbonzini@redhat.com> writes:
> >> Il 28/05/2013 19:32, Michael S. Tsirkin ha scritto:
> >>>>>>> +
> >>>>>>> +    switch (addr) {
> >>>>>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> >>>>>>> +        return proxy->device_feature_select;
> >>>>>
> >>>>> Oh dear no...  Please use defines like the rest of QEMU.
> >>> Any good reason not to use offsetof?
> >>
> >> I'm not sure it's portable to use it in case labels.  IIRC, the
> >> definition ((int)&(((T *)0)->field)) is not a valid C integer constant
> >> expression.  Laszlo?
> > 
> > It's defined to yield an integer constant expression in the ISO standard
> > (and I think ANSI too, though that's not at hand):
> 
> It's not in C89.  The oldest compiler QEMU cares about is GCC 4.2.  I
> don't know if it has a builtin offsetof, probably it does.

Yes, I think __builtin_offsetof was added in 4.0:
http://gcc.gnu.org/onlinedocs/gcc-4.0.0/gcc/Offsetof.html


> But I'm not sure about other users of virtio headers.
> 
> Paolo
> > 
> >     7.19, para 3:
> > 
> >         ...offsetof(type, member-designator)
> >         which expands to an integer constant expression that has type
> >         size_t, ...
> > 
> > The real question is whether compilers qemu cares about meet the
> > standard (there's some evidence that older compilers fail this).  If
> > not, we'll have to define them as raw offsets...

So I think the question is whether we care about GCC 3.  We used to when
mingw in debian stable was GCC 3, but not anymore ...

> > 
> > Cheers,
> > Rusty.
> > 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29  4:31     ` Rusty Russell
  (?)
@ 2013-05-29  8:24     ` Michael S. Tsirkin
  2013-05-29  8:52       ` Paolo Bonzini
  2013-05-29  9:00       ` Peter Maydell
  -1 siblings, 2 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29  8:24 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Wed, May 29, 2013 at 02:01:18PM +0930, Rusty Russell wrote:
> Anthony Liguori <aliguori@us.ibm.com> writes:
> > "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> >> +        return proxy->device_feature_select;
> >
> > Oh dear no...  Please use defines like the rest of QEMU.
> 
> It is pretty ugly.
> 
> Yet the structure definitions are descriptive, capturing layout, size
> and endianness in natural a format readable by any C programmer.
> 
> So AFAICT the question is, do we put the required
> 
> #define VIRTIO_PCI_CFG_FEATURE_SEL \
>          (offsetof(struct virtio_pci_common_cfg, device_feature_select))
> 
> etc. in the kernel headers or qemu?


I forgot that we need to validate size (different fields
have different sizes so memory core does not validate
for us).

And that is much cleaner if we use offsetof directly:
You can see that you are checking the correct
size matching the offset, at a glance.

--->

virtio: new layout: add offset validation

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index f4db224..fd09ea7 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -467,51 +467,70 @@ static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
 {
     VirtIOPCIProxy *proxy = opaque;
     VirtIODevice *vdev = proxy->vdev;
+    struct virtio_pci_common_cfg cfg;
 
     uint64_t low = 0xffffffffull;
 
     switch (addr) {
     case offsetof(struct virtio_pci_common_cfg, device_feature_select):
+        assert(size == sizeof cfg.device_feature_select);
         return proxy->device_feature_select;
     case offsetof(struct virtio_pci_common_cfg, device_feature):
+        assert(size == sizeof cfg.device_feature);
         /* TODO: 64-bit features */
 	return proxy->device_feature_select ? 0 : proxy->host_features;
     case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
+        assert(size == sizeof cfg.guest_feature_select);
         return proxy->guest_feature_select;
     case offsetof(struct virtio_pci_common_cfg, guest_feature):
+        assert(size == sizeof cfg.guest_feature);
         /* TODO: 64-bit features */
 	return proxy->guest_feature_select ? 0 : vdev->guest_features;
     case offsetof(struct virtio_pci_common_cfg, msix_config):
+        assert(size == sizeof cfg.msix_config);
 	return vdev->config_vector;
     case offsetof(struct virtio_pci_common_cfg, num_queues):
+        assert(size == sizeof cfg.num_queues);
         /* TODO: more exact limit? */
 	return VIRTIO_PCI_QUEUE_MAX;
     case offsetof(struct virtio_pci_common_cfg, device_status):
+        assert(size == sizeof cfg.device_status);
         return vdev->status;
 
 	/* About a specific virtqueue. */
     case offsetof(struct virtio_pci_common_cfg, queue_select):
+        assert(size == sizeof cfg.queue_select);
         return  vdev->queue_sel;
     case offsetof(struct virtio_pci_common_cfg, queue_size):
+        assert(size == sizeof cfg.queue_size);
         return virtio_queue_get_num(vdev, vdev->queue_sel);
     case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
+        assert(size == sizeof cfg.queue_msix_vector);
 	return virtio_queue_vector(vdev, vdev->queue_sel);
     case offsetof(struct virtio_pci_common_cfg, queue_enable):
+        assert(size == sizeof cfg.queue_enable);
         /* TODO */
 	return 0;
     case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
+        assert(size == sizeof cfg.queue_notify_off);
         return vdev->queue_sel;
     case offsetof(struct virtio_pci_common_cfg, queue_desc):
+        assert(size == 4);
         return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
     case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
+        assert(size == 4);
         return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) >> 32;
     case offsetof(struct virtio_pci_common_cfg, queue_avail):
+        assert(size == 4);
         return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
     case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
+        assert(size == 4);
         return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) >> 32;
     case offsetof(struct virtio_pci_common_cfg, queue_used):
+        assert(size == 4);
         return virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
     case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
+        assert(size == 4);
         return virtio_queue_get_used_addr(vdev, vdev->queue_sel) >> 32;
     default:
         return 0;
@@ -523,76 +542,95 @@ static void virtio_pci_config_common_write(void *opaque, hwaddr addr,
 {
     VirtIOPCIProxy *proxy = opaque;
     VirtIODevice *vdev = proxy->vdev;
+    struct virtio_pci_common_cfg cfg;
 
     uint64_t low = 0xffffffffull;
     uint64_t high = ~low;
 
     switch (addr) {
     case offsetof(struct virtio_pci_common_cfg, device_feature_select):
+        assert(size == sizeof cfg.device_feature_select);
         proxy->device_feature_select = val;
         break;
     case offsetof(struct virtio_pci_common_cfg, device_feature):
+        assert(size == sizeof cfg.device_feature);
         break;
     case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
+        assert(size == sizeof cfg.guest_feature_select);
         proxy->guest_feature_select = val;
         break;
     case offsetof(struct virtio_pci_common_cfg, guest_feature):
+        assert(size == sizeof cfg.guest_feature);
         /* TODO: 64-bit features */
 	if (!proxy->guest_feature_select) {
             virtio_set_features(vdev, val);
         }
         break;
     case offsetof(struct virtio_pci_common_cfg, msix_config):
+        assert(size == sizeof cfg.msix_config);
 	vdev->config_vector = val;
         break;
     case offsetof(struct virtio_pci_common_cfg, num_queues):
+        assert(size == sizeof cfg.num_queues);
         break;
     case offsetof(struct virtio_pci_common_cfg, device_status):
+        assert(size == sizeof cfg.device_status);
         virtio_pci_set_status(proxy, val);
         break;
 	/* About a specific virtqueue. */
     case offsetof(struct virtio_pci_common_cfg, queue_select):
+        assert(size == sizeof cfg.queue_select);
         assert(val < VIRTIO_PCI_QUEUE_MAX);
         vdev->queue_sel = val;
         break;
     case offsetof(struct virtio_pci_common_cfg, queue_size):
+        assert(size == sizeof cfg.queue_size);
         assert(val && val < 0x8ffff && !(val & (val - 1)));
         virtio_queue_set_num(vdev, vdev->queue_sel, val);
         break;
     case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
+        assert(size == sizeof cfg.queue_msix_vector);
 	virtio_queue_set_vector(vdev, vdev->queue_sel, val);
         break;
     case offsetof(struct virtio_pci_common_cfg, queue_enable):
+        assert(size == sizeof cfg.queue_enable);
         /* TODO */
 	break;
     case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
+        assert(size == sizeof cfg.queue_notify_off);
         break;
     case offsetof(struct virtio_pci_common_cfg, queue_desc):
+        assert(size == 4);
         val &= low;
         val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & high;
         virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
         break;
     case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
+        assert(size == 4);
         val = val << 32;
         val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
         virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
         break;
     case offsetof(struct virtio_pci_common_cfg, queue_avail):
+        assert(size == 4);
         val &= low;
         val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & high;
         virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
         break;
     case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
+        assert(size == 4);
         val = val << 32;
         val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
         virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
         break;
     case offsetof(struct virtio_pci_common_cfg, queue_used):
+        assert(size == 4);
         val &= low;
         val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & high;
         virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);
         break;
     case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
+        assert(size == 4);
         val = val << 32;
         val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
         virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);

^ permalink raw reply related	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29  8:24     ` Michael S. Tsirkin
@ 2013-05-29  8:52       ` Paolo Bonzini
  2013-05-29  9:00       ` Peter Maydell
  1 sibling, 0 replies; 109+ messages in thread
From: Paolo Bonzini @ 2013-05-29  8:52 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, KONRAD Frederic

Il 29/05/2013 10:24, Michael S. Tsirkin ha scritto:
> On Wed, May 29, 2013 at 02:01:18PM +0930, Rusty Russell wrote:
>> Anthony Liguori <aliguori@us.ibm.com> writes:
>>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>>> +        return proxy->device_feature_select;
>>>
>>> Oh dear no...  Please use defines like the rest of QEMU.
>>
>> It is pretty ugly.
>>
>> Yet the structure definitions are descriptive, capturing layout, size
>> and endianness in natural a format readable by any C programmer.
>>
>> So AFAICT the question is, do we put the required
>>
>> #define VIRTIO_PCI_CFG_FEATURE_SEL \
>>          (offsetof(struct virtio_pci_common_cfg, device_feature_select))
>>
>> etc. in the kernel headers or qemu?
> 
> 
> I forgot that we need to validate size (different fields
> have different sizes so memory core does not validate
> for us).
> 
> And that is much cleaner if we use offsetof directly:
> You can see that you are checking the correct
> size matching the offset, at a glance.

I wonder if we can use and possibly extend Peter Crosthwaite's "register
API" to support this better.

Paolo

> --->
> 
> virtio: new layout: add offset validation
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> ---
> 
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index f4db224..fd09ea7 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -467,51 +467,70 @@ static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
>  {
>      VirtIOPCIProxy *proxy = opaque;
>      VirtIODevice *vdev = proxy->vdev;
> +    struct virtio_pci_common_cfg cfg;
>  
>      uint64_t low = 0xffffffffull;
>  
>      switch (addr) {
>      case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> +        assert(size == sizeof cfg.device_feature_select);
>          return proxy->device_feature_select;
>      case offsetof(struct virtio_pci_common_cfg, device_feature):
> +        assert(size == sizeof cfg.device_feature);
>          /* TODO: 64-bit features */
>  	return proxy->device_feature_select ? 0 : proxy->host_features;
>      case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
> +        assert(size == sizeof cfg.guest_feature_select);
>          return proxy->guest_feature_select;
>      case offsetof(struct virtio_pci_common_cfg, guest_feature):
> +        assert(size == sizeof cfg.guest_feature);
>          /* TODO: 64-bit features */
>  	return proxy->guest_feature_select ? 0 : vdev->guest_features;
>      case offsetof(struct virtio_pci_common_cfg, msix_config):
> +        assert(size == sizeof cfg.msix_config);
>  	return vdev->config_vector;
>      case offsetof(struct virtio_pci_common_cfg, num_queues):
> +        assert(size == sizeof cfg.num_queues);
>          /* TODO: more exact limit? */
>  	return VIRTIO_PCI_QUEUE_MAX;
>      case offsetof(struct virtio_pci_common_cfg, device_status):
> +        assert(size == sizeof cfg.device_status);
>          return vdev->status;
>  
>  	/* About a specific virtqueue. */
>      case offsetof(struct virtio_pci_common_cfg, queue_select):
> +        assert(size == sizeof cfg.queue_select);
>          return  vdev->queue_sel;
>      case offsetof(struct virtio_pci_common_cfg, queue_size):
> +        assert(size == sizeof cfg.queue_size);
>          return virtio_queue_get_num(vdev, vdev->queue_sel);
>      case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
> +        assert(size == sizeof cfg.queue_msix_vector);
>  	return virtio_queue_vector(vdev, vdev->queue_sel);
>      case offsetof(struct virtio_pci_common_cfg, queue_enable):
> +        assert(size == sizeof cfg.queue_enable);
>          /* TODO */
>  	return 0;
>      case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
> +        assert(size == sizeof cfg.queue_notify_off);
>          return vdev->queue_sel;
>      case offsetof(struct virtio_pci_common_cfg, queue_desc):
> +        assert(size == 4);
>          return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
>      case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
> +        assert(size == 4);
>          return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) >> 32;
>      case offsetof(struct virtio_pci_common_cfg, queue_avail):
> +        assert(size == 4);
>          return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
>      case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
> +        assert(size == 4);
>          return virtio_queue_get_avail_addr(vdev, vdev->queue_sel) >> 32;
>      case offsetof(struct virtio_pci_common_cfg, queue_used):
> +        assert(size == 4);
>          return virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
>      case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
> +        assert(size == 4);
>          return virtio_queue_get_used_addr(vdev, vdev->queue_sel) >> 32;
>      default:
>          return 0;
> @@ -523,76 +542,95 @@ static void virtio_pci_config_common_write(void *opaque, hwaddr addr,
>  {
>      VirtIOPCIProxy *proxy = opaque;
>      VirtIODevice *vdev = proxy->vdev;
> +    struct virtio_pci_common_cfg cfg;
>  
>      uint64_t low = 0xffffffffull;
>      uint64_t high = ~low;
>  
>      switch (addr) {
>      case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> +        assert(size == sizeof cfg.device_feature_select);
>          proxy->device_feature_select = val;
>          break;
>      case offsetof(struct virtio_pci_common_cfg, device_feature):
> +        assert(size == sizeof cfg.device_feature);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, guest_feature_select):
> +        assert(size == sizeof cfg.guest_feature_select);
>          proxy->guest_feature_select = val;
>          break;
>      case offsetof(struct virtio_pci_common_cfg, guest_feature):
> +        assert(size == sizeof cfg.guest_feature);
>          /* TODO: 64-bit features */
>  	if (!proxy->guest_feature_select) {
>              virtio_set_features(vdev, val);
>          }
>          break;
>      case offsetof(struct virtio_pci_common_cfg, msix_config):
> +        assert(size == sizeof cfg.msix_config);
>  	vdev->config_vector = val;
>          break;
>      case offsetof(struct virtio_pci_common_cfg, num_queues):
> +        assert(size == sizeof cfg.num_queues);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, device_status):
> +        assert(size == sizeof cfg.device_status);
>          virtio_pci_set_status(proxy, val);
>          break;
>  	/* About a specific virtqueue. */
>      case offsetof(struct virtio_pci_common_cfg, queue_select):
> +        assert(size == sizeof cfg.queue_select);
>          assert(val < VIRTIO_PCI_QUEUE_MAX);
>          vdev->queue_sel = val;
>          break;
>      case offsetof(struct virtio_pci_common_cfg, queue_size):
> +        assert(size == sizeof cfg.queue_size);
>          assert(val && val < 0x8ffff && !(val & (val - 1)));
>          virtio_queue_set_num(vdev, vdev->queue_sel, val);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, queue_msix_vector):
> +        assert(size == sizeof cfg.queue_msix_vector);
>  	virtio_queue_set_vector(vdev, vdev->queue_sel, val);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, queue_enable):
> +        assert(size == sizeof cfg.queue_enable);
>          /* TODO */
>  	break;
>      case offsetof(struct virtio_pci_common_cfg, queue_notify_off):
> +        assert(size == sizeof cfg.queue_notify_off);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, queue_desc):
> +        assert(size == 4);
>          val &= low;
>          val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & high;
>          virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
> +        assert(size == 4);
>          val = val << 32;
>          val |= virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
>          virtio_queue_set_desc_addr(vdev, vdev->queue_sel, val);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, queue_avail):
> +        assert(size == 4);
>          val &= low;
>          val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & high;
>          virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, queue_avail) + 4:
> +        assert(size == 4);
>          val = val << 32;
>          val |= virtio_queue_get_avail_addr(vdev, vdev->queue_sel) & low;
>          virtio_queue_set_avail_addr(vdev, vdev->queue_sel, val);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, queue_used):
> +        assert(size == 4);
>          val &= low;
>          val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & high;
>          virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);
>          break;
>      case offsetof(struct virtio_pci_common_cfg, queue_used) + 4:
> +        assert(size == 4);
>          val = val << 32;
>          val |= virtio_queue_get_used_addr(vdev, vdev->queue_sel) & low;
>          virtio_queue_set_used_addr(vdev, vdev->queue_sel, val);
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29  8:24     ` Michael S. Tsirkin
  2013-05-29  8:52       ` Paolo Bonzini
@ 2013-05-29  9:00       ` Peter Maydell
  2013-05-29 10:08         ` Michael S. Tsirkin
  1 sibling, 1 reply; 109+ messages in thread
From: Peter Maydell @ 2013-05-29  9:00 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Anthony Liguori, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On 29 May 2013 09:24, Michael S. Tsirkin <mst@redhat.com> wrote:
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index f4db224..fd09ea7 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -467,51 +467,70 @@ static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
>  {
>      VirtIOPCIProxy *proxy = opaque;
>      VirtIODevice *vdev = proxy->vdev;
> +    struct virtio_pci_common_cfg cfg;
>
>      uint64_t low = 0xffffffffull;
>
>      switch (addr) {
>      case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> +        assert(size == sizeof cfg.device_feature_select);
>          return proxy->device_feature_select;

Asserting is definitely the wrong thing here, since the
guest can trigger it.

If you really want to use offsetof like this you're
going to need to decorate the structs with QEMU_PACKED.

thanks
-- PMM

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29  7:27         ` Paolo Bonzini
  2013-05-29  8:05           ` Michael S. Tsirkin
@ 2013-05-29 10:07           ` Laszlo Ersek
  1 sibling, 0 replies; 109+ messages in thread
From: Laszlo Ersek @ 2013-05-29 10:07 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, KONRAD Frederic

On 05/29/13 09:27, Paolo Bonzini wrote:
> Il 29/05/2013 06:33, Rusty Russell ha scritto:
>> Paolo Bonzini <pbonzini@redhat.com> writes:
>>> Il 28/05/2013 19:32, Michael S. Tsirkin ha scritto:
>>>>>>>> +
>>>>>>>> +    switch (addr) {
>>>>>>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>>>>>>> +        return proxy->device_feature_select;
>>>>>>
>>>>>> Oh dear no...  Please use defines like the rest of QEMU.
>>>> Any good reason not to use offsetof?
>>>
>>> I'm not sure it's portable to use it in case labels.  IIRC, the
>>> definition ((int)&(((T *)0)->field)) is not a valid C integer constant
>>> expression.  Laszlo?
>>
>> It's defined to yield an integer constant expression in the ISO standard
>> (and I think ANSI too, though that's not at hand):
> 
> It's not in C89.

It is, see 7.1.6 Common definitions <stddef.h>.


> The oldest compiler QEMU cares about is GCC 4.2.  I
> don't know if it has a builtin offsetof, probably it does.

Talking about nothing else but the ISO C standard(s), it doesn't matter
how the C implementation deals with offsetof() internally. "C
implementation" in this sense includes both compiler and standard library.

If the standard library header (<stddef.h> or something internal
included by it) expands the offsetof() macro to replacement text that
does pointer black magic, magic that would be otherwise nonportable or
undefined, it is alright as long as the accompanying compiler guarantees
that the replacement text will work.

That is, offsetof() gives the C implementor leeway to implement it in
wherever "distribution" between compiler and standard library; the main
thing is that the C program use the one offsetof() provided by the C
implementation.

Of course in the FLOSS world OS distros might mix and match gcc and
glibc statistically randomly; normally some documentation should enable
the user to put his/her system into ISO C or even SUSv[1234] mode.

(

An interesting example for, say, "non-unified implementation" is
"getconf" vs. "c99". I'll pick SUSv3 for this example.

http://pubs.opengroup.org/onlinepubs/000095399/utilities/getconf.html
http://pubs.opengroup.org/onlinepubs/000095399/utilities/c99.html

In a nutshell one can interrogate getconf for CFLAGS, LDFLAGS and LIBS
so that c99 builds a program in ILP32_OFF32 / ILP32_OFFBIG / LP64_OFF64
/ LPBIG_OFFBIG mode ("programming environment").

On my x86_64 RHEL-6.4 laptop, "getconf" is part of glibc
(glibc-common-2.12-1.107.el6.x86_64), while c99 is part of gcc
(gcc-4.4.7-3.el6.x86_64).

Supposing I'd like to build a 32-bit program with 64-bit off_t:

  getconf _POSIX_V6_ILP32_OFFBIG
  1

  getconf -v POSIX_V6_ILP32_OFFBIG POSIX_V6_ILP32_OFFBIG_CFLAGS
  -m32 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64

  getconf -v POSIX_V6_ILP32_OFFBIG POSIX_V6_ILP32_OFFBIG_LDFLAGS
  -m32

  getconf -v POSIX_V6_ILP32_OFFBIG POSIX_V6_ILP32_OFFBIG_LIBS
  [empty string]

However if I try to actually build a program using these flags:

  #define _XOPEN_SOURCE 600
  int main(void) { return 0; }

  c99 \
      -m32 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 \
      -o mytest \
      -m32 \
      mytest.c

I get

  /usr/bin/ld: crt1.o: No such file: No such file or directory
  collect2: ld returned 1 exit status

unless I install "glibc-devel.i686".

This problem is rooted in "getconf" (a glibc utility) being unaware of
RHEL packaging choices: if the system can't guarantee the final c99
invocation to succeed, then the very first "getconf" invocation should
write "-1\n" or "undefined\n" to stdout.

(I'm aware that RHEL-6 is not certified UNIX 03; this is just an example
for problems caused by various parts of a standard's
(quasi-)implementation coming from separate projects. In that sense,
caring about the offsetof() internals may have merit.)

)

Laszlo

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29  9:00       ` Peter Maydell
@ 2013-05-29 10:08         ` Michael S. Tsirkin
  2013-05-29 10:53           ` Peter Maydell
  0 siblings, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 10:08 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Anthony Liguori, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, May 29, 2013 at 10:00:33AM +0100, Peter Maydell wrote:
> On 29 May 2013 09:24, Michael S. Tsirkin <mst@redhat.com> wrote:
> > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> > index f4db224..fd09ea7 100644
> > --- a/hw/virtio/virtio-pci.c
> > +++ b/hw/virtio/virtio-pci.c
> > @@ -467,51 +467,70 @@ static uint64_t virtio_pci_config_common_read(void *opaque, hwaddr addr,
> >  {
> >      VirtIOPCIProxy *proxy = opaque;
> >      VirtIODevice *vdev = proxy->vdev;
> > +    struct virtio_pci_common_cfg cfg;
> >
> >      uint64_t low = 0xffffffffull;
> >
> >      switch (addr) {
> >      case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> > +        assert(size == sizeof cfg.device_feature_select);
> >          return proxy->device_feature_select;
> 
> Asserting is definitely the wrong thing here, since the
> guest can trigger it.

So?

It's a driver bug. It can reset or crash guest with the same effect,
and it likely will if we let it continue.
assert makes sure we don't let it escalate into some
hard to debug security problem.


> If you really want to use offsetof like this you're
> going to need to decorate the structs with QEMU_PACKED.
> 
> thanks
> -- PMM

Nope.
These structs are carefully designed not to have any padding.

And if there was a bug and there was some padding, we still
can't fix it with PACKED because this structure
is used to interact with the guest code which does not
have the packed attribute.

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 10:08         ` Michael S. Tsirkin
@ 2013-05-29 10:53           ` Peter Maydell
  2013-05-29 12:16             ` Michael S. Tsirkin
  0 siblings, 1 reply; 109+ messages in thread
From: Peter Maydell @ 2013-05-29 10:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Anthony Liguori, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On 29 May 2013 11:08, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Wed, May 29, 2013 at 10:00:33AM +0100, Peter Maydell wrote:
>> Asserting is definitely the wrong thing here, since the
>> guest can trigger it.
>
> So?

So "guest should not be able to crash QEMU" is a standard
rule: assert() is for QEMU bugs, not guest bugs.
Virtio isn't any different to any other device model.

> It's a driver bug. It can reset or crash guest with the same effect,
> and it likely will if we let it continue.

Letting a misbehaving guest crash itself is fine. Killing
QEMU isn't.

> assert makes sure we don't let it escalate into some
> hard to debug security problem.

If you want to make guest bugs easier to spot and debug
this is what qemu_log_mask(LOG_GUEST_ERROR,...) is for.

>> If you really want to use offsetof like this you're
>> going to need to decorate the structs with QEMU_PACKED.

> Nope.
> These structs are carefully designed not to have any padding.

...on every compiler and OS combination that QEMU builds for?

> And if there was a bug and there was some padding, we still
> can't fix it with PACKED because this structure
> is used to interact with the guest code which does not
> have the packed attribute.

The guest code has to use a set of structure offsets and
sizes which is fixed by the virtio ABI -- how it implements
this is up to the guest (and if it misimplements it that is
a guest bug and not our problem). Note also that the ABI
is not defined by a set of C source struct definitions
(except in as far as they are accompanied by a set of
restrictions on alignment, padding, etc that completely
determine the numerical alignments and offsets).
How QEMU implements the set of offsets and sizes specified
by the ABI is definitely our problem, and is exactly what
this discussion is about. The simplest and safest way to
get the offsets right on all platforms is just to use a set
of #defines, which is why that's what almost all of our
device models do. Where we do define a struct matching
guest memory layout, we tag it with QEMU_PACKED as our
best means of ensuring consistency on all hosts.

thanks
-- PMM

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 10:53           ` Peter Maydell
@ 2013-05-29 12:16             ` Michael S. Tsirkin
  2013-05-29 12:28               ` Paolo Bonzini
  0 siblings, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 12:16 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Anthony Liguori, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, May 29, 2013 at 11:53:17AM +0100, Peter Maydell wrote:
> On 29 May 2013 11:08, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Wed, May 29, 2013 at 10:00:33AM +0100, Peter Maydell wrote:
> >> Asserting is definitely the wrong thing here, since the
> >> guest can trigger it.
> >
> > So?
> 
> So "guest should not be able to crash QEMU" is a standard
> rule: assert() is for QEMU bugs, not guest bugs.

It's quite likely a QEMU bug -
guests normally do fixed-size accesses so it's hard
for them to access a field with a wrong size.

It is guest triggerable but this does not
contradict the fact that it never happens in practice.

> Virtio isn't any different to any other device model.
> 
> > It's a driver bug. It can reset or crash guest with the same effect,
> > and it likely will if we let it continue.
> 
> Letting a misbehaving guest crash itself is fine. Killing
> QEMU isn't.

*Why* isn't it?
It has the advantage of making sure the misbehaving system
is stopped before it does any damage.
Why keep QEMU running even though we know
there's a memory corruption somewhere?

> > assert makes sure we don't let it escalate into some
> > hard to debug security problem.
> 
> If you want to make guest bugs easier to spot and debug
> this is what qemu_log_mask(LOG_GUEST_ERROR,...) is for.

We really want something that would also stop guest
and stop device model as well - we don't know where the bug is.

> >> If you really want to use offsetof like this you're
> >> going to need to decorate the structs with QEMU_PACKED.
> 
> > Nope.
> > These structs are carefully designed not to have any padding.
> 
> ...on every compiler and OS combination that QEMU builds for?

Yes. All the way back to EGCS and before.
GCC has been like this for many many years.

> > And if there was a bug and there was some padding, we still
> > can't fix it with PACKED because this structure
> > is used to interact with the guest code which does not
> > have the packed attribute.
> 
> The guest code has to use a set of structure offsets and
> sizes which is fixed by the virtio ABI -- how it implements
> this is up to the guest (and if it misimplements it that is
> a guest bug and not our problem). Note also that the ABI
> is not defined by a set of C source struct definitions
> (except in as far as they are accompanied by a set of
> restrictions on alignment, padding, etc that completely
> determine the numerical alignments and offsets).

In practical terms, we should all talk and agree on what's
best for driver *and* QEMU, not have QEMU just
ignore driver and do it's own thing.

In practical terms, virtio in QEMU should use
exactly same code to define layout as virtio in guest.
Preferably same header file, we'll get there too,
once we comple the discussion over the bikeshed color.

Deviating from driver in random ways is an endless source
of hard to debug issues, and it's a very practical
consideration because virtio spec is constantly
being extended (unlike hardware which is mostly fixed).


> How QEMU implements the set of offsets and sizes specified
> by the ABI is definitely our problem, and is exactly what
> this discussion is about. The simplest and safest way to
> get the offsets right on all platforms is just to use a set
> of #defines, which is why that's what almost all of our
> device models do.

Maybe you don't feel safe when you see offsetof.
I review a struct and see fields are aligned
properly at a glance and I feel safe.

But I don't feel safe when I see headers in guest and qemu have
different code.

> Where we do define a struct matching
> guest memory layout, we tag it with QEMU_PACKED as our
> best means of ensuring consistency on all hosts.
> 
> thanks
> -- PMM

Further, packed structures produce terrible
code in GCC, in all versions that QEMU cares about :).

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 12:16             ` Michael S. Tsirkin
@ 2013-05-29 12:28               ` Paolo Bonzini
  2013-05-29 12:37                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 109+ messages in thread
From: Paolo Bonzini @ 2013-05-29 12:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, KONRAD Frederic

Il 29/05/2013 14:16, Michael S. Tsirkin ha scritto:
>>>> > >> If you really want to use offsetof like this you're
>>>> > >> going to need to decorate the structs with QEMU_PACKED.
>> > 
>>> > > Nope.
>>> > > These structs are carefully designed not to have any padding.
>> > 
>> > ...on every compiler and OS combination that QEMU builds for?
> Yes. All the way back to EGCS and before.
> GCC has been like this for many many years.

I would still prefer to have QEMU_PACKED (or __attribute((__packed__))
in the kernel).

>>> > > And if there was a bug and there was some padding, we still
>>> > > can't fix it with PACKED because this structure
>>> > > is used to interact with the guest code which does not
>>> > > have the packed attribute.
>> > 
>> > The guest code has to use a set of structure offsets and
>> > sizes which is fixed by the virtio ABI -- how it implements
>> > this is up to the guest (and if it misimplements it that is
>> > a guest bug and not our problem).

On the other hand, encouraging both userspace and guest to reuse a
single set of headers means that the bad offset becomes a spec bug more
than a guest bug.

> Deviating from driver in random ways is an endless source
> of hard to debug issues, and it's a very practical
> consideration because virtio spec is constantly
> being extended (unlike hardware which is mostly fixed).

Agreed---but the driver should use __attribute__((__packed__)) too.

Paolo

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 12:28               ` Paolo Bonzini
@ 2013-05-29 12:37                 ` Michael S. Tsirkin
  0 siblings, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 12:37 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, KONRAD Frederic

On Wed, May 29, 2013 at 02:28:32PM +0200, Paolo Bonzini wrote:
> Il 29/05/2013 14:16, Michael S. Tsirkin ha scritto:
> >>>> > >> If you really want to use offsetof like this you're
> >>>> > >> going to need to decorate the structs with QEMU_PACKED.
> >> > 
> >>> > > Nope.
> >>> > > These structs are carefully designed not to have any padding.
> >> > 
> >> > ...on every compiler and OS combination that QEMU builds for?
> > Yes. All the way back to EGCS and before.
> > GCC has been like this for many many years.
> 
> I would still prefer to have QEMU_PACKED (or __attribute((__packed__))
> in the kernel).
> 
> >>> > > And if there was a bug and there was some padding, we still
> >>> > > can't fix it with PACKED because this structure
> >>> > > is used to interact with the guest code which does not
> >>> > > have the packed attribute.
> >> > 
> >> > The guest code has to use a set of structure offsets and
> >> > sizes which is fixed by the virtio ABI -- how it implements
> >> > this is up to the guest (and if it misimplements it that is
> >> > a guest bug and not our problem).
> 
> On the other hand, encouraging both userspace and guest to reuse a
> single set of headers means that the bad offset becomes a spec bug more
> than a guest bug.

Heh

> > Deviating from driver in random ways is an endless source
> > of hard to debug issues, and it's a very practical
> > consideration because virtio spec is constantly
> > being extended (unlike hardware which is mostly fixed).
> 
> Agreed---but the driver should use __attribute__((__packed__)) too.
> 
> Paolo

For these specific structures I don't mind, we never dereference them
anyway. If you do dereference a structure, using packed is generally
a mistake. In particular because GCC generates code that is
much slower.

You can also get nasty surprises e.g.

struct foo {
	char a;
	int b;
} __attribute__((packed));

struct foo foo;
int *a = &foo.a;

Last line compiles fine but isn't legal C.

So I think there are two cases with packed:
1. it does not change logical behaviour but results in bad code
   generated
2. it does change logical behaviour and leads to unsafe code

There aren't many good reasons to use packed: if you
must treat unaligned data, best to use constants.

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29  4:31     ` Rusty Russell
@ 2013-05-29 12:52       ` Anthony Liguori
  -1 siblings, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-29 12:52 UTC (permalink / raw)
  To: Rusty Russell, Michael S. Tsirkin, virtualization, kvm
  Cc: Peter Maydell, Paolo Bonzini, Michael S. Tsirkin,
	Stefan Hajnoczi, KONRAD Frederic

Rusty Russell <rusty@rustcorp.com.au> writes:

> Anthony Liguori <aliguori@us.ibm.com> writes:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>> +        return proxy->device_feature_select;
>>
>> Oh dear no...  Please use defines like the rest of QEMU.
>
> It is pretty ugly.

I think beauty is in the eye of the beholder here...

Pretty much every device we have has a switch statement like this.
Consistency wins when it comes to qualitative arguments like this.

> Yet the structure definitions are descriptive, capturing layout, size
> and endianness in natural a format readable by any C programmer.

>From an API design point of view, here are the problems I see:

1) C makes no guarantees about structure layout beyond the first
   member.  Yes, if it's naturally aligned or has a packed attribute,
   GCC does the right thing.  But this isn't kernel land anymore,
   portability matters and there are more compilers than GCC.

2) If we every introduce anything like latching, this doesn't work out
   so well anymore because it's hard to express in a single C structure
   the register layout at that point.  Perhaps a union could be used but
   padding may make it a bit challenging.

3) It suspect it's harder to review because a subtle change could more
   easily have broad impact.  If someone changed the type of a field
   from u32 to u16, it changes the offset of every other field.  That's
   not terribly obvious in the patch itself unless you understand how
   the structure is used elsewhere.

   This may not be a problem for virtio because we all understand that
   the structures are part of an ABI, but if we used this pattern more
   in QEMU, it would be a lot less obvious.

> So AFAICT the question is, do we put the required
>
> #define VIRTIO_PCI_CFG_FEATURE_SEL \
>          (offsetof(struct virtio_pci_common_cfg, device_feature_select))
>
> etc. in the kernel headers or qemu?

I'm pretty sure we would end up just having our own integer defines.  We
carry our own virtio headers today because we can't easily import the
kernel headers.

>> Haven't looked at the proposed new ring layout yet.
>
> No change, but there's an open question on whether we should nail it to
> little endian (or define the endian by the transport).
>
> Of course, I can't rule out that the 1.0 standard *may* decide to frob
> the ring layout somehow,

Well, given that virtio is widely deployed today, I would think the 1.0
standard should strictly reflect what's deployed today, no?

Any new config layout would be 2.0 material, right?

Re: the new config layout, I don't think we would want to use it for
anything but new devices.  Forcing a guest driver change is a really big
deal and I see no reason to do that unless there's a compelling reason
to.

So we're stuck with the 1.0 config layout for a very long time.

Regards,

Anthony Liguori

> but I'd think it would require a compelling
> reason.  I suggest that's 2.0 material...
>
> Cheers,
> Rusty.
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
@ 2013-05-29 12:52       ` Anthony Liguori
  0 siblings, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-29 12:52 UTC (permalink / raw)
  To: Rusty Russell, virtualization, kvm
  Cc: Peter Maydell, Paolo Bonzini, Michael S. Tsirkin,
	Stefan Hajnoczi, KONRAD Frederic

Rusty Russell <rusty@rustcorp.com.au> writes:

> Anthony Liguori <aliguori@us.ibm.com> writes:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>> +        return proxy->device_feature_select;
>>
>> Oh dear no...  Please use defines like the rest of QEMU.
>
> It is pretty ugly.

I think beauty is in the eye of the beholder here...

Pretty much every device we have has a switch statement like this.
Consistency wins when it comes to qualitative arguments like this.

> Yet the structure definitions are descriptive, capturing layout, size
> and endianness in natural a format readable by any C programmer.

From an API design point of view, here are the problems I see:

1) C makes no guarantees about structure layout beyond the first
   member.  Yes, if it's naturally aligned or has a packed attribute,
   GCC does the right thing.  But this isn't kernel land anymore,
   portability matters and there are more compilers than GCC.

2) If we every introduce anything like latching, this doesn't work out
   so well anymore because it's hard to express in a single C structure
   the register layout at that point.  Perhaps a union could be used but
   padding may make it a bit challenging.

3) It suspect it's harder to review because a subtle change could more
   easily have broad impact.  If someone changed the type of a field
   from u32 to u16, it changes the offset of every other field.  That's
   not terribly obvious in the patch itself unless you understand how
   the structure is used elsewhere.

   This may not be a problem for virtio because we all understand that
   the structures are part of an ABI, but if we used this pattern more
   in QEMU, it would be a lot less obvious.

> So AFAICT the question is, do we put the required
>
> #define VIRTIO_PCI_CFG_FEATURE_SEL \
>          (offsetof(struct virtio_pci_common_cfg, device_feature_select))
>
> etc. in the kernel headers or qemu?

I'm pretty sure we would end up just having our own integer defines.  We
carry our own virtio headers today because we can't easily import the
kernel headers.

>> Haven't looked at the proposed new ring layout yet.
>
> No change, but there's an open question on whether we should nail it to
> little endian (or define the endian by the transport).
>
> Of course, I can't rule out that the 1.0 standard *may* decide to frob
> the ring layout somehow,

Well, given that virtio is widely deployed today, I would think the 1.0
standard should strictly reflect what's deployed today, no?

Any new config layout would be 2.0 material, right?

Re: the new config layout, I don't think we would want to use it for
anything but new devices.  Forcing a guest driver change is a really big
deal and I see no reason to do that unless there's a compelling reason
to.

So we're stuck with the 1.0 config layout for a very long time.

Regards,

Anthony Liguori

> but I'd think it would require a compelling
> reason.  I suggest that's 2.0 material...
>
> Cheers,
> Rusty.
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 12:52       ` Anthony Liguori
  (?)
@ 2013-05-29 13:24       ` Michael S. Tsirkin
  2013-05-29 13:35         ` Peter Maydell
                           ` (4 more replies)
  -1 siblings, 5 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 13:24 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, May 29, 2013 at 07:52:37AM -0500, Anthony Liguori wrote:
> Rusty Russell <rusty@rustcorp.com.au> writes:
> 
> > Anthony Liguori <aliguori@us.ibm.com> writes:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> >>> +        return proxy->device_feature_select;
> >>
> >> Oh dear no...  Please use defines like the rest of QEMU.
> >
> > It is pretty ugly.
> 
> I think beauty is in the eye of the beholder here...
> 
> Pretty much every device we have has a switch statement like this.
> Consistency wins when it comes to qualitative arguments like this.
> 
> > Yet the structure definitions are descriptive, capturing layout, size
> > and endianness in natural a format readable by any C programmer.
> 
> >From an API design point of view, here are the problems I see:
> 
> 1) C makes no guarantees about structure layout beyond the first
>    member.  Yes, if it's naturally aligned or has a packed attribute,
>    GCC does the right thing.  But this isn't kernel land anymore,
>    portability matters and there are more compilers than GCC.

You expect a compiler to pad this structure:

struct foo {
	uint8_t a;
	uint8_t b;
	uint16_t c;
	uint32_t d;
};

I'm guessing any compiler that decides to waste memory in this way
will quickly get dropped by users and then we won't worry
about building QEMU with it.

> 2) If we every introduce anything like latching, this doesn't work out
>    so well anymore because it's hard to express in a single C structure
>    the register layout at that point.  Perhaps a union could be used but
>    padding may make it a bit challenging.

Then linux won't use it either.

> 3) It suspect it's harder to review because a subtle change could more
>    easily have broad impact.  If someone changed the type of a field
>    from u32 to u16, it changes the offset of every other field.  That's
>    not terribly obvious in the patch itself unless you understand how
>    the structure is used elsewhere.
> 
>    This may not be a problem for virtio because we all understand that
>    the structures are part of an ABI, but if we used this pattern more
>    in QEMU, it would be a lot less obvious.

So let's not use it more in QEMU.

> > So AFAICT the question is, do we put the required
> >
> > #define VIRTIO_PCI_CFG_FEATURE_SEL \
> >          (offsetof(struct virtio_pci_common_cfg, device_feature_select))
> >
> > etc. in the kernel headers or qemu?
> 
> I'm pretty sure we would end up just having our own integer defines.  We
> carry our own virtio headers today because we can't easily import the
> kernel headers.

Yes we can easily import them.
And at least we copy headers verbatim.


> >> Haven't looked at the proposed new ring layout yet.
> >
> > No change, but there's an open question on whether we should nail it to
> > little endian (or define the endian by the transport).
> >
> > Of course, I can't rule out that the 1.0 standard *may* decide to frob
> > the ring layout somehow,
> 
> Well, given that virtio is widely deployed today, I would think the 1.0
> standard should strictly reflect what's deployed today, no?
> Any new config layout would be 2.0 material, right?

Not as it's currently planned. Devices can choose
to support a legacy layout in addition to the new one,
and if you look at the patch you will see that that
is exactly what it does.

> Re: the new config layout, I don't think we would want to use it for
> anything but new devices.  Forcing a guest driver change

There's no forcing.
If you look at the patches closely, you will see that
we still support the old layout on BAR0.


> is a really big
> deal and I see no reason to do that unless there's a compelling reason
> to.

There are many a compelling reasons, and they are well known
limitations of virtio PCI:

- PCI spec compliance (madates device operation with IO memory disabled).
- support 64 bit addressing
- add more than 32 feature bits.
- individually disable queues.
- sanely support cross-endian systems.
- support very small (<1 PAGE) for virtio rings.
- support a separate page for each vq kick.
- make each device place config at flexible offset.

Addressing any one of these would cause us to add a substantially new
way to operate virtio devices.

And since it's a guest change anyway, it seemed like a
good time to do the new layout and fix everything in one go.

And they are needed like yesterday.


> So we're stuck with the 1.0 config layout for a very long time.
> 
> Regards,
> 
> Anthony Liguori

Absolutely. This patch let us support both which will allow for
a gradual transition over the next 10 years or so.

> > reason.  I suggest that's 2.0 material...
> >
> > Cheers,
> > Rusty.
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 13:24       ` Michael S. Tsirkin
@ 2013-05-29 13:35         ` Peter Maydell
  2013-05-29 13:35         ` Peter Maydell
                           ` (3 subsequent siblings)
  4 siblings, 0 replies; 109+ messages in thread
From: Peter Maydell @ 2013-05-29 13:35 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Anthony Liguori, Rusty Russell, virtualization, kvm,
	KONRAD Frederic, Paolo Bonzini, Stefan Hajnoczi

On 29 May 2013 14:24, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Wed, May 29, 2013 at 07:52:37AM -0500, Anthony Liguori wrote:
>> 1) C makes no guarantees about structure layout beyond the first
>>    member.  Yes, if it's naturally aligned or has a packed attribute,
>>    GCC does the right thing.  But this isn't kernel land anymore,
>>    portability matters and there are more compilers than GCC.
>
> You expect a compiler to pad this structure:
>
> struct foo {
>         uint8_t a;
>         uint8_t b;
>         uint16_t c;
>         uint32_t d;
> };
>
> I'm guessing any compiler that decides to waste memory in this way
> will quickly get dropped by users and then we won't worry
> about building QEMU with it.

Structure alignment is a platform ABI choice. Oddly enough people
choose operating systems on more criteria than the amount of
padding their ABI mandates in structures.

In any case it's really tedious to read a structure and wade
through working out whether it's going to have all its members
naturally aligned, especially once it's more than a handful
of members in size. And getting it wrong is "silent portability
problem".

thanks
-- PMM

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 13:24       ` Michael S. Tsirkin
  2013-05-29 13:35         ` Peter Maydell
@ 2013-05-29 13:35         ` Peter Maydell
  2013-05-29 13:41         ` Paolo Bonzini
                           ` (2 subsequent siblings)
  4 siblings, 0 replies; 109+ messages in thread
From: Peter Maydell @ 2013-05-29 13:35 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Anthony Liguori, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On 29 May 2013 14:24, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Wed, May 29, 2013 at 07:52:37AM -0500, Anthony Liguori wrote:
>> 1) C makes no guarantees about structure layout beyond the first
>>    member.  Yes, if it's naturally aligned or has a packed attribute,
>>    GCC does the right thing.  But this isn't kernel land anymore,
>>    portability matters and there are more compilers than GCC.
>
> You expect a compiler to pad this structure:
>
> struct foo {
>         uint8_t a;
>         uint8_t b;
>         uint16_t c;
>         uint32_t d;
> };
>
> I'm guessing any compiler that decides to waste memory in this way
> will quickly get dropped by users and then we won't worry
> about building QEMU with it.

Structure alignment is a platform ABI choice. Oddly enough people
choose operating systems on more criteria than the amount of
padding their ABI mandates in structures.

In any case it's really tedious to read a structure and wade
through working out whether it's going to have all its members
naturally aligned, especially once it's more than a handful
of members in size. And getting it wrong is "silent portability
problem".

thanks
-- PMM

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 13:24       ` Michael S. Tsirkin
  2013-05-29 13:35         ` Peter Maydell
  2013-05-29 13:35         ` Peter Maydell
@ 2013-05-29 13:41         ` Paolo Bonzini
  2013-05-29 14:02           ` Michael S. Tsirkin
                             ` (3 more replies)
  2013-05-29 14:16         ` Anthony Liguori
  2013-05-29 14:16         ` Anthony Liguori
  4 siblings, 4 replies; 109+ messages in thread
From: Paolo Bonzini @ 2013-05-29 13:41 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, KONRAD Frederic

Il 29/05/2013 15:24, Michael S. Tsirkin ha scritto:
> You expect a compiler to pad this structure:
> 
> struct foo {
> 	uint8_t a;
> 	uint8_t b;
> 	uint16_t c;
> 	uint32_t d;
> };
> 
> I'm guessing any compiler that decides to waste memory in this way
> will quickly get dropped by users and then we won't worry
> about building QEMU with it.

You know the virtio-pci config structures are padded, but not all of
them are.  For example, virtio_balloon_stat is not padded and indeed has
an __attribute__((__packed__)) in the spec.

For this reason I prefer to have the attribute everywhere.  So people
don't have to wonder why it's here and not there.

Paolo

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 13:41         ` Paolo Bonzini
@ 2013-05-29 14:02           ` Michael S. Tsirkin
  2013-05-29 14:18           ` Anthony Liguori
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 14:02 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, KONRAD Frederic

On Wed, May 29, 2013 at 03:41:09PM +0200, Paolo Bonzini wrote:
> Il 29/05/2013 15:24, Michael S. Tsirkin ha scritto:
> > You expect a compiler to pad this structure:
> > 
> > struct foo {
> > 	uint8_t a;
> > 	uint8_t b;
> > 	uint16_t c;
> > 	uint32_t d;
> > };
> > 
> > I'm guessing any compiler that decides to waste memory in this way
> > will quickly get dropped by users and then we won't worry
> > about building QEMU with it.
> 
> You know the virtio-pci config structures are padded, but not all of
> them are.  For example, virtio_balloon_stat is not padded and indeed has
> an __attribute__((__packed__)) in the spec.
> 
> For this reason I prefer to have the attribute everywhere.  So people
> don't have to wonder why it's here and not there.
> 
> Paolo

FWIW I think it was a mistake to lay the balloon structure out like that.
We should have padded it manually.
__attribute__((__packed__)) is really easy to misuse.
If you get into such a situation, just use offset enums ...

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 13:24       ` Michael S. Tsirkin
                           ` (3 preceding siblings ...)
  2013-05-29 14:16         ` Anthony Liguori
@ 2013-05-29 14:16         ` Anthony Liguori
  2013-05-29 14:30           ` Michael S. Tsirkin
  4 siblings, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-05-29 14:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Rusty Russell, virtualization, kvm, KONRAD Frederic,
	Paolo Bonzini, Peter Maydell, Stefan Hajnoczi

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, May 29, 2013 at 07:52:37AM -0500, Anthony Liguori wrote:
>> 1) C makes no guarantees about structure layout beyond the first
>>    member.  Yes, if it's naturally aligned or has a packed attribute,
>>    GCC does the right thing.  But this isn't kernel land anymore,
>>    portability matters and there are more compilers than GCC.
>
> You expect a compiler to pad this structure:
>
> struct foo {
> 	uint8_t a;
> 	uint8_t b;
> 	uint16_t c;
> 	uint32_t d;
> };
>
> I'm guessing any compiler that decides to waste memory in this way
> will quickly get dropped by users and then we won't worry
> about building QEMU with it.

There are other responses in the thread here and I don't really care to
bikeshed on this issue.

>> Well, given that virtio is widely deployed today, I would think the 1.0
>> standard should strictly reflect what's deployed today, no?
>> Any new config layout would be 2.0 material, right?
>
> Not as it's currently planned. Devices can choose
> to support a legacy layout in addition to the new one,
> and if you look at the patch you will see that that
> is exactly what it does.

Adding a new BAR most certainly requires bumping the revision ID or
changing the device ID, no?

Didn't we run into this problem with the virtio-win drivers with just
the BAR size changing? 

>> Re: the new config layout, I don't think we would want to use it for
>> anything but new devices.  Forcing a guest driver change
>
> There's no forcing.
> If you look at the patches closely, you will see that
> we still support the old layout on BAR0.
>
>
>> is a really big
>> deal and I see no reason to do that unless there's a compelling reason
>> to.
>
> There are many a compelling reasons, and they are well known
> limitations of virtio PCI:
>
> - PCI spec compliance (madates device operation with IO memory
> disabled).

PCI express spec.  We are fully compliant with the PCI spec.  And what's
the user visible advantage of pointing an emulated virtio device behind
a PCI-e bus verses a legacy PCI bus?

This is a very good example because if we have to disable BAR0, then
it's an ABI breaker plan and simple.

> - support 64 bit addressing

We currently support 44-bit addressing for the ring.  While I agree we
need to bump it, there's no immediate problem with 44-bit addressing.

> - add more than 32 feature bits.
> - individually disable queues.
> - sanely support cross-endian systems.
> - support very small (<1 PAGE) for virtio rings.
> - support a separate page for each vq kick.
> - make each device place config at flexible offset.

None of these things are holding us back today.

I'm not saying we shouldn't introduce a new device.  But adoption of
that device will be slow and realistically will be limited to new
devices only.

We'll be supporting both devices for a very, very long time.

Compatibility is the fundamental value that we provide.  We need to go
out of our way to make sure that existing guests work and work as well
as possible.

Sticking virtio devices behind a PCI-e bus just for the hell of it isn't
a compelling reason to break existing guests.

Regards,

Anthony Liguori


> Addressing any one of these would cause us to add a substantially new
> way to operate virtio devices.
>
> And since it's a guest change anyway, it seemed like a
> good time to do the new layout and fix everything in one go.
>
> And they are needed like yesterday.
>
>
>> So we're stuck with the 1.0 config layout for a very long time.
>> 
>> Regards,
>> 
>> Anthony Liguori
>
> Absolutely. This patch let us support both which will allow for
> a gradual transition over the next 10 years or so.
>
>> > reason.  I suggest that's 2.0 material...
>> >
>> > Cheers,
>> > Rusty.
>> >
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe kvm" in
>> > the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 13:24       ` Michael S. Tsirkin
                           ` (2 preceding siblings ...)
  2013-05-29 13:41         ` Paolo Bonzini
@ 2013-05-29 14:16         ` Anthony Liguori
  2013-05-29 14:16         ` Anthony Liguori
  4 siblings, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-29 14:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, May 29, 2013 at 07:52:37AM -0500, Anthony Liguori wrote:
>> 1) C makes no guarantees about structure layout beyond the first
>>    member.  Yes, if it's naturally aligned or has a packed attribute,
>>    GCC does the right thing.  But this isn't kernel land anymore,
>>    portability matters and there are more compilers than GCC.
>
> You expect a compiler to pad this structure:
>
> struct foo {
> 	uint8_t a;
> 	uint8_t b;
> 	uint16_t c;
> 	uint32_t d;
> };
>
> I'm guessing any compiler that decides to waste memory in this way
> will quickly get dropped by users and then we won't worry
> about building QEMU with it.

There are other responses in the thread here and I don't really care to
bikeshed on this issue.

>> Well, given that virtio is widely deployed today, I would think the 1.0
>> standard should strictly reflect what's deployed today, no?
>> Any new config layout would be 2.0 material, right?
>
> Not as it's currently planned. Devices can choose
> to support a legacy layout in addition to the new one,
> and if you look at the patch you will see that that
> is exactly what it does.

Adding a new BAR most certainly requires bumping the revision ID or
changing the device ID, no?

Didn't we run into this problem with the virtio-win drivers with just
the BAR size changing? 

>> Re: the new config layout, I don't think we would want to use it for
>> anything but new devices.  Forcing a guest driver change
>
> There's no forcing.
> If you look at the patches closely, you will see that
> we still support the old layout on BAR0.
>
>
>> is a really big
>> deal and I see no reason to do that unless there's a compelling reason
>> to.
>
> There are many a compelling reasons, and they are well known
> limitations of virtio PCI:
>
> - PCI spec compliance (madates device operation with IO memory
> disabled).

PCI express spec.  We are fully compliant with the PCI spec.  And what's
the user visible advantage of pointing an emulated virtio device behind
a PCI-e bus verses a legacy PCI bus?

This is a very good example because if we have to disable BAR0, then
it's an ABI breaker plan and simple.

> - support 64 bit addressing

We currently support 44-bit addressing for the ring.  While I agree we
need to bump it, there's no immediate problem with 44-bit addressing.

> - add more than 32 feature bits.
> - individually disable queues.
> - sanely support cross-endian systems.
> - support very small (<1 PAGE) for virtio rings.
> - support a separate page for each vq kick.
> - make each device place config at flexible offset.

None of these things are holding us back today.

I'm not saying we shouldn't introduce a new device.  But adoption of
that device will be slow and realistically will be limited to new
devices only.

We'll be supporting both devices for a very, very long time.

Compatibility is the fundamental value that we provide.  We need to go
out of our way to make sure that existing guests work and work as well
as possible.

Sticking virtio devices behind a PCI-e bus just for the hell of it isn't
a compelling reason to break existing guests.

Regards,

Anthony Liguori


> Addressing any one of these would cause us to add a substantially new
> way to operate virtio devices.
>
> And since it's a guest change anyway, it seemed like a
> good time to do the new layout and fix everything in one go.
>
> And they are needed like yesterday.
>
>
>> So we're stuck with the 1.0 config layout for a very long time.
>> 
>> Regards,
>> 
>> Anthony Liguori
>
> Absolutely. This patch let us support both which will allow for
> a gradual transition over the next 10 years or so.
>
>> > reason.  I suggest that's 2.0 material...
>> >
>> > Cheers,
>> > Rusty.
>> >
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe kvm" in
>> > the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 13:41         ` Paolo Bonzini
  2013-05-29 14:02           ` Michael S. Tsirkin
  2013-05-29 14:18           ` Anthony Liguori
@ 2013-05-29 14:18           ` Anthony Liguori
  2013-05-30  7:43           ` Michael S. Tsirkin
  3 siblings, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-29 14:18 UTC (permalink / raw)
  To: Paolo Bonzini, Michael S. Tsirkin
  Cc: Rusty Russell, virtualization, kvm, KONRAD Frederic,
	Peter Maydell, Stefan Hajnoczi

Paolo Bonzini <pbonzini@redhat.com> writes:

> Il 29/05/2013 15:24, Michael S. Tsirkin ha scritto:
>> You expect a compiler to pad this structure:
>> 
>> struct foo {
>> 	uint8_t a;
>> 	uint8_t b;
>> 	uint16_t c;
>> 	uint32_t d;
>> };
>> 
>> I'm guessing any compiler that decides to waste memory in this way
>> will quickly get dropped by users and then we won't worry
>> about building QEMU with it.
>
> You know the virtio-pci config structures are padded, but not all of
> them are.  For example, virtio_balloon_stat is not padded and indeed has
> an __attribute__((__packed__)) in the spec.

Not that these structures are actually used for something.

We store the config in these structures so they are actually used for
something.

The proposed structures only serve as a way to express offsets.  You
would never actually have a variable of this type.

Regards,

Anthony Liguori

>
> For this reason I prefer to have the attribute everywhere.  So people
> don't have to wonder why it's here and not there.
>
> Paolo
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 13:41         ` Paolo Bonzini
  2013-05-29 14:02           ` Michael S. Tsirkin
@ 2013-05-29 14:18           ` Anthony Liguori
  2013-05-29 14:18           ` Anthony Liguori
  2013-05-30  7:43           ` Michael S. Tsirkin
  3 siblings, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-29 14:18 UTC (permalink / raw)
  To: Paolo Bonzini, Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi, KONRAD Frederic

Paolo Bonzini <pbonzini@redhat.com> writes:

> Il 29/05/2013 15:24, Michael S. Tsirkin ha scritto:
>> You expect a compiler to pad this structure:
>> 
>> struct foo {
>> 	uint8_t a;
>> 	uint8_t b;
>> 	uint16_t c;
>> 	uint32_t d;
>> };
>> 
>> I'm guessing any compiler that decides to waste memory in this way
>> will quickly get dropped by users and then we won't worry
>> about building QEMU with it.
>
> You know the virtio-pci config structures are padded, but not all of
> them are.  For example, virtio_balloon_stat is not padded and indeed has
> an __attribute__((__packed__)) in the spec.

Not that these structures are actually used for something.

We store the config in these structures so they are actually used for
something.

The proposed structures only serve as a way to express offsets.  You
would never actually have a variable of this type.

Regards,

Anthony Liguori

>
> For this reason I prefer to have the attribute everywhere.  So people
> don't have to wonder why it's here and not there.
>
> Paolo
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 14:16         ` Anthony Liguori
@ 2013-05-29 14:30           ` Michael S. Tsirkin
  2013-05-29 14:32             ` Paolo Bonzini
                               ` (2 more replies)
  0 siblings, 3 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 14:30 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, May 29, 2013 at 09:16:39AM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Wed, May 29, 2013 at 07:52:37AM -0500, Anthony Liguori wrote:
> >> 1) C makes no guarantees about structure layout beyond the first
> >>    member.  Yes, if it's naturally aligned or has a packed attribute,
> >>    GCC does the right thing.  But this isn't kernel land anymore,
> >>    portability matters and there are more compilers than GCC.
> >
> > You expect a compiler to pad this structure:
> >
> > struct foo {
> > 	uint8_t a;
> > 	uint8_t b;
> > 	uint16_t c;
> > 	uint32_t d;
> > };
> >
> > I'm guessing any compiler that decides to waste memory in this way
> > will quickly get dropped by users and then we won't worry
> > about building QEMU with it.
> 
> There are other responses in the thread here and I don't really care to
> bikeshed on this issue.

Great. Let's make the bikeshed blue then?

> >> Well, given that virtio is widely deployed today, I would think the 1.0
> >> standard should strictly reflect what's deployed today, no?
> >> Any new config layout would be 2.0 material, right?
> >
> > Not as it's currently planned. Devices can choose
> > to support a legacy layout in addition to the new one,
> > and if you look at the patch you will see that that
> > is exactly what it does.
> 
> Adding a new BAR most certainly requires bumping the revision ID or
> changing the device ID, no?

No, why would it?

If a device dropped BAR0, that would be a good reason
to bump revision ID.
We don't do this yet.

> Didn't we run into this problem with the virtio-win drivers with just
> the BAR size changing? 

Because they had a bug: they validated BAR0 size. AFAIK they don't care
what happens with other bars.

> >> Re: the new config layout, I don't think we would want to use it for
> >> anything but new devices.  Forcing a guest driver change
> >
> > There's no forcing.
> > If you look at the patches closely, you will see that
> > we still support the old layout on BAR0.
> >
> >
> >> is a really big
> >> deal and I see no reason to do that unless there's a compelling reason
> >> to.
> >
> > There are many a compelling reasons, and they are well known
> > limitations of virtio PCI:
> >
> > - PCI spec compliance (madates device operation with IO memory
> > disabled).
> 
> PCI express spec.  We are fully compliant with the PCI spec.  And what's
> the user visible advantage of pointing an emulated virtio device behind
> a PCI-e bus verses a legacy PCI bus?

Native hotplug support.

> This is a very good example because if we have to disable BAR0, then
> it's an ABI breaker plan and simple.

Not we. The BIOS can disable IO BAR: it can do this already
but the device won't be functional.

> > - support 64 bit addressing
> 
> We currently support 44-bit addressing for the ring.  While I agree we
> need to bump it, there's no immediate problem with 44-bit addressing.

I heard developers (though not users) complaining.

> > - add more than 32 feature bits.
> > - individually disable queues.
> > - sanely support cross-endian systems.
> > - support very small (<1 PAGE) for virtio rings.
> > - support a separate page for each vq kick.
> > - make each device place config at flexible offset.
> 
> None of these things are holding us back today.

All of them do, to bigger or lesser degree.


> I'm not saying we shouldn't introduce a new device.  But adoption of
> that device will be slow and realistically will be limited to new
> devices only.
> 
> We'll be supporting both devices for a very, very long time.

This is true for any new feature. What are you trying to say here?
We won't add new features to old config: for once, we have
run out of feature bits.

> Compatibility is the fundamental value that we provide.  We need to go
> out of our way to make sure that existing guests work and work as well
> as possible.

What are you trying to say?
There's nothing here that breaks compatibility.
Have you looked at the patch?

I'm wasting my time arguing on the mailing list, but
once I tear myself away from this occupation,
I intend to verify that I can run an old guest on
qemu with this patch without issues.

> Sticking virtio devices behind a PCI-e bus just for the hell of it isn't
> a compelling reason to break existing guests.
> 
> Regards,
> 
> Anthony Liguori

That's why my patch does not break existing guests.


> 
> > Addressing any one of these would cause us to add a substantially new
> > way to operate virtio devices.
> >
> > And since it's a guest change anyway, it seemed like a
> > good time to do the new layout and fix everything in one go.
> >
> > And they are needed like yesterday.
> >
> >
> >> So we're stuck with the 1.0 config layout for a very long time.
> >> 
> >> Regards,
> >> 
> >> Anthony Liguori
> >
> > Absolutely. This patch let us support both which will allow for
> > a gradual transition over the next 10 years or so.
> >
> >> > reason.  I suggest that's 2.0 material...
> >> >
> >> > Cheers,
> >> > Rusty.
> >> >
> >> > --
> >> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> >> > the body of a message to majordomo@vger.kernel.org
> >> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 14:30           ` Michael S. Tsirkin
@ 2013-05-29 14:32             ` Paolo Bonzini
  2013-05-29 14:52               ` Michael S. Tsirkin
  2013-05-29 14:55             ` Anthony Liguori
  2013-05-29 14:55             ` Anthony Liguori
  2 siblings, 1 reply; 109+ messages in thread
From: Paolo Bonzini @ 2013-05-29 14:32 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, KONRAD Frederic

Il 29/05/2013 16:30, Michael S. Tsirkin ha scritto:
>> > There are other responses in the thread here and I don't really care to
>> > bikeshed on this issue.
> Great. Let's make the bikeshed blue then?

Yes, let's make it blue, but please promise to check into Peter's
register API so we can remove the case offsetof.

I checked that it works on RHEL5, which is 4.1 and probably the oldest
compiler we care about (any other 4.1 lacks the __sync builtins;
upstream added them in 4.2).

Paolo

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 14:32             ` Paolo Bonzini
@ 2013-05-29 14:52               ` Michael S. Tsirkin
  0 siblings, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 14:52 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, KONRAD Frederic

On Wed, May 29, 2013 at 04:32:31PM +0200, Paolo Bonzini wrote:
> Il 29/05/2013 16:30, Michael S. Tsirkin ha scritto:
> >> > There are other responses in the thread here and I don't really care to
> >> > bikeshed on this issue.
> > Great. Let's make the bikeshed blue then?
> 
> Yes, let's make it blue, but please promise to check into Peter's
> register API so we can remove the case offsetof.

I think it's overkill here.  virtio was designed to be easy to implement
with a simple switch statement.

> I checked that it works on RHEL5, which is 4.1 and probably the oldest
> compiler we care about (any other 4.1 lacks the __sync builtins;
> upstream added them in 4.2).
> 
> Paolo

Are you sure? Documentation says 4.0 ...

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 14:30           ` Michael S. Tsirkin
  2013-05-29 14:32             ` Paolo Bonzini
@ 2013-05-29 14:55             ` Anthony Liguori
  2013-05-29 16:12               ` Michael S. Tsirkin
  2013-05-29 18:16               ` Michael S. Tsirkin
  2013-05-29 14:55             ` Anthony Liguori
  2 siblings, 2 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-29 14:55 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Rusty Russell, virtualization, kvm, KONRAD Frederic,
	Paolo Bonzini, Peter Maydell, Stefan Hajnoczi

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, May 29, 2013 at 09:16:39AM -0500, Anthony Liguori wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> > I'm guessing any compiler that decides to waste memory in this way
>> > will quickly get dropped by users and then we won't worry
>> > about building QEMU with it.
>> 
>> There are other responses in the thread here and I don't really care to
>> bikeshed on this issue.
>
> Great. Let's make the bikeshed blue then?

It's fun to argue about stuff like this and I certainly have an opinion,
but I honestly don't care all that much about the offsetof thing.
However...


>
>> >> Well, given that virtio is widely deployed today, I would think the 1.0
>> >> standard should strictly reflect what's deployed today, no?
>> >> Any new config layout would be 2.0 material, right?
>> >
>> > Not as it's currently planned. Devices can choose
>> > to support a legacy layout in addition to the new one,
>> > and if you look at the patch you will see that that
>> > is exactly what it does.
>> 
>> Adding a new BAR most certainly requires bumping the revision ID or
>> changing the device ID, no?
>
> No, why would it?

If we change the programming interface for a device in a way that is
incompatible, we are required to change the revision ID and/or device
ID.

> If a device dropped BAR0, that would be a good reason
> to bump revision ID.
> We don't do this yet.

But we have to drop BAR0 to put it behind a PCI express bus, right?

If that's the case, then device that's exposed on the PCI express bus
must use a different device ID and/or revision ID.

That means a new driver is needed in the guest.

>> Didn't we run into this problem with the virtio-win drivers with just
>> the BAR size changing? 
>
> Because they had a bug: they validated BAR0 size. AFAIK they don't care
> what happens with other bars.

I think there's a grey area with respect to the assumptions a device can
make about the programming interface.

But very concretely, we cannot expose virtio-pci-net via PCI express
with BAR0 disabled because that will result in existing virtio-pci Linux
drivers breaking.

> Not we. The BIOS can disable IO BAR: it can do this already
> but the device won't be functional.

But the only way to expose the device over PCI express is to disable the
IO BAR, right?

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 14:30           ` Michael S. Tsirkin
  2013-05-29 14:32             ` Paolo Bonzini
  2013-05-29 14:55             ` Anthony Liguori
@ 2013-05-29 14:55             ` Anthony Liguori
  2 siblings, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-29 14:55 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, May 29, 2013 at 09:16:39AM -0500, Anthony Liguori wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> > I'm guessing any compiler that decides to waste memory in this way
>> > will quickly get dropped by users and then we won't worry
>> > about building QEMU with it.
>> 
>> There are other responses in the thread here and I don't really care to
>> bikeshed on this issue.
>
> Great. Let's make the bikeshed blue then?

It's fun to argue about stuff like this and I certainly have an opinion,
but I honestly don't care all that much about the offsetof thing.
However...


>
>> >> Well, given that virtio is widely deployed today, I would think the 1.0
>> >> standard should strictly reflect what's deployed today, no?
>> >> Any new config layout would be 2.0 material, right?
>> >
>> > Not as it's currently planned. Devices can choose
>> > to support a legacy layout in addition to the new one,
>> > and if you look at the patch you will see that that
>> > is exactly what it does.
>> 
>> Adding a new BAR most certainly requires bumping the revision ID or
>> changing the device ID, no?
>
> No, why would it?

If we change the programming interface for a device in a way that is
incompatible, we are required to change the revision ID and/or device
ID.

> If a device dropped BAR0, that would be a good reason
> to bump revision ID.
> We don't do this yet.

But we have to drop BAR0 to put it behind a PCI express bus, right?

If that's the case, then device that's exposed on the PCI express bus
must use a different device ID and/or revision ID.

That means a new driver is needed in the guest.

>> Didn't we run into this problem with the virtio-win drivers with just
>> the BAR size changing? 
>
> Because they had a bug: they validated BAR0 size. AFAIK they don't care
> what happens with other bars.

I think there's a grey area with respect to the assumptions a device can
make about the programming interface.

But very concretely, we cannot expose virtio-pci-net via PCI express
with BAR0 disabled because that will result in existing virtio-pci Linux
drivers breaking.

> Not we. The BIOS can disable IO BAR: it can do this already
> but the device won't be functional.

But the only way to expose the device over PCI express is to disable the
IO BAR, right?

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 14:55             ` Anthony Liguori
@ 2013-05-29 16:12               ` Michael S. Tsirkin
  2013-05-29 18:16               ` Michael S. Tsirkin
  1 sibling, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 16:12 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, May 29, 2013 at 09:55:55AM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Wed, May 29, 2013 at 09:16:39AM -0500, Anthony Liguori wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> > I'm guessing any compiler that decides to waste memory in this way
> >> > will quickly get dropped by users and then we won't worry
> >> > about building QEMU with it.
> >> 
> >> There are other responses in the thread here and I don't really care to
> >> bikeshed on this issue.
> >
> > Great. Let's make the bikeshed blue then?
> 
> It's fun to argue about stuff like this and I certainly have an opinion,
> but I honestly don't care all that much about the offsetof thing.
> However...
> 
> 
> >
> >> >> Well, given that virtio is widely deployed today, I would think the 1.0
> >> >> standard should strictly reflect what's deployed today, no?
> >> >> Any new config layout would be 2.0 material, right?
> >> >
> >> > Not as it's currently planned. Devices can choose
> >> > to support a legacy layout in addition to the new one,
> >> > and if you look at the patch you will see that that
> >> > is exactly what it does.
> >> 
> >> Adding a new BAR most certainly requires bumping the revision ID or
> >> changing the device ID, no?
> >
> > No, why would it?
> 
> If we change the programming interface for a device in a way that is
> incompatible, we are required to change the revision ID and/or device
> ID.

It's compatible.


> > If a device dropped BAR0, that would be a good reason
> > to bump revision ID.
> > We don't do this yet.
> 
> But we have to drop BAR0 to put it behind a PCI express bus, right?

No, no.

The PCIe spec states that failure to allocate an I/O BAR should still
allow the device to function.

That's *guest* allocating an I/O BAR.


> If that's the case, then device that's exposed on the PCI express bus
> must use a different device ID and/or revision ID.
> 
> That means a new driver is needed in the guest.
> 
> >> Didn't we run into this problem with the virtio-win drivers with just
> >> the BAR size changing? 
> >
> > Because they had a bug: they validated BAR0 size. AFAIK they don't care
> > what happens with other bars.
> 
> I think there's a grey area with respect to the assumptions a device can
> make about the programming interface.
> 
> But very concretely, we cannot expose virtio-pci-net via PCI express
> with BAR0 disabled because that will result in existing virtio-pci Linux
> drivers breaking.
> 
> > Not we. The BIOS can disable IO BAR: it can do this already
> > but the device won't be functional.
> 
> But the only way to expose the device over PCI express is to disable the
> IO BAR, right?
> 
> Regards,
> 
> Anthony Liguori

No :)

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 14:55             ` Anthony Liguori
  2013-05-29 16:12               ` Michael S. Tsirkin
@ 2013-05-29 18:16               ` Michael S. Tsirkin
  1 sibling, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 18:16 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, May 29, 2013 at 09:55:55AM -0500, Anthony Liguori wrote:
> > Not we. The BIOS can disable IO BAR: it can do this already
> > but the device won't be functional.
> 
> But the only way to expose the device over PCI express is to disable the
> IO BAR, right?

I think this is the source of the misunderstanding:

	1.3.2.2.  PCI Express Endpoint Rules

	...

	PCI Express Endpoint must not depend on operating system
	allocation of I/O resources claimed through BAR(s).

The real meaning here is *not* that an Endpoint should not
have an I/O BAR. An Endpoint can have an I/O BAR,
and PCI Express spec supports I/O transactions.
The meaning is that an Endpoint should work
even if *the OS* decides to disable the I/O BAR.
Note: it's up to the guest. We support I/O
as a device with full compatibility for old guests,
but must be prepared to handle a guest which does not
enable I/O.

This likely means that as time goes on,
future OSes will start disabling I/O BARs,
relying on this not hurting functionality.

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 12:52       ` Anthony Liguori
@ 2013-05-30  3:58         ` Rusty Russell
  -1 siblings, 0 replies; 109+ messages in thread
From: Rusty Russell @ 2013-05-30  3:58 UTC (permalink / raw)
  To: Anthony Liguori, Michael S. Tsirkin, virtualization, kvm
  Cc: Peter Maydell, Paolo Bonzini, Michael S. Tsirkin,
	Stefan Hajnoczi, KONRAD Frederic

Anthony Liguori <aliguori@us.ibm.com> writes:
> Rusty Russell <rusty@rustcorp.com.au> writes:
>
>> Anthony Liguori <aliguori@us.ibm.com> writes:
>>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>>> +        return proxy->device_feature_select;
>>>
>>> Oh dear no...  Please use defines like the rest of QEMU.
>>
>> It is pretty ugly.
>
> I think beauty is in the eye of the beholder here...
>
> Pretty much every device we have has a switch statement like this.
> Consistency wins when it comes to qualitative arguments like this.

I was agreeing with you here, actually.

>> Yet the structure definitions are descriptive, capturing layout, size
>> and endianness in natural a format readable by any C programmer.
>
>>From an API design point of view, here are the problems I see:
>
> 1) C makes no guarantees about structure layout beyond the first
>    member.  Yes, if it's naturally aligned or has a packed attribute,
>    GCC does the right thing.  But this isn't kernel land anymore,
>    portability matters and there are more compilers than GCC.

[ I argued in detail here, then deleted.  Damn bikeshedding... ]

I think the best thing is to add the decoded integer versions as macros,
and have a heap of BUILD_BUG_ON() in the kernel source to make sure they
match.

> 3) It suspect it's harder to review because a subtle change could more
>    easily have broad impact.  If someone changed the type of a field
>    from u32 to u16, it changes the offset of every other field.  That's
>    not terribly obvious in the patch itself unless you understand how
>    the structure is used elsewhere.

MST's patch just did this, so point taken.  (MST: I'm going to combine
the cfg_type and bar bytes to fix this, patch coming).

>> No change, but there's an open question on whether we should nail it to
>> little endian (or define the endian by the transport).
>>
>> Of course, I can't rule out that the 1.0 standard *may* decide to frob
>> the ring layout somehow,
>
> Well, given that virtio is widely deployed today, I would think the 1.0
> standard should strictly reflect what's deployed today, no?

That will be up to the committee.  I think we want to fix some obvious
pain points, though qemu will not benefit from them in the next 5 years.

> Any new config layout would be 2.0 material, right?
>
> Re: the new config layout, I don't think we would want to use it for
> anything but new devices.

There are enough concrete reasons that I think we want it for existing
devices:

1) Negotiated ring size/alignment.  Coreboot wants smaller, others want
   larger.
2) Remove assertion that it has to be an I/O bar.  PowerPC wants this.
3) Notification location flexibility.  MST wanted this for performance.
4) More feature bits.

> Forcing a guest driver change is a really big
> deal and I see no reason to do that unless there's a compelling reason
> to.
>
> So we're stuck with the 1.0 config layout for a very long time.

We definitely must not force a guest change.  The explicit aim of the
standard is that "legacy" and 1.0 be backward compatible.  One
deliverable is a document detailing how this is done (effectively a
summary of changes between what we have and 1.0).

It's a delicate balancing act.  My plan is to accompany any changes in
the standard with a qemu implementation, so we can see how painful those
changes are.  And if there are performance implications, measure them.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
@ 2013-05-30  3:58         ` Rusty Russell
  0 siblings, 0 replies; 109+ messages in thread
From: Rusty Russell @ 2013-05-30  3:58 UTC (permalink / raw)
  To: Anthony Liguori, virtualization, kvm
  Cc: Peter Maydell, Paolo Bonzini, Michael S. Tsirkin,
	Stefan Hajnoczi, KONRAD Frederic

Anthony Liguori <aliguori@us.ibm.com> writes:
> Rusty Russell <rusty@rustcorp.com.au> writes:
>
>> Anthony Liguori <aliguori@us.ibm.com> writes:
>>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
>>>> +        return proxy->device_feature_select;
>>>
>>> Oh dear no...  Please use defines like the rest of QEMU.
>>
>> It is pretty ugly.
>
> I think beauty is in the eye of the beholder here...
>
> Pretty much every device we have has a switch statement like this.
> Consistency wins when it comes to qualitative arguments like this.

I was agreeing with you here, actually.

>> Yet the structure definitions are descriptive, capturing layout, size
>> and endianness in natural a format readable by any C programmer.
>
>>From an API design point of view, here are the problems I see:
>
> 1) C makes no guarantees about structure layout beyond the first
>    member.  Yes, if it's naturally aligned or has a packed attribute,
>    GCC does the right thing.  But this isn't kernel land anymore,
>    portability matters and there are more compilers than GCC.

[ I argued in detail here, then deleted.  Damn bikeshedding... ]

I think the best thing is to add the decoded integer versions as macros,
and have a heap of BUILD_BUG_ON() in the kernel source to make sure they
match.

> 3) It suspect it's harder to review because a subtle change could more
>    easily have broad impact.  If someone changed the type of a field
>    from u32 to u16, it changes the offset of every other field.  That's
>    not terribly obvious in the patch itself unless you understand how
>    the structure is used elsewhere.

MST's patch just did this, so point taken.  (MST: I'm going to combine
the cfg_type and bar bytes to fix this, patch coming).

>> No change, but there's an open question on whether we should nail it to
>> little endian (or define the endian by the transport).
>>
>> Of course, I can't rule out that the 1.0 standard *may* decide to frob
>> the ring layout somehow,
>
> Well, given that virtio is widely deployed today, I would think the 1.0
> standard should strictly reflect what's deployed today, no?

That will be up to the committee.  I think we want to fix some obvious
pain points, though qemu will not benefit from them in the next 5 years.

> Any new config layout would be 2.0 material, right?
>
> Re: the new config layout, I don't think we would want to use it for
> anything but new devices.

There are enough concrete reasons that I think we want it for existing
devices:

1) Negotiated ring size/alignment.  Coreboot wants smaller, others want
   larger.
2) Remove assertion that it has to be an I/O bar.  PowerPC wants this.
3) Notification location flexibility.  MST wanted this for performance.
4) More feature bits.

> Forcing a guest driver change is a really big
> deal and I see no reason to do that unless there's a compelling reason
> to.
>
> So we're stuck with the 1.0 config layout for a very long time.

We definitely must not force a guest change.  The explicit aim of the
standard is that "legacy" and 1.0 be backward compatible.  One
deliverable is a document detailing how this is done (effectively a
summary of changes between what we have and 1.0).

It's a delicate balancing act.  My plan is to accompany any changes in
the standard with a qemu implementation, so we can see how painful those
changes are.  And if there are performance implications, measure them.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-30  3:58         ` Rusty Russell
  (?)
@ 2013-05-30  5:55         ` Michael S. Tsirkin
  -1 siblings, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-30  5:55 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Thu, May 30, 2013 at 01:28:26PM +0930, Rusty Russell wrote:
> Anthony Liguori <aliguori@us.ibm.com> writes:
> > Rusty Russell <rusty@rustcorp.com.au> writes:
> >
> >> Anthony Liguori <aliguori@us.ibm.com> writes:
> >>> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >>>> +    case offsetof(struct virtio_pci_common_cfg, device_feature_select):
> >>>> +        return proxy->device_feature_select;
> >>>
> >>> Oh dear no...  Please use defines like the rest of QEMU.
> >>
> >> It is pretty ugly.
> >
> > I think beauty is in the eye of the beholder here...
> >
> > Pretty much every device we have has a switch statement like this.
> > Consistency wins when it comes to qualitative arguments like this.
> 
> I was agreeing with you here, actually.
> 
> >> Yet the structure definitions are descriptive, capturing layout, size
> >> and endianness in natural a format readable by any C programmer.
> >
> >>From an API design point of view, here are the problems I see:
> >
> > 1) C makes no guarantees about structure layout beyond the first
> >    member.  Yes, if it's naturally aligned or has a packed attribute,
> >    GCC does the right thing.  But this isn't kernel land anymore,
> >    portability matters and there are more compilers than GCC.
> 
> [ I argued in detail here, then deleted.  Damn bikeshedding... ]
> 
> I think the best thing is to add the decoded integer versions as macros,
> and have a heap of BUILD_BUG_ON() in the kernel source to make sure they
> match.

On the qemu side, fine with me, all I want is ability to stay
close to kernel headers.
Let's have XXX_SIZE macros as well, so access size can be
easily validated?

On the kernel side, for 'struct virtio_pci_cap',
since we only ever do offsetof on this struct,
it might be easier to just use the numeric constants
directly.


-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-29 13:41         ` Paolo Bonzini
                             ` (2 preceding siblings ...)
  2013-05-29 14:18           ` Anthony Liguori
@ 2013-05-30  7:43           ` Michael S. Tsirkin
  3 siblings, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-30  7:43 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, KONRAD Frederic

On Wed, May 29, 2013 at 03:41:09PM +0200, Paolo Bonzini wrote:
> Il 29/05/2013 15:24, Michael S. Tsirkin ha scritto:
> > You expect a compiler to pad this structure:
> > 
> > struct foo {
> > 	uint8_t a;
> > 	uint8_t b;
> > 	uint16_t c;
> > 	uint32_t d;
> > };
> > 
> > I'm guessing any compiler that decides to waste memory in this way
> > will quickly get dropped by users and then we won't worry
> > about building QEMU with it.
> 
> You know the virtio-pci config structures are padded, but not all of
> them are.  For example, virtio_balloon_stat is not padded and indeed has
> an __attribute__((__packed__)) in the spec.
> 
> For this reason I prefer to have the attribute everywhere.  So people
> don't have to wonder why it's here and not there.
> 
> Paolo

BTW we don't even do this consistently everywhere in QEMU.

It would have been better to have a rule to avoid packed
as much as possible, then we'd have found the misaligned field bug
in balloon before it's too late.

That would be a good rule to adopt I think:
any pack if something is misaligned, and document
the reason.

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-30  3:58         ` Rusty Russell
  (?)
  (?)
@ 2013-05-30  7:55         ` Michael S. Tsirkin
  2013-06-03  0:17           ` Rusty Russell
  -1 siblings, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-30  7:55 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Thu, May 30, 2013 at 01:28:26PM +0930, Rusty Russell wrote:
> >> Yet the structure definitions are descriptive, capturing layout, size
> >> and endianness in natural a format readable by any C programmer.
> >
> >>From an API design point of view, here are the problems I see:
> >
> > 1) C makes no guarantees about structure layout beyond the first
> >    member.  Yes, if it's naturally aligned or has a packed attribute,
> >    GCC does the right thing.  But this isn't kernel land anymore,
> >    portability matters and there are more compilers than GCC.
> 
> [ I argued in detail here, then deleted.  Damn bikeshedding... ]
> 
> I think the best thing is to add the decoded integer versions as macros,
> and have a heap of BUILD_BUG_ON() in the kernel source to make sure they
> match.

Hmm you want to have both in the header?

That would confuse users for sure :)

I guess we could put in a big comment explaning
why do we have both, and which version should
userspace use. I tried to write it:

/*
 * On all known C compilers, you can use the
 * offsetof macro to find the correct offset of each field -
 * if your compiler doesn't implement this correctly, use the
 * integer versions below, instead.
 * In that case please don't use the structures above, to avoid confusion.
 */


-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-30  3:58         ` Rusty Russell
@ 2013-05-30 13:53           ` Anthony Liguori
  -1 siblings, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-30 13:53 UTC (permalink / raw)
  To: Rusty Russell, Michael S. Tsirkin, virtualization, kvm
  Cc: Peter Maydell, Paolo Bonzini, Michael S. Tsirkin,
	Stefan Hajnoczi, KONRAD Frederic

Rusty Russell <rusty@rustcorp.com.au> writes:

> Anthony Liguori <aliguori@us.ibm.com> writes:
>> Forcing a guest driver change is a really big
>> deal and I see no reason to do that unless there's a compelling reason
>> to.
>>
>> So we're stuck with the 1.0 config layout for a very long time.
>
> We definitely must not force a guest change.  The explicit aim of the
> standard is that "legacy" and 1.0 be backward compatible.  One
> deliverable is a document detailing how this is done (effectively a
> summary of changes between what we have and 1.0).

If 2.0 is fully backwards compatible, great.  It seems like such a
difference that that would be impossible but I need to investigate
further.

Regards,

Anthony Liguori

>
> It's a delicate balancing act.  My plan is to accompany any changes in
> the standard with a qemu implementation, so we can see how painful those
> changes are.  And if there are performance implications, measure them.
>
> Cheers,
> Rusty.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
@ 2013-05-30 13:53           ` Anthony Liguori
  0 siblings, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-05-30 13:53 UTC (permalink / raw)
  To: Rusty Russell, virtualization, kvm
  Cc: Peter Maydell, Paolo Bonzini, Michael S. Tsirkin,
	Stefan Hajnoczi, KONRAD Frederic

Rusty Russell <rusty@rustcorp.com.au> writes:

> Anthony Liguori <aliguori@us.ibm.com> writes:
>> Forcing a guest driver change is a really big
>> deal and I see no reason to do that unless there's a compelling reason
>> to.
>>
>> So we're stuck with the 1.0 config layout for a very long time.
>
> We definitely must not force a guest change.  The explicit aim of the
> standard is that "legacy" and 1.0 be backward compatible.  One
> deliverable is a document detailing how this is done (effectively a
> summary of changes between what we have and 1.0).

If 2.0 is fully backwards compatible, great.  It seems like such a
difference that that would be impossible but I need to investigate
further.

Regards,

Anthony Liguori

>
> It's a delicate balancing act.  My plan is to accompany any changes in
> the standard with a qemu implementation, so we can see how painful those
> changes are.  And if there are performance implications, measure them.
>
> Cheers,
> Rusty.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-30 13:53           ` Anthony Liguori
  (?)
@ 2013-05-30 14:01           ` Michael S. Tsirkin
  2013-06-03  0:26             ` Rusty Russell
  -1 siblings, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-05-30 14:01 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Thu, May 30, 2013 at 08:53:45AM -0500, Anthony Liguori wrote:
> Rusty Russell <rusty@rustcorp.com.au> writes:
> 
> > Anthony Liguori <aliguori@us.ibm.com> writes:
> >> Forcing a guest driver change is a really big
> >> deal and I see no reason to do that unless there's a compelling reason
> >> to.
> >>
> >> So we're stuck with the 1.0 config layout for a very long time.
> >
> > We definitely must not force a guest change.  The explicit aim of the
> > standard is that "legacy" and 1.0 be backward compatible.  One
> > deliverable is a document detailing how this is done (effectively a
> > summary of changes between what we have and 1.0).
> 
> If 2.0 is fully backwards compatible, great.  It seems like such a
> difference that that would be impossible but I need to investigate
> further.
> 
> Regards,
> 
> Anthony Liguori

If you look at my patches you'll see how it works.
Basically old guests use BAR0 new ones don't, so
it's easy: BAR0 access means legacy guest.
Only started testing but things seem to work
fine with old guests so far.

I think we need a spec, not just driver code.

Rusty what's the plan? Want me to write it?

> >
> > It's a delicate balancing act.  My plan is to accompany any changes in
> > the standard with a qemu implementation, so we can see how painful those
> > changes are.  And if there are performance implications, measure them.
> >
> > Cheers,
> > Rusty.
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-30  7:55         ` Michael S. Tsirkin
@ 2013-06-03  0:17           ` Rusty Russell
  0 siblings, 0 replies; 109+ messages in thread
From: Rusty Russell @ 2013-06-03  0:17 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:
> On Thu, May 30, 2013 at 01:28:26PM +0930, Rusty Russell wrote:
>> >> Yet the structure definitions are descriptive, capturing layout, size
>> >> and endianness in natural a format readable by any C programmer.
>> >
>> >>From an API design point of view, here are the problems I see:
>> >
>> > 1) C makes no guarantees about structure layout beyond the first
>> >    member.  Yes, if it's naturally aligned or has a packed attribute,
>> >    GCC does the right thing.  But this isn't kernel land anymore,
>> >    portability matters and there are more compilers than GCC.
>> 
>> [ I argued in detail here, then deleted.  Damn bikeshedding... ]
>> 
>> I think the best thing is to add the decoded integer versions as macros,
>> and have a heap of BUILD_BUG_ON() in the kernel source to make sure they
>> match.
>
> Hmm you want to have both in the header?
>
> That would confuse users for sure :)
>
> I guess we could put in a big comment explaning
> why do we have both, and which version should
> userspace use. I tried to write it:
>
> /*
>  * On all known C compilers, you can use the
>  * offsetof macro to find the correct offset of each field -
>  * if your compiler doesn't implement this correctly, use the
>  * integer versions below, instead.
>  * In that case please don't use the structures above, to avoid confusion.
>  */

My version was a little less verbose :)

Subject: virtio_pci: macros for PCI layout offsets.

QEMU wants it, so why not?  Trust, but verify.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 24bcd9b..6510009 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -453,6 +453,64 @@ static void __iomem *map_capability(struct pci_dev *dev, int off, size_t minlen,
 	return p;
 }
 
+/* This is part of the ABI.  Don't screw with it. */
+static inline void check_offsets(void)
+{
+	/* Note: disk space was harmed in compilation of this function. */
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR !=
+		     offsetof(struct virtio_pci_cap, cap_vndr));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT !=
+		     offsetof(struct virtio_pci_cap, cap_next));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
+		     offsetof(struct virtio_pci_cap, cap_len));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_TYPE_AND_BAR !=
+		     offsetof(struct virtio_pci_cap, type_and_bar));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
+		     offsetof(struct virtio_pci_cap, offset));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
+		     offsetof(struct virtio_pci_cap, length));
+	BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT !=
+		     offsetof(struct virtio_pci_notify_cap,
+			      notify_off_multiplier));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT != 
+		     offsetof(struct virtio_pci_common_cfg,
+			      device_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF != 
+		     offsetof(struct virtio_pci_common_cfg, device_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT != 
+		     offsetof(struct virtio_pci_common_cfg,
+			      guest_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF != 
+		     offsetof(struct virtio_pci_common_cfg, guest_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX != 
+		     offsetof(struct virtio_pci_common_cfg, msix_config));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ != 
+		     offsetof(struct virtio_pci_common_cfg, num_queues));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS != 
+		     offsetof(struct virtio_pci_common_cfg, device_status));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT != 
+		     offsetof(struct virtio_pci_common_cfg, queue_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE != 
+		     offsetof(struct virtio_pci_common_cfg, queue_size));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX != 
+		     offsetof(struct virtio_pci_common_cfg, queue_msix_vector));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE != 
+		     offsetof(struct virtio_pci_common_cfg, queue_enable));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF != 
+		     offsetof(struct virtio_pci_common_cfg, queue_notify_off));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO != 
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI != 
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO != 
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI != 
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO != 
+		     offsetof(struct virtio_pci_common_cfg, queue_used_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI != 
+		     offsetof(struct virtio_pci_common_cfg, queue_used_hi));
+}
 
 /* the PCI probing function */
 static int virtio_pci_probe(struct pci_dev *pci_dev,
@@ -461,6 +519,8 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
 	struct virtio_pci_device *vp_dev;
 	int err, common, isr, notify, device;
 
+	check_offsets();
+
 	/* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
 	if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f)
 		return -ENODEV;
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index e1a25ad..9fb7ea4 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -171,4 +171,34 @@ struct virtio_pci_common_cfg {
 	__le32 queue_used_lo;		/* read-write */
 	__le32 queue_used_hi;		/* read-write */
 };
+
+/* Macro versions of offsets for the Old Timers! */
+#define VIRTIO_PCI_CAP_VNDR		0
+#define VIRTIO_PCI_CAP_NEXT		1
+#define VIRTIO_PCI_CAP_LEN		2
+#define VIRTIO_PCI_CAP_TYPE_AND_BAR	3
+#define VIRTIO_PCI_CAP_OFFSET		4
+#define VIRTIO_PCI_CAP_LENGTH		8
+
+#define VIRTIO_PCI_NOTIFY_CAP_MULT	12
+
+#define VIRTIO_PCI_COMMON_DFSELECT	0
+#define VIRTIO_PCI_COMMON_DF		4
+#define VIRTIO_PCI_COMMON_GFSELECT	8
+#define VIRTIO_PCI_COMMON_GF		12
+#define VIRTIO_PCI_COMMON_MSIX		16
+#define VIRTIO_PCI_COMMON_NUMQ		18
+#define VIRTIO_PCI_COMMON_STATUS	20
+#define VIRTIO_PCI_COMMON_Q_SELECT	22
+#define VIRTIO_PCI_COMMON_Q_SIZE	24
+#define VIRTIO_PCI_COMMON_Q_MSIX	26
+#define VIRTIO_PCI_COMMON_Q_ENABLE	28
+#define VIRTIO_PCI_COMMON_Q_NOFF	30
+#define VIRTIO_PCI_COMMON_Q_DESCLO	32
+#define VIRTIO_PCI_COMMON_Q_DESCHI	36
+#define VIRTIO_PCI_COMMON_Q_AVAILLO	40
+#define VIRTIO_PCI_COMMON_Q_AVAILHI	44
+#define VIRTIO_PCI_COMMON_Q_USEDLO	48
+#define VIRTIO_PCI_COMMON_Q_USEDHI	52
+
 #endif /* _UAPI_LINUX_VIRTIO_PCI_H */

^ permalink raw reply related	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-05-30 14:01           ` Michael S. Tsirkin
@ 2013-06-03  0:26             ` Rusty Russell
  2013-06-03 10:11               ` Michael S. Tsirkin
  0 siblings, 1 reply; 109+ messages in thread
From: Rusty Russell @ 2013-06-03  0:26 UTC (permalink / raw)
  To: Michael S. Tsirkin, Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:
> On Thu, May 30, 2013 at 08:53:45AM -0500, Anthony Liguori wrote:
>> Rusty Russell <rusty@rustcorp.com.au> writes:
>> 
>> > Anthony Liguori <aliguori@us.ibm.com> writes:
>> >> Forcing a guest driver change is a really big
>> >> deal and I see no reason to do that unless there's a compelling reason
>> >> to.
>> >>
>> >> So we're stuck with the 1.0 config layout for a very long time.
>> >
>> > We definitely must not force a guest change.  The explicit aim of the
>> > standard is that "legacy" and 1.0 be backward compatible.  One
>> > deliverable is a document detailing how this is done (effectively a
>> > summary of changes between what we have and 1.0).
>> 
>> If 2.0 is fully backwards compatible, great.  It seems like such a
>> difference that that would be impossible but I need to investigate
>> further.
>> 
>> Regards,
>> 
>> Anthony Liguori
>
> If you look at my patches you'll see how it works.
> Basically old guests use BAR0 new ones don't, so
> it's easy: BAR0 access means legacy guest.
> Only started testing but things seem to work
> fine with old guests so far.
>
> I think we need a spec, not just driver code.
>
> Rusty what's the plan? Want me to write it?

We need both, of course, but the spec work will happen in the OASIS WG.
A draft is good, but let's not commit anything to upstream QEMU until we
get the spec finalized.  And that is proposed to be late this year.

Since I'm going to have to reformat the spec and adapt it into OASIS
style anyway, perhaps you should prepare a description as a standalone
text document.  Easier to email and work with...

Now, the idea is that if you want to support 0.9 and 1.0 (or whatever we
call them; I used the term "legacy" for existing implementations in the
OASIS WG proposal), you add capabilities and don't point them into (the
start of?) BAR0.  Old drivers use BAR0 as now.

One trick to note: while drivers shouldn't use both old and new style on
the same device, you need to allow it for kexec, particularly reset via
BAR0.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-03  0:26             ` Rusty Russell
@ 2013-06-03 10:11               ` Michael S. Tsirkin
  2013-06-04  5:31                 ` Rusty Russell
  0 siblings, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-03 10:11 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Mon, Jun 03, 2013 at 09:56:15AM +0930, Rusty Russell wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> > On Thu, May 30, 2013 at 08:53:45AM -0500, Anthony Liguori wrote:
> >> Rusty Russell <rusty@rustcorp.com.au> writes:
> >> 
> >> > Anthony Liguori <aliguori@us.ibm.com> writes:
> >> >> Forcing a guest driver change is a really big
> >> >> deal and I see no reason to do that unless there's a compelling reason
> >> >> to.
> >> >>
> >> >> So we're stuck with the 1.0 config layout for a very long time.
> >> >
> >> > We definitely must not force a guest change.  The explicit aim of the
> >> > standard is that "legacy" and 1.0 be backward compatible.  One
> >> > deliverable is a document detailing how this is done (effectively a
> >> > summary of changes between what we have and 1.0).
> >> 
> >> If 2.0 is fully backwards compatible, great.  It seems like such a
> >> difference that that would be impossible but I need to investigate
> >> further.
> >> 
> >> Regards,
> >> 
> >> Anthony Liguori
> >
> > If you look at my patches you'll see how it works.
> > Basically old guests use BAR0 new ones don't, so
> > it's easy: BAR0 access means legacy guest.
> > Only started testing but things seem to work
> > fine with old guests so far.
> >
> > I think we need a spec, not just driver code.
> >
> > Rusty what's the plan? Want me to write it?
> 
> We need both, of course, but the spec work will happen in the OASIS WG.
> A draft is good, but let's not commit anything to upstream QEMU until we
> get the spec finalized.  And that is proposed to be late this year.

Well that would be quite sad really.

This means we can't make virtio a spec compliant pci express device,
and we can't add any more feature bits, so no
flexible buffer optimizations for virtio net.

There are probably more projects that will be blocked.

So how about we keep extending legacy layout for a bit longer:
- add a way to access device with MMIO
- use feature bit 31 to signal 64 bit features
  (and shift device config accordingly)

No endian-ness rework, no per queue enable etc.


Then when we start discussions we will have a working
express and working 64 bit feature support,
and it will be that much easier to make it pretty it.


> Since I'm going to have to reformat the spec and adapt it into OASIS
> style anyway, perhaps you should prepare a description as a standalone
> text document.  Easier to email and work with...
> 
> Now, the idea is that if you want to support 0.9 and 1.0 (or whatever we
> call them; I used the term "legacy" for existing implementations in the
> OASIS WG proposal), you add capabilities and don't point them into (the
> start of?) BAR0.  Old drivers use BAR0 as now.
> 
> One trick to note: while drivers shouldn't use both old and new style on
> the same device, you need to allow it for kexec, particularly reset via
> BAR0.
> 
> Cheers,
> Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-03 10:11               ` Michael S. Tsirkin
@ 2013-06-04  5:31                 ` Rusty Russell
  2013-06-04  6:42                   ` Michael S. Tsirkin
  2013-06-06  8:02                   ` Michael S. Tsirkin
  0 siblings, 2 replies; 109+ messages in thread
From: Rusty Russell @ 2013-06-04  5:31 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:
> On Mon, Jun 03, 2013 at 09:56:15AM +0930, Rusty Russell wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> > On Thu, May 30, 2013 at 08:53:45AM -0500, Anthony Liguori wrote:
>> >> Rusty Russell <rusty@rustcorp.com.au> writes:
>> >> 
>> >> > Anthony Liguori <aliguori@us.ibm.com> writes:
>> >> >> Forcing a guest driver change is a really big
>> >> >> deal and I see no reason to do that unless there's a compelling reason
>> >> >> to.
>> >> >>
>> >> >> So we're stuck with the 1.0 config layout for a very long time.
>> >> >
>> >> > We definitely must not force a guest change.  The explicit aim of the
>> >> > standard is that "legacy" and 1.0 be backward compatible.  One
>> >> > deliverable is a document detailing how this is done (effectively a
>> >> > summary of changes between what we have and 1.0).
>> >> 
>> >> If 2.0 is fully backwards compatible, great.  It seems like such a
>> >> difference that that would be impossible but I need to investigate
>> >> further.
>> >> 
>> >> Regards,
>> >> 
>> >> Anthony Liguori
>> >
>> > If you look at my patches you'll see how it works.
>> > Basically old guests use BAR0 new ones don't, so
>> > it's easy: BAR0 access means legacy guest.
>> > Only started testing but things seem to work
>> > fine with old guests so far.
>> >
>> > I think we need a spec, not just driver code.
>> >
>> > Rusty what's the plan? Want me to write it?
>> 
>> We need both, of course, but the spec work will happen in the OASIS WG.
>> A draft is good, but let's not commit anything to upstream QEMU until we
>> get the spec finalized.  And that is proposed to be late this year.
>
> Well that would be quite sad really.
> 
> This means we can't make virtio a spec compliant pci express device,
> and we can't add any more feature bits, so no
> flexible buffer optimizations for virtio net.
>
> There are probably more projects that will be blocked.
>
> So how about we keep extending legacy layout for a bit longer:
> - add a way to access device with MMIO
> - use feature bit 31 to signal 64 bit features
>   (and shift device config accordingly)

By my count, net still has 7 feature bits left, so I don't think the
feature bits are likely to be a limitation in the next 6 months?

MMIO is a bigger problem.  Linux guests are happy with it: does it break
the Windows drivers?

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-04  5:31                 ` Rusty Russell
@ 2013-06-04  6:42                   ` Michael S. Tsirkin
  2013-06-05  7:19                     ` Rusty Russell
  2013-06-05 12:59                     ` Anthony Liguori
  2013-06-06  8:02                   ` Michael S. Tsirkin
  1 sibling, 2 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-04  6:42 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 04, 2013 at 03:01:50PM +0930, Rusty Russell wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> > On Mon, Jun 03, 2013 at 09:56:15AM +0930, Rusty Russell wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> > On Thu, May 30, 2013 at 08:53:45AM -0500, Anthony Liguori wrote:
> >> >> Rusty Russell <rusty@rustcorp.com.au> writes:
> >> >> 
> >> >> > Anthony Liguori <aliguori@us.ibm.com> writes:
> >> >> >> Forcing a guest driver change is a really big
> >> >> >> deal and I see no reason to do that unless there's a compelling reason
> >> >> >> to.
> >> >> >>
> >> >> >> So we're stuck with the 1.0 config layout for a very long time.
> >> >> >
> >> >> > We definitely must not force a guest change.  The explicit aim of the
> >> >> > standard is that "legacy" and 1.0 be backward compatible.  One
> >> >> > deliverable is a document detailing how this is done (effectively a
> >> >> > summary of changes between what we have and 1.0).
> >> >> 
> >> >> If 2.0 is fully backwards compatible, great.  It seems like such a
> >> >> difference that that would be impossible but I need to investigate
> >> >> further.
> >> >> 
> >> >> Regards,
> >> >> 
> >> >> Anthony Liguori
> >> >
> >> > If you look at my patches you'll see how it works.
> >> > Basically old guests use BAR0 new ones don't, so
> >> > it's easy: BAR0 access means legacy guest.
> >> > Only started testing but things seem to work
> >> > fine with old guests so far.
> >> >
> >> > I think we need a spec, not just driver code.
> >> >
> >> > Rusty what's the plan? Want me to write it?
> >> 
> >> We need both, of course, but the spec work will happen in the OASIS WG.
> >> A draft is good, but let's not commit anything to upstream QEMU until we
> >> get the spec finalized.  And that is proposed to be late this year.
> >
> > Well that would be quite sad really.
> > 
> > This means we can't make virtio a spec compliant pci express device,
> > and we can't add any more feature bits, so no
> > flexible buffer optimizations for virtio net.
> >
> > There are probably more projects that will be blocked.
> >
> > So how about we keep extending legacy layout for a bit longer:
> > - add a way to access device with MMIO
> > - use feature bit 31 to signal 64 bit features
> >   (and shift device config accordingly)
> 
> By my count, net still has 7 feature bits left, so I don't think the
> feature bits are likely to be a limitation in the next 6 months?

Yes but you wanted a generic transport feature bit
for flexible SG layout.
Are you happy with VIRTIO_NET_F_ANY_HEADER_SG then?

> MMIO is a bigger problem.  Linux guests are happy with it: does it break
> the Windows drivers?
> 
> Thanks,
> Rusty.

You mean make BAR0 an MMIO BAR?
Yes, it would break current windows guests.
Further, as long as we use same address to notify all queues,
we would also need to decode the instruction on x86 and that's
measureably slower than PIO.
We could go back to discussing hypercall use for notifications,
but that has its own set of issues...

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-04  6:42                   ` Michael S. Tsirkin
@ 2013-06-05  7:19                     ` Rusty Russell
  2013-06-05 10:22                       ` Michael S. Tsirkin
  2013-06-05 12:59                     ` Anthony Liguori
  1 sibling, 1 reply; 109+ messages in thread
From: Rusty Russell @ 2013-06-05  7:19 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:
>> By my count, net still has 7 feature bits left, so I don't think the
>> feature bits are likely to be a limitation in the next 6 months?
>
> Yes but you wanted a generic transport feature bit
> for flexible SG layout.
> Are you happy with VIRTIO_NET_F_ANY_HEADER_SG then?

If we need it within 6 months, I'd rather do that: this feature would
then be assumed for 1.0, and reserved.  We may do that for other
features, too (the committee will have to review).

>> MMIO is a bigger problem.  Linux guests are happy with it: does it break
>> the Windows drivers?
>> 
>> Thanks,
>> Rusty.
>
> You mean make BAR0 an MMIO BAR?
> Yes, it would break current windows guests.
> Further, as long as we use same address to notify all queues,
> we would also need to decode the instruction on x86 and that's
> measureably slower than PIO.
> We could go back to discussing hypercall use for notifications,
> but that has its own set of issues...

We might have something ready to deploy in 3 months, but realistically,
I'd rather have it ready and tested outside the main git tree(s) and
push it once the standard is committed.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05  7:19                     ` Rusty Russell
@ 2013-06-05 10:22                       ` Michael S. Tsirkin
  0 siblings, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 10:22 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 04:49:22PM +0930, Rusty Russell wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> By my count, net still has 7 feature bits left, so I don't think the
> >> feature bits are likely to be a limitation in the next 6 months?
> >
> > Yes but you wanted a generic transport feature bit
> > for flexible SG layout.
> > Are you happy with VIRTIO_NET_F_ANY_HEADER_SG then?
> 
> If we need it within 6 months, I'd rather do that: this feature would
> then be assumed for 1.0, and reserved.  We may do that for other
> features, too (the committee will have to review).
> 
> >> MMIO is a bigger problem.  Linux guests are happy with it: does it break
> >> the Windows drivers?
> >> 
> >> Thanks,
> >> Rusty.
> >
> > You mean make BAR0 an MMIO BAR?
> > Yes, it would break current windows guests.
> > Further, as long as we use same address to notify all queues,
> > we would also need to decode the instruction on x86 and that's
> > measureably slower than PIO.
> > We could go back to discussing hypercall use for notifications,
> > but that has its own set of issues...
> 
> We might have something ready to deploy in 3 months,
> but realistically,
> I'd rather have it ready and tested outside the main git tree(s) and
> push it once the standard is committed.
> 
> Cheers,
> Rusty.


We have working code already. We don't need 3 months out of tree,
this gets us no progress. To make it ready to deploy
we need it upstream and people testing it.

I think the issue is the layout change, you don't want
the new config layout before we get standartization rolling, right?
Okay how about this small patch to the linux guest
(completely untested, just to give you the idea):


diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index a7ce730..0e34862 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -675,6 +675,33 @@ static void virtio_pci_release_dev(struct device *_d)
 	 */
 }
 
+/* Map a BAR. But carefully: make sure we don't overlap the MSI-X table */
+static void __iomem * virtio_pci_iomap(struct pci_dev *pci_dev, int bar)
+{
+	int msix_cap = pci_find_capability(pci_dev, PCI_CAP_ID_MSIX);
+	if (msix_cap) {
+		u32 offset;
+		u8 bir;
+		pci_read_config_dword(pci_dev, msix_cap + PCI_MSIX_TABLE,
+				      &offset);
+		bir = (u8)(offset & PCI_MSIX_TABLE_BIR);
+		offset &= PCI_MSIX_TABLE_OFFSET;
+		/* Spec says table offset is in a 4K page all by itself */
+		if (bir == bar && offset < 4096)
+			return NULL;
+
+		pci_read_config_dword(pci_dev, msix_cap + PCI_MSIX_PBA,
+				      &offset);
+		bir = (u8)(offset & PCI_MSIX_PBA_BIR);
+		offset &= PCI_MSIX_PBA_OFFSET;
+		/* Spec says table offset is in a 4K page all by itself. */
+		if (bir == bar && offset < 4096)
+			return NULL;
+	}
+	/* 4K is enough for all devices at the moment. */
+	return pci_iomap(pci_dev, 0, 4096);
+}
+
 /* the PCI probing function */
 static int virtio_pci_probe(struct pci_dev *pci_dev,
 			    const struct pci_device_id *id)
@@ -716,7 +743,10 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
 	if (err)
 		goto out_enable_device;
 
-	vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
+	vp_dev->ioaddr = virtio_pci_iomap(pci_dev, 0);
+	/* Failed to map BAR0? Try with BAR1. */
+	if (vp_dev->ioaddr == NULL) {
+		vp_dev->ioaddr = virtio_pci_iomap(pci_dev, 1);
 	if (vp_dev->ioaddr == NULL) {
 		err = -ENOMEM;
 		goto out_req_regions;


In other words: put a copy of IO config at start of MMIO BAR1,
making sure we don't overlap the MSIX table that's there.

This does not break windows guests, and makes us compliant to the
PCI express spec.

Is this small enough to start going immediately, without waiting
3+ months?

I really think it's a good idea to put something like this
in the field: we might discover more issues around MMIO
and we'll address them in the new config layout.
This way we don't get two changes: new layout and switch to MMIO
all at once.

If you like this, I can make the appropriate spec and qemu changes in a
couple of days.

-- 
MST

^ permalink raw reply related	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-04  6:42                   ` Michael S. Tsirkin
  2013-06-05  7:19                     ` Rusty Russell
@ 2013-06-05 12:59                     ` Anthony Liguori
  2013-06-05 14:09                       ` Michael S. Tsirkin
  1 sibling, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 12:59 UTC (permalink / raw)
  To: Michael S. Tsirkin, Rusty Russell
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Tue, Jun 04, 2013 at 03:01:50PM +0930, Rusty Russell wrote:
> You mean make BAR0 an MMIO BAR?
> Yes, it would break current windows guests.
> Further, as long as we use same address to notify all queues,
> we would also need to decode the instruction on x86 and that's
> measureably slower than PIO.
> We could go back to discussing hypercall use for notifications,
> but that has its own set of issues...

So... does "violating the PCI-e" spec really matter?  Is it preventing
any guest from working properly?

I don't think we should rush an ABI breakage if the only benefit is
claiming spec compliance.

Regards,

Anthony Liguori

>
> -- 
> MST
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 12:59                     ` Anthony Liguori
@ 2013-06-05 14:09                       ` Michael S. Tsirkin
  2013-06-05 15:08                         ` Anthony Liguori
  0 siblings, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 14:09 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 07:59:33AM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Tue, Jun 04, 2013 at 03:01:50PM +0930, Rusty Russell wrote:
> > You mean make BAR0 an MMIO BAR?
> > Yes, it would break current windows guests.
> > Further, as long as we use same address to notify all queues,
> > we would also need to decode the instruction on x86 and that's
> > measureably slower than PIO.
> > We could go back to discussing hypercall use for notifications,
> > but that has its own set of issues...
> 
> So... does "violating the PCI-e" spec really matter?  Is it preventing
> any guest from working properly?

Yes, absolutely, this wording in spec is not there without reason.

Existing guests allocate io space for PCI express ports in
multiples on 4K.

Since each express device is behind such a port, this means
at most 15 such devices can use IO ports in a system.

That's why to make a pci express virtio device,
we must allow MMIO and/or some other communication
mechanism as the spec requires.

That's on x86.

Besides x86, there are achitectures where IO is unavailable or very slow.

> I don't think we should rush an ABI breakage if the only benefit is
> claiming spec compliance.
> 
> Regards,
> 
> Anthony Liguori

Why do you bring this up? No one advocates any ABI breakage,
I only suggest extensions.


> >
> > -- 
> > MST
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 14:09                       ` Michael S. Tsirkin
@ 2013-06-05 15:08                         ` Anthony Liguori
  2013-06-05 15:19                           ` Michael S. Tsirkin
  0 siblings, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 15:08 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Jun 05, 2013 at 07:59:33AM -0500, Anthony Liguori wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > On Tue, Jun 04, 2013 at 03:01:50PM +0930, Rusty Russell wrote:
>> > You mean make BAR0 an MMIO BAR?
>> > Yes, it would break current windows guests.
>> > Further, as long as we use same address to notify all queues,
>> > we would also need to decode the instruction on x86 and that's
>> > measureably slower than PIO.
>> > We could go back to discussing hypercall use for notifications,
>> > but that has its own set of issues...
>> 
>> So... does "violating the PCI-e" spec really matter?  Is it preventing
>> any guest from working properly?
>
> Yes, absolutely, this wording in spec is not there without reason.
>
> Existing guests allocate io space for PCI express ports in
> multiples on 4K.
>
> Since each express device is behind such a port, this means
> at most 15 such devices can use IO ports in a system.
>
> That's why to make a pci express virtio device,
> we must allow MMIO and/or some other communication
> mechanism as the spec requires.

This is precisely why this is an ABI breaker.

If you disable IO bars in the BIOS, than the interface that the OS sees
will *not have an IO bar*.

This *breaks existing guests*.

Any time the programming interfaces changes on a PCI device, the
revision ID and/or device ID must change.  The spec is very clear about
this.

We cannot disable the IO BAR without changing revision ID/device ID.

> That's on x86.
>
> Besides x86, there are achitectures where IO is unavailable or very slow.
>
>> I don't think we should rush an ABI breakage if the only benefit is
>> claiming spec compliance.
>> 
>> Regards,
>> 
>> Anthony Liguori
>
> Why do you bring this up? No one advocates any ABI breakage,
> I only suggest extensions.

It's an ABI breakage.  You're claiming that the guests you tested
handle the breakage reasonably but it is unquestionably an ABI breakage.

Regards,

Anthony Liguori

>
>
>> >
>> > -- 
>> > MST
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe kvm" in
>> > the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 15:08                         ` Anthony Liguori
@ 2013-06-05 15:19                           ` Michael S. Tsirkin
  2013-06-05 15:46                             ` Anthony Liguori
  2013-06-05 15:46                             ` Anthony Liguori
  0 siblings, 2 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 15:19 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 10:08:37AM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Wed, Jun 05, 2013 at 07:59:33AM -0500, Anthony Liguori wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> 
> >> > On Tue, Jun 04, 2013 at 03:01:50PM +0930, Rusty Russell wrote:
> >> > You mean make BAR0 an MMIO BAR?
> >> > Yes, it would break current windows guests.
> >> > Further, as long as we use same address to notify all queues,
> >> > we would also need to decode the instruction on x86 and that's
> >> > measureably slower than PIO.
> >> > We could go back to discussing hypercall use for notifications,
> >> > but that has its own set of issues...
> >> 
> >> So... does "violating the PCI-e" spec really matter?  Is it preventing
> >> any guest from working properly?
> >
> > Yes, absolutely, this wording in spec is not there without reason.
> >
> > Existing guests allocate io space for PCI express ports in
> > multiples on 4K.
> >
> > Since each express device is behind such a port, this means
> > at most 15 such devices can use IO ports in a system.
> >
> > That's why to make a pci express virtio device,
> > we must allow MMIO and/or some other communication
> > mechanism as the spec requires.
> 
> This is precisely why this is an ABI breaker.
> 
> If you disable IO bars in the BIOS, than the interface that the OS sees
> will *not have an IO bar*.
> 
> This *breaks existing guests*.
> Any time the programming interfaces changes on a PCI device, the
> revision ID and/or device ID must change.  The spec is very clear about
> this.
> 
> We cannot disable the IO BAR without changing revision ID/device ID.
> 

But it's a bios/PC issue. It's not a device issue.

Anyway, let's put express aside.

It's easy to create non-working setups with pci, today:

- create 16 pci bridges
- put one virtio device behind each

boom

Try it.

I want to fix that.


> > That's on x86.
> >
> > Besides x86, there are achitectures where IO is unavailable or very slow.
> >
> >> I don't think we should rush an ABI breakage if the only benefit is
> >> claiming spec compliance.
> >> 
> >> Regards,
> >> 
> >> Anthony Liguori
> >
> > Why do you bring this up? No one advocates any ABI breakage,
> > I only suggest extensions.
> 
> It's an ABI breakage.  You're claiming that the guests you tested
> handle the breakage reasonably but it is unquestionably an ABI breakage.
> 
> Regards,
> 
> Anthony Liguori

Adding BAR is not an ABI breakage, do we agree on that?

Disabling IO would be but I am not proposing disabling IO.

Guests might disable IO.

I propose a way to make virtio still work if they do.

This is *fixing* things. Not breaking.

> >
> >
> >> >
> >> > -- 
> >> > MST
> >> > --
> >> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> >> > the body of a message to majordomo@vger.kernel.org
> >> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 15:19                           ` Michael S. Tsirkin
  2013-06-05 15:46                             ` Anthony Liguori
@ 2013-06-05 15:46                             ` Anthony Liguori
  2013-06-05 16:20                               ` Michael S. Tsirkin
  1 sibling, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 15:46 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Rusty Russell, virtualization, kvm, KONRAD Frederic,
	Paolo Bonzini, Peter Maydell, Stefan Hajnoczi, gleb

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Jun 05, 2013 at 10:08:37AM -0500, Anthony Liguori wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > On Wed, Jun 05, 2013 at 07:59:33AM -0500, Anthony Liguori wrote:
>> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> >> 
>> >> > On Tue, Jun 04, 2013 at 03:01:50PM +0930, Rusty Russell wrote:
>> >> > You mean make BAR0 an MMIO BAR?
>> >> > Yes, it would break current windows guests.
>> >> > Further, as long as we use same address to notify all queues,
>> >> > we would also need to decode the instruction on x86 and that's
>> >> > measureably slower than PIO.
>> >> > We could go back to discussing hypercall use for notifications,
>> >> > but that has its own set of issues...
>> >> 
>> >> So... does "violating the PCI-e" spec really matter?  Is it preventing
>> >> any guest from working properly?
>> >
>> > Yes, absolutely, this wording in spec is not there without reason.
>> >
>> > Existing guests allocate io space for PCI express ports in
>> > multiples on 4K.
>> >
>> > Since each express device is behind such a port, this means
>> > at most 15 such devices can use IO ports in a system.
>> >
>> > That's why to make a pci express virtio device,
>> > we must allow MMIO and/or some other communication
>> > mechanism as the spec requires.
>> 
>> This is precisely why this is an ABI breaker.
>> 
>> If you disable IO bars in the BIOS, than the interface that the OS sees
>> will *not have an IO bar*.
>> 
>> This *breaks existing guests*.
>> Any time the programming interfaces changes on a PCI device, the
>> revision ID and/or device ID must change.  The spec is very clear about
>> this.
>> 
>> We cannot disable the IO BAR without changing revision ID/device ID.
>> 
>
> But it's a bios/PC issue. It's not a device issue.
>
> Anyway, let's put express aside.
>
> It's easy to create non-working setups with pci, today:
>
> - create 16 pci bridges
> - put one virtio device behind each
>
> boom
>
> Try it.
>
> I want to fix that.
>
>
>> > That's on x86.
>> >
>> > Besides x86, there are achitectures where IO is unavailable or very slow.
>> >
>> >> I don't think we should rush an ABI breakage if the only benefit is
>> >> claiming spec compliance.
>> >> 
>> >> Regards,
>> >> 
>> >> Anthony Liguori
>> >
>> > Why do you bring this up? No one advocates any ABI breakage,
>> > I only suggest extensions.
>> 
>> It's an ABI breakage.  You're claiming that the guests you tested
>> handle the breakage reasonably but it is unquestionably an ABI breakage.
>> 
>> Regards,
>> 
>> Anthony Liguori
>
> Adding BAR is not an ABI breakage, do we agree on that?
>
> Disabling IO would be but I am not proposing disabling IO.
>
> Guests might disable IO.

Look, it's very simple.

If the failure in the guest is that BAR0 mapping fails because the
device is enabled but the BAR is disabled, then you've broken the ABI.

And what's worse is that this isn't for an obscure scenario (like having
15 PCI bridges) but for something that would become the standard
scenario (using a PCI-e bus).

We need to either bump the revision ID or the device ID if we do this.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 15:19                           ` Michael S. Tsirkin
@ 2013-06-05 15:46                             ` Anthony Liguori
  2013-06-05 15:46                             ` Anthony Liguori
  1 sibling, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 15:46 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Jun 05, 2013 at 10:08:37AM -0500, Anthony Liguori wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > On Wed, Jun 05, 2013 at 07:59:33AM -0500, Anthony Liguori wrote:
>> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> >> 
>> >> > On Tue, Jun 04, 2013 at 03:01:50PM +0930, Rusty Russell wrote:
>> >> > You mean make BAR0 an MMIO BAR?
>> >> > Yes, it would break current windows guests.
>> >> > Further, as long as we use same address to notify all queues,
>> >> > we would also need to decode the instruction on x86 and that's
>> >> > measureably slower than PIO.
>> >> > We could go back to discussing hypercall use for notifications,
>> >> > but that has its own set of issues...
>> >> 
>> >> So... does "violating the PCI-e" spec really matter?  Is it preventing
>> >> any guest from working properly?
>> >
>> > Yes, absolutely, this wording in spec is not there without reason.
>> >
>> > Existing guests allocate io space for PCI express ports in
>> > multiples on 4K.
>> >
>> > Since each express device is behind such a port, this means
>> > at most 15 such devices can use IO ports in a system.
>> >
>> > That's why to make a pci express virtio device,
>> > we must allow MMIO and/or some other communication
>> > mechanism as the spec requires.
>> 
>> This is precisely why this is an ABI breaker.
>> 
>> If you disable IO bars in the BIOS, than the interface that the OS sees
>> will *not have an IO bar*.
>> 
>> This *breaks existing guests*.
>> Any time the programming interfaces changes on a PCI device, the
>> revision ID and/or device ID must change.  The spec is very clear about
>> this.
>> 
>> We cannot disable the IO BAR without changing revision ID/device ID.
>> 
>
> But it's a bios/PC issue. It's not a device issue.
>
> Anyway, let's put express aside.
>
> It's easy to create non-working setups with pci, today:
>
> - create 16 pci bridges
> - put one virtio device behind each
>
> boom
>
> Try it.
>
> I want to fix that.
>
>
>> > That's on x86.
>> >
>> > Besides x86, there are achitectures where IO is unavailable or very slow.
>> >
>> >> I don't think we should rush an ABI breakage if the only benefit is
>> >> claiming spec compliance.
>> >> 
>> >> Regards,
>> >> 
>> >> Anthony Liguori
>> >
>> > Why do you bring this up? No one advocates any ABI breakage,
>> > I only suggest extensions.
>> 
>> It's an ABI breakage.  You're claiming that the guests you tested
>> handle the breakage reasonably but it is unquestionably an ABI breakage.
>> 
>> Regards,
>> 
>> Anthony Liguori
>
> Adding BAR is not an ABI breakage, do we agree on that?
>
> Disabling IO would be but I am not proposing disabling IO.
>
> Guests might disable IO.

Look, it's very simple.

If the failure in the guest is that BAR0 mapping fails because the
device is enabled but the BAR is disabled, then you've broken the ABI.

And what's worse is that this isn't for an obscure scenario (like having
15 PCI bridges) but for something that would become the standard
scenario (using a PCI-e bus).

We need to either bump the revision ID or the device ID if we do this.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 15:46                             ` Anthony Liguori
@ 2013-06-05 16:20                               ` Michael S. Tsirkin
  2013-06-05 18:57                                 ` Anthony Liguori
  2013-06-05 21:10                                 ` H. Peter Anvin
  0 siblings, 2 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 16:20 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 10:46:15AM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Wed, Jun 05, 2013 at 10:08:37AM -0500, Anthony Liguori wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> 
> >> > On Wed, Jun 05, 2013 at 07:59:33AM -0500, Anthony Liguori wrote:
> >> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> >> 
> >> >> > On Tue, Jun 04, 2013 at 03:01:50PM +0930, Rusty Russell wrote:
> >> >> > You mean make BAR0 an MMIO BAR?
> >> >> > Yes, it would break current windows guests.
> >> >> > Further, as long as we use same address to notify all queues,
> >> >> > we would also need to decode the instruction on x86 and that's
> >> >> > measureably slower than PIO.
> >> >> > We could go back to discussing hypercall use for notifications,
> >> >> > but that has its own set of issues...
> >> >> 
> >> >> So... does "violating the PCI-e" spec really matter?  Is it preventing
> >> >> any guest from working properly?
> >> >
> >> > Yes, absolutely, this wording in spec is not there without reason.
> >> >
> >> > Existing guests allocate io space for PCI express ports in
> >> > multiples on 4K.
> >> >
> >> > Since each express device is behind such a port, this means
> >> > at most 15 such devices can use IO ports in a system.
> >> >
> >> > That's why to make a pci express virtio device,
> >> > we must allow MMIO and/or some other communication
> >> > mechanism as the spec requires.
> >> 
> >> This is precisely why this is an ABI breaker.
> >> 
> >> If you disable IO bars in the BIOS, than the interface that the OS sees
> >> will *not have an IO bar*.
> >> 
> >> This *breaks existing guests*.
> >> Any time the programming interfaces changes on a PCI device, the
> >> revision ID and/or device ID must change.  The spec is very clear about
> >> this.
> >> 
> >> We cannot disable the IO BAR without changing revision ID/device ID.
> >> 
> >
> > But it's a bios/PC issue. It's not a device issue.
> >
> > Anyway, let's put express aside.
> >
> > It's easy to create non-working setups with pci, today:
> >
> > - create 16 pci bridges
> > - put one virtio device behind each
> >
> > boom
> >
> > Try it.
> >
> > I want to fix that.
> >
> >
> >> > That's on x86.
> >> >
> >> > Besides x86, there are achitectures where IO is unavailable or very slow.
> >> >
> >> >> I don't think we should rush an ABI breakage if the only benefit is
> >> >> claiming spec compliance.
> >> >> 
> >> >> Regards,
> >> >> 
> >> >> Anthony Liguori
> >> >
> >> > Why do you bring this up? No one advocates any ABI breakage,
> >> > I only suggest extensions.
> >> 
> >> It's an ABI breakage.  You're claiming that the guests you tested
> >> handle the breakage reasonably but it is unquestionably an ABI breakage.
> >> 
> >> Regards,
> >> 
> >> Anthony Liguori
> >
> > Adding BAR is not an ABI breakage, do we agree on that?
> >
> > Disabling IO would be but I am not proposing disabling IO.
> >
> > Guests might disable IO.
> 
> Look, it's very simple.
> 
> If the failure in the guest is that BAR0 mapping fails because the
> device is enabled but the BAR is disabled,

There's no such thing as device is enabled and neither
BAR is disabled in PCI spec.

Spec says IO and memory can be enabled/disabled, separately.
PCI Express spec says devices should work without IO.

So modern guests will assume it's ok to work without IO,
and it will become more common in the future.

> then you've broken the ABI.


No.  It means that the ABI is broken.

Guests can disable IO *today* and when they do things don't work.

> 
> And what's worse is that this isn't for an obscure scenario (like having
> 15 PCI bridges) but for something that would become the standard
> scenario (using a PCI-e bus).
> 
> We need to either bump the revision ID or the device ID if we do this.
> 
> Regards,
> 
> Anthony Liguori

We only need to do it if we do a change that breaks guests.

Please find a guest that is broken by the patches. You won't find any.


-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 16:20                               ` Michael S. Tsirkin
@ 2013-06-05 18:57                                 ` Anthony Liguori
  2013-06-05 19:43                                   ` Michael S. Tsirkin
                                                     ` (2 more replies)
  2013-06-05 21:10                                 ` H. Peter Anvin
  1 sibling, 3 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 18:57 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Jun 05, 2013 at 10:46:15AM -0500, Anthony Liguori wrote:
>> Look, it's very simple.
> We only need to do it if we do a change that breaks guests.
>
> Please find a guest that is broken by the patches. You won't find any.

I think the problem in this whole discussion is that we're talking past
each other.

Here is my understanding:

1) PCI-e says that you must be able to disable IO bars and still have a
functioning device.

2) It says (1) because you must size IO bars to 4096 which means that
practically speaking, once you enable a dozen or so PIO bars, you run
out of PIO space (16 * 4k == 64k and not all that space can be used).

virtio-pci uses a IO bars exclusively today.  Existing guest drivers
assume that there is an IO bar that contains the virtio-pci registers.

So let's consider the following scenarios:

QEMU of today:

1) qemu -drive file=ubuntu-13.04.img,if=virtio

This works today.  Does adding an MMIO bar at BAR1 break this?
Certainly not if the device is behind a PCI bus...

But are we going to put devices behind a PCI-e bus by default?  Are we
going to ask the user to choose whether devices are put behind a legacy
bus or the express bus?

What happens if we put the device behind a PCI-e bus by default?  Well,
it can still work.  That is, until we do something like this:

2) qemu -drive file=ubuntu-13.04.img,if=virtio -device virtio-rng
        -device virtio-balloon..

Such that we have more than a dozen or so devices.  This works
perfectly fine today.  It works fine because we've designed virtio to
make sure it works fine.  Quoting the spec:

"Configuration space is generally used for rarely-changing or
 initialization-time parameters. But it is a limited resource, so it
 might be better to use a virtqueue to update configuration information
 (the network device does this for filtering, otherwise the table in the
 config space could potentially be very large)."

In fact, we can have 100s of PCI devices today without running out of IO
space because we're so careful about this.

So if we switch to using PCI-e by default *and* we keep virtio-pci
without modifying the device IDs, then very frequently we are going to
break existing guests because the drivers they already have no longer
work.

A few virtio-serial channels, a few block devices, a couple of network
adapters, the balloon and RNG driver, and we hit the IO space limit
pretty damn quickly so this is not a contrived scenario at all.  I would
expect that we frequently run into this if we don't address this problem.

So we have a few options:

1) Punt all of this complexity to libvirt et al and watch people make
   the wrong decisions about when to use PCI-e.  This will become yet
   another example of KVM being too hard to configure.

2) Enable PCI-e by default and just force people to upgrade their
   drivers.

3) Don't use PCI-e by default but still add BAR1 to virtio-pci

4) Do virtio-pcie, make it PCI-e friendly (drop the IO BAR completely), give
   it a new device/vendor ID.   Continue to use virtio-pci for existing
   devices potentially adding virtio-{net,blk,...}-pcie variants for
   people that care to use them.

I think 1 == 2 == 3 and I view 2 as an ABI breaker.  libvirt does like
policy so they're going to make a simple decision and always use the
same bus by default.  I suspect if we made PCI the default, they might
just always set the PCI-e flag just because.

There are hundreds of thousands if not millions of guests with existing
virtio-pci drivers.  Forcing them to upgrade better have an extremely
good justification.

I think 4 is the best path forward.  It's better for users (guests
continue to work as they always have).  There's less confusion about
enabling PCI-e support--you must ask for the virtio-pcie variant and you
must have a virtio-pcie driver.  It's easy to explain.

It also maps to what regular hardware does.  I highly doubt that there
are any real PCI cards that made the shift from PCI to PCI-e without
bumping at least a revision ID.

It also means we don't need to play games about sometimes enabling IO
bars and sometimes not.

Regards,

Anthony Liguori

>
>
> -- 
> MST
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 18:57                                 ` Anthony Liguori
@ 2013-06-05 19:43                                   ` Michael S. Tsirkin
  2013-06-05 19:52                                     ` Michael S. Tsirkin
  2013-06-05 20:42                                     ` Anthony Liguori
  2013-06-05 19:54                                   ` Michael S. Tsirkin
  2013-06-06  3:42                                   ` Rusty Russell
  2 siblings, 2 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 19:43 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 01:57:16PM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Wed, Jun 05, 2013 at 10:46:15AM -0500, Anthony Liguori wrote:
> >> Look, it's very simple.
> > We only need to do it if we do a change that breaks guests.
> >
> > Please find a guest that is broken by the patches. You won't find any.
> 
> I think the problem in this whole discussion is that we're talking past
> each other.
> 
> Here is my understanding:
> 
> 1) PCI-e says that you must be able to disable IO bars and still have a
> functioning device.
> 
> 2) It says (1) because you must size IO bars to 4096 which means that
> practically speaking, once you enable a dozen or so PIO bars, you run
> out of PIO space (16 * 4k == 64k and not all that space can be used).


Let me add 3 other issues which I mentioned and you seem to miss:

3) architectures which don't have fast access to IO ports, exist
   virtio does not work there ATM

4) setups with many PCI bridges exist and have the same issue
   as PCI express. virtio does not work there ATM

5) On x86, even with nested page tables, firmware only decodes
   the page address on an invalid PTE, not the data. You need to
   emulate the guest to get at the data. Without
   nested page tables, we have to do page table walk and emulate
   to get both address and data. Since this is how MMIO
   is implemented in kvm on x86, MMIO is much slower than PIO
   (with nested page tables by a factor of >2, did not test without).

> virtio-pci uses a IO bars exclusively today.  Existing guest drivers
> assume that there is an IO bar that contains the virtio-pci registers.
> So let's consider the following scenarios:
> 
> QEMU of today:
> 
> 1) qemu -drive file=ubuntu-13.04.img,if=virtio
> 
> This works today.  Does adding an MMIO bar at BAR1 break this?
> Certainly not if the device is behind a PCI bus...
> 
> But are we going to put devices behind a PCI-e bus by default?  Are we
> going to ask the user to choose whether devices are put behind a legacy
> bus or the express bus?
> 
> What happens if we put the device behind a PCI-e bus by default?  Well,
> it can still work.  That is, until we do something like this:
> 
> 2) qemu -drive file=ubuntu-13.04.img,if=virtio -device virtio-rng
>         -device virtio-balloon..
> 
> Such that we have more than a dozen or so devices.  This works
> perfectly fine today.  It works fine because we've designed virtio to
> make sure it works fine.  Quoting the spec:
> 
> "Configuration space is generally used for rarely-changing or
>  initialization-time parameters. But it is a limited resource, so it
>  might be better to use a virtqueue to update configuration information
>  (the network device does this for filtering, otherwise the table in the
>  config space could potentially be very large)."
> 
> In fact, we can have 100s of PCI devices today without running out of IO
> space because we're so careful about this.
> 
> So if we switch to using PCI-e by default *and* we keep virtio-pci
> without modifying the device IDs, then very frequently we are going to
> break existing guests because the drivers they already have no longer
> work.
> 
> A few virtio-serial channels, a few block devices, a couple of network
> adapters, the balloon and RNG driver, and we hit the IO space limit
> pretty damn quickly so this is not a contrived scenario at all.  I would
> expect that we frequently run into this if we don't address this problem.
> 
> So we have a few options:
> 1) Punt all of this complexity to libvirt et al and watch people make
>    the wrong decisions about when to use PCI-e.  This will become yet
>    another example of KVM being too hard to configure.
> 
> 2) Enable PCI-e by default and just force people to upgrade their
>    drivers.
> 
> 3) Don't use PCI-e by default but still add BAR1 to virtio-pci
> 
> 4) Do virtio-pcie, make it PCI-e friendly (drop the IO BAR completely),

We can't do this - it will hurt performance.

>    give
>    it a new device/vendor ID.   Continue to use virtio-pci for existing
>    devices potentially adding virtio-{net,blk,...}-pcie variants for
>    people that care to use them.
> 
> I think 1 == 2 == 3 and I view 2 as an ABI breaker.

Why do you think 2 == 3? 2 changes default behaviour. 3 does not.

> libvirt does like
> policy so they're going to make a simple decision and always use the
> same bus by default.  I suspect if we made PCI the default, they might
> just always set the PCI-e flag just because.

This sounds very strange. But let's assume you are right for
the sake of the argument ...

> There are hundreds of thousands if not millions of guests with existing
> virtio-pci drivers.  Forcing them to upgrade better have an extremely
> good justification.
> 
> I think 4 is the best path forward.  It's better for users (guests
> continue to work as they always have).  There's less confusion about
> enabling PCI-e support--you must ask for the virtio-pcie variant and you
> must have a virtio-pcie driver.  It's easy to explain.

I don't think how this changes the situation. libvirt still need
to set policy and decide which device to use.

> It also maps to what regular hardware does.  I highly doubt that there
> are any real PCI cards that made the shift from PCI to PCI-e without
> bumping at least a revision ID.

Only because the chance it's 100% compatible on the software level is 0.
It always has some hardware specific quirks.
No such excuse here.

> It also means we don't need to play games about sometimes enabling IO
> bars and sometimes not.

This last paragraph is wrong, it ignores the issues 3) to 5) 
I added above.

If you do take them into account:
	- there are reasons to add MMIO BAR to PCI,
	  even without PCI express
	- we won't be able to drop IO BAR from virtio

> Regards,
> 
> Anthony Liguori
> 
> >
> >
> > -- 
> > MST
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 19:43                                   ` Michael S. Tsirkin
@ 2013-06-05 19:52                                     ` Michael S. Tsirkin
  2013-06-05 20:45                                       ` Anthony Liguori
  2013-06-05 20:42                                     ` Anthony Liguori
  1 sibling, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 19:52 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 10:43:17PM +0300, Michael S. Tsirkin wrote:
> On Wed, Jun 05, 2013 at 01:57:16PM -0500, Anthony Liguori wrote:
> > "Michael S. Tsirkin" <mst@redhat.com> writes:
> > 
> > > On Wed, Jun 05, 2013 at 10:46:15AM -0500, Anthony Liguori wrote:
> > >> Look, it's very simple.
> > > We only need to do it if we do a change that breaks guests.
> > >
> > > Please find a guest that is broken by the patches. You won't find any.
> > 
> > I think the problem in this whole discussion is that we're talking past
> > each other.
> > 
> > Here is my understanding:
> > 
> > 1) PCI-e says that you must be able to disable IO bars and still have a
> > functioning device.
> > 
> > 2) It says (1) because you must size IO bars to 4096 which means that
> > practically speaking, once you enable a dozen or so PIO bars, you run
> > out of PIO space (16 * 4k == 64k and not all that space can be used).
> 
> 
> Let me add 3 other issues which I mentioned and you seem to miss:
> 
> 3) architectures which don't have fast access to IO ports, exist
>    virtio does not work there ATM
> 
> 4) setups with many PCI bridges exist and have the same issue
>    as PCI express. virtio does not work there ATM
> 
> 5) On x86, even with nested page tables, firmware only decodes
>    the page address on an invalid PTE, not the data. You need to
>    emulate the guest to get at the data. Without
>    nested page tables, we have to do page table walk and emulate
>    to get both address and data. Since this is how MMIO
>    is implemented in kvm on x86, MMIO is much slower than PIO
>    (with nested page tables by a factor of >2, did not test without).

Oh I forgot:

6) access to MMIO BARs is painful in the BIOS environment
   so BIOS would typically need to enable IO for the boot device.



> > virtio-pci uses a IO bars exclusively today.  Existing guest drivers
> > assume that there is an IO bar that contains the virtio-pci registers.
> > So let's consider the following scenarios:
> > 
> > QEMU of today:
> > 
> > 1) qemu -drive file=ubuntu-13.04.img,if=virtio
> > 
> > This works today.  Does adding an MMIO bar at BAR1 break this?
> > Certainly not if the device is behind a PCI bus...
> > 
> > But are we going to put devices behind a PCI-e bus by default?  Are we
> > going to ask the user to choose whether devices are put behind a legacy
> > bus or the express bus?
> > 
> > What happens if we put the device behind a PCI-e bus by default?  Well,
> > it can still work.  That is, until we do something like this:
> > 
> > 2) qemu -drive file=ubuntu-13.04.img,if=virtio -device virtio-rng
> >         -device virtio-balloon..
> > 
> > Such that we have more than a dozen or so devices.  This works
> > perfectly fine today.  It works fine because we've designed virtio to
> > make sure it works fine.  Quoting the spec:
> > 
> > "Configuration space is generally used for rarely-changing or
> >  initialization-time parameters. But it is a limited resource, so it
> >  might be better to use a virtqueue to update configuration information
> >  (the network device does this for filtering, otherwise the table in the
> >  config space could potentially be very large)."
> > 
> > In fact, we can have 100s of PCI devices today without running out of IO
> > space because we're so careful about this.
> > 
> > So if we switch to using PCI-e by default *and* we keep virtio-pci
> > without modifying the device IDs, then very frequently we are going to
> > break existing guests because the drivers they already have no longer
> > work.
> > 
> > A few virtio-serial channels, a few block devices, a couple of network
> > adapters, the balloon and RNG driver, and we hit the IO space limit
> > pretty damn quickly so this is not a contrived scenario at all.  I would
> > expect that we frequently run into this if we don't address this problem.
> > 
> > So we have a few options:
> > 1) Punt all of this complexity to libvirt et al and watch people make
> >    the wrong decisions about when to use PCI-e.  This will become yet
> >    another example of KVM being too hard to configure.
> > 
> > 2) Enable PCI-e by default and just force people to upgrade their
> >    drivers.
> > 
> > 3) Don't use PCI-e by default but still add BAR1 to virtio-pci
> > 
> > 4) Do virtio-pcie, make it PCI-e friendly (drop the IO BAR completely),
> 
> We can't do this - it will hurt performance.
> 
> >    give
> >    it a new device/vendor ID.   Continue to use virtio-pci for existing
> >    devices potentially adding virtio-{net,blk,...}-pcie variants for
> >    people that care to use them.
> > 
> > I think 1 == 2 == 3 and I view 2 as an ABI breaker.
> 
> Why do you think 2 == 3? 2 changes default behaviour. 3 does not.
> 
> > libvirt does like
> > policy so they're going to make a simple decision and always use the
> > same bus by default.  I suspect if we made PCI the default, they might
> > just always set the PCI-e flag just because.
> 
> This sounds very strange. But let's assume you are right for
> the sake of the argument ...
> 
> > There are hundreds of thousands if not millions of guests with existing
> > virtio-pci drivers.  Forcing them to upgrade better have an extremely
> > good justification.
> > 
> > I think 4 is the best path forward.  It's better for users (guests
> > continue to work as they always have).  There's less confusion about
> > enabling PCI-e support--you must ask for the virtio-pcie variant and you
> > must have a virtio-pcie driver.  It's easy to explain.
> 
> I don't think how this changes the situation. libvirt still need
> to set policy and decide which device to use.
> 
> > It also maps to what regular hardware does.  I highly doubt that there
> > are any real PCI cards that made the shift from PCI to PCI-e without
> > bumping at least a revision ID.
> 
> Only because the chance it's 100% compatible on the software level is 0.
> It always has some hardware specific quirks.
> No such excuse here.
> 
> > It also means we don't need to play games about sometimes enabling IO
> > bars and sometimes not.
> 
> This last paragraph is wrong, it ignores the issues 3) to 5) 
> I added above.
> 
> If you do take them into account:
> 	- there are reasons to add MMIO BAR to PCI,
> 	  even without PCI express
> 	- we won't be able to drop IO BAR from virtio
> 
> > Regards,
> > 
> > Anthony Liguori
> > 
> > >
> > >
> > > -- 
> > > MST
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 18:57                                 ` Anthony Liguori
  2013-06-05 19:43                                   ` Michael S. Tsirkin
@ 2013-06-05 19:54                                   ` Michael S. Tsirkin
  2013-06-06  3:42                                   ` Rusty Russell
  2 siblings, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 19:54 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 01:57:16PM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Wed, Jun 05, 2013 at 10:46:15AM -0500, Anthony Liguori wrote:
> >> Look, it's very simple.
> > We only need to do it if we do a change that breaks guests.
> >
> > Please find a guest that is broken by the patches. You won't find any.
> 
> I think the problem in this whole discussion is that we're talking past
> each other.
> 
> Here is my understanding:
> 
> 1) PCI-e says that you must be able to disable IO bars and still have a
> functioning device.
> 
> 2) It says (1) because you must size IO bars to 4096 which means that
> practically speaking, once you enable a dozen or so PIO bars, you run
> out of PIO space (16 * 4k == 64k and not all that space can be used).
> 
> virtio-pci uses a IO bars exclusively today.  Existing guest drivers
> assume that there is an IO bar that contains the virtio-pci registers.
> 
> So let's consider the following scenarios:
> 
> QEMU of today:
> 
> 1) qemu -drive file=ubuntu-13.04.img,if=virtio
> 
> This works today.  Does adding an MMIO bar at BAR1 break this?
> Certainly not if the device is behind a PCI bus...
> 
> But are we going to put devices behind a PCI-e bus by default?  Are we
> going to ask the user to choose whether devices are put behind a legacy
> bus or the express bus?
> 
> What happens if we put the device behind a PCI-e bus by default?  Well,
> it can still work.  That is, until we do something like this:
> 
> 2) qemu -drive file=ubuntu-13.04.img,if=virtio -device virtio-rng
>         -device virtio-balloon..
> 
> Such that we have more than a dozen or so devices.  This works
> perfectly fine today.  It works fine because we've designed virtio to
> make sure it works fine.  Quoting the spec:
> 
> "Configuration space is generally used for rarely-changing or
>  initialization-time parameters. But it is a limited resource, so it
>  might be better to use a virtqueue to update configuration information
>  (the network device does this for filtering, otherwise the table in the
>  config space could potentially be very large)."
> 
> In fact, we can have 100s of PCI devices today without running out of IO
> space because we're so careful about this.
> 
> So if we switch to using PCI-e by default *and* we keep virtio-pci
> without modifying the device IDs, then very frequently we are going to
> break existing guests because the drivers they already have no longer
> work.
> 
> A few virtio-serial channels, a few block devices, a couple of network
> adapters, the balloon and RNG driver, and we hit the IO space limit
> pretty damn quickly so this is not a contrived scenario at all.  I would
> expect that we frequently run into this if we don't address this problem.
> 
> So we have a few options:
> 
> 1) Punt all of this complexity to libvirt et al and watch people make
>    the wrong decisions about when to use PCI-e.  This will become yet
>    another example of KVM being too hard to configure.
> 
> 2) Enable PCI-e by default and just force people to upgrade their
>    drivers.
> 
> 3) Don't use PCI-e by default but still add BAR1 to virtio-pci
> 
> 4) Do virtio-pcie, make it PCI-e friendly (drop the IO BAR completely), give
>    it a new device/vendor ID.   Continue to use virtio-pci for existing
>    devices potentially adding virtio-{net,blk,...}-pcie variants for
>    people that care to use them.

For the record, with respect to PCI-e discussion, I have no
problem with the idea of changing the device ID or
revision id and asking guests to upgrade if they
want to use a pcie device.

That's not exactly 4 however.


I see no reason to couple PCI-e with MMIO discussion,
that's just one of the reasons to support MMIO.


> I think 1 == 2 == 3 and I view 2 as an ABI breaker.  libvirt does like
> policy so they're going to make a simple decision and always use the
> same bus by default.  I suspect if we made PCI the default, they might
> just always set the PCI-e flag just because.
> 
> There are hundreds of thousands if not millions of guests with existing
> virtio-pci drivers.  Forcing them to upgrade better have an extremely
> good justification.
> 
> I think 4 is the best path forward.  It's better for users (guests
> continue to work as they always have).  There's less confusion about
> enabling PCI-e support--you must ask for the virtio-pcie variant and you
> must have a virtio-pcie driver.  It's easy to explain.
> 
> It also maps to what regular hardware does.  I highly doubt that there
> are any real PCI cards that made the shift from PCI to PCI-e without
> bumping at least a revision ID.
> 
> It also means we don't need to play games about sometimes enabling IO
> bars and sometimes not.
> 
> Regards,
> 
> Anthony Liguori
> 
> >
> >
> > -- 
> > MST
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 19:43                                   ` Michael S. Tsirkin
  2013-06-05 19:52                                     ` Michael S. Tsirkin
@ 2013-06-05 20:42                                     ` Anthony Liguori
  2013-06-05 21:14                                       ` Michael S. Tsirkin
  1 sibling, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 20:42 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Jun 05, 2013 at 01:57:16PM -0500, Anthony Liguori wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > On Wed, Jun 05, 2013 at 10:46:15AM -0500, Anthony Liguori wrote:
>> >> Look, it's very simple.
>> > We only need to do it if we do a change that breaks guests.
>> >
>> > Please find a guest that is broken by the patches. You won't find any.
>> 
>> I think the problem in this whole discussion is that we're talking past
>> each other.
>> 
>> Here is my understanding:
>> 
>> 1) PCI-e says that you must be able to disable IO bars and still have a
>> functioning device.
>> 
>> 2) It says (1) because you must size IO bars to 4096 which means that
>> practically speaking, once you enable a dozen or so PIO bars, you run
>> out of PIO space (16 * 4k == 64k and not all that space can be used).
>
>
> Let me add 3 other issues which I mentioned and you seem to miss:
>
> 3) architectures which don't have fast access to IO ports, exist
>    virtio does not work there ATM

Which architectures have PCI but no IO ports?

> 4) setups with many PCI bridges exist and have the same issue
>    as PCI express. virtio does not work there ATM

This is not virtio specific.  This is true for all devices that use IO.

> 5) On x86, even with nested page tables, firmware only decodes
>    the page address on an invalid PTE, not the data. You need to
>    emulate the guest to get at the data. Without
>    nested page tables, we have to do page table walk and emulate
>    to get both address and data. Since this is how MMIO
>    is implemented in kvm on x86, MMIO is much slower than PIO
>    (with nested page tables by a factor of >2, did not test without).

Am well aware of this, this is why we use PIO.

I fully agree with you that when we do MMIO, we should switch the
notification mechanism to avoid encoding anything meaningful as data.

>> virtio-pci uses a IO bars exclusively today.  Existing guest drivers
>> assume that there is an IO bar that contains the virtio-pci registers.
>> So let's consider the following scenarios:
>> 
>> QEMU of today:
>> 
>> 1) qemu -drive file=ubuntu-13.04.img,if=virtio
>> 
>> This works today.  Does adding an MMIO bar at BAR1 break this?
>> Certainly not if the device is behind a PCI bus...
>> 
>> But are we going to put devices behind a PCI-e bus by default?  Are we
>> going to ask the user to choose whether devices are put behind a legacy
>> bus or the express bus?
>> 
>> What happens if we put the device behind a PCI-e bus by default?  Well,
>> it can still work.  That is, until we do something like this:
>> 
>> 2) qemu -drive file=ubuntu-13.04.img,if=virtio -device virtio-rng
>>         -device virtio-balloon..
>> 
>> Such that we have more than a dozen or so devices.  This works
>> perfectly fine today.  It works fine because we've designed virtio to
>> make sure it works fine.  Quoting the spec:
>> 
>> "Configuration space is generally used for rarely-changing or
>>  initialization-time parameters. But it is a limited resource, so it
>>  might be better to use a virtqueue to update configuration information
>>  (the network device does this for filtering, otherwise the table in the
>>  config space could potentially be very large)."
>> 
>> In fact, we can have 100s of PCI devices today without running out of IO
>> space because we're so careful about this.
>> 
>> So if we switch to using PCI-e by default *and* we keep virtio-pci
>> without modifying the device IDs, then very frequently we are going to
>> break existing guests because the drivers they already have no longer
>> work.
>> 
>> A few virtio-serial channels, a few block devices, a couple of network
>> adapters, the balloon and RNG driver, and we hit the IO space limit
>> pretty damn quickly so this is not a contrived scenario at all.  I would
>> expect that we frequently run into this if we don't address this problem.
>> 
>> So we have a few options:
>> 1) Punt all of this complexity to libvirt et al and watch people make
>>    the wrong decisions about when to use PCI-e.  This will become yet
>>    another example of KVM being too hard to configure.
>> 
>> 2) Enable PCI-e by default and just force people to upgrade their
>>    drivers.
>> 
>> 3) Don't use PCI-e by default but still add BAR1 to virtio-pci
>> 
>> 4) Do virtio-pcie, make it PCI-e friendly (drop the IO BAR completely),
>
> We can't do this - it will hurt performance.

Can you explain?  I thought the whole trick with separating out the
virtqueue notification register was to regain the performance?

>>    give
>>    it a new device/vendor ID.   Continue to use virtio-pci for existing
>>    devices potentially adding virtio-{net,blk,...}-pcie variants for
>>    people that care to use them.
>> 
>> I think 1 == 2 == 3 and I view 2 as an ABI breaker.
>
> Why do you think 2 == 3? 2 changes default behaviour. 3 does not.

It doesn't change the default behavior but then we're pushing the
decision of when to use pci-e to the user.  They have to understand that
there can be subtle breakages because the virtio-pci driver may not work
if they are using an old guest.

>> libvirt does like
>> policy so they're going to make a simple decision and always use the
>> same bus by default.  I suspect if we made PCI the default, they might
>> just always set the PCI-e flag just because.
>
> This sounds very strange. But let's assume you are right for
> the sake of the argument ...
>
>> There are hundreds of thousands if not millions of guests with existing
>> virtio-pci drivers.  Forcing them to upgrade better have an extremely
>> good justification.
>> 
>> I think 4 is the best path forward.  It's better for users (guests
>> continue to work as they always have).  There's less confusion about
>> enabling PCI-e support--you must ask for the virtio-pcie variant and you
>> must have a virtio-pcie driver.  It's easy to explain.
>
> I don't think how this changes the situation. libvirt still need
> to set policy and decide which device to use.

But virtio-pcie never exhausts the IO configuration space.  That's the
difference.

And virtio-pcie is a separate driver so presumably libvirt will make
that visible in the XML.  In fact, it should.

>> It also maps to what regular hardware does.  I highly doubt that there
>> are any real PCI cards that made the shift from PCI to PCI-e without
>> bumping at least a revision ID.
>
> Only because the chance it's 100% compatible on the software level is 0.
> It always has some hardware specific quirks.
> No such excuse here.
>
>> It also means we don't need to play games about sometimes enabling IO
>> bars and sometimes not.
>
> This last paragraph is wrong, it ignores the issues 3) to 5) 
> I added above.
>
> If you do take them into account:
> 	- there are reasons to add MMIO BAR to PCI,
> 	  even without PCI express

So far, the only reason you've provided is "it doesn't work on some
architectures."  Which architectures?

> 	- we won't be able to drop IO BAR from virtio

An IO BAR is useless if it means we can't have more than 12 devices.

>
>> Regards,
>> 
>> Anthony Liguori
>> 
>> >
>> >
>> > -- 
>> > MST
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe kvm" in
>> > the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 19:52                                     ` Michael S. Tsirkin
@ 2013-06-05 20:45                                       ` Anthony Liguori
  2013-06-05 21:15                                         ` H. Peter Anvin
  2013-06-05 21:15                                         ` Michael S. Tsirkin
  0 siblings, 2 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 20:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Jun 05, 2013 at 10:43:17PM +0300, Michael S. Tsirkin wrote:
>> On Wed, Jun 05, 2013 at 01:57:16PM -0500, Anthony Liguori wrote:
>> > "Michael S. Tsirkin" <mst@redhat.com> writes:
>> > 
>> > > On Wed, Jun 05, 2013 at 10:46:15AM -0500, Anthony Liguori wrote:
>> > >> Look, it's very simple.
>> > > We only need to do it if we do a change that breaks guests.
>> > >
>> > > Please find a guest that is broken by the patches. You won't find any.
>> > 
>> > I think the problem in this whole discussion is that we're talking past
>> > each other.
>> > 
>> > Here is my understanding:
>> > 
>> > 1) PCI-e says that you must be able to disable IO bars and still have a
>> > functioning device.
>> > 
>> > 2) It says (1) because you must size IO bars to 4096 which means that
>> > practically speaking, once you enable a dozen or so PIO bars, you run
>> > out of PIO space (16 * 4k == 64k and not all that space can be used).
>> 
>> 
>> Let me add 3 other issues which I mentioned and you seem to miss:
>> 
>> 3) architectures which don't have fast access to IO ports, exist
>>    virtio does not work there ATM
>> 
>> 4) setups with many PCI bridges exist and have the same issue
>>    as PCI express. virtio does not work there ATM
>> 
>> 5) On x86, even with nested page tables, firmware only decodes
>>    the page address on an invalid PTE, not the data. You need to
>>    emulate the guest to get at the data. Without
>>    nested page tables, we have to do page table walk and emulate
>>    to get both address and data. Since this is how MMIO
>>    is implemented in kvm on x86, MMIO is much slower than PIO
>>    (with nested page tables by a factor of >2, did not test without).
>
> Oh I forgot:
>
> 6) access to MMIO BARs is painful in the BIOS environment
>    so BIOS would typically need to enable IO for the boot device.

But if you want to boot from the 16th device, the BIOS needs to solve
this problem anyway.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 16:20                               ` Michael S. Tsirkin
  2013-06-05 18:57                                 ` Anthony Liguori
@ 2013-06-05 21:10                                 ` H. Peter Anvin
  2013-06-05 21:17                                   ` Michael S. Tsirkin
  2013-06-05 21:50                                   ` Anthony Liguori
  1 sibling, 2 replies; 109+ messages in thread
From: H. Peter Anvin @ 2013-06-05 21:10 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On 06/05/2013 09:20 AM, Michael S. Tsirkin wrote:
> 
> Spec says IO and memory can be enabled/disabled, separately.
> PCI Express spec says devices should work without IO.
> 

For "native endpoints".  Currently virtio would be a "legacy endpoint"
which is quite correct -- it is compatible with a legacy interface.

	-hpa

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 20:42                                     ` Anthony Liguori
@ 2013-06-05 21:14                                       ` Michael S. Tsirkin
  2013-06-05 21:53                                         ` Anthony Liguori
  2013-06-05 21:53                                         ` Anthony Liguori
  0 siblings, 2 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 21:14 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 03:42:57PM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Wed, Jun 05, 2013 at 01:57:16PM -0500, Anthony Liguori wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> 
> >> > On Wed, Jun 05, 2013 at 10:46:15AM -0500, Anthony Liguori wrote:
> >> >> Look, it's very simple.
> >> > We only need to do it if we do a change that breaks guests.
> >> >
> >> > Please find a guest that is broken by the patches. You won't find any.
> >> 
> >> I think the problem in this whole discussion is that we're talking past
> >> each other.
> >> 
> >> Here is my understanding:
> >> 
> >> 1) PCI-e says that you must be able to disable IO bars and still have a
> >> functioning device.
> >> 
> >> 2) It says (1) because you must size IO bars to 4096 which means that
> >> practically speaking, once you enable a dozen or so PIO bars, you run
> >> out of PIO space (16 * 4k == 64k and not all that space can be used).
> >
> >
> > Let me add 3 other issues which I mentioned and you seem to miss:
> >
> > 3) architectures which don't have fast access to IO ports, exist
> >    virtio does not work there ATM
> 
> Which architectures have PCI but no IO ports?
> 
> > 4) setups with many PCI bridges exist and have the same issue
> >    as PCI express. virtio does not work there ATM
> 
> This is not virtio specific.  This is true for all devices that use IO.

Absolutely. And you will find that modern devices make use of IO
ports optional.

> > 5) On x86, even with nested page tables, firmware only decodes
> >    the page address on an invalid PTE, not the data. You need to
> >    emulate the guest to get at the data. Without
> >    nested page tables, we have to do page table walk and emulate
> >    to get both address and data. Since this is how MMIO
> >    is implemented in kvm on x86, MMIO is much slower than PIO
> >    (with nested page tables by a factor of >2, did not test without).
> 
> Am well aware of this, this is why we use PIO.
> 
> I fully agree with you that when we do MMIO, we should switch the
> notification mechanism to avoid encoding anything meaningful as data.
> 
> >> virtio-pci uses a IO bars exclusively today.  Existing guest drivers
> >> assume that there is an IO bar that contains the virtio-pci registers.
> >> So let's consider the following scenarios:
> >> 
> >> QEMU of today:
> >> 
> >> 1) qemu -drive file=ubuntu-13.04.img,if=virtio
> >> 
> >> This works today.  Does adding an MMIO bar at BAR1 break this?
> >> Certainly not if the device is behind a PCI bus...
> >> 
> >> But are we going to put devices behind a PCI-e bus by default?  Are we
> >> going to ask the user to choose whether devices are put behind a legacy
> >> bus or the express bus?
> >> 
> >> What happens if we put the device behind a PCI-e bus by default?  Well,
> >> it can still work.  That is, until we do something like this:
> >> 
> >> 2) qemu -drive file=ubuntu-13.04.img,if=virtio -device virtio-rng
> >>         -device virtio-balloon..
> >> 
> >> Such that we have more than a dozen or so devices.  This works
> >> perfectly fine today.  It works fine because we've designed virtio to
> >> make sure it works fine.  Quoting the spec:
> >> 
> >> "Configuration space is generally used for rarely-changing or
> >>  initialization-time parameters. But it is a limited resource, so it
> >>  might be better to use a virtqueue to update configuration information
> >>  (the network device does this for filtering, otherwise the table in the
> >>  config space could potentially be very large)."
> >> 
> >> In fact, we can have 100s of PCI devices today without running out of IO
> >> space because we're so careful about this.
> >> 
> >> So if we switch to using PCI-e by default *and* we keep virtio-pci
> >> without modifying the device IDs, then very frequently we are going to
> >> break existing guests because the drivers they already have no longer
> >> work.
> >> 
> >> A few virtio-serial channels, a few block devices, a couple of network
> >> adapters, the balloon and RNG driver, and we hit the IO space limit
> >> pretty damn quickly so this is not a contrived scenario at all.  I would
> >> expect that we frequently run into this if we don't address this problem.
> >> 
> >> So we have a few options:
> >> 1) Punt all of this complexity to libvirt et al and watch people make
> >>    the wrong decisions about when to use PCI-e.  This will become yet
> >>    another example of KVM being too hard to configure.
> >> 
> >> 2) Enable PCI-e by default and just force people to upgrade their
> >>    drivers.
> >> 
> >> 3) Don't use PCI-e by default but still add BAR1 to virtio-pci
> >> 
> >> 4) Do virtio-pcie, make it PCI-e friendly (drop the IO BAR completely),
> >
> > We can't do this - it will hurt performance.
> 
> Can you explain?  I thought the whole trick with separating out the
> virtqueue notification register was to regain the performance?

Yes but this trick only works well with NPT (it's still a bit
slower than PIO but not so drastically).
Without NPT you still need a page walk so it will be slow.

> >>    give
> >>    it a new device/vendor ID.   Continue to use virtio-pci for existing
> >>    devices potentially adding virtio-{net,blk,...}-pcie variants for
> >>    people that care to use them.
> >> 
> >> I think 1 == 2 == 3 and I view 2 as an ABI breaker.
> >
> > Why do you think 2 == 3? 2 changes default behaviour. 3 does not.
> 
> It doesn't change the default behavior but then we're pushing the
> decision of when to use pci-e to the user.  They have to understand that
> there can be subtle breakages because the virtio-pci driver may not work
> if they are using an old guest.

pci-e is a separate issue. E.g. we can make pci-e a new device id.

> >> libvirt does like
> >> policy so they're going to make a simple decision and always use the
> >> same bus by default.  I suspect if we made PCI the default, they might
> >> just always set the PCI-e flag just because.
> >
> > This sounds very strange. But let's assume you are right for
> > the sake of the argument ...
> >
> >> There are hundreds of thousands if not millions of guests with existing
> >> virtio-pci drivers.  Forcing them to upgrade better have an extremely
> >> good justification.
> >> 
> >> I think 4 is the best path forward.  It's better for users (guests
> >> continue to work as they always have).  There's less confusion about
> >> enabling PCI-e support--you must ask for the virtio-pcie variant and you
> >> must have a virtio-pcie driver.  It's easy to explain.
> >
> > I don't think how this changes the situation. libvirt still need
> > to set policy and decide which device to use.
> 
> But virtio-pcie never exhausts the IO configuration space.  That's the
> difference.
> 
> And virtio-pcie is a separate driver so presumably libvirt will make
> that visible in the XML.  In fact, it should.

It should already do that explicit even if device name
is the same. In fact, it does: bus name is pcie.0 versus
pci.0

> >> It also maps to what regular hardware does.  I highly doubt that there
> >> are any real PCI cards that made the shift from PCI to PCI-e without
> >> bumping at least a revision ID.
> >
> > Only because the chance it's 100% compatible on the software level is 0.
> > It always has some hardware specific quirks.
> > No such excuse here.
> >
> >> It also means we don't need to play games about sometimes enabling IO
> >> bars and sometimes not.
> >
> > This last paragraph is wrong, it ignores the issues 3) to 5) 
> > I added above.
> >
> > If you do take them into account:
> > 	- there are reasons to add MMIO BAR to PCI,
> > 	  even without PCI express
> 
> So far, the only reason you've provided is "it doesn't work on some
> architectures."  Which architectures?

PowerPC wants this.

> > 	- we won't be able to drop IO BAR from virtio
> 
> An IO BAR is useless if it means we can't have more than 12 devices.


It's not useless. A smart BIOS can enable devices one by one as
it tries to boot from them.

> >
> >> Regards,
> >> 
> >> Anthony Liguori
> >> 
> >> >
> >> >
> >> > -- 
> >> > MST
> >> > --
> >> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> >> > the body of a message to majordomo@vger.kernel.org
> >> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 20:45                                       ` Anthony Liguori
@ 2013-06-05 21:15                                         ` H. Peter Anvin
  2013-06-05 21:15                                         ` Michael S. Tsirkin
  1 sibling, 0 replies; 109+ messages in thread
From: H. Peter Anvin @ 2013-06-05 21:15 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On 06/05/2013 01:45 PM, Anthony Liguori wrote:
> 
> But if you want to boot from the 16th device, the BIOS needs to solve
> this problem anyway.
> 

No.  Just have the BIOS plumb the path to the device it wants to boot
from.  Problem solved.

	-hpa

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 20:45                                       ` Anthony Liguori
  2013-06-05 21:15                                         ` H. Peter Anvin
@ 2013-06-05 21:15                                         ` Michael S. Tsirkin
  1 sibling, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 21:15 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 03:45:09PM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Wed, Jun 05, 2013 at 10:43:17PM +0300, Michael S. Tsirkin wrote:
> >> On Wed, Jun 05, 2013 at 01:57:16PM -0500, Anthony Liguori wrote:
> >> > "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> > 
> >> > > On Wed, Jun 05, 2013 at 10:46:15AM -0500, Anthony Liguori wrote:
> >> > >> Look, it's very simple.
> >> > > We only need to do it if we do a change that breaks guests.
> >> > >
> >> > > Please find a guest that is broken by the patches. You won't find any.
> >> > 
> >> > I think the problem in this whole discussion is that we're talking past
> >> > each other.
> >> > 
> >> > Here is my understanding:
> >> > 
> >> > 1) PCI-e says that you must be able to disable IO bars and still have a
> >> > functioning device.
> >> > 
> >> > 2) It says (1) because you must size IO bars to 4096 which means that
> >> > practically speaking, once you enable a dozen or so PIO bars, you run
> >> > out of PIO space (16 * 4k == 64k and not all that space can be used).
> >> 
> >> 
> >> Let me add 3 other issues which I mentioned and you seem to miss:
> >> 
> >> 3) architectures which don't have fast access to IO ports, exist
> >>    virtio does not work there ATM
> >> 
> >> 4) setups with many PCI bridges exist and have the same issue
> >>    as PCI express. virtio does not work there ATM
> >> 
> >> 5) On x86, even with nested page tables, firmware only decodes
> >>    the page address on an invalid PTE, not the data. You need to
> >>    emulate the guest to get at the data. Without
> >>    nested page tables, we have to do page table walk and emulate
> >>    to get both address and data. Since this is how MMIO
> >>    is implemented in kvm on x86, MMIO is much slower than PIO
> >>    (with nested page tables by a factor of >2, did not test without).
> >
> > Oh I forgot:
> >
> > 6) access to MMIO BARs is painful in the BIOS environment
> >    so BIOS would typically need to enable IO for the boot device.
> 
> But if you want to boot from the 16th device, the BIOS needs to solve
> this problem anyway.
> 
> Regards,
> 
> Anthony Liguori

But it's solvable: just enable devices one by one to boot from them.

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 21:10                                 ` H. Peter Anvin
@ 2013-06-05 21:17                                   ` Michael S. Tsirkin
  2013-06-05 21:50                                   ` Anthony Liguori
  1 sibling, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-05 21:17 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 02:10:03PM -0700, H. Peter Anvin wrote:
> On 06/05/2013 09:20 AM, Michael S. Tsirkin wrote:
> > 
> > Spec says IO and memory can be enabled/disabled, separately.
> > PCI Express spec says devices should work without IO.
> > 
> 
> For "native endpoints".  Currently virtio would be a "legacy endpoint"
> which is quite correct -- it is compatible with a legacy interface.
> 
> 	-hpa

Yes. But it's not the spec wording that we care for most.
It's supporting setups that have trouble using IO.

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 21:10                                 ` H. Peter Anvin
  2013-06-05 21:17                                   ` Michael S. Tsirkin
@ 2013-06-05 21:50                                   ` Anthony Liguori
  2013-06-05 21:55                                     ` H. Peter Anvin
  1 sibling, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 21:50 UTC (permalink / raw)
  To: H. Peter Anvin, Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

"H. Peter Anvin" <hpa@zytor.com> writes:

> On 06/05/2013 09:20 AM, Michael S. Tsirkin wrote:
>> 
>> Spec says IO and memory can be enabled/disabled, separately.
>> PCI Express spec says devices should work without IO.
>> 
>
> For "native endpoints".  Currently virtio would be a "legacy endpoint"
> which is quite correct -- it is compatible with a legacy interface.

Do legacy endpoints also use 4k for BARs?

If not, can't we use a new device id for native endpoints and call it a
day?  Legacy endpoints would continue using the existing BAR layout.

Regards,

Anthony Liguori

>
> 	-hpa
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 21:14                                       ` Michael S. Tsirkin
  2013-06-05 21:53                                         ` Anthony Liguori
@ 2013-06-05 21:53                                         ` Anthony Liguori
  2013-06-05 22:19                                           ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 21:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Rusty Russell, virtualization, kvm, KONRAD Frederic,
	Paolo Bonzini, Peter Maydell, Stefan Hajnoczi, gleb,
	Benjamin Herrenschmidt

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Jun 05, 2013 at 03:42:57PM -0500, Anthony Liguori wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> Can you explain?  I thought the whole trick with separating out the
>> virtqueue notification register was to regain the performance?
>
> Yes but this trick only works well with NPT (it's still a bit
> slower than PIO but not so drastically).
> Without NPT you still need a page walk so it will be slow.

Do you mean NPT/EPT?

If your concern is shadow paging, then I think you're concerned about
hardware that is so slow to start with that it's not worth considering.

>> >> It also maps to what regular hardware does.  I highly doubt that there
>> >> are any real PCI cards that made the shift from PCI to PCI-e without
>> >> bumping at least a revision ID.
>> >
>> > Only because the chance it's 100% compatible on the software level is 0.
>> > It always has some hardware specific quirks.
>> > No such excuse here.
>> >
>> >> It also means we don't need to play games about sometimes enabling IO
>> >> bars and sometimes not.
>> >
>> > This last paragraph is wrong, it ignores the issues 3) to 5) 
>> > I added above.
>> >
>> > If you do take them into account:
>> > 	- there are reasons to add MMIO BAR to PCI,
>> > 	  even without PCI express
>> 
>> So far, the only reason you've provided is "it doesn't work on some
>> architectures."  Which architectures?
>
> PowerPC wants this.

Existing PowerPC remaps PIO to MMAP so it works fine today.

Future platforms may not do this but future platforms can use a
different device.  They certainly won't be able to use the existing
drivers anyway.

Ben, am I wrong here?

>> > 	- we won't be able to drop IO BAR from virtio
>> 
>> An IO BAR is useless if it means we can't have more than 12 devices.
>
>
> It's not useless. A smart BIOS can enable devices one by one as
> it tries to boot from them.

A smart BIOS can also use MMIO to program virtio.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 21:14                                       ` Michael S. Tsirkin
@ 2013-06-05 21:53                                         ` Anthony Liguori
  2013-06-05 21:53                                         ` Anthony Liguori
  1 sibling, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 21:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, kvm, Benjamin Herrenschmidt, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Jun 05, 2013 at 03:42:57PM -0500, Anthony Liguori wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> Can you explain?  I thought the whole trick with separating out the
>> virtqueue notification register was to regain the performance?
>
> Yes but this trick only works well with NPT (it's still a bit
> slower than PIO but not so drastically).
> Without NPT you still need a page walk so it will be slow.

Do you mean NPT/EPT?

If your concern is shadow paging, then I think you're concerned about
hardware that is so slow to start with that it's not worth considering.

>> >> It also maps to what regular hardware does.  I highly doubt that there
>> >> are any real PCI cards that made the shift from PCI to PCI-e without
>> >> bumping at least a revision ID.
>> >
>> > Only because the chance it's 100% compatible on the software level is 0.
>> > It always has some hardware specific quirks.
>> > No such excuse here.
>> >
>> >> It also means we don't need to play games about sometimes enabling IO
>> >> bars and sometimes not.
>> >
>> > This last paragraph is wrong, it ignores the issues 3) to 5) 
>> > I added above.
>> >
>> > If you do take them into account:
>> > 	- there are reasons to add MMIO BAR to PCI,
>> > 	  even without PCI express
>> 
>> So far, the only reason you've provided is "it doesn't work on some
>> architectures."  Which architectures?
>
> PowerPC wants this.

Existing PowerPC remaps PIO to MMAP so it works fine today.

Future platforms may not do this but future platforms can use a
different device.  They certainly won't be able to use the existing
drivers anyway.

Ben, am I wrong here?

>> > 	- we won't be able to drop IO BAR from virtio
>> 
>> An IO BAR is useless if it means we can't have more than 12 devices.
>
>
> It's not useless. A smart BIOS can enable devices one by one as
> it tries to boot from them.

A smart BIOS can also use MMIO to program virtio.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 21:50                                   ` Anthony Liguori
@ 2013-06-05 21:55                                     ` H. Peter Anvin
  2013-06-05 22:08                                       ` Anthony Liguori
  0 siblings, 1 reply; 109+ messages in thread
From: H. Peter Anvin @ 2013-06-05 21:55 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On 06/05/2013 02:50 PM, Anthony Liguori wrote:
> "H. Peter Anvin" <hpa@zytor.com> writes:
> 
>> On 06/05/2013 09:20 AM, Michael S. Tsirkin wrote:
>>>
>>> Spec says IO and memory can be enabled/disabled, separately.
>>> PCI Express spec says devices should work without IO.
>>>
>>
>> For "native endpoints".  Currently virtio would be a "legacy endpoint"
>> which is quite correct -- it is compatible with a legacy interface.
> 
> Do legacy endpoints also use 4k for BARs?

There are no 4K BARs.  In fact, I/O BARs are restricted by spec (there
is no technical enforcement, however) to 256 bytes.

The 4K come from the upstream bridge windows, which are only 4K granular
(historic stuff from when bridges were assumed rare.)  However, there
can be multiple devices, functions, and BARs inside that window.

The issue with PCIe is that each PCIe port is a bridge, so in reality
there is only one real device per bus number.

> If not, can't we use a new device id for native endpoints and call it a
> day?  Legacy endpoints would continue using the existing BAR layout.

Definitely an option.  However, we want to be able to boot from native
devices, too, so having an I/O BAR (which would not be used by the OS
driver) should still at the very least be an option.

	-hpa

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 21:55                                     ` H. Peter Anvin
@ 2013-06-05 22:08                                       ` Anthony Liguori
  2013-06-05 23:07                                         ` H. Peter Anvin
  0 siblings, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 22:08 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

"H. Peter Anvin" <hpa@zytor.com> writes:

> On 06/05/2013 02:50 PM, Anthony Liguori wrote:
>> "H. Peter Anvin" <hpa@zytor.com> writes:
>> 
>>> On 06/05/2013 09:20 AM, Michael S. Tsirkin wrote:
>>>>
>>>> Spec says IO and memory can be enabled/disabled, separately.
>>>> PCI Express spec says devices should work without IO.
>>>>
>>>
>>> For "native endpoints".  Currently virtio would be a "legacy endpoint"
>>> which is quite correct -- it is compatible with a legacy interface.
>> 
>> Do legacy endpoints also use 4k for BARs?
>
> There are no 4K BARs.  In fact, I/O BARs are restricted by spec (there
> is no technical enforcement, however) to 256 bytes.
>
> The 4K come from the upstream bridge windows, which are only 4K granular
> (historic stuff from when bridges were assumed rare.)  However, there
> can be multiple devices, functions, and BARs inside that window.

Got it.

>
> The issue with PCIe is that each PCIe port is a bridge, so in reality
> there is only one real device per bus number.
>
>> If not, can't we use a new device id for native endpoints and call it a
>> day?  Legacy endpoints would continue using the existing BAR layout.
>
> Definitely an option.  However, we want to be able to boot from native
> devices, too, so having an I/O BAR (which would not be used by the OS
> driver) should still at the very least be an option.

What makes it so difficult to work with an MMIO bar for PCI-e?

With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
forward.  Is there something special about PCI-e here?

Regards,

Anthony Liguori

>
> 	-hpa
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 21:53                                         ` Anthony Liguori
@ 2013-06-05 22:19                                           ` Benjamin Herrenschmidt
  2013-06-05 22:53                                             ` Anthony Liguori
  0 siblings, 1 reply; 109+ messages in thread
From: Benjamin Herrenschmidt @ 2013-06-05 22:19 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Wed, 2013-06-05 at 16:53 -0500, Anthony Liguori wrote:

> Existing PowerPC remaps PIO to MMAP so it works fine today.
> 
> Future platforms may not do this but future platforms can use a
> different device.  They certainly won't be able to use the existing
> drivers anyway.
> 
> Ben, am I wrong here?

Well, we do remap PIO, though it's ugly and annoying so we'd rather
avoid it alltogether :-)

Our latest HW PHBs don't support PIO at all, so while we can still
support it in the virtual ones in the guest for ever, it's becoming
pretty clear that PIO is on the way out ...

> >> > 	- we won't be able to drop IO BAR from virtio
> >> 
> >> An IO BAR is useless if it means we can't have more than 12 devices.
> >
> >
> > It's not useless. A smart BIOS can enable devices one by one as
> > it tries to boot from them.
> 
> A smart BIOS can also use MMIO to program virtio.

Indeed :-)

I see no reason why not providing both access path though. Have the PIO
BAR there for compatibility/legacy/BIOS/x86 purposes and *also* have the
MMIO window which I'd be happy to favor on power.

We could even put somewhere in there a feature bit set by qemu to
indicate whether it thinks PIO or MMIO is faster on a given platform if
you really think that's worth it (I don't).

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 22:19                                           ` Benjamin Herrenschmidt
@ 2013-06-05 22:53                                             ` Anthony Liguori
  2013-06-05 23:27                                               ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-06-05 22:53 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:

> On Wed, 2013-06-05 at 16:53 -0500, Anthony Liguori wrote:
>
>> A smart BIOS can also use MMIO to program virtio.
>
> Indeed :-)
>
> I see no reason why not providing both access path though. Have the PIO
> BAR there for compatibility/legacy/BIOS/x86 purposes and *also* have the
> MMIO window which I'd be happy to favor on power.
>
> We could even put somewhere in there a feature bit set by qemu to
> indicate whether it thinks PIO or MMIO is faster on a given platform if
> you really think that's worth it (I don't).

That's okay, but what I'm most concerned about is compatibility.

A virtio PCI device that's a "native endpoint" needs to have a different
device ID than one that is a "legacy endpoint".  The current drivers
have no hope of working (well) with virtio PCI devices exposed as native
endpoints.

I don't care if the native PCI endpoint also has a PIO bar.  But it
seems silly (and confusing) to me to make that layout be the "legacy"
layout verses a straight mirror of the new layout if we're already
changing the device ID.

In addition, it doesn't seem at all necessary to have an MMIO bar to the
legacy device.  If the reason you want MMIO is to avoid using PIO, then
you break existing drivers because they assume PIO.  If you are breaking
existing drivers then you should change the device ID.

If strictly speaking it's just that MMIO is a bit faster, I'm not sure
that complexity is worth it without seeing performance numbers first.

Regards,

Anthony Liguori

>
> Cheers,
> Ben.
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 22:08                                       ` Anthony Liguori
@ 2013-06-05 23:07                                         ` H. Peter Anvin
  2013-06-06  0:41                                           ` Anthony Liguori
  0 siblings, 1 reply; 109+ messages in thread
From: H. Peter Anvin @ 2013-06-05 23:07 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On 06/05/2013 03:08 PM, Anthony Liguori wrote:
>>
>> Definitely an option.  However, we want to be able to boot from native
>> devices, too, so having an I/O BAR (which would not be used by the OS
>> driver) should still at the very least be an option.
> 
> What makes it so difficult to work with an MMIO bar for PCI-e?
> 
> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> forward.  Is there something special about PCI-e here?
> 

It's not tracking allocation.  It is that accessing memory above 1 MiB
is incredibly painful in the BIOS environment, which basically means
MMIO is inaccessible.

	-hpa

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 22:53                                             ` Anthony Liguori
@ 2013-06-05 23:27                                               ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 109+ messages in thread
From: Benjamin Herrenschmidt @ 2013-06-05 23:27 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Wed, 2013-06-05 at 17:53 -0500, Anthony Liguori wrote:
> If strictly speaking it's just that MMIO is a bit faster, I'm not sure
> that complexity is worth it without seeing performance numbers first.

You mean PIO. I agree with all your points here. The only thing I
saw as an option would be to add a PIO BAR that is an exact mirror
of the MMIO BAR if and only if:

 - PIO is indeed significantly faster on platforms we care about

and/or

 - There is significant simplification in things like BIOSes in
   using PIO over MMIO

I personally don't care and would chose to use MMIO always including in
firmware on ppc.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 23:07                                         ` H. Peter Anvin
@ 2013-06-06  0:41                                           ` Anthony Liguori
  2013-06-06  6:34                                             ` Gleb Natapov
  0 siblings, 1 reply; 109+ messages in thread
From: Anthony Liguori @ 2013-06-06  0:41 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

"H. Peter Anvin" <hpa@zytor.com> writes:

> On 06/05/2013 03:08 PM, Anthony Liguori wrote:
>>>
>>> Definitely an option.  However, we want to be able to boot from native
>>> devices, too, so having an I/O BAR (which would not be used by the OS
>>> driver) should still at the very least be an option.
>> 
>> What makes it so difficult to work with an MMIO bar for PCI-e?
>> 
>> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
>> forward.  Is there something special about PCI-e here?
>> 
>
> It's not tracking allocation.  It is that accessing memory above 1 MiB
> is incredibly painful in the BIOS environment, which basically means
> MMIO is inaccessible.

Oh, you mean in real mode.

SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
There are loads of ASSERT32FLAT()s in the code to make sure of this.

Regards,

Anthony Liguori

>
> 	-hpa
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-05 18:57                                 ` Anthony Liguori
  2013-06-05 19:43                                   ` Michael S. Tsirkin
  2013-06-05 19:54                                   ` Michael S. Tsirkin
@ 2013-06-06  3:42                                   ` Rusty Russell
  2013-06-06 14:59                                     ` Anthony Liguori
  2 siblings, 1 reply; 109+ messages in thread
From: Rusty Russell @ 2013-06-06  3:42 UTC (permalink / raw)
  To: Anthony Liguori, Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

Anthony Liguori <aliguori@us.ibm.com> writes:
> 4) Do virtio-pcie, make it PCI-e friendly (drop the IO BAR completely), give
>    it a new device/vendor ID.   Continue to use virtio-pci for existing
>    devices potentially adding virtio-{net,blk,...}-pcie variants for
>    people that care to use them.

Now you have a different compatibility problem; how do you know the
guest supports the new virtio-pcie net?

If you put a virtio-pci card behind a PCI-e bridge today, it's not
compliant, but AFAICT it will Just Work.  (Modulo the 16-dev limit).

I've been assuming we'd avoid a "flag day" change; that devices would
look like existing virtio-pci with capabilities indicating the new
config layout.

> I think 4 is the best path forward.  It's better for users (guests
> continue to work as they always have).  There's less confusion about
> enabling PCI-e support--you must ask for the virtio-pcie variant and you
> must have a virtio-pcie driver.  It's easy to explain.

Removing both forward and backward compatibility is easy to explain, but
I think it'll be harder to deploy.  This is your area though, so perhaps
I'm wrong.

> It also maps to what regular hardware does.  I highly doubt that there
> are any real PCI cards that made the shift from PCI to PCI-e without
> bumping at least a revision ID.

Noone expected the new cards to Just Work with old OSes: a new machine
meant a new OS and new drivers.  Hardware vendors like that.

Since virtualization often involves legacy, our priorities might be
different.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06  0:41                                           ` Anthony Liguori
@ 2013-06-06  6:34                                             ` Gleb Natapov
  2013-06-06 13:53                                               ` H. Peter Anvin
                                                                 ` (3 more replies)
  0 siblings, 4 replies; 109+ messages in thread
From: Gleb Natapov @ 2013-06-06  6:34 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> "H. Peter Anvin" <hpa@zytor.com> writes:
> 
> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> >>>
> >>> Definitely an option.  However, we want to be able to boot from native
> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> >>> driver) should still at the very least be an option.
> >> 
> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> >> 
> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> >> forward.  Is there something special about PCI-e here?
> >> 
> >
> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> > is incredibly painful in the BIOS environment, which basically means
> > MMIO is inaccessible.
> 
> Oh, you mean in real mode.
> 
> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> 
Well, not exactly. Initialization is done in 32bit, but disk
reads/writes are done in 16bit mode since it should work from int13
interrupt handler. The only way I know to access MMIO bars from 16 bit
is to use SMM which we do not have in KVM.

--
			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-04  5:31                 ` Rusty Russell
  2013-06-04  6:42                   ` Michael S. Tsirkin
@ 2013-06-06  8:02                   ` Michael S. Tsirkin
  1 sibling, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-06  8:02 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 04, 2013 at 03:01:50PM +0930, Rusty Russell wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> > On Mon, Jun 03, 2013 at 09:56:15AM +0930, Rusty Russell wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> > On Thu, May 30, 2013 at 08:53:45AM -0500, Anthony Liguori wrote:
> >> >> Rusty Russell <rusty@rustcorp.com.au> writes:
> >> >> 
> >> >> > Anthony Liguori <aliguori@us.ibm.com> writes:
> >> >> >> Forcing a guest driver change is a really big
> >> >> >> deal and I see no reason to do that unless there's a compelling reason
> >> >> >> to.
> >> >> >>
> >> >> >> So we're stuck with the 1.0 config layout for a very long time.
> >> >> >
> >> >> > We definitely must not force a guest change.  The explicit aim of the
> >> >> > standard is that "legacy" and 1.0 be backward compatible.  One
> >> >> > deliverable is a document detailing how this is done (effectively a
> >> >> > summary of changes between what we have and 1.0).
> >> >> 
> >> >> If 2.0 is fully backwards compatible, great.  It seems like such a
> >> >> difference that that would be impossible but I need to investigate
> >> >> further.
> >> >> 
> >> >> Regards,
> >> >> 
> >> >> Anthony Liguori
> >> >
> >> > If you look at my patches you'll see how it works.
> >> > Basically old guests use BAR0 new ones don't, so
> >> > it's easy: BAR0 access means legacy guest.
> >> > Only started testing but things seem to work
> >> > fine with old guests so far.
> >> >
> >> > I think we need a spec, not just driver code.
> >> >
> >> > Rusty what's the plan? Want me to write it?
> >> 
> >> We need both, of course, but the spec work will happen in the OASIS WG.
> >> A draft is good, but let's not commit anything to upstream QEMU until we
> >> get the spec finalized.  And that is proposed to be late this year.
> >
> > Well that would be quite sad really.
> > 
> > This means we can't make virtio a spec compliant pci express device,
> > and we can't add any more feature bits, so no
> > flexible buffer optimizations for virtio net.
> >
> > There are probably more projects that will be blocked.
> >
> > So how about we keep extending legacy layout for a bit longer:
> > - add a way to access device with MMIO
> > - use feature bit 31 to signal 64 bit features
> >   (and shift device config accordingly)
> 
> By my count, net still has 7 feature bits left, so I don't think the
> feature bits are likely to be a limitation in the next 6 months?

Actually I count 5 net specific ones:

3, 4 and then 25, 26, 27

Unless you count 31 even though it's a generic transport bit?

That's still 6 ...

> MMIO is a bigger problem.  Linux guests are happy with it: does it break
> the Windows drivers?
> 
> Thanks,
> Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06  6:34                                             ` Gleb Natapov
@ 2013-06-06 13:53                                               ` H. Peter Anvin
  2013-06-06 15:02                                               ` Anthony Liguori
                                                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 109+ messages in thread
From: H. Peter Anvin @ 2013-06-06 13:53 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On 06/05/2013 11:34 PM, Gleb Natapov wrote:
>>
>> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
>> There are loads of ASSERT32FLAT()s in the code to make sure of this.
>>
> Well, not exactly. Initialization is done in 32bit, but disk
> reads/writes are done in 16bit mode since it should work from int13
> interrupt handler. The only way I know to access MMIO bars from 16 bit
> is to use SMM which we do not have in KVM.
> 

In some ways it is even worse for PXE, since PXE is defined to work from
16-bit protected mode... but the mechanism to request a segment mapping
for the high memory area doesn't actually work.

	-hpa

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06  3:42                                   ` Rusty Russell
@ 2013-06-06 14:59                                     ` Anthony Liguori
  2013-06-07  1:58                                       ` Rusty Russell
  2013-06-07  8:25                                       ` Peter Maydell
  0 siblings, 2 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-06-06 14:59 UTC (permalink / raw)
  To: Rusty Russell, Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic


Hi Rusty,

Rusty Russell <rusty@rustcorp.com.au> writes:

> Anthony Liguori <aliguori@us.ibm.com> writes:
>> 4) Do virtio-pcie, make it PCI-e friendly (drop the IO BAR completely), give
>>    it a new device/vendor ID.   Continue to use virtio-pci for existing
>>    devices potentially adding virtio-{net,blk,...}-pcie variants for
>>    people that care to use them.
>
> Now you have a different compatibility problem; how do you know the
> guest supports the new virtio-pcie net?

We don't care.

We would still use virtio-pci for existing devices.  Only new devices
would use virtio-pcie.

> If you put a virtio-pci card behind a PCI-e bridge today, it's not
> compliant, but AFAICT it will Just Work.  (Modulo the 16-dev limit).

I believe you can put it in legacy mode and then there isn't the 16-dev
limit.  I believe the only advantage of putting it in native mode is
that then you can do native hotplug (as opposed to ACPI hotplug).

So sticking with virtio-pci seems reasonable to me.

> I've been assuming we'd avoid a "flag day" change; that devices would
> look like existing virtio-pci with capabilities indicating the new
> config layout.

I don't think that's feasible.  Maybe 5 or 10 years from now, we switch
the default adapter to virtio-pcie.

>> I think 4 is the best path forward.  It's better for users (guests
>> continue to work as they always have).  There's less confusion about
>> enabling PCI-e support--you must ask for the virtio-pcie variant and you
>> must have a virtio-pcie driver.  It's easy to explain.
>
> Removing both forward and backward compatibility is easy to explain, but
> I think it'll be harder to deploy.  This is your area though, so perhaps
> I'm wrong.

My concern is that it's not real backwards compatibility.

>> It also maps to what regular hardware does.  I highly doubt that there
>> are any real PCI cards that made the shift from PCI to PCI-e without
>> bumping at least a revision ID.
>
> Noone expected the new cards to Just Work with old OSes: a new machine
> meant a new OS and new drivers.  Hardware vendors like that.

Yup.

> Since virtualization often involves legacy, our priorities might be
> different.

So realistically, I think if we introduce virtio-pcie with a different
vendor ID, it will be adopted fairly quickly.  The drivers will show up
in distros quickly and get backported.

New devices can be limited to supporting virtio-pcie and we'll certainly
provide a way to use old devices with virtio-pcie too.  But for
practical reasons, I think we have to continue using virtio-pci by
default.

Regards,

Anthony Liguori

>
> Cheers,
> Rusty.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06  6:34                                             ` Gleb Natapov
  2013-06-06 13:53                                               ` H. Peter Anvin
@ 2013-06-06 15:02                                               ` Anthony Liguori
  2013-06-07 11:30                                                 ` Gleb Natapov
  2013-06-11  7:10                                                 ` Michael S. Tsirkin
  2013-06-06 15:02                                               ` Anthony Liguori
  2013-06-06 15:06                                               ` Gerd Hoffmann
  3 siblings, 2 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-06-06 15:02 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: H. Peter Anvin, Peter Maydell, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

Gleb Natapov <gleb@redhat.com> writes:

> On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
>> "H. Peter Anvin" <hpa@zytor.com> writes:
>> 
>> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
>> >>>
>> >>> Definitely an option.  However, we want to be able to boot from native
>> >>> devices, too, so having an I/O BAR (which would not be used by the OS
>> >>> driver) should still at the very least be an option.
>> >> 
>> >> What makes it so difficult to work with an MMIO bar for PCI-e?
>> >> 
>> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
>> >> forward.  Is there something special about PCI-e here?
>> >> 
>> >
>> > It's not tracking allocation.  It is that accessing memory above 1 MiB
>> > is incredibly painful in the BIOS environment, which basically means
>> > MMIO is inaccessible.
>> 
>> Oh, you mean in real mode.
>> 
>> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
>> There are loads of ASSERT32FLAT()s in the code to make sure of this.
>> 
> Well, not exactly. Initialization is done in 32bit, but disk
> reads/writes are done in 16bit mode since it should work from int13
> interrupt handler. The only way I know to access MMIO bars from 16 bit
> is to use SMM which we do not have in KVM.

Ah, if it's just the dataplane operations then there's another solution.

We can introduce a virtqueue flag that asks the backend to poll for new
requests.  Then SeaBIOS can add the request to the queue and not worry
about kicking or reading the ISR.

SeaBIOS is polling for completion anyway.

Regards,

Anthony Liguori

>
> --
> 			Gleb.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06  6:34                                             ` Gleb Natapov
  2013-06-06 13:53                                               ` H. Peter Anvin
  2013-06-06 15:02                                               ` Anthony Liguori
@ 2013-06-06 15:02                                               ` Anthony Liguori
  2013-06-06 15:06                                               ` Gerd Hoffmann
  3 siblings, 0 replies; 109+ messages in thread
From: Anthony Liguori @ 2013-06-06 15:02 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

Gleb Natapov <gleb@redhat.com> writes:

> On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
>> "H. Peter Anvin" <hpa@zytor.com> writes:
>> 
>> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
>> >>>
>> >>> Definitely an option.  However, we want to be able to boot from native
>> >>> devices, too, so having an I/O BAR (which would not be used by the OS
>> >>> driver) should still at the very least be an option.
>> >> 
>> >> What makes it so difficult to work with an MMIO bar for PCI-e?
>> >> 
>> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
>> >> forward.  Is there something special about PCI-e here?
>> >> 
>> >
>> > It's not tracking allocation.  It is that accessing memory above 1 MiB
>> > is incredibly painful in the BIOS environment, which basically means
>> > MMIO is inaccessible.
>> 
>> Oh, you mean in real mode.
>> 
>> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
>> There are loads of ASSERT32FLAT()s in the code to make sure of this.
>> 
> Well, not exactly. Initialization is done in 32bit, but disk
> reads/writes are done in 16bit mode since it should work from int13
> interrupt handler. The only way I know to access MMIO bars from 16 bit
> is to use SMM which we do not have in KVM.

Ah, if it's just the dataplane operations then there's another solution.

We can introduce a virtqueue flag that asks the backend to poll for new
requests.  Then SeaBIOS can add the request to the queue and not worry
about kicking or reading the ISR.

SeaBIOS is polling for completion anyway.

Regards,

Anthony Liguori

>
> --
> 			Gleb.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06  6:34                                             ` Gleb Natapov
                                                                 ` (2 preceding siblings ...)
  2013-06-06 15:02                                               ` Anthony Liguori
@ 2013-06-06 15:06                                               ` Gerd Hoffmann
  2013-06-06 15:10                                                 ` Gleb Natapov
  3 siblings, 1 reply; 109+ messages in thread
From: Gerd Hoffmann @ 2013-06-06 15:06 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini,
	KONRAD Frederic

On 06/06/13 08:34, Gleb Natapov wrote:
> On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
>>
>> Oh, you mean in real mode.
>>
>> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
>> There are loads of ASSERT32FLAT()s in the code to make sure of this.
>>
> Well, not exactly. Initialization is done in 32bit, but disk
> reads/writes are done in 16bit mode since it should work from int13
> interrupt handler.

Exactly.  It's "only" the initialization code which has ASSERt32FLAT()
all over the place.  Which actually is the majority of the code in most
cases as all the hardware detection and initialization code is there.
But kicking I/O requests must work from 16bit mode too.

> The only way I know to access MMIO bars from 16 bit
> is to use SMM which we do not have in KVM.

For seabios itself this isn't a big issue, see pci_{readl,writel} in
src/pci.c.  When called in 16bit mode it goes into 32bit mode
temporarily, just for accessing the mmio register.  ahci driver uses it,
xhci driver (wip atm) will use that too, and virtio-{blk,scsi} drivers
in seabios can do the same.

But as hpa mentioned it will be more tricky for option roms (aka
virtio-net).

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06 15:06                                               ` Gerd Hoffmann
@ 2013-06-06 15:10                                                 ` Gleb Natapov
  2013-06-06 15:19                                                   ` H. Peter Anvin
                                                                     ` (2 more replies)
  0 siblings, 3 replies; 109+ messages in thread
From: Gleb Natapov @ 2013-06-06 15:10 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini,
	KONRAD Frederic

On Thu, Jun 06, 2013 at 05:06:32PM +0200, Gerd Hoffmann wrote:
> On 06/06/13 08:34, Gleb Natapov wrote:
> > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> >>
> >> Oh, you mean in real mode.
> >>
> >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> >>
> > Well, not exactly. Initialization is done in 32bit, but disk
> > reads/writes are done in 16bit mode since it should work from int13
> > interrupt handler.
> 
> Exactly.  It's "only" the initialization code which has ASSERt32FLAT()
> all over the place.  Which actually is the majority of the code in most
> cases as all the hardware detection and initialization code is there.
> But kicking I/O requests must work from 16bit mode too.
> 
> > The only way I know to access MMIO bars from 16 bit
> > is to use SMM which we do not have in KVM.
> 
> For seabios itself this isn't a big issue, see pci_{readl,writel} in
> src/pci.c.  When called in 16bit mode it goes into 32bit mode
> temporarily, just for accessing the mmio register.  ahci driver uses it,
> xhci driver (wip atm) will use that too, and virtio-{blk,scsi} drivers
> in seabios can do the same.
> 
Isn't this approach broken? How can SeaBIOS be sure it restores real
mode registers to exactly same state they were before entering 32bit
mode?

> But as hpa mentioned it will be more tricky for option roms (aka
> virtio-net).
> 
> cheers,
>   Gerd
> 

--
			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06 15:10                                                 ` Gleb Natapov
@ 2013-06-06 15:19                                                   ` H. Peter Anvin
  2013-06-06 15:22                                                   ` Gerd Hoffmann
  2013-07-08  4:25                                                   ` Kevin O'Connor
  2 siblings, 0 replies; 109+ messages in thread
From: H. Peter Anvin @ 2013-06-06 15:19 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, Paolo Bonzini, KONRAD Frederic

On 06/06/2013 08:10 AM, Gleb Natapov wrote:
> On Thu, Jun 06, 2013 at 05:06:32PM +0200, Gerd Hoffmann wrote:
>>
> Isn't this approach broken? How can SeaBIOS be sure it restores real
> mode registers to exactly same state they were before entering 32bit
> mode?
> 

It can't... so yes, it is broken.

	-hpa

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06 15:10                                                 ` Gleb Natapov
  2013-06-06 15:19                                                   ` H. Peter Anvin
@ 2013-06-06 15:22                                                   ` Gerd Hoffmann
  2013-07-08  4:25                                                   ` Kevin O'Connor
  2 siblings, 0 replies; 109+ messages in thread
From: Gerd Hoffmann @ 2013-06-06 15:22 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini,
	KONRAD Frederic

  Hi,

>> For seabios itself this isn't a big issue, see pci_{readl,writel} in
>> src/pci.c.  When called in 16bit mode it goes into 32bit mode
>> temporarily, just for accessing the mmio register.  ahci driver uses it,
>> xhci driver (wip atm) will use that too, and virtio-{blk,scsi} drivers
>> in seabios can do the same.
>>
> Isn't this approach broken? How can SeaBIOS be sure it restores real
> mode registers to exactly same state they were before entering 32bit
> mode?

Don't know the details of that magic.  Kevin had some concerns on the
stability of this, so maybe there is a theoretic hole.  So far I havn't
seen any issues in practice, but also didn't stress it too much.
Basically only used that with all kinds of boot loaders, could very well
be it breaks if you try to use that with more esoteric stuff such as dos
extenders, then hit unhandled corner cases ...

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06 14:59                                     ` Anthony Liguori
@ 2013-06-07  1:58                                       ` Rusty Russell
  2013-06-07  8:25                                       ` Peter Maydell
  1 sibling, 0 replies; 109+ messages in thread
From: Rusty Russell @ 2013-06-07  1:58 UTC (permalink / raw)
  To: Anthony Liguori, Michael S. Tsirkin
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

Anthony Liguori <aliguori@us.ibm.com> writes:
> Hi Rusty,
>
> Rusty Russell <rusty@rustcorp.com.au> writes:
>
>> Anthony Liguori <aliguori@us.ibm.com> writes:
>>> 4) Do virtio-pcie, make it PCI-e friendly (drop the IO BAR completely), give
>>>    it a new device/vendor ID.   Continue to use virtio-pci for existing
>>>    devices potentially adding virtio-{net,blk,...}-pcie variants for
>>>    people that care to use them.
>>
>> Now you have a different compatibility problem; how do you know the
>> guest supports the new virtio-pcie net?
>
> We don't care.
>
> We would still use virtio-pci for existing devices.  Only new devices
> would use virtio-pcie.

My concern is that new features make the virtio-pcie driver so desirable
that libvirt is pressured to use it ASAP.  Have we just punted the
problem upstream?

(Of course, feature bits were supposed to avoid such a transition issue,
but mistakes accumulate).

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06 14:59                                     ` Anthony Liguori
  2013-06-07  1:58                                       ` Rusty Russell
@ 2013-06-07  8:25                                       ` Peter Maydell
  1 sibling, 0 replies; 109+ messages in thread
From: Peter Maydell @ 2013-06-07  8:25 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, Michael S. Tsirkin, virtualization, Stefan Hajnoczi,
	Paolo Bonzini, KONRAD Frederic

On 6 June 2013 15:59, Anthony Liguori <aliguori@us.ibm.com> wrote:
> We would still use virtio-pci for existing devices.  Only new devices
> would use virtio-pcie.

Surely you'd want to support both for any new device, because
(a) transport is orthogonal to backend functionality
(b) not all existing machine models have pci-e ?

Or am I misunderstanding?

thanks
-- PMM

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06 15:02                                               ` Anthony Liguori
@ 2013-06-07 11:30                                                 ` Gleb Natapov
  2013-06-11  7:10                                                 ` Michael S. Tsirkin
  1 sibling, 0 replies; 109+ messages in thread
From: Gleb Natapov @ 2013-06-07 11:30 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, Michael S. Tsirkin, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> Gleb Natapov <gleb@redhat.com> writes:
> 
> > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> >> "H. Peter Anvin" <hpa@zytor.com> writes:
> >> 
> >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> >> >>>
> >> >>> Definitely an option.  However, we want to be able to boot from native
> >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> >> >>> driver) should still at the very least be an option.
> >> >> 
> >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> >> >> 
> >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> >> >> forward.  Is there something special about PCI-e here?
> >> >> 
> >> >
> >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> >> > is incredibly painful in the BIOS environment, which basically means
> >> > MMIO is inaccessible.
> >> 
> >> Oh, you mean in real mode.
> >> 
> >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> >> 
> > Well, not exactly. Initialization is done in 32bit, but disk
> > reads/writes are done in 16bit mode since it should work from int13
> > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > is to use SMM which we do not have in KVM.
> 
> Ah, if it's just the dataplane operations then there's another solution.
> 
> We can introduce a virtqueue flag that asks the backend to poll for new
> requests.  Then SeaBIOS can add the request to the queue and not worry
> about kicking or reading the ISR.
> 
And when backend stops polling? When guest's driver reconfigure it? But
what if this will never happen?

> SeaBIOS is polling for completion anyway.
> 
> Regards,
> 
> Anthony Liguori
> 
> >
> > --
> > 			Gleb.
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06 15:02                                               ` Anthony Liguori
  2013-06-07 11:30                                                 ` Gleb Natapov
@ 2013-06-11  7:10                                                 ` Michael S. Tsirkin
  2013-06-11  7:53                                                   ` Gleb Natapov
  1 sibling, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-11  7:10 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Peter Maydell, kvm, virtualization, Stefan Hajnoczi,
	H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> Gleb Natapov <gleb@redhat.com> writes:
> 
> > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> >> "H. Peter Anvin" <hpa@zytor.com> writes:
> >> 
> >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> >> >>>
> >> >>> Definitely an option.  However, we want to be able to boot from native
> >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> >> >>> driver) should still at the very least be an option.
> >> >> 
> >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> >> >> 
> >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> >> >> forward.  Is there something special about PCI-e here?
> >> >> 
> >> >
> >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> >> > is incredibly painful in the BIOS environment, which basically means
> >> > MMIO is inaccessible.
> >> 
> >> Oh, you mean in real mode.
> >> 
> >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> >> 
> > Well, not exactly. Initialization is done in 32bit, but disk
> > reads/writes are done in 16bit mode since it should work from int13
> > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > is to use SMM which we do not have in KVM.
> 
> Ah, if it's just the dataplane operations then there's another solution.
> 
> We can introduce a virtqueue flag that asks the backend to poll for new
> requests.  Then SeaBIOS can add the request to the queue and not worry
> about kicking or reading the ISR.

This will pin a host CPU.
If we do something timer based it will likely
both increase host CPU utilization and slow device down.

If we didn't care about performance at all we could
do config cycles for signalling, which is much
more elegant than polling in host, but I don't think
that's the case.

> 
> SeaBIOS is polling for completion anyway.

I think that's different because a disk will normally respond
quickly. So it polls a bit, but then it stops as
there are no outstanding requests.

-- 
MST

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-11  7:10                                                 ` Michael S. Tsirkin
@ 2013-06-11  7:53                                                   ` Gleb Natapov
  2013-06-11  8:02                                                     ` Michael S. Tsirkin
  0 siblings, 1 reply; 109+ messages in thread
From: Gleb Natapov @ 2013-06-11  7:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 11, 2013 at 10:10:47AM +0300, Michael S. Tsirkin wrote:
> On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> > Gleb Natapov <gleb@redhat.com> writes:
> > 
> > > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> > >> "H. Peter Anvin" <hpa@zytor.com> writes:
> > >> 
> > >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> > >> >>>
> > >> >>> Definitely an option.  However, we want to be able to boot from native
> > >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> > >> >>> driver) should still at the very least be an option.
> > >> >> 
> > >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> > >> >> 
> > >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> > >> >> forward.  Is there something special about PCI-e here?
> > >> >> 
> > >> >
> > >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> > >> > is incredibly painful in the BIOS environment, which basically means
> > >> > MMIO is inaccessible.
> > >> 
> > >> Oh, you mean in real mode.
> > >> 
> > >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> > >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> > >> 
> > > Well, not exactly. Initialization is done in 32bit, but disk
> > > reads/writes are done in 16bit mode since it should work from int13
> > > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > > is to use SMM which we do not have in KVM.
> > 
> > Ah, if it's just the dataplane operations then there's another solution.
> > 
> > We can introduce a virtqueue flag that asks the backend to poll for new
> > requests.  Then SeaBIOS can add the request to the queue and not worry
> > about kicking or reading the ISR.
> 
> This will pin a host CPU.
> If we do something timer based it will likely
> both increase host CPU utilization and slow device down.
> 
> If we didn't care about performance at all we could
> do config cycles for signalling, which is much
> more elegant than polling in host, but I don't think
> that's the case.
> 
I wouldn't call BIOS int13 interface performance critical.

> > 
> > SeaBIOS is polling for completion anyway.
> 
> I think that's different because a disk will normally respond
> quickly. So it polls a bit, but then it stops as
> there are no outstanding requests.
> 
> -- 
> MST

--
			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-11  7:53                                                   ` Gleb Natapov
@ 2013-06-11  8:02                                                     ` Michael S. Tsirkin
  2013-06-11  8:03                                                       ` Gleb Natapov
  2013-06-11  8:04                                                       ` Michael S. Tsirkin
  0 siblings, 2 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-11  8:02 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 11, 2013 at 10:53:48AM +0300, Gleb Natapov wrote:
> On Tue, Jun 11, 2013 at 10:10:47AM +0300, Michael S. Tsirkin wrote:
> > On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> > > Gleb Natapov <gleb@redhat.com> writes:
> > > 
> > > > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> > > >> "H. Peter Anvin" <hpa@zytor.com> writes:
> > > >> 
> > > >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> > > >> >>>
> > > >> >>> Definitely an option.  However, we want to be able to boot from native
> > > >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> > > >> >>> driver) should still at the very least be an option.
> > > >> >> 
> > > >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> > > >> >> 
> > > >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> > > >> >> forward.  Is there something special about PCI-e here?
> > > >> >> 
> > > >> >
> > > >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> > > >> > is incredibly painful in the BIOS environment, which basically means
> > > >> > MMIO is inaccessible.
> > > >> 
> > > >> Oh, you mean in real mode.
> > > >> 
> > > >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> > > >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> > > >> 
> > > > Well, not exactly. Initialization is done in 32bit, but disk
> > > > reads/writes are done in 16bit mode since it should work from int13
> > > > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > > > is to use SMM which we do not have in KVM.
> > > 
> > > Ah, if it's just the dataplane operations then there's another solution.
> > > 
> > > We can introduce a virtqueue flag that asks the backend to poll for new
> > > requests.  Then SeaBIOS can add the request to the queue and not worry
> > > about kicking or reading the ISR.
> > 
> > This will pin a host CPU.
> > If we do something timer based it will likely
> > both increase host CPU utilization and slow device down.
> > 
> > If we didn't care about performance at all we could
> > do config cycles for signalling, which is much
> > more elegant than polling in host, but I don't think
> > that's the case.
> > 
> I wouldn't call BIOS int13 interface performance critical.

So the plan always was to
- add an MMIO BAR
- add a register for pci-config based access to devices

hpa felt performance does matter there but didn't clarify why ...

> > > 
> > > SeaBIOS is polling for completion anyway.
> > 
> > I think that's different because a disk will normally respond
> > quickly. So it polls a bit, but then it stops as
> > there are no outstanding requests.
> > 
> > -- 
> > MST
> 
> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-11  8:02                                                     ` Michael S. Tsirkin
@ 2013-06-11  8:03                                                       ` Gleb Natapov
  2013-06-11  8:19                                                         ` Michael S. Tsirkin
  2013-06-11  8:04                                                       ` Michael S. Tsirkin
  1 sibling, 1 reply; 109+ messages in thread
From: Gleb Natapov @ 2013-06-11  8:03 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 11, 2013 at 11:02:26AM +0300, Michael S. Tsirkin wrote:
> On Tue, Jun 11, 2013 at 10:53:48AM +0300, Gleb Natapov wrote:
> > On Tue, Jun 11, 2013 at 10:10:47AM +0300, Michael S. Tsirkin wrote:
> > > On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> > > > Gleb Natapov <gleb@redhat.com> writes:
> > > > 
> > > > > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> > > > >> "H. Peter Anvin" <hpa@zytor.com> writes:
> > > > >> 
> > > > >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> > > > >> >>>
> > > > >> >>> Definitely an option.  However, we want to be able to boot from native
> > > > >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> > > > >> >>> driver) should still at the very least be an option.
> > > > >> >> 
> > > > >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> > > > >> >> 
> > > > >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> > > > >> >> forward.  Is there something special about PCI-e here?
> > > > >> >> 
> > > > >> >
> > > > >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> > > > >> > is incredibly painful in the BIOS environment, which basically means
> > > > >> > MMIO is inaccessible.
> > > > >> 
> > > > >> Oh, you mean in real mode.
> > > > >> 
> > > > >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> > > > >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> > > > >> 
> > > > > Well, not exactly. Initialization is done in 32bit, but disk
> > > > > reads/writes are done in 16bit mode since it should work from int13
> > > > > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > > > > is to use SMM which we do not have in KVM.
> > > > 
> > > > Ah, if it's just the dataplane operations then there's another solution.
> > > > 
> > > > We can introduce a virtqueue flag that asks the backend to poll for new
> > > > requests.  Then SeaBIOS can add the request to the queue and not worry
> > > > about kicking or reading the ISR.
> > > 
> > > This will pin a host CPU.
> > > If we do something timer based it will likely
> > > both increase host CPU utilization and slow device down.
> > > 
> > > If we didn't care about performance at all we could
> > > do config cycles for signalling, which is much
> > > more elegant than polling in host, but I don't think
> > > that's the case.
> > > 
> > I wouldn't call BIOS int13 interface performance critical.
> 
> So the plan always was to
> - add an MMIO BAR
> - add a register for pci-config based access to devices
> 
> hpa felt performance does matter there but didn't clarify why ...
> 
You do not what to make it too slow obviously, this is interface that is
used to load OS during boot.

> > > > 
> > > > SeaBIOS is polling for completion anyway.
> > > 
> > > I think that's different because a disk will normally respond
> > > quickly. So it polls a bit, but then it stops as
> > > there are no outstanding requests.
> > > 
> > > -- 
> > > MST
> > 
> > --
> > 			Gleb.

--
			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-11  8:02                                                     ` Michael S. Tsirkin
  2013-06-11  8:03                                                       ` Gleb Natapov
@ 2013-06-11  8:04                                                       ` Michael S. Tsirkin
  1 sibling, 0 replies; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-11  8:04 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 11, 2013 at 11:02:26AM +0300, Michael S. Tsirkin wrote:
> On Tue, Jun 11, 2013 at 10:53:48AM +0300, Gleb Natapov wrote:
> > On Tue, Jun 11, 2013 at 10:10:47AM +0300, Michael S. Tsirkin wrote:
> > > On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> > > > Gleb Natapov <gleb@redhat.com> writes:
> > > > 
> > > > > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> > > > >> "H. Peter Anvin" <hpa@zytor.com> writes:
> > > > >> 
> > > > >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> > > > >> >>>
> > > > >> >>> Definitely an option.  However, we want to be able to boot from native
> > > > >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> > > > >> >>> driver) should still at the very least be an option.
> > > > >> >> 
> > > > >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> > > > >> >> 
> > > > >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> > > > >> >> forward.  Is there something special about PCI-e here?
> > > > >> >> 
> > > > >> >
> > > > >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> > > > >> > is incredibly painful in the BIOS environment, which basically means
> > > > >> > MMIO is inaccessible.
> > > > >> 
> > > > >> Oh, you mean in real mode.
> > > > >> 
> > > > >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> > > > >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> > > > >> 
> > > > > Well, not exactly. Initialization is done in 32bit, but disk
> > > > > reads/writes are done in 16bit mode since it should work from int13
> > > > > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > > > > is to use SMM which we do not have in KVM.
> > > > 
> > > > Ah, if it's just the dataplane operations then there's another solution.
> > > > 
> > > > We can introduce a virtqueue flag that asks the backend to poll for new
> > > > requests.  Then SeaBIOS can add the request to the queue and not worry
> > > > about kicking or reading the ISR.
> > > 
> > > This will pin a host CPU.
> > > If we do something timer based it will likely
> > > both increase host CPU utilization and slow device down.
> > > 
> > > If we didn't care about performance at all we could
> > > do config cycles for signalling, which is much
> > > more elegant than polling in host, but I don't think
> > > that's the case.
> > > 
> > I wouldn't call BIOS int13 interface performance critical.
> 
> So the plan always was to
> - add an MMIO BAR
> - add a register for pci-config based access to devices
> 
> hpa felt performance does matter there but didn't clarify why ...

Also

<gleb> mst, well the question is if it is safe to call int13 in the
middle of pci bus enumeration/configuration
<gleb> mst, and int13 predates PCI, so how knows

> > > > 
> > > > SeaBIOS is polling for completion anyway.
> > > 
> > > I think that's different because a disk will normally respond
> > > quickly. So it polls a bit, but then it stops as
> > > there are no outstanding requests.
> > > 
> > > -- 
> > > MST
> > 
> > --
> > 			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-11  8:03                                                       ` Gleb Natapov
@ 2013-06-11  8:19                                                         ` Michael S. Tsirkin
  2013-06-11  8:22                                                           ` Gleb Natapov
  0 siblings, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-11  8:19 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 11, 2013 at 11:03:50AM +0300, Gleb Natapov wrote:
> On Tue, Jun 11, 2013 at 11:02:26AM +0300, Michael S. Tsirkin wrote:
> > On Tue, Jun 11, 2013 at 10:53:48AM +0300, Gleb Natapov wrote:
> > > On Tue, Jun 11, 2013 at 10:10:47AM +0300, Michael S. Tsirkin wrote:
> > > > On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> > > > > Gleb Natapov <gleb@redhat.com> writes:
> > > > > 
> > > > > > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> > > > > >> "H. Peter Anvin" <hpa@zytor.com> writes:
> > > > > >> 
> > > > > >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> > > > > >> >>>
> > > > > >> >>> Definitely an option.  However, we want to be able to boot from native
> > > > > >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> > > > > >> >>> driver) should still at the very least be an option.
> > > > > >> >> 
> > > > > >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> > > > > >> >> 
> > > > > >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> > > > > >> >> forward.  Is there something special about PCI-e here?
> > > > > >> >> 
> > > > > >> >
> > > > > >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> > > > > >> > is incredibly painful in the BIOS environment, which basically means
> > > > > >> > MMIO is inaccessible.
> > > > > >> 
> > > > > >> Oh, you mean in real mode.
> > > > > >> 
> > > > > >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> > > > > >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> > > > > >> 
> > > > > > Well, not exactly. Initialization is done in 32bit, but disk
> > > > > > reads/writes are done in 16bit mode since it should work from int13
> > > > > > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > > > > > is to use SMM which we do not have in KVM.
> > > > > 
> > > > > Ah, if it's just the dataplane operations then there's another solution.
> > > > > 
> > > > > We can introduce a virtqueue flag that asks the backend to poll for new
> > > > > requests.  Then SeaBIOS can add the request to the queue and not worry
> > > > > about kicking or reading the ISR.
> > > > 
> > > > This will pin a host CPU.
> > > > If we do something timer based it will likely
> > > > both increase host CPU utilization and slow device down.
> > > > 
> > > > If we didn't care about performance at all we could
> > > > do config cycles for signalling, which is much
> > > > more elegant than polling in host, but I don't think
> > > > that's the case.
> > > > 
> > > I wouldn't call BIOS int13 interface performance critical.
> > 
> > So the plan always was to
> > - add an MMIO BAR
> > - add a register for pci-config based access to devices
> > 
> > hpa felt performance does matter there but didn't clarify why ...
> > 
> You do not what to make it too slow obviously, this is interface that is
> used to load OS during boot.

And possibly installation?

> > > > > 
> > > > > SeaBIOS is polling for completion anyway.
> > > > 
> > > > I think that's different because a disk will normally respond
> > > > quickly. So it polls a bit, but then it stops as
> > > > there are no outstanding requests.
> > > > 
> > > > -- 
> > > > MST
> > > 
> > > --
> > > 			Gleb.
> 
> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-11  8:19                                                         ` Michael S. Tsirkin
@ 2013-06-11  8:22                                                           ` Gleb Natapov
  2013-06-11  8:30                                                             ` Michael S. Tsirkin
  0 siblings, 1 reply; 109+ messages in thread
From: Gleb Natapov @ 2013-06-11  8:22 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 11, 2013 at 11:19:46AM +0300, Michael S. Tsirkin wrote:
> On Tue, Jun 11, 2013 at 11:03:50AM +0300, Gleb Natapov wrote:
> > On Tue, Jun 11, 2013 at 11:02:26AM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jun 11, 2013 at 10:53:48AM +0300, Gleb Natapov wrote:
> > > > On Tue, Jun 11, 2013 at 10:10:47AM +0300, Michael S. Tsirkin wrote:
> > > > > On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> > > > > > Gleb Natapov <gleb@redhat.com> writes:
> > > > > > 
> > > > > > > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> > > > > > >> "H. Peter Anvin" <hpa@zytor.com> writes:
> > > > > > >> 
> > > > > > >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> > > > > > >> >>>
> > > > > > >> >>> Definitely an option.  However, we want to be able to boot from native
> > > > > > >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> > > > > > >> >>> driver) should still at the very least be an option.
> > > > > > >> >> 
> > > > > > >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> > > > > > >> >> 
> > > > > > >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> > > > > > >> >> forward.  Is there something special about PCI-e here?
> > > > > > >> >> 
> > > > > > >> >
> > > > > > >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> > > > > > >> > is incredibly painful in the BIOS environment, which basically means
> > > > > > >> > MMIO is inaccessible.
> > > > > > >> 
> > > > > > >> Oh, you mean in real mode.
> > > > > > >> 
> > > > > > >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> > > > > > >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> > > > > > >> 
> > > > > > > Well, not exactly. Initialization is done in 32bit, but disk
> > > > > > > reads/writes are done in 16bit mode since it should work from int13
> > > > > > > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > > > > > > is to use SMM which we do not have in KVM.
> > > > > > 
> > > > > > Ah, if it's just the dataplane operations then there's another solution.
> > > > > > 
> > > > > > We can introduce a virtqueue flag that asks the backend to poll for new
> > > > > > requests.  Then SeaBIOS can add the request to the queue and not worry
> > > > > > about kicking or reading the ISR.
> > > > > 
> > > > > This will pin a host CPU.
> > > > > If we do something timer based it will likely
> > > > > both increase host CPU utilization and slow device down.
> > > > > 
> > > > > If we didn't care about performance at all we could
> > > > > do config cycles for signalling, which is much
> > > > > more elegant than polling in host, but I don't think
> > > > > that's the case.
> > > > > 
> > > > I wouldn't call BIOS int13 interface performance critical.
> > > 
> > > So the plan always was to
> > > - add an MMIO BAR
> > > - add a register for pci-config based access to devices
> > > 
> > > hpa felt performance does matter there but didn't clarify why ...
> > > 
> > You do not what to make it too slow obviously, this is interface that is
> > used to load OS during boot.
> 
> And possibly installation?
> 
Only the stage that reads files from CDROM. IIRC actual installation
runs with native drivers. This is why Windows asks you to provide floppy
with a driver at very early stage of installation.

--
			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-11  8:22                                                           ` Gleb Natapov
@ 2013-06-11  8:30                                                             ` Michael S. Tsirkin
  2013-06-11  8:32                                                               ` Gleb Natapov
  0 siblings, 1 reply; 109+ messages in thread
From: Michael S. Tsirkin @ 2013-06-11  8:30 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 11, 2013 at 11:22:37AM +0300, Gleb Natapov wrote:
> On Tue, Jun 11, 2013 at 11:19:46AM +0300, Michael S. Tsirkin wrote:
> > On Tue, Jun 11, 2013 at 11:03:50AM +0300, Gleb Natapov wrote:
> > > On Tue, Jun 11, 2013 at 11:02:26AM +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Jun 11, 2013 at 10:53:48AM +0300, Gleb Natapov wrote:
> > > > > On Tue, Jun 11, 2013 at 10:10:47AM +0300, Michael S. Tsirkin wrote:
> > > > > > On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> > > > > > > Gleb Natapov <gleb@redhat.com> writes:
> > > > > > > 
> > > > > > > > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> > > > > > > >> "H. Peter Anvin" <hpa@zytor.com> writes:
> > > > > > > >> 
> > > > > > > >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> > > > > > > >> >>>
> > > > > > > >> >>> Definitely an option.  However, we want to be able to boot from native
> > > > > > > >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> > > > > > > >> >>> driver) should still at the very least be an option.
> > > > > > > >> >> 
> > > > > > > >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> > > > > > > >> >> 
> > > > > > > >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> > > > > > > >> >> forward.  Is there something special about PCI-e here?
> > > > > > > >> >> 
> > > > > > > >> >
> > > > > > > >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> > > > > > > >> > is incredibly painful in the BIOS environment, which basically means
> > > > > > > >> > MMIO is inaccessible.
> > > > > > > >> 
> > > > > > > >> Oh, you mean in real mode.
> > > > > > > >> 
> > > > > > > >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> > > > > > > >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> > > > > > > >> 
> > > > > > > > Well, not exactly. Initialization is done in 32bit, but disk
> > > > > > > > reads/writes are done in 16bit mode since it should work from int13
> > > > > > > > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > > > > > > > is to use SMM which we do not have in KVM.
> > > > > > > 
> > > > > > > Ah, if it's just the dataplane operations then there's another solution.
> > > > > > > 
> > > > > > > We can introduce a virtqueue flag that asks the backend to poll for new
> > > > > > > requests.  Then SeaBIOS can add the request to the queue and not worry
> > > > > > > about kicking or reading the ISR.
> > > > > > 
> > > > > > This will pin a host CPU.
> > > > > > If we do something timer based it will likely
> > > > > > both increase host CPU utilization and slow device down.
> > > > > > 
> > > > > > If we didn't care about performance at all we could
> > > > > > do config cycles for signalling, which is much
> > > > > > more elegant than polling in host, but I don't think
> > > > > > that's the case.
> > > > > > 
> > > > > I wouldn't call BIOS int13 interface performance critical.
> > > > 
> > > > So the plan always was to
> > > > - add an MMIO BAR
> > > > - add a register for pci-config based access to devices
> > > > 
> > > > hpa felt performance does matter there but didn't clarify why ...
> > > > 
> > > You do not what to make it too slow obviously, this is interface that is
> > > used to load OS during boot.
> > 
> > And possibly installation?
> > 
> Only the stage that reads files from CDROM. IIRC actual installation
> runs with native drivers. This is why Windows asks you to provide floppy
> with a driver at very early stage of installation.

Have any numbers to tell us how much time is spent there?
E.g. if it's slowed down by a factor of 2, is it a problem?
How about a factor of 10?

> --
> 			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-11  8:30                                                             ` Michael S. Tsirkin
@ 2013-06-11  8:32                                                               ` Gleb Natapov
  0 siblings, 0 replies; 109+ messages in thread
From: Gleb Natapov @ 2013-06-11  8:32 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Maydell, Anthony Liguori, kvm, virtualization,
	Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini, KONRAD Frederic

On Tue, Jun 11, 2013 at 11:30:11AM +0300, Michael S. Tsirkin wrote:
> On Tue, Jun 11, 2013 at 11:22:37AM +0300, Gleb Natapov wrote:
> > On Tue, Jun 11, 2013 at 11:19:46AM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jun 11, 2013 at 11:03:50AM +0300, Gleb Natapov wrote:
> > > > On Tue, Jun 11, 2013 at 11:02:26AM +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Jun 11, 2013 at 10:53:48AM +0300, Gleb Natapov wrote:
> > > > > > On Tue, Jun 11, 2013 at 10:10:47AM +0300, Michael S. Tsirkin wrote:
> > > > > > > On Thu, Jun 06, 2013 at 10:02:14AM -0500, Anthony Liguori wrote:
> > > > > > > > Gleb Natapov <gleb@redhat.com> writes:
> > > > > > > > 
> > > > > > > > > On Wed, Jun 05, 2013 at 07:41:17PM -0500, Anthony Liguori wrote:
> > > > > > > > >> "H. Peter Anvin" <hpa@zytor.com> writes:
> > > > > > > > >> 
> > > > > > > > >> > On 06/05/2013 03:08 PM, Anthony Liguori wrote:
> > > > > > > > >> >>>
> > > > > > > > >> >>> Definitely an option.  However, we want to be able to boot from native
> > > > > > > > >> >>> devices, too, so having an I/O BAR (which would not be used by the OS
> > > > > > > > >> >>> driver) should still at the very least be an option.
> > > > > > > > >> >> 
> > > > > > > > >> >> What makes it so difficult to work with an MMIO bar for PCI-e?
> > > > > > > > >> >> 
> > > > > > > > >> >> With legacy PCI, tracking allocation of MMIO vs. PIO is pretty straight
> > > > > > > > >> >> forward.  Is there something special about PCI-e here?
> > > > > > > > >> >> 
> > > > > > > > >> >
> > > > > > > > >> > It's not tracking allocation.  It is that accessing memory above 1 MiB
> > > > > > > > >> > is incredibly painful in the BIOS environment, which basically means
> > > > > > > > >> > MMIO is inaccessible.
> > > > > > > > >> 
> > > > > > > > >> Oh, you mean in real mode.
> > > > > > > > >> 
> > > > > > > > >> SeaBIOS runs the virtio code in 32-bit mode with a flat memory layout.
> > > > > > > > >> There are loads of ASSERT32FLAT()s in the code to make sure of this.
> > > > > > > > >> 
> > > > > > > > > Well, not exactly. Initialization is done in 32bit, but disk
> > > > > > > > > reads/writes are done in 16bit mode since it should work from int13
> > > > > > > > > interrupt handler. The only way I know to access MMIO bars from 16 bit
> > > > > > > > > is to use SMM which we do not have in KVM.
> > > > > > > > 
> > > > > > > > Ah, if it's just the dataplane operations then there's another solution.
> > > > > > > > 
> > > > > > > > We can introduce a virtqueue flag that asks the backend to poll for new
> > > > > > > > requests.  Then SeaBIOS can add the request to the queue and not worry
> > > > > > > > about kicking or reading the ISR.
> > > > > > > 
> > > > > > > This will pin a host CPU.
> > > > > > > If we do something timer based it will likely
> > > > > > > both increase host CPU utilization and slow device down.
> > > > > > > 
> > > > > > > If we didn't care about performance at all we could
> > > > > > > do config cycles for signalling, which is much
> > > > > > > more elegant than polling in host, but I don't think
> > > > > > > that's the case.
> > > > > > > 
> > > > > > I wouldn't call BIOS int13 interface performance critical.
> > > > > 
> > > > > So the plan always was to
> > > > > - add an MMIO BAR
> > > > > - add a register for pci-config based access to devices
> > > > > 
> > > > > hpa felt performance does matter there but didn't clarify why ...
> > > > > 
> > > > You do not what to make it too slow obviously, this is interface that is
> > > > used to load OS during boot.
> > > 
> > > And possibly installation?
> > > 
> > Only the stage that reads files from CDROM. IIRC actual installation
> > runs with native drivers. This is why Windows asks you to provide floppy
> > with a driver at very early stage of installation.
> 
> Have any numbers to tell us how much time is spent there?
> E.g. if it's slowed down by a factor of 2, is it a problem?
> How about a factor of 10?
> 
No I do not.

--
			Gleb.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
  2013-06-06 15:10                                                 ` Gleb Natapov
  2013-06-06 15:19                                                   ` H. Peter Anvin
  2013-06-06 15:22                                                   ` Gerd Hoffmann
@ 2013-07-08  4:25                                                   ` Kevin O'Connor
  2 siblings, 0 replies; 109+ messages in thread
From: Kevin O'Connor @ 2013-07-08  4:25 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Peter Maydell, Anthony Liguori, kvm, Michael S. Tsirkin,
	virtualization, Stefan Hajnoczi, H. Peter Anvin, Paolo Bonzini,
	KONRAD Frederic

I only recently saw this email.

On Thu, Jun 06, 2013 at 06:10:12PM +0300, Gleb Natapov wrote:
> On Thu, Jun 06, 2013 at 05:06:32PM +0200, Gerd Hoffmann wrote:
> > For seabios itself this isn't a big issue, see pci_{readl,writel} in
> > src/pci.c.  When called in 16bit mode it goes into 32bit mode
> > temporarily, just for accessing the mmio register.  ahci driver uses it,
> > xhci driver (wip atm) will use that too, and virtio-{blk,scsi} drivers
> > in seabios can do the same.
> > 
> Isn't this approach broken? How can SeaBIOS be sure it restores real
> mode registers to exactly same state they were before entering 32bit
> mode?

You are correct - SeaBIOS can't fully restore the "hidden" segment
registers.  So, in a way it is broken.

In practice, it seems to work on modern bootloaders (eg, ntldr, grub,
lilo).  It definitely doesn't work with EMM386 (old DOS stuff), but
does seem to work okay with FreeDOS as long as one doesn't run EMM386.

The AHCI code uses this 32bit/16bit trampoline because it would not be
possible to support AHCI otherwise.  I haven't seen any complaints of
failures with the AHCI code - probably because people using AHCI are
using modern guests.

I explored this a bit some time back and the only way I could think of
to reliably restore the 16bit registers would be via SMM.
Unfortunately, using SMM introduces a whole host of complexity and
problems.

-Kevin

^ permalink raw reply	[flat|nested] 109+ messages in thread

end of thread, other threads:[~2013-07-08  4:25 UTC | newest]

Thread overview: 109+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-28 16:03 [PATCH RFC] virtio-pci: new config layout: using memory BAR Michael S. Tsirkin
2013-05-28 17:15 ` Anthony Liguori
2013-05-28 17:15 ` Anthony Liguori
2013-05-28 17:32   ` Michael S. Tsirkin
2013-05-28 17:43     ` Paolo Bonzini
2013-05-29  2:02       ` Laszlo Ersek
2013-05-29  2:02         ` Laszlo Ersek
2013-05-29  4:33       ` Rusty Russell
2013-05-29  4:33       ` Rusty Russell
2013-05-29  7:27         ` Paolo Bonzini
2013-05-29  8:05           ` Michael S. Tsirkin
2013-05-29 10:07           ` Laszlo Ersek
2013-05-28 18:53     ` Anthony Liguori
2013-05-28 19:27       ` Michael S. Tsirkin
2013-05-29  4:31   ` Rusty Russell
2013-05-29  4:31     ` Rusty Russell
2013-05-29  8:24     ` Michael S. Tsirkin
2013-05-29  8:52       ` Paolo Bonzini
2013-05-29  9:00       ` Peter Maydell
2013-05-29 10:08         ` Michael S. Tsirkin
2013-05-29 10:53           ` Peter Maydell
2013-05-29 12:16             ` Michael S. Tsirkin
2013-05-29 12:28               ` Paolo Bonzini
2013-05-29 12:37                 ` Michael S. Tsirkin
2013-05-29 12:52     ` Anthony Liguori
2013-05-29 12:52       ` Anthony Liguori
2013-05-29 13:24       ` Michael S. Tsirkin
2013-05-29 13:35         ` Peter Maydell
2013-05-29 13:35         ` Peter Maydell
2013-05-29 13:41         ` Paolo Bonzini
2013-05-29 14:02           ` Michael S. Tsirkin
2013-05-29 14:18           ` Anthony Liguori
2013-05-29 14:18           ` Anthony Liguori
2013-05-30  7:43           ` Michael S. Tsirkin
2013-05-29 14:16         ` Anthony Liguori
2013-05-29 14:16         ` Anthony Liguori
2013-05-29 14:30           ` Michael S. Tsirkin
2013-05-29 14:32             ` Paolo Bonzini
2013-05-29 14:52               ` Michael S. Tsirkin
2013-05-29 14:55             ` Anthony Liguori
2013-05-29 16:12               ` Michael S. Tsirkin
2013-05-29 18:16               ` Michael S. Tsirkin
2013-05-29 14:55             ` Anthony Liguori
2013-05-30  3:58       ` Rusty Russell
2013-05-30  3:58         ` Rusty Russell
2013-05-30  5:55         ` Michael S. Tsirkin
2013-05-30  7:55         ` Michael S. Tsirkin
2013-06-03  0:17           ` Rusty Russell
2013-05-30 13:53         ` Anthony Liguori
2013-05-30 13:53           ` Anthony Liguori
2013-05-30 14:01           ` Michael S. Tsirkin
2013-06-03  0:26             ` Rusty Russell
2013-06-03 10:11               ` Michael S. Tsirkin
2013-06-04  5:31                 ` Rusty Russell
2013-06-04  6:42                   ` Michael S. Tsirkin
2013-06-05  7:19                     ` Rusty Russell
2013-06-05 10:22                       ` Michael S. Tsirkin
2013-06-05 12:59                     ` Anthony Liguori
2013-06-05 14:09                       ` Michael S. Tsirkin
2013-06-05 15:08                         ` Anthony Liguori
2013-06-05 15:19                           ` Michael S. Tsirkin
2013-06-05 15:46                             ` Anthony Liguori
2013-06-05 15:46                             ` Anthony Liguori
2013-06-05 16:20                               ` Michael S. Tsirkin
2013-06-05 18:57                                 ` Anthony Liguori
2013-06-05 19:43                                   ` Michael S. Tsirkin
2013-06-05 19:52                                     ` Michael S. Tsirkin
2013-06-05 20:45                                       ` Anthony Liguori
2013-06-05 21:15                                         ` H. Peter Anvin
2013-06-05 21:15                                         ` Michael S. Tsirkin
2013-06-05 20:42                                     ` Anthony Liguori
2013-06-05 21:14                                       ` Michael S. Tsirkin
2013-06-05 21:53                                         ` Anthony Liguori
2013-06-05 21:53                                         ` Anthony Liguori
2013-06-05 22:19                                           ` Benjamin Herrenschmidt
2013-06-05 22:53                                             ` Anthony Liguori
2013-06-05 23:27                                               ` Benjamin Herrenschmidt
2013-06-05 19:54                                   ` Michael S. Tsirkin
2013-06-06  3:42                                   ` Rusty Russell
2013-06-06 14:59                                     ` Anthony Liguori
2013-06-07  1:58                                       ` Rusty Russell
2013-06-07  8:25                                       ` Peter Maydell
2013-06-05 21:10                                 ` H. Peter Anvin
2013-06-05 21:17                                   ` Michael S. Tsirkin
2013-06-05 21:50                                   ` Anthony Liguori
2013-06-05 21:55                                     ` H. Peter Anvin
2013-06-05 22:08                                       ` Anthony Liguori
2013-06-05 23:07                                         ` H. Peter Anvin
2013-06-06  0:41                                           ` Anthony Liguori
2013-06-06  6:34                                             ` Gleb Natapov
2013-06-06 13:53                                               ` H. Peter Anvin
2013-06-06 15:02                                               ` Anthony Liguori
2013-06-07 11:30                                                 ` Gleb Natapov
2013-06-11  7:10                                                 ` Michael S. Tsirkin
2013-06-11  7:53                                                   ` Gleb Natapov
2013-06-11  8:02                                                     ` Michael S. Tsirkin
2013-06-11  8:03                                                       ` Gleb Natapov
2013-06-11  8:19                                                         ` Michael S. Tsirkin
2013-06-11  8:22                                                           ` Gleb Natapov
2013-06-11  8:30                                                             ` Michael S. Tsirkin
2013-06-11  8:32                                                               ` Gleb Natapov
2013-06-11  8:04                                                       ` Michael S. Tsirkin
2013-06-06 15:02                                               ` Anthony Liguori
2013-06-06 15:06                                               ` Gerd Hoffmann
2013-06-06 15:10                                                 ` Gleb Natapov
2013-06-06 15:19                                                   ` H. Peter Anvin
2013-06-06 15:22                                                   ` Gerd Hoffmann
2013-07-08  4:25                                                   ` Kevin O'Connor
2013-06-06  8:02                   ` Michael S. Tsirkin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.