linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHv3 RFC] virtio-pci: flexible configuration layout
@ 2011-11-22 18:36 Michael S. Tsirkin
  2011-11-23  2:32 ` Rusty Russell
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-22 18:36 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

Here's an updated vesion.
I'm alternating between updating the spec and the driver,
spec update to follow.

Compiled only.  Posting here for early feedback, and to allow Sasha to
proceed with his "kvm tool" work.

Changes from v2:
	address comments by Rusty
	bugfixes by Sasha
Changes from v1:
	Updated to match v3 of the spec, see:

todo:
	split core changes out

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/Kconfig      |   22 +++++
 drivers/virtio/virtio_pci.c |  203 ++++++++++++++++++++++++++++++++++++++++---
 include/asm-generic/io.h    |    4 +
 include/asm-generic/iomap.h |   11 +++
 include/linux/pci_regs.h    |    4 +
 include/linux/virtio_pci.h  |   41 +++++++++
 lib/iomap.c                 |   41 ++++++++-
 7 files changed, 307 insertions(+), 19 deletions(-)

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 816ed08..465245e 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -25,6 +25,28 @@ config VIRTIO_PCI
 
 	  If unsure, say M.
 
+config VIRTIO_PCI_LEGACY
+	bool "Compatibility with Legacy PIO"
+	default y
+	depends on VIRTIO_PCI
+	---help---
+	  Look out into your driveway.  Do you have a flying car?  If
+	  so, you can happily disable this option and virtio will not
+	  break.  Otherwise, leave it set.  Unless you're testing what
+	  life will be like in The Future.
+
+	  In other words:
+
+	  Support compatibility with legacy PIO mapping in hypervisors.
+	  As of Nov 2011, this is required by all hypervisors without
+	  exception, so for now, disabling this option is only useful for
+	  testing.  In future hypervisors, it will be possible to disable
+	  this option and get a slightly smaller kernel.
+	  You know that your hypervisor is new enough if you disable this
+	  option and the device initialization passes.
+
+	  If unsure, say Y.
+
 config VIRTIO_BALLOON
 	tristate "Virtio balloon driver (EXPERIMENTAL)"
 	select VIRTIO
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 3d1bf41..681347b 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -37,8 +37,14 @@ struct virtio_pci_device
 	struct virtio_device vdev;
 	struct pci_dev *pci_dev;
 
-	/* the IO mapping for the PCI config space */
+	/* the IO address for the common PCI config space */
 	void __iomem *ioaddr;
+	/* the IO address for device specific config */
+	void __iomem *ioaddr_device;
+	/* the IO address to use for notifications operations */
+	void __iomem *ioaddr_notify;
+	/* the IO address to use for reading ISR */
+	void __iomem *ioaddr_isr;
 
 	/* a list of queues so we can dispatch IRQs */
 	spinlock_t lock;
@@ -57,8 +63,175 @@ struct virtio_pci_device
 	unsigned msix_used_vectors;
 	/* Whether we have vector per vq */
 	bool per_vq_vectors;
+
+	/* Various IO mappings: used for resource tracking only. */
+
+#ifdef CONFIG_VIRTIO_PCI_LEGACY
+	/* Legacy BAR0: typically PIO. */
+	void __iomem *legacy_map;
+#endif
+
+	/* Mappings specified by device capabilities: typically in MMIO */
+	void __iomem *isr_map;
+	void __iomem *notify_map;
+	void __iomem *common_map;
+	void __iomem *device_map;
 };
 
+#ifdef CONFIG_VIRTIO_PCI_LEGACY
+static void __iomem *virtio_pci_set_legacy_map(struct virtio_pci_device *vp_dev)
+{
+	return vp_dev->legacy_map = pci_iomap(vp_dev->pci_dev, 0, 256);
+}
+
+static void __iomem *virtio_pci_legacy_map(struct virtio_pci_device *vp_dev)
+{
+	return vp_dev->legacy_map;
+}
+#else
+static void __iomem *virtio_pci_set_legacy_map(struct virtio_pci_device *vp_dev)
+{
+	return NULL;
+}
+
+static void __iomem *virtio_pci_legacy_map(struct virtio_pci_device *vp_dev)
+{
+	return NULL;
+}
+#endif
+
+/*
+ * With PIO, device-specific config moves as MSI-X is enabled/disabled.
+ * Use this accessor to keep pointer to that config in sync.
+ */
+static void virtio_pci_set_msix_enabled(struct virtio_pci_device *vp_dev, int enabled)
+{
+	void __iomem* m;
+	vp_dev->msix_enabled = enabled;
+	m = virtio_pci_legacy_map(vp_dev);
+	if (m)
+		vp_dev->ioaddr_device = m + VIRTIO_PCI_CONFIG(vp_dev);
+	else
+		vp_dev->ioaddr_device = vp_dev->device_map;
+}
+
+static void __iomem *virtio_pci_map_cfg(struct virtio_pci_device *vp_dev, u8 cap_id,
+					u32 align)
+{
+        u32 size;
+        u32 offset;
+        u8 bir;
+        u8 cap_len;
+	int pos;
+	struct pci_dev *dev = vp_dev->pci_dev;
+	void __iomem *p;
+
+	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+		u8 id;
+		pci_read_config_byte(dev, pos + PCI_VNDR_CAP_LEN, &cap_len);
+		if (cap_len < VIRTIO_PCI_CAP_ID + 1)
+			continue;
+		pci_read_config_byte(dev, pos + VIRTIO_PCI_CAP_ID, &id);
+		id >>= VIRTIO_PCI_CAP_ID_SHIFT;
+		id &= VIRTIO_PCI_CAP_ID_MASK;
+		if (id == cap_id)
+			break;
+	}
+
+	if (pos <= 0)
+		return NULL;
+
+	if (cap_len < VIRTIO_PCI_CAP_CFG_BIR + 1)
+		goto err;
+        pci_read_config_byte(dev, pos + VIRTIO_PCI_CAP_CFG_BIR, &bir);
+	if (cap_len < VIRTIO_PCI_CAP_CFG_OFF + 4)
+		goto err;
+        pci_read_config_dword(dev, pos + VIRTIO_PCI_CAP_CFG_OFF, &offset);
+	if (cap_len < VIRTIO_PCI_CAP_CFG_SIZE + 4)
+		goto err;
+        pci_read_config_dword(dev, pos + VIRTIO_PCI_CAP_CFG_SIZE, &size);
+        bir >>= VIRTIO_PCI_CAP_CFG_BIR_SHIFT;
+        bir &= VIRTIO_PCI_CAP_CFG_BIR_MASK;
+        size >>= VIRTIO_PCI_CAP_CFG_SIZE_SHIFT;
+        size &= VIRTIO_PCI_CAP_CFG_SIZE_MASK;
+        offset >>= VIRTIO_PCI_CAP_CFG_OFF_SHIFT;
+        offset &= VIRTIO_PCI_CAP_CFG_OFF_MASK;
+	/* Align offset appropriately */
+	offset &= ~(align - 1);
+
+	/* It's possible for a device to supply a huge config space,
+	 * but we'll never need to map more than a page ATM. */
+	p = pci_iomap_range(dev, bir, offset, size, PAGE_SIZE);
+	if (!p)
+		dev_err(&vp_dev->vdev.dev, "Unable to map virtio pci memory");
+	return p;
+err:
+	dev_err(&vp_dev->vdev.dev, "Unable to parse virtio pci capability");
+	return NULL;
+}
+
+static void virtio_pci_iounmap(struct virtio_pci_device *vp_dev)
+{
+	if (virtio_pci_legacy_map(vp_dev))
+		pci_iounmap(vp_dev->pci_dev, virtio_pci_legacy_map(vp_dev));
+	if (vp_dev->isr_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->isr_map);
+	if (vp_dev->notify_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->notify_map);
+	if (vp_dev->common_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->common_map);
+	if (vp_dev->device_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->device_map);
+}
+
+static int virtio_pci_iomap(struct virtio_pci_device *vp_dev)
+{
+	vp_dev->isr_map = virtio_pci_map_cfg(vp_dev,
+					     VIRTIO_PCI_CAP_ISR_CFG,
+					     sizeof(u8));
+	vp_dev->notify_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_NOTIFY_CFG,
+						sizeof(u16));
+	vp_dev->common_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_COMMON_CFG,
+						sizeof(u32));
+	vp_dev->device_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_DEVICE_CFG,
+						sizeof(u8));
+
+	if (vp_dev->notify_map && vp_dev->isr_map &&
+	    vp_dev->common_map && vp_dev->device_map) {
+		vp_dev->ioaddr = vp_dev->common_map;
+		vp_dev->ioaddr_isr = vp_dev->isr_map;
+		vp_dev->ioaddr_notify = vp_dev->notify_map;
+		vp_dev->ioaddr_device = vp_dev->device_map;
+	} else {
+		void __iomem* m;
+		/*
+		 * If not all capabilities present, map legacy PIO.
+		 * Legacy access is at BAR 0. We never need to map
+		 * more than 256 bytes there, since legacy config space
+		 * used PIO which has this size limit.
+		 * */
+		m = virtio_pci_set_legacy_map(vp_dev);
+		if (!m) {
+			dev_err(&vp_dev->vdev.dev, "Unable to map legacy PIO");
+			goto err;
+		}
+		vp_dev->ioaddr = m;
+		vp_dev->ioaddr_isr = m + VIRTIO_PCI_ISR;
+		vp_dev->ioaddr_notify = m + VIRTIO_PCI_QUEUE_NOTIFY;
+		vp_dev->ioaddr_device = m + VIRTIO_PCI_CONFIG(vp_dev);
+	}
+
+	return 0;
+err:
+	virtio_pci_iounmap(vp_dev);
+	return -EINVAL;
+}
+
 /* Constants for MSI-X */
 /* Use first vector for configuration changes, second and the rest for
  * virtqueues Thus, we need at least 2 vectors for MSI. */
@@ -130,8 +303,7 @@ static void vp_get(struct virtio_device *vdev, unsigned offset,
 		   void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
-				VIRTIO_PCI_CONFIG(vp_dev) + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr_device + offset;
 	u8 *ptr = buf;
 	int i;
 
@@ -145,8 +317,7 @@ static void vp_set(struct virtio_device *vdev, unsigned offset,
 		   const void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
-				VIRTIO_PCI_CONFIG(vp_dev) + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr_device + offset;
 	const u8 *ptr = buf;
 	int i;
 
@@ -184,7 +355,7 @@ static void vp_notify(struct virtqueue *vq)
 
 	/* we write the queue's selector into the notification register to
 	 * signal the other end */
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+	iowrite16(info->queue_index, vp_dev->ioaddr_notify);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -231,7 +402,8 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
 
 	/* reading the ISR has the effect of also clearing it so it's very
 	 * important to save off the value. */
-	isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+	isr = ioread8(vp_dev->ioaddr_notify +
+		      VIRTIO_PCI_ISR - VIRTIO_PCI_QUEUE_NOTIFY);
 
 	/* It's definitely not us if the ISR was not high */
 	if (!isr)
@@ -265,7 +437,7 @@ static void vp_free_vectors(struct virtio_device *vdev)
 		ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
 
 		pci_disable_msix(vp_dev->pci_dev);
-		vp_dev->msix_enabled = 0;
+                virtio_pci_set_msix_enabled(vp_dev, 0);
 		vp_dev->msix_vectors = 0;
 	}
 
@@ -303,7 +475,7 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 	if (err)
 		goto error;
 	vp_dev->msix_vectors = nvectors;
-	vp_dev->msix_enabled = 1;
+        virtio_pci_set_msix_enabled(vp_dev, 1);
 
 	/* Set the vector used for configuration */
 	v = vp_dev->msix_used_vectors;
@@ -451,7 +623,10 @@ static void vp_del_vq(struct virtqueue *vq)
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
 			  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
 		/* Flush the write out to device */
-		ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+		ioread8(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		/* And clear ISR: TODO: really needed? */
+		ioread8(vp_dev->ioaddr_notify +
+		      VIRTIO_PCI_ISR - VIRTIO_PCI_QUEUE_NOTIFY);
 	}
 
 	vring_del_virtqueue(vq);
@@ -642,8 +817,8 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 	if (err)
 		goto out_enable_device;
 
-	vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
-	if (vp_dev->ioaddr == NULL)
+	err = virtio_pci_iomap(vp_dev);
+	if (err)
 		goto out_req_regions;
 
 	pci_set_drvdata(pci_dev, vp_dev);
@@ -665,7 +840,7 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 
 out_set_drvdata:
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	virtio_pci_iounmap(vp_dev);
 out_req_regions:
 	pci_release_regions(pci_dev);
 out_enable_device:
@@ -683,7 +858,7 @@ static void __devexit virtio_pci_remove(struct pci_dev *pci_dev)
 
 	vp_del_vqs(&vp_dev->vdev);
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	virtio_pci_iounmap(vp_dev);
 	pci_release_regions(pci_dev);
 	pci_disable_device(pci_dev);
 	kfree(vp_dev);
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 9120887..3cf1787 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -286,6 +286,10 @@ static inline void writesb(const void __iomem *addr, const void *buf, int len)
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                     unsigned offset,
+                                     unsigned long minlen,
+                                     unsigned long maxlen);
 static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
 {
 }
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 98dcd76..6f192d4 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -70,8 +70,19 @@ extern void ioport_unmap(void __iomem *);
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                     unsigned offset,
+                                     unsigned long minlen,
+                                     unsigned long maxlen);
 extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 #else
+static inline void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                            unsigned offset,
+                                            unsigned long minlen,
+                                            unsigned long maxlen)
+{
+	return NULL;
+}
 struct pci_dev;
 static inline void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
 {
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index b5d9657..3e7905c 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -375,6 +375,10 @@
 #define  PCI_X_STATUS_266MHZ	0x40000000	/* 266 MHz capable */
 #define  PCI_X_STATUS_533MHZ	0x80000000	/* 533 MHz capable */
 
+/* Vendor specific capability */
+#define PCI_VNDR_CAP_LEN	2	/* Capability length (8 bits), including
+					   bytes: ID, NEXT and LEN itself. */
+
 /* PCI Bridge Subsystem ID registers */
 
 #define PCI_SSVID_VENDOR_ID     4	/* PCI-Bridge subsystem vendor id register */
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index ea66f3f..d6568e7 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -92,4 +92,45 @@
 /* The alignment to use between consumer and producer parts of vring.
  * x86 pagesize again. */
 #define VIRTIO_PCI_VRING_ALIGN		4096
+
+/*
+ * Layout for Virtio PCI vendor specific capability (little-endian):
+ * 5 bit virtio capability id.
+ * 3 bit BAR index register, specifying which BAR to use.
+ * 4 byte cfg offset within the BAR.
+ * 4 byte cfg size.
+ */
+
+/* A single virtio device has multiple vendor specific capabilities, we use the
+ * 5 bit ID field to distinguish between these. */
+#define VIRTIO_PCI_CAP_ID		3
+#define VIRTIO_PCI_CAP_ID_MASK		0xff
+#define VIRTIO_PCI_CAP_ID_SHIFT		0
+
+/* IDs for different capabilities. If a specific configuration
+ * is missing, legacy PIO path is used. */
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR access */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific confiuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+
+/* Index of the BAR including this configuration */
+#define VIRTIO_PCI_CAP_CFG_BIR		4
+#define VIRTIO_PCI_CAP_CFG_BIR_MASK	(0x7)
+#define VIRTIO_PCI_CAP_CFG_BIR_SHIFT	0
+
+/* Size of the configuration space */
+#define VIRTIO_PCI_CAP_CFG_SIZE		4
+#define VIRTIO_PCI_CAP_CFG_SIZE_MASK	0xffffff
+#define VIRTIO_PCI_CAP_CFG_SIZE_SHIFT	8
+
+/* Offset within the BAR */
+#define VIRTIO_PCI_CAP_CFG_OFF		8
+#define VIRTIO_PCI_CAP_CFG_OFF_MASK	0xffffffff
+#define VIRTIO_PCI_CAP_CFG_OFF_SHIFT	0
+
 #endif
diff --git a/lib/iomap.c b/lib/iomap.c
index 5dbcb4b..93ae915 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -243,26 +243,37 @@ EXPORT_SYMBOL(ioport_unmap);
 
 #ifdef CONFIG_PCI
 /**
- * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * pci_iomap_range - create a virtual mapping cookie for a PCI BAR
  * @dev: PCI device that owns the BAR
  * @bar: BAR number
- * @maxlen: length of the memory to map
+ * @offset: map memory at the given offset in BAR
+ * @minlen: min length of the memory to map
+ * @maxlen: max length of the memory to map
  *
  * Using this function you will get a __iomem address to your device BAR.
  * You can access it using ioread*() and iowrite*(). These functions hide
  * the details if this is a MMIO or PIO address space and will just do what
  * you expect from them in the correct way.
  *
+ * @minlen specifies the minimum length to map. We check that BAR is
+ * large enough.
  * @maxlen specifies the maximum length to map. If you want to get access to
- * the complete BAR without checking for its length first, pass %0 here.
+ * the complete BAR from offset to the end, pass %0 here.
  * */
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                              unsigned offset,
+                              unsigned long minlen,
+                              unsigned long maxlen)
 {
 	resource_size_t start = pci_resource_start(dev, bar);
 	resource_size_t len = pci_resource_len(dev, bar);
 	unsigned long flags = pci_resource_flags(dev, bar);
 
-	if (!len || !start)
+	if (len <= offset || !start)
+		return NULL;
+        len -= offset;
+	start += offset;
+        if (len < minlen)
 		return NULL;
 	if (maxlen && len > maxlen)
 		len = maxlen;
@@ -277,10 +288,30 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
 	return NULL;
 }
 
+/**
+ * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * @dev: PCI device that owns the BAR
+ * @bar: BAR number
+ * @maxlen: length of the memory to map
+ *
+ * Using this function you will get a __iomem address to your device BAR.
+ * You can access it using ioread*() and iowrite*(). These functions hide
+ * the details if this is a MMIO or PIO address space and will just do what
+ * you expect from them in the correct way.
+ *
+ * @maxlen specifies the maximum length to map. If you want to get access to
+ * the complete BAR without checking for its length first, pass %0 here.
+ * */
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+    return pci_iomap_range(dev, bar, 0, 0, maxlen);
+}
+
 void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
 {
 	IO_COND(addr, /* nothing */, iounmap(addr));
 }
 EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iomap_range);
 EXPORT_SYMBOL(pci_iounmap);
 #endif /* CONFIG_PCI */
-- 
1.7.5.53.gc233e

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-22 18:36 [PATCHv3 RFC] virtio-pci: flexible configuration layout Michael S. Tsirkin
@ 2011-11-23  2:32 ` Rusty Russell
  2011-11-23  8:46   ` Michael S. Tsirkin
                     ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Rusty Russell @ 2011-11-23  2:32 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Tue, 22 Nov 2011 20:36:22 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> Here's an updated vesion.
> I'm alternating between updating the spec and the driver,
> spec update to follow.

Don't touch the spec yet, we have a long way to go :(

I want the ability for driver to set the ring size, and the device to
set the alignment.  That's a bigger change than you have here.  I
imagine it almost rips the driver into two completely different drivers.

This is the kind of thing I had in mind, for the header.  Want me to
code up the rest?

(I've avoided adding the constants for the new layout: a struct is more
compact and more descriptive).

Cheers,
Rusty.

diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -42,56 +42,74 @@
 #include <linux/virtio_config.h>
 
 /* A 32-bit r/o bitmask of the features supported by the host */
-#define VIRTIO_PCI_HOST_FEATURES	0
+#define VIRTIO_PCI_LEGACY_HOST_FEATURES		0
 
 /* A 32-bit r/w bitmask of features activated by the guest */
-#define VIRTIO_PCI_GUEST_FEATURES	4
+#define VIRTIO_PCI_LEGACY_GUEST_FEATURES	4
 
 /* A 32-bit r/w PFN for the currently selected queue */
-#define VIRTIO_PCI_QUEUE_PFN		8
+#define VIRTIO_PCI_LEGACY_QUEUE_PFN		8
 
 /* A 16-bit r/o queue size for the currently selected queue */
-#define VIRTIO_PCI_QUEUE_NUM		12
+#define VIRTIO_PCI_LEGACY_QUEUE_NUM		12
 
 /* A 16-bit r/w queue selector */
-#define VIRTIO_PCI_QUEUE_SEL		14
+#define VIRTIO_PCI_LEGACY_QUEUE_SEL		14
 
 /* A 16-bit r/w queue notifier */
-#define VIRTIO_PCI_QUEUE_NOTIFY		16
+#define VIRTIO_PCI_LEGACY_QUEUE_NOTIFY		16
 
 /* An 8-bit device status register.  */
-#define VIRTIO_PCI_STATUS		18
+#define VIRTIO_PCI_LEGACY_STATUS		18
 
 /* An 8-bit r/o interrupt status register.  Reading the value will return the
  * current contents of the ISR and will also clear it.  This is effectively
  * a read-and-acknowledge. */
-#define VIRTIO_PCI_ISR			19
-
-/* The bit of the ISR which indicates a device configuration change. */
-#define VIRTIO_PCI_ISR_CONFIG		0x2
+#define VIRTIO_PCI_LEGACY_ISR			19
 
 /* MSI-X registers: only enabled if MSI-X is enabled. */
 /* A 16-bit vector for configuration changes. */
-#define VIRTIO_MSI_CONFIG_VECTOR        20
+#define VIRTIO_MSI_LEGACY_CONFIG_VECTOR        20
 /* A 16-bit vector for selected queue notifications. */
-#define VIRTIO_MSI_QUEUE_VECTOR         22
-/* Vector value used to disable MSI for queue */
-#define VIRTIO_MSI_NO_VECTOR            0xffff
+#define VIRTIO_MSI_LEGACY_QUEUE_VECTOR         22
 
 /* The remaining space is defined by each driver as the per-driver
  * configuration space */
-#define VIRTIO_PCI_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
+#define VIRTIO_PCI_LEGACY_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
+
+/* How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size. */
+#define VIRTIO_PCI_LEGACY_QUEUE_ADDR_SHIFT	12
+
+/* The alignment to use between consumer and producer parts of vring.
+ * x86 pagesize again. */
+#define VIRTIO_PCI_LEGACY_VRING_ALIGN		4096
+
+#ifndef __KERNEL__
+/* Don't break compile of old userspace code.  These will go away. */
+#define VIRTIO_PCI_HOST_FEATURES VIRTIO_PCI_LEGACY_HOST_FEATURES
+#define VIRTIO_PCI_GUEST_FEATURES VIRTIO_PCI_LEGACY_GUEST_FEATURES
+#define VIRTIO_PCI_LEGACY_QUEUE_PFN VIRTIO_PCI_QUEUE_PFN
+#define VIRTIO_PCI_LEGACY_QUEUE_NUM VIRTIO_PCI_QUEUE_NUM
+#define VIRTIO_PCI_LEGACY_QUEUE_SEL VIRTIO_PCI_QUEUE_SEL
+#define VIRTIO_PCI_LEGACY_QUEUE_NOTIFY VIRTIO_PCI_QUEUE_NOTIFY
+#define VIRTIO_PCI_LEGACY_STATUS VIRTIO_PCI_STATUS
+#define VIRTIO_PCI_LEGACY_ISR VIRTIO_PCI_ISR
+#define VIRTIO_MSI_LEGACY_CONFIG_VECTOR VIRTIO_MSI_CONFIG_VECTOR
+#define VIRTIO_MSI_LEGACY_QUEUE_VECTOR VIRTIO_MSI_QUEUE_VECTOR
+#define VIRTIO_PCI_LEGACY_CONFIG(dev) VIRTIO_PCI_CONFIG(dev)
+#define VIRTIO_PCI_LEGACY_QUEUE_ADDR_SHIFT VIRTIO_PCI_QUEUE_ADDR_SHIFT
+#define VIRTIO_PCI_LEGACY_VRING_ALIGN VIRTIO_PCI_VRING_ALIGN
+#endif /* ...!KERNEL */
 
 /* Virtio ABI version, this must match exactly */
 #define VIRTIO_PCI_ABI_VERSION		0
 
-/* How many bits to shift physical queue address written to QUEUE_PFN.
- * 12 is historical, and due to x86 page size. */
-#define VIRTIO_PCI_QUEUE_ADDR_SHIFT	12
+/* Vector value used to disable MSI for queue */
+#define VIRTIO_MSI_NO_VECTOR            0xffff
 
-/* The alignment to use between consumer and producer parts of vring.
- * x86 pagesize again. */
-#define VIRTIO_PCI_VRING_ALIGN		4096
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG		0x2
 
 /*
  * Layout for Virtio PCI vendor specific capability (little-endian):
@@ -133,4 +151,20 @@
 #define VIRTIO_PCI_CAP_CFG_OFF_MASK	0xffffffff
 #define VIRTIO_PCI_CAP_CFG_OFF_SHIFT	0
 
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	__u64 device_features;	/* read-only */
+	__u64 guest_features;	/* read-write */
+	__u64 queue_address;	/* read-write */
+	__u16 msix_config;	/* read-write */
+	__u8 device_status;	/* read-write */
+	__u8 unused;
+
+	/* About a specific virtqueue. */
+	__u16 queue_select;	/* read-write */
+	__u16 queue_align;	/* read-write, power of 2. */
+	__u16 queue_size;	/* read-write, power of 2. */
+	__u16 queue_msix_vector;/* read-write */
+};
 #endif




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-23  2:32 ` Rusty Russell
@ 2011-11-23  8:46   ` Michael S. Tsirkin
  2011-11-23 15:34     ` Michael S. Tsirkin
  2011-11-24  0:36     ` Rusty Russell
  2011-11-23  8:49   ` Michael S. Tsirkin
  2011-11-23  9:44   ` Sasha Levin
  2 siblings, 2 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-23  8:46 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Wed, Nov 23, 2011 at 01:02:22PM +1030, Rusty Russell wrote:
> On Tue, 22 Nov 2011 20:36:22 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > Here's an updated vesion.
> > I'm alternating between updating the spec and the driver,
> > spec update to follow.
> 
> Don't touch the spec yet, we have a long way to go :(
> 
> I want the ability for driver to set the ring size, and the device to
> set the alignment.

Did you mean driver to be able to set the alignment? This
is what BIOS guys want - after BIOS completes, guest driver gets handed
control and sets its own alignment to save memory.

> That's a bigger change than you have here.

Why can't we just add the new registers at the end?
With the new capability, we have as much space as we like for that.

> I imagine it almost rips the driver into two completely different drivers.

If you insist on moving all the rest of registers around, certainly. But
why do this?

> This is the kind of thing I had in mind, for the header.  Want me to
> code up the rest?
> 
> (I've avoided adding the constants for the new layout: a struct is more
> compact and more descriptive).
> 
> Cheers,
> Rusty.

Renaming constants in exported headers will break userspace builds.
Do we care? Why not?

> diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
> --- a/include/linux/virtio_pci.h
> +++ b/include/linux/virtio_pci.h
> @@ -42,56 +42,74 @@
>  #include <linux/virtio_config.h>
>  
>  /* A 32-bit r/o bitmask of the features supported by the host */
> -#define VIRTIO_PCI_HOST_FEATURES	0
> +#define VIRTIO_PCI_LEGACY_HOST_FEATURES		0
>  
>  /* A 32-bit r/w bitmask of features activated by the guest */
> -#define VIRTIO_PCI_GUEST_FEATURES	4
> +#define VIRTIO_PCI_LEGACY_GUEST_FEATURES	4
>  
>  /* A 32-bit r/w PFN for the currently selected queue */
> -#define VIRTIO_PCI_QUEUE_PFN		8
> +#define VIRTIO_PCI_LEGACY_QUEUE_PFN		8
>  
>  /* A 16-bit r/o queue size for the currently selected queue */
> -#define VIRTIO_PCI_QUEUE_NUM		12
> +#define VIRTIO_PCI_LEGACY_QUEUE_NUM		12
>  
>  /* A 16-bit r/w queue selector */
> -#define VIRTIO_PCI_QUEUE_SEL		14
> +#define VIRTIO_PCI_LEGACY_QUEUE_SEL		14
>  
>  /* A 16-bit r/w queue notifier */
> -#define VIRTIO_PCI_QUEUE_NOTIFY		16
> +#define VIRTIO_PCI_LEGACY_QUEUE_NOTIFY		16
>  
>  /* An 8-bit device status register.  */
> -#define VIRTIO_PCI_STATUS		18
> +#define VIRTIO_PCI_LEGACY_STATUS		18
>  
>  /* An 8-bit r/o interrupt status register.  Reading the value will return the
>   * current contents of the ISR and will also clear it.  This is effectively
>   * a read-and-acknowledge. */
> -#define VIRTIO_PCI_ISR			19
> -
> -/* The bit of the ISR which indicates a device configuration change. */
> -#define VIRTIO_PCI_ISR_CONFIG		0x2
> +#define VIRTIO_PCI_LEGACY_ISR			19
>  
>  /* MSI-X registers: only enabled if MSI-X is enabled. */
>  /* A 16-bit vector for configuration changes. */
> -#define VIRTIO_MSI_CONFIG_VECTOR        20
> +#define VIRTIO_MSI_LEGACY_CONFIG_VECTOR        20
>  /* A 16-bit vector for selected queue notifications. */
> -#define VIRTIO_MSI_QUEUE_VECTOR         22
> -/* Vector value used to disable MSI for queue */
> -#define VIRTIO_MSI_NO_VECTOR            0xffff
> +#define VIRTIO_MSI_LEGACY_QUEUE_VECTOR         22
>  
>  /* The remaining space is defined by each driver as the per-driver
>   * configuration space */
> -#define VIRTIO_PCI_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
> +#define VIRTIO_PCI_LEGACY_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
> +
> +/* How many bits to shift physical queue address written to QUEUE_PFN.
> + * 12 is historical, and due to x86 page size. */
> +#define VIRTIO_PCI_LEGACY_QUEUE_ADDR_SHIFT	12
> +
> +/* The alignment to use between consumer and producer parts of vring.
> + * x86 pagesize again. */
> +#define VIRTIO_PCI_LEGACY_VRING_ALIGN		4096
> +
> +#ifndef __KERNEL__
> +/* Don't break compile of old userspace code.  These will go away. */
> +#define VIRTIO_PCI_HOST_FEATURES VIRTIO_PCI_LEGACY_HOST_FEATURES
> +#define VIRTIO_PCI_GUEST_FEATURES VIRTIO_PCI_LEGACY_GUEST_FEATURES
> +#define VIRTIO_PCI_LEGACY_QUEUE_PFN VIRTIO_PCI_QUEUE_PFN
> +#define VIRTIO_PCI_LEGACY_QUEUE_NUM VIRTIO_PCI_QUEUE_NUM
> +#define VIRTIO_PCI_LEGACY_QUEUE_SEL VIRTIO_PCI_QUEUE_SEL
> +#define VIRTIO_PCI_LEGACY_QUEUE_NOTIFY VIRTIO_PCI_QUEUE_NOTIFY
> +#define VIRTIO_PCI_LEGACY_STATUS VIRTIO_PCI_STATUS
> +#define VIRTIO_PCI_LEGACY_ISR VIRTIO_PCI_ISR
> +#define VIRTIO_MSI_LEGACY_CONFIG_VECTOR VIRTIO_MSI_CONFIG_VECTOR
> +#define VIRTIO_MSI_LEGACY_QUEUE_VECTOR VIRTIO_MSI_QUEUE_VECTOR
> +#define VIRTIO_PCI_LEGACY_CONFIG(dev) VIRTIO_PCI_CONFIG(dev)
> +#define VIRTIO_PCI_LEGACY_QUEUE_ADDR_SHIFT VIRTIO_PCI_QUEUE_ADDR_SHIFT
> +#define VIRTIO_PCI_LEGACY_VRING_ALIGN VIRTIO_PCI_VRING_ALIGN
> +#endif /* ...!KERNEL */
>  
>  /* Virtio ABI version, this must match exactly */
>  #define VIRTIO_PCI_ABI_VERSION		0
>  
> -/* How many bits to shift physical queue address written to QUEUE_PFN.
> - * 12 is historical, and due to x86 page size. */
> -#define VIRTIO_PCI_QUEUE_ADDR_SHIFT	12
> +/* Vector value used to disable MSI for queue */
> +#define VIRTIO_MSI_NO_VECTOR            0xffff
>  
> -/* The alignment to use between consumer and producer parts of vring.
> - * x86 pagesize again. */
> -#define VIRTIO_PCI_VRING_ALIGN		4096
> +/* The bit of the ISR which indicates a device configuration change. */
> +#define VIRTIO_PCI_ISR_CONFIG		0x2
>  
>  /*
>   * Layout for Virtio PCI vendor specific capability (little-endian):
> @@ -133,4 +151,20 @@
>  #define VIRTIO_PCI_CAP_CFG_OFF_MASK	0xffffffff
>  #define VIRTIO_PCI_CAP_CFG_OFF_SHIFT	0
>  
> +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> +struct virtio_pci_common_cfg {
> +	/* About the whole device. */
> +	__u64 device_features;	/* read-only */
> +	__u64 guest_features;	/* read-write */
> +	__u64 queue_address;	/* read-write */
> +	__u16 msix_config;	/* read-write */
> +	__u8 device_status;	/* read-write */
> +	__u8 unused;
> +
> +	/* About a specific virtqueue. */
> +	__u16 queue_select;	/* read-write */
> +	__u16 queue_align;	/* read-write, power of 2. */
> +	__u16 queue_size;	/* read-write, power of 2. */
> +	__u16 queue_msix_vector;/* read-write */
> +};

Slightly confusing as the registers are in fact little endian ...

>  #endif
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-23  2:32 ` Rusty Russell
  2011-11-23  8:46   ` Michael S. Tsirkin
@ 2011-11-23  8:49   ` Michael S. Tsirkin
  2011-11-23  9:38     ` Sasha Levin
  2011-11-23  9:44   ` Sasha Levin
  2 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-23  8:49 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Wed, Nov 23, 2011 at 01:02:22PM +1030, Rusty Russell wrote:
> +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> +struct virtio_pci_common_cfg {
> +	/* About the whole device. */
> +	__u64 device_features;	/* read-only */
> +	__u64 guest_features;	/* read-write */

We currently require atomic accesses to common fields.
Some architectures might not have such for 64 bit,
so these need to be split I think ...

> +	__u64 queue_address;	/* read-write */
> +	__u16 msix_config;	/* read-write */
> +	__u8 device_status;	/* read-write */
> +	__u8 unused;
> +
> +	/* About a specific virtqueue. */
> +	__u16 queue_select;	/* read-write */
> +	__u16 queue_align;	/* read-write, power of 2. */
> +	__u16 queue_size;	/* read-write, power of 2. */
> +	__u16 queue_msix_vector;/* read-write */
> +};
>  #endif
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-23  8:49   ` Michael S. Tsirkin
@ 2011-11-23  9:38     ` Sasha Levin
  2011-11-24  1:07       ` Rusty Russell
  0 siblings, 1 reply; 20+ messages in thread
From: Sasha Levin @ 2011-11-23  9:38 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Rusty Russell, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Wed, 2011-11-23 at 10:49 +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 23, 2011 at 01:02:22PM +1030, Rusty Russell wrote:
> > +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> > +struct virtio_pci_common_cfg {
> > +	/* About the whole device. */
> > +	__u64 device_features;	/* read-only */
> > +	__u64 guest_features;	/* read-write */
> 
> We currently require atomic accesses to common fields.
> Some architectures might not have such for 64 bit,
> so these need to be split I think ...

We can consider stealing the feature implementation from virtio-mmio:
Use the first 32 bits as a selector and the last as the features
themselves.

It's more complex to work with, but it provides 2**32 32 feature bits
(which should be enough for a while) and it solves the atomic access
issue.

> > +	__u64 queue_address;	/* read-write */
> > +	__u16 msix_config;	/* read-write */
> > +	__u8 device_status;	/* read-write */
> > +	__u8 unused;
> > +
> > +	/* About a specific virtqueue. */
> > +	__u16 queue_select;	/* read-write */
> > +	__u16 queue_align;	/* read-write, power of 2. */
> > +	__u16 queue_size;	/* read-write, power of 2. */
> > +	__u16 queue_msix_vector;/* read-write */
> > +};
> >  #endif
> > 
> > 

-- 

Sasha.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-23  2:32 ` Rusty Russell
  2011-11-23  8:46   ` Michael S. Tsirkin
  2011-11-23  8:49   ` Michael S. Tsirkin
@ 2011-11-23  9:44   ` Sasha Levin
  2 siblings, 0 replies; 20+ messages in thread
From: Sasha Levin @ 2011-11-23  9:44 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Michael S. Tsirkin, lkml - Kernel Mailing List,
	Alexey Kardashevskiy, Amit Shah, Christian Borntraeger,
	Krishna Kumar, Pawel Moll, Wang Sheng-Hui, virtualization, kvm

On Wed, 2011-11-23 at 13:02 +1030, Rusty Russell wrote:
> On Tue, 22 Nov 2011 20:36:22 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > Here's an updated vesion.
> > I'm alternating between updating the spec and the driver,
> > spec update to follow.
> 
> Don't touch the spec yet, we have a long way to go :(
> 
> I want the ability for driver to set the ring size, and the device to
> set the alignment.  That's a bigger change than you have here.  I
> imagine it almost rips the driver into two completely different drivers.
> 
> This is the kind of thing I had in mind, for the header.  Want me to
> code up the rest?
> 
> (I've avoided adding the constants for the new layout: a struct is more
> compact and more descriptive).
> 
> Cheers,
> Rusty.
> 
> diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
> --- a/include/linux/virtio_pci.h
> +++ b/include/linux/virtio_pci.h
> @@ -42,56 +42,74 @@
>  #include <linux/virtio_config.h>
>  
>  /* A 32-bit r/o bitmask of the features supported by the host */
> -#define VIRTIO_PCI_HOST_FEATURES	0
> +#define VIRTIO_PCI_LEGACY_HOST_FEATURES		0
>  
>  /* A 32-bit r/w bitmask of features activated by the guest */
> -#define VIRTIO_PCI_GUEST_FEATURES	4
> +#define VIRTIO_PCI_LEGACY_GUEST_FEATURES	4
>  
>  /* A 32-bit r/w PFN for the currently selected queue */
> -#define VIRTIO_PCI_QUEUE_PFN		8
> +#define VIRTIO_PCI_LEGACY_QUEUE_PFN		8
>  
>  /* A 16-bit r/o queue size for the currently selected queue */
> -#define VIRTIO_PCI_QUEUE_NUM		12
> +#define VIRTIO_PCI_LEGACY_QUEUE_NUM		12
>  
>  /* A 16-bit r/w queue selector */
> -#define VIRTIO_PCI_QUEUE_SEL		14
> +#define VIRTIO_PCI_LEGACY_QUEUE_SEL		14
>  
>  /* A 16-bit r/w queue notifier */
> -#define VIRTIO_PCI_QUEUE_NOTIFY		16
> +#define VIRTIO_PCI_LEGACY_QUEUE_NOTIFY		16
>  
>  /* An 8-bit device status register.  */
> -#define VIRTIO_PCI_STATUS		18
> +#define VIRTIO_PCI_LEGACY_STATUS		18
>  
>  /* An 8-bit r/o interrupt status register.  Reading the value will return the
>   * current contents of the ISR and will also clear it.  This is effectively
>   * a read-and-acknowledge. */
> -#define VIRTIO_PCI_ISR			19
> -
> -/* The bit of the ISR which indicates a device configuration change. */
> -#define VIRTIO_PCI_ISR_CONFIG		0x2
> +#define VIRTIO_PCI_LEGACY_ISR			19
>  
>  /* MSI-X registers: only enabled if MSI-X is enabled. */
>  /* A 16-bit vector for configuration changes. */
> -#define VIRTIO_MSI_CONFIG_VECTOR        20
> +#define VIRTIO_MSI_LEGACY_CONFIG_VECTOR        20
>  /* A 16-bit vector for selected queue notifications. */
> -#define VIRTIO_MSI_QUEUE_VECTOR         22
> -/* Vector value used to disable MSI for queue */
> -#define VIRTIO_MSI_NO_VECTOR            0xffff
> +#define VIRTIO_MSI_LEGACY_QUEUE_VECTOR         22
>  
>  /* The remaining space is defined by each driver as the per-driver
>   * configuration space */
> -#define VIRTIO_PCI_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
> +#define VIRTIO_PCI_LEGACY_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
> +
> +/* How many bits to shift physical queue address written to QUEUE_PFN.
> + * 12 is historical, and due to x86 page size. */
> +#define VIRTIO_PCI_LEGACY_QUEUE_ADDR_SHIFT	12
> +
> +/* The alignment to use between consumer and producer parts of vring.
> + * x86 pagesize again. */
> +#define VIRTIO_PCI_LEGACY_VRING_ALIGN		4096
> +
> +#ifndef __KERNEL__
> +/* Don't break compile of old userspace code.  These will go away. */

Maybe wrap it in:
#ifndef VIRTIO_PCI_NO_LEGACY

#warning Please stop using old defines

[...]

> +#define VIRTIO_PCI_HOST_FEATURES VIRTIO_PCI_LEGACY_HOST_FEATURES
> +#define VIRTIO_PCI_GUEST_FEATURES VIRTIO_PCI_LEGACY_GUEST_FEATURES
> +#define VIRTIO_PCI_LEGACY_QUEUE_PFN VIRTIO_PCI_QUEUE_PFN
> +#define VIRTIO_PCI_LEGACY_QUEUE_NUM VIRTIO_PCI_QUEUE_NUM
> +#define VIRTIO_PCI_LEGACY_QUEUE_SEL VIRTIO_PCI_QUEUE_SEL
> +#define VIRTIO_PCI_LEGACY_QUEUE_NOTIFY VIRTIO_PCI_QUEUE_NOTIFY
> +#define VIRTIO_PCI_LEGACY_STATUS VIRTIO_PCI_STATUS
> +#define VIRTIO_PCI_LEGACY_ISR VIRTIO_PCI_ISR
> +#define VIRTIO_MSI_LEGACY_CONFIG_VECTOR VIRTIO_MSI_CONFIG_VECTOR
> +#define VIRTIO_MSI_LEGACY_QUEUE_VECTOR VIRTIO_MSI_QUEUE_VECTOR
> +#define VIRTIO_PCI_LEGACY_CONFIG(dev) VIRTIO_PCI_CONFIG(dev)
> +#define VIRTIO_PCI_LEGACY_QUEUE_ADDR_SHIFT VIRTIO_PCI_QUEUE_ADDR_SHIFT
> +#define VIRTIO_PCI_LEGACY_VRING_ALIGN VIRTIO_PCI_VRING_ALIGN

#endif /* VIRTIO_PCI_NO_LEGACY */

> +#endif /* ...!KERNEL */
>  
>  /* Virtio ABI version, this must match exactly */
>  #define VIRTIO_PCI_ABI_VERSION		0
>  
> -/* How many bits to shift physical queue address written to QUEUE_PFN.
> - * 12 is historical, and due to x86 page size. */
> -#define VIRTIO_PCI_QUEUE_ADDR_SHIFT	12
> +/* Vector value used to disable MSI for queue */
> +#define VIRTIO_MSI_NO_VECTOR            0xffff
>  
> -/* The alignment to use between consumer and producer parts of vring.
> - * x86 pagesize again. */
> -#define VIRTIO_PCI_VRING_ALIGN		4096
> +/* The bit of the ISR which indicates a device configuration change. */
> +#define VIRTIO_PCI_ISR_CONFIG		0x2
>  
>  /*
>   * Layout for Virtio PCI vendor specific capability (little-endian):
> @@ -133,4 +151,20 @@
>  #define VIRTIO_PCI_CAP_CFG_OFF_MASK	0xffffffff
>  #define VIRTIO_PCI_CAP_CFG_OFF_SHIFT	0
>  
> +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> +struct virtio_pci_common_cfg {
> +	/* About the whole device. */
> +	__u64 device_features;	/* read-only */
> +	__u64 guest_features;	/* read-write */
> +	__u64 queue_address;	/* read-write */

queue address should be in the queue specific fields below

> +	__u16 msix_config;	/* read-write */
> +	__u8 device_status;	/* read-write */
> +	__u8 unused;
> +
> +	/* About a specific virtqueue. */
> +	__u16 queue_select;	/* read-write */
> +	__u16 queue_align;	/* read-write, power of 2. */
> +	__u16 queue_size;	/* read-write, power of 2. */
> +	__u16 queue_msix_vector;/* read-write */

Maybe we should reserve a bunch of bytes here for future extensions of
virtqueues. Otherwise, non virtqueue options may be added here which
will cause fragmentation of virtqueue specific features.

> +};
>  #endif
> 
> 
> 

-- 

Sasha.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-23  8:46   ` Michael S. Tsirkin
@ 2011-11-23 15:34     ` Michael S. Tsirkin
  2011-11-24  0:36     ` Rusty Russell
  1 sibling, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-23 15:34 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Wed, Nov 23, 2011 at 10:46:40AM +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 23, 2011 at 01:02:22PM +1030, Rusty Russell wrote:
> > On Tue, 22 Nov 2011 20:36:22 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > Here's an updated vesion.
> > > I'm alternating between updating the spec and the driver,
> > > spec update to follow.
> > 
> > Don't touch the spec yet, we have a long way to go :(
> > 
> > I want the ability for driver to set the ring size, and the device to
> > set the alignment.
> 
> Did you mean driver to be able to set the alignment? This
> is what BIOS guys want - after BIOS completes, guest driver gets handed
> control and sets its own alignment to save memory.
> 
> > That's a bigger change than you have here.
> 
> Why can't we just add the new registers at the end?
> With the new capability, we have as much space as we like for that.

Didn't have the time to finish the patch today, but just to clarify,
we can apply something like the below on top of my patch,
and then config_len can be checked to see whether the new fields
like alignment are available.

We probably can make the alignment smaller and save some memory -
the default value could set the default alignment.

Ring size, naturally, can just be made writeable - BIOS can put a
small value there, but linux guest would just use the defaults
so no need for any code changes at all.

As a bonus, the capability length is verified to be large enough.
The change's pretty small, isn't it?

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 681347b..39433d3 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -37,8 +37,9 @@ struct virtio_pci_device
 	struct virtio_device vdev;
 	struct pci_dev *pci_dev;
 
-	/* the IO address for the common PCI config space */
+	/* the IO address and length for the common PCI config space */
 	void __iomem *ioaddr;
+	unsigned config_len;
 	/* the IO address for device specific config */
 	void __iomem *ioaddr_device;
 	/* the IO address to use for notifications operations */
@@ -101,22 +102,24 @@ static void __iomem *virtio_pci_legacy_map(struct virtio_pci_device *vp_dev)
 #endif
 
 /*
- * With PIO, device-specific config moves as MSI-X is enabled/disabled.
- * Use this accessor to keep pointer to that config in sync.
+ * With PIO, device-specific config moves as MSI-X is enabled/disabled,
+ * configuration region length changes as well.  Use this accessor to keep
+ * pointer to that config and common config length, in sync.
  */
 static void virtio_pci_set_msix_enabled(struct virtio_pci_device *vp_dev, int enabled)
 {
 	void __iomem* m;
 	vp_dev->msix_enabled = enabled;
 	m = virtio_pci_legacy_map(vp_dev);
-	if (m)
+	if (m) {
 		vp_dev->ioaddr_device = m + VIRTIO_PCI_CONFIG(vp_dev);
-	else
+		vp_dev->config_len = VIRTIO_PCI_CONFIG(vp_dev);
+	} else
 		vp_dev->ioaddr_device = vp_dev->device_map;
 }
 
 static void __iomem *virtio_pci_map_cfg(struct virtio_pci_device *vp_dev, u8 cap_id,
-					u32 align)
+					u32 align, unsigned *lenp)
 {
         u32 size;
         u32 offset;
@@ -160,12 +163,16 @@ static void __iomem *virtio_pci_map_cfg(struct virtio_pci_device *vp_dev, u8 cap
         offset &= VIRTIO_PCI_CAP_CFG_OFF_MASK;
 	/* Align offset appropriately */
 	offset &= ~(align - 1);
+	if (lenp && size < *lenp)
+		goto err;
 
 	/* It's possible for a device to supply a huge config space,
 	 * but we'll never need to map more than a page ATM. */
 	p = pci_iomap_range(dev, bir, offset, size, PAGE_SIZE);
 	if (!p)
 		dev_err(&vp_dev->vdev.dev, "Unable to map virtio pci memory");
+	if (lenp)
+		*lenp = min(size, PAGE_SIZE);
 	return p;
 err:
 	dev_err(&vp_dev->vdev.dev, "Unable to parse virtio pci capability");
@@ -188,22 +195,24 @@ static void virtio_pci_iounmap(struct virtio_pci_device *vp_dev)
 
 static int virtio_pci_iomap(struct virtio_pci_device *vp_dev)
 {
+	unsigned common_len = VIRTIO_PCI_COMMON_CFG_MINSIZE;
 	vp_dev->isr_map = virtio_pci_map_cfg(vp_dev,
 					     VIRTIO_PCI_CAP_ISR_CFG,
-					     sizeof(u8));
+					     sizeof(u8), NULL);
 	vp_dev->notify_map = virtio_pci_map_cfg(vp_dev,
 						VIRTIO_PCI_CAP_NOTIFY_CFG,
-						sizeof(u16));
+						sizeof(u16), NULL);
 	vp_dev->common_map = virtio_pci_map_cfg(vp_dev,
 						VIRTIO_PCI_CAP_COMMON_CFG,
-						sizeof(u32));
+						sizeof(u32), &common_len);
 	vp_dev->device_map = virtio_pci_map_cfg(vp_dev,
 						VIRTIO_PCI_CAP_DEVICE_CFG,
-						sizeof(u8));
+						sizeof(u8), NULL);
 
 	if (vp_dev->notify_map && vp_dev->isr_map &&
 	    vp_dev->common_map && vp_dev->device_map) {
 		vp_dev->ioaddr = vp_dev->common_map;
+		vp_dev->config_len = common_len;
 		vp_dev->ioaddr_isr = vp_dev->isr_map;
 		vp_dev->ioaddr_notify = vp_dev->notify_map;
 		vp_dev->ioaddr_device = vp_dev->device_map;
@@ -221,6 +230,7 @@ static int virtio_pci_iomap(struct virtio_pci_device *vp_dev)
 			goto err;
 		}
 		vp_dev->ioaddr = m;
+		vp_dev->config_len = VIRTIO_PCI_CONFIG(vp_dev);
 		vp_dev->ioaddr_isr = m + VIRTIO_PCI_ISR;
 		vp_dev->ioaddr_notify = m + VIRTIO_PCI_QUEUE_NOTIFY;
 		vp_dev->ioaddr_device = m + VIRTIO_PCI_CONFIG(vp_dev);
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index d6568e7..8133851 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -133,4 +133,6 @@
 #define VIRTIO_PCI_CAP_CFG_OFF_MASK	0xffffffff
 #define VIRTIO_PCI_CAP_CFG_OFF_SHIFT	0
 
+#define VIRTIO_PCI_COMMON_CFG_MINSIZE	24
+
 #endif

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-23  8:46   ` Michael S. Tsirkin
  2011-11-23 15:34     ` Michael S. Tsirkin
@ 2011-11-24  0:36     ` Rusty Russell
  2011-11-24  6:24       ` Michael S. Tsirkin
  2011-11-24  7:11       ` Michael S. Tsirkin
  1 sibling, 2 replies; 20+ messages in thread
From: Rusty Russell @ 2011-11-24  0:36 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Wed, 23 Nov 2011 10:46:41 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Nov 23, 2011 at 01:02:22PM +1030, Rusty Russell wrote:
> > On Tue, 22 Nov 2011 20:36:22 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > Here's an updated vesion.
> > > I'm alternating between updating the spec and the driver,
> > > spec update to follow.
> > 
> > Don't touch the spec yet, we have a long way to go :(
> > 
> > I want the ability for driver to set the ring size, and the device to
> > set the alignment.
> 
> Did you mean driver to be able to set the alignment? This
> is what BIOS guys want - after BIOS completes, guest driver gets handed
> control and sets its own alignment to save memory.

Yep, sorry.

But we really do want the guest to set the ring size.  Because it has to
be guest-physical-contiguous, the host currently sets a very small ring,
because the guest is useless if it can't allocate.

Either way, it's now the driver's responsibility to write those fields.

> > That's a bigger change than you have here.
> 
> Why can't we just add the new registers at the end?
> With the new capability, we have as much space as we like for that.

We could, for sure.

> > I imagine it almost rips the driver into two completely different drivers.
> 
> If you insist on moving all the rest of registers around, certainly. But
> why do this?

Because I suspect we'll be different enough anyway, once we change the
way we allocate the ring, and write the alignment.  It'll be *clearer*
to have two completely separate paths than to fill with if() statements.
And a rewrite won't hurt the driver.

But to be honest I don't really care about the Linux driver: we're
steeped in this stuff and we'll get it right.  But I'm *terrified* of
making the spec more complex; implementations will get it wrong.  I
*really* want to banish the legacy stuff to an appendix where noone will
ever know it's there :)

> Renaming constants in exported headers will break userspace builds.
> Do we care? Why not?

As the patch shows, I decided not to do that.  It's a nice heads-up, but
breaking older versions of the code is just mean.  Hence this:

> > +#ifndef __KERNEL__
> > +/* Don't break compile of old userspace code.  These will go away. */
> > +#define VIRTIO_PCI_HOST_FEATURES VIRTIO_PCI_LEGACY_HOST_FEATURES
> > +#define VIRTIO_PCI_GUEST_FEATURES VIRTIO_PCI_LEGACY_GUEST_FEATURES
> > +#define VIRTIO_PCI_LEGACY_QUEUE_PFN VIRTIO_PCI_QUEUE_PFN
> > +#define VIRTIO_PCI_LEGACY_QUEUE_NUM VIRTIO_PCI_QUEUE_NUM
> > +#define VIRTIO_PCI_LEGACY_QUEUE_SEL VIRTIO_PCI_QUEUE_SEL
> > +#define VIRTIO_PCI_LEGACY_QUEUE_NOTIFY VIRTIO_PCI_QUEUE_NOTIFY
> > +#define VIRTIO_PCI_LEGACY_STATUS VIRTIO_PCI_STATUS
> > +#define VIRTIO_PCI_LEGACY_ISR VIRTIO_PCI_ISR
> > +#define VIRTIO_MSI_LEGACY_CONFIG_VECTOR VIRTIO_MSI_CONFIG_VECTOR
> > +#define VIRTIO_MSI_LEGACY_QUEUE_VECTOR VIRTIO_MSI_QUEUE_VECTOR
> > +#define VIRTIO_PCI_LEGACY_CONFIG(dev) VIRTIO_PCI_CONFIG(dev)
> > +#define VIRTIO_PCI_LEGACY_QUEUE_ADDR_SHIFT VIRTIO_PCI_QUEUE_ADDR_SHIFT
> > +#define VIRTIO_PCI_LEGACY_VRING_ALIGN VIRTIO_PCI_VRING_ALIGN
> > +#endif /* ...!KERNEL */

...
> > +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> > +struct virtio_pci_common_cfg {
> > +	/* About the whole device. */
> > +	__u64 device_features;	/* read-only */
> > +	__u64 guest_features;	/* read-write */
> > +	__u64 queue_address;	/* read-write */
> > +	__u16 msix_config;	/* read-write */
> > +	__u8 device_status;	/* read-write */
> > +	__u8 unused;
> > +
> > +	/* About a specific virtqueue. */
> > +	__u16 queue_select;	/* read-write */
> > +	__u16 queue_align;	/* read-write, power of 2. */
> > +	__u16 queue_size;	/* read-write, power of 2. */
> > +	__u16 queue_msix_vector;/* read-write */
> > +};
> 
> Slightly confusing as the registers are in fact little endian ...

Good point, should mark them appropriately with __le16.  That makes it
even clearer.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-23  9:38     ` Sasha Levin
@ 2011-11-24  1:07       ` Rusty Russell
  0 siblings, 0 replies; 20+ messages in thread
From: Rusty Russell @ 2011-11-24  1:07 UTC (permalink / raw)
  To: Sasha Levin, Michael S. Tsirkin
  Cc: lkml - Kernel Mailing List, Alexey Kardashevskiy, Amit Shah,
	Christian Borntraeger, Krishna Kumar, Pawel Moll, Wang Sheng-Hui,
	virtualization, kvm

On Wed, 23 Nov 2011 11:38:40 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> On Wed, 2011-11-23 at 10:49 +0200, Michael S. Tsirkin wrote:
> > On Wed, Nov 23, 2011 at 01:02:22PM +1030, Rusty Russell wrote:
> > > +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> > > +struct virtio_pci_common_cfg {
> > > +	/* About the whole device. */
> > > +	__u64 device_features;	/* read-only */
> > > +	__u64 guest_features;	/* read-write */
> > 
> > We currently require atomic accesses to common fields.
> > Some architectures might not have such for 64 bit,
> > so these need to be split I think ...
> 
> We can consider stealing the feature implementation from virtio-mmio:
> Use the first 32 bits as a selector and the last as the features
> themselves.
> 
> It's more complex to work with, but it provides 2**32 32 feature bits
> (which should be enough for a while) and it solves the atomic access
> issue.

That works.  I don't expect we'll need more than 64 features given that
virtio_net hasn't seen a new one in over a year, but it's gone from 5 to
18 in 4 years, so another 32 bits at that rate only gets us another decade.

Unfortunately, it doesn't solve the queue_address problem.

We currently multiply the (32-bit) address by 4096 (the alignment).  If
drivers are going to reduce alignment, that makes it trickier, hence the
change here to a 64.  Simplicity.

We could cheat and assert that a 32-bit write there implies 0 in the
upper bits, and hope that all 64 bit platforms can do a 64-bit atomic
write.  Or insist the value not be intepreted until the guest writes its
(first) feature bit.

Perhaps we really need a "I'm done configuring!" trigger, now we can't
use the feature bit field for that.

Thoughts welcome...
Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-24  0:36     ` Rusty Russell
@ 2011-11-24  6:24       ` Michael S. Tsirkin
  2011-11-24  7:11       ` Michael S. Tsirkin
  1 sibling, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-24  6:24 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Thu, Nov 24, 2011 at 11:06:44AM +1030, Rusty Russell wrote:
> > > +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> > > +struct virtio_pci_common_cfg {
> > > +	/* About the whole device. */
> > > +	__u64 device_features;	/* read-only */
> > > +	__u64 guest_features;	/* read-write */
> > > +	__u64 queue_address;	/* read-write */
> > > +	__u16 msix_config;	/* read-write */
> > > +	__u8 device_status;	/* read-write */
> > > +	__u8 unused;
> > > +
> > > +	/* About a specific virtqueue. */
> > > +	__u16 queue_select;	/* read-write */
> > > +	__u16 queue_align;	/* read-write, power of 2. */
> > > +	__u16 queue_size;	/* read-write, power of 2. */
> > > +	__u16 queue_msix_vector;/* read-write */
> > > +};
> > 
> > Slightly confusing as the registers are in fact little endian ...
> 
> Good point, should mark them appropriately with __le16.  That makes it
> even clearer.
> 
> Thanks,
> Rusty.

Do we still require atomic access to fields in common cfg?
If yes it's a problem as some systems don't have 64 bit
addresses. If no, implementations might get harder.

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-24  0:36     ` Rusty Russell
  2011-11-24  6:24       ` Michael S. Tsirkin
@ 2011-11-24  7:11       ` Michael S. Tsirkin
  2011-11-28  0:55         ` Rusty Russell
  1 sibling, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-24  7:11 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Thu, Nov 24, 2011 at 11:06:44AM +1030, Rusty Russell wrote:
> Because I suspect we'll be different enough anyway, once we change the
> way we allocate the ring, and write the alignment.

Well, it'd be easy to just add pa_high.

> It'll be *clearer* to have two completely separate paths than to fill
> with if() statements.

Well, look at my patches. See how each new feature basically adds *one*
if statement.

Yes, we could declare all existing features to be required,
this is what you are advocating. But in a couple of years
more if statements have accumulated and we still don't have
a flying car. Meanwhile the driver has a huge legacy section
which is a huge pain to maintain.

> And a rewrite won't hurt the driver.

Wow. Is everyone going for the rewrite these days?
After a lot of work we will be back at the point where we left,
with a minor tweak to enable a huge number of slow devices.
Meanwhile we have no time to work on real problems,
such as ring fragmentation that Avi pointed out,
support for very large requests.


And yes it will hurt, it will hurt bisectability and testability.
What you propose is a huge patch that just changes all of it.
If there's a breakage, bisect just points at a rewrite.
What I would like to see is incremental patches. So I would like to
1. Split driver config from common config
2. Split isr/notifications out too
3. Copy config in MMIO
4. Add a new config
5. Add length checks
6. Add 64 bit features
7. Add alignment programming

At each point we can bisect.  It worked well for event index, qemu had a
flag to turn it off, each time someone came in and went 'maybe it's
event index' we just tested without.

> But to be honest I don't really care about the Linux driver: we're
> steeped in this stuff and we'll get it right.

Maybe. What about windows drivers?

> But I'm *terrified* of making the spec more complex;

All you do is move stuff around. Why do you think it simplifies the spec
so much?

> implementations will get it wrong.

Just to clarify: you assume that if virtio pci spec is changed, then
there will be many more new implementations than existing ones, so it is
worth it to make life (temporarily, for several short years) harder for
all the existing ones?

Look at the qemu side for example. The way I proposed - with optional
capabilities, each one with a separate fallback - the change can be
implemented very gradually.  Each step will be bisectable, and testable.

> I *really* want to banish the legacy stuff to an appendix where noone will
> ever know it's there :)

This does not make life easy for implementations as they
need to implement it anyway. What makes life easy is
making new features optional, so you pick just what you like.

For example, we might want to backport some new feature into
rhel6.X at some point. I would like the diff to be small,
and easily understandable in its impact.


-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-24  7:11       ` Michael S. Tsirkin
@ 2011-11-28  0:55         ` Rusty Russell
  2011-11-28  8:41           ` Michael S. Tsirkin
  2011-11-28  9:15           ` Sasha Levin
  0 siblings, 2 replies; 20+ messages in thread
From: Rusty Russell @ 2011-11-28  0:55 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Thu, 24 Nov 2011 09:11:22 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Thu, Nov 24, 2011 at 11:06:44AM +1030, Rusty Russell wrote:
> > It'll be *clearer* to have two completely separate paths than to fill
> > with if() statements.
> 
> Well, look at my patches. See how each new feature basically adds *one*
> if statement.

Which means every new feature doubles the number of paths to test.  Both
in qemu and the kernel.  And I know we're not very good at testing.

And you've still got alignment setting and guest-set ringsize to go,
which are more complex.

> Yes, we could declare all existing features to be required,
> this is what you are advocating. But in a couple of years
> more if statements have accumulated and we still don't have
> a flying car. Meanwhile the driver has a huge legacy section
> which is a huge pain to maintain.

We've only implemented one layout change in 4 years (MSI-X) with one
more definite (64-bit features).  The rate of requests is also slowing:
they mainly came from early re-implementations.

And by drawing a line under legacy mode, we don't have to test those new
options with legacy mode.

> > And a rewrite won't hurt the driver.
> 
> Wow. Is everyone going for the rewrite these days?

Ok, I'll confess that was an ill-advised comment.  The virtio-pci driver
is actually quite nice.

> And yes it will hurt, it will hurt bisectability and testability.
> What you propose is a huge patch that just changes all of it.
> If there's a breakage, bisect just points at a rewrite.

I don't have a problem with bisectability; it's great in the transition.

But you're suggesting we maintain all those different variations
forever, and that burden be on other implementations too.

> What I would like to see is incremental patches. So I would like to
> 1. Split driver config from common config
> 2. Split isr/notifications out too
> 3. Copy config in MMIO
> 4. Add a new config
> 5. Add length checks
> 6. Add 64 bit features
> 7. Add alignment programming
> 
> At each point we can bisect.  It worked well for event index, qemu had a
> flag to turn it off, each time someone came in and went 'maybe it's
> event index' we just tested without.

Ok, I would like to see this too.  But I think the intermediate stages
should be disallowed in the final patch.

> > But to be honest I don't really care about the Linux driver: we're
> > steeped in this stuff and we'll get it right.
> 
> Maybe. What about windows drivers?

Yes.  My point was: I think one big switch is easier than lots of little
ones.

> > But I'm *terrified* of making the spec more complex;
> 
> All you do is move stuff around. Why do you think it simplifies the spec
> so much?

No, but it reduces the yuk factor.  Which has been important to adoption.

And that's *not* all I do: reducing the number of options definitely
simplifies the spec.  For example, the spec should currently say
(looking at your implementation):

  Notifying the device
  ====================
  If you find a valid VIRTIO_PCI_CAP_NOTIFY_CFG capability, and you can
  map 2 bytes within it, those two bytes should be used to notify the
  device of new descriptors in its virtqueues, by writing the index of the
  virtqueue to that mapping.

  If the capability is missing or malformed or you cannot map it, the
  virtqueue index should be written to the VIRTIO_PCI_QUEUE_NOTIFY offset
  of the legacy bar.

Vs:

  Notifying the device
  ====================
  The index of the virtqueue containing new descriptors should be written
  to the location specified by the VIRTIO_PCI_CAP_NOTIFY_CFG capability.
  (Unless the device is in legacy mode, see Appendix Y: Legacy Mode).

> > I *really* want to banish the legacy stuff to an appendix where noone will
> > ever know it's there :)
> 
> This does not make life easy for implementations as they
> need to implement it anyway. What makes life easy is
> making new features optional, so you pick just what you like.

Today, that's true.  Tomorrow, it's not.

I'd like to see kvmtools remove support for legacy mode altogether, but
they probably have existing users.

And there are boutique implementations who don't care about
compatibility with older versions, but they implement virtio because
it's well specified, and clear.

> For example, we might want to backport some new feature into
> rhel6.X at some point. I would like the diff to be small,
> and easily understandable in its impact.

Sure.  And I'm determined to reduce long-term complexity.

Look, I try to be more inclusive and polite than Linus, but at some
point more verbiage is wasted.  We will have single Legacy Mode switch.
Accept it, or fork the standard.

If you want to reuse the same structure, we're going to need to figure
out how to specify the virtqueue address without a fixed alignment, and
how to specify the alignment itself.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-28  0:55         ` Rusty Russell
@ 2011-11-28  8:41           ` Michael S. Tsirkin
  2011-11-29 23:28             ` Rusty Russell
  2011-11-28  9:15           ` Sasha Levin
  1 sibling, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-28  8:41 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Mon, Nov 28, 2011 at 11:25:43AM +1030, Rusty Russell wrote:
> > > But I'm *terrified* of making the spec more complex;
> > 
> > All you do is move stuff around. Why do you think it simplifies the spec
> > so much?
> 
> No, but it reduces the yuk factor.  Which has been important to adoption.

Sorry if I'm dense. Could you please clarify: do you think we can live
with the slightly higher yuk factor assuming the spec moves the
legacy mode into an appendix as you explain below and driver has a
single 'legacy' switch?

> And that's *not* all I do: reducing the number of options definitely
> simplifies the spec.  For example, the spec should currently say
> (looking at your implementation):
> 
>   Notifying the device
>   ====================
>   If you find a valid VIRTIO_PCI_CAP_NOTIFY_CFG capability, and you can
>   map 2 bytes within it, those two bytes should be used to notify the
>   device of new descriptors in its virtqueues, by writing the index of the
>   virtqueue to that mapping.
> 
>   If the capability is missing or malformed or you cannot map it, the
>   virtqueue index should be written to the VIRTIO_PCI_QUEUE_NOTIFY offset
>   of the legacy bar.
> 
> Vs:
> 
>   Notifying the device
>   ====================
>   The index of the virtqueue containing new descriptors should be written
>   to the location specified by the VIRTIO_PCI_CAP_NOTIFY_CFG capability.
>   (Unless the device is in legacy mode, see Appendix Y: Legacy Mode).

Yes, I agree, this is better.

...

> Look, I try to be more inclusive and polite than Linus, but at some
> point more verbiage is wasted.
> We will have single Legacy Mode switch.

Sorry, I'm adding more verbiage :( 
When you say a single Legacy Mode switch, you mean that the driver will
assume either legacy layout or the new one, correct?

> Accept it, or fork the standard.
>
> If you want to reuse the same structure, we're going to need to figure
> out how to specify the virtqueue address without a fixed alignment, and
> how to specify the alignment itself.

I think I see a way to do that in a relatively painless way.
Do you prefer seeing driver patches or spec? Or are you not interested
in reusing the same structure at all?

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-28  0:55         ` Rusty Russell
  2011-11-28  8:41           ` Michael S. Tsirkin
@ 2011-11-28  9:15           ` Sasha Levin
  2011-11-29 23:40             ` Rusty Russell
  1 sibling, 1 reply; 20+ messages in thread
From: Sasha Levin @ 2011-11-28  9:15 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Michael S. Tsirkin, lkml - Kernel Mailing List,
	Alexey Kardashevskiy, Amit Shah, Christian Borntraeger,
	Krishna Kumar, Pawel Moll, Wang Sheng-Hui, virtualization, kvm

On Mon, 2011-11-28 at 11:25 +1030, Rusty Russell wrote:
> I'd like to see kvmtools remove support for legacy mode altogether,
> but they probably have existing users.

While we can't simply remove it right away, instead of mixing our
implementation for both legacy and new spec in the same code we can
split the virtio-pci implementation into two:

	- virtio/virtio-pci-legacy.c
	- virtio/virtio-pci.c

At that point we can #ifdef the entire virtio-pci-legacy.c for now and
remove it at the same time legacy virtio-pci is removed from the kernel.

I think this is something very similar to what you want done in the
kernel code, so an added plus is that the usermode code will be
mirroring the kernel code - which is something we try to have in the KVM
tool :)

-- 

Sasha.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-28  8:41           ` Michael S. Tsirkin
@ 2011-11-29 23:28             ` Rusty Russell
  2011-11-30  7:18               ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Rusty Russell @ 2011-11-29 23:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Mon, 28 Nov 2011 10:41:51 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Nov 28, 2011 at 11:25:43AM +1030, Rusty Russell wrote:
> > > > But I'm *terrified* of making the spec more complex;
> > > 
> > > All you do is move stuff around. Why do you think it simplifies the spec
> > > so much?
> > 
> > No, but it reduces the yuk factor.  Which has been important to adoption.
> 
> Sorry if I'm dense. Could you please clarify: do you think we can live
> with the slightly higher yuk factor assuming the spec moves the
> legacy mode into an appendix as you explain below and driver has a
> single 'legacy' switch?

Yep, it's all a trade-off.  A clean slate is good, but if we can make
our lives in transition less painful, I'm all for it.

> I think I see a way to do that in a relatively painless way.
> Do you prefer seeing driver patches or spec? Or are you not interested
> in reusing the same structure at all?

I think we should look at code at this point; my gut says we're going to
be not-quite-similar-enough-to-be-useful.  At which point, a clean-slate
approach is more appealing.  But the code will show, one way or another.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-28  9:15           ` Sasha Levin
@ 2011-11-29 23:40             ` Rusty Russell
  2011-11-30  8:14               ` Michael S. Tsirkin
  2011-11-30 13:12               ` Sasha Levin
  0 siblings, 2 replies; 20+ messages in thread
From: Rusty Russell @ 2011-11-29 23:40 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael S. Tsirkin, lkml - Kernel Mailing List,
	Alexey Kardashevskiy, Amit Shah, Christian Borntraeger,
	Krishna Kumar, Pawel Moll, Wang Sheng-Hui, virtualization, kvm

On Mon, 28 Nov 2011 11:15:31 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> On Mon, 2011-11-28 at 11:25 +1030, Rusty Russell wrote:
> > I'd like to see kvmtools remove support for legacy mode altogether,
> > but they probably have existing users.
> 
> While we can't simply remove it right away, instead of mixing our
> implementation for both legacy and new spec in the same code we can
> split the virtio-pci implementation into two:
> 
> 	- virtio/virtio-pci-legacy.c
> 	- virtio/virtio-pci.c
> 
> At that point we can #ifdef the entire virtio-pci-legacy.c for now and
> remove it at the same time legacy virtio-pci is removed from the kernel.

Hmm, that might be neat, but we can't tell the driver core to try
virtio-pci before virtio-pci-legacy, so we need detection code in both
modules (and add a "force" flag to virtio-pci-legacy to tell it to
accept the device even if it's not a legacy-only one).

Then it should work...
Cheers,
Rusty.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-29 23:28             ` Rusty Russell
@ 2011-11-30  7:18               ` Michael S. Tsirkin
  0 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-30  7:18 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Wed, Nov 30, 2011 at 09:58:45AM +1030, Rusty Russell wrote:
> > I think I see a way to do that in a relatively painless way.
> > Do you prefer seeing driver patches or spec? Or are you not interested
> > in reusing the same structure at all?
> 
> I think we should look at code at this point; my gut says we're going to
> be not-quite-similar-enough-to-be-useful.  At which point, a clean-slate
> approach is more appealing.  But the code will show, one way or another.
> 
> Thanks,
> Rusty.

Makes sense, absolutely. So I'll hack on it and post and we can
judge the result.

One small comment that I'm afraid was lost in the noise
is that we should not add any 64 bit fields in the common area,
because there's no generic iowrite64/ioread64.

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-29 23:40             ` Rusty Russell
@ 2011-11-30  8:14               ` Michael S. Tsirkin
  2011-11-30 13:12               ` Sasha Levin
  1 sibling, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-30  8:14 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

On Wed, Nov 30, 2011 at 10:10:22AM +1030, Rusty Russell wrote:
> On Mon, 28 Nov 2011 11:15:31 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> > On Mon, 2011-11-28 at 11:25 +1030, Rusty Russell wrote:
> > > I'd like to see kvmtools remove support for legacy mode altogether,
> > > but they probably have existing users.
> > 
> > While we can't simply remove it right away, instead of mixing our
> > implementation for both legacy and new spec in the same code we can
> > split the virtio-pci implementation into two:
> > 
> > 	- virtio/virtio-pci-legacy.c
> > 	- virtio/virtio-pci.c
> > 
> > At that point we can #ifdef the entire virtio-pci-legacy.c for now and
> > remove it at the same time legacy virtio-pci is removed from the kernel.
> 
> Hmm, that might be neat, but we can't tell the driver core to try
> virtio-pci before virtio-pci-legacy, so we need detection code in both
> modules (and add a "force" flag to virtio-pci-legacy to tell it to
> accept the device even if it's not a legacy-only one).

This flag might need to be per device ideally, which is tricky ...

> 
> Then it should work...
> Cheers,
> Rusty.

One also wonders whether and how this will work on other OS-es.

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-29 23:40             ` Rusty Russell
  2011-11-30  8:14               ` Michael S. Tsirkin
@ 2011-11-30 13:12               ` Sasha Levin
  2011-12-01  2:42                 ` Rusty Russell
  1 sibling, 1 reply; 20+ messages in thread
From: Sasha Levin @ 2011-11-30 13:12 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Michael S. Tsirkin, lkml - Kernel Mailing List,
	Alexey Kardashevskiy, Amit Shah, Christian Borntraeger,
	Krishna Kumar, Pawel Moll, Wang Sheng-Hui, virtualization, kvm

On Wed, 2011-11-30 at 10:10 +1030, Rusty Russell wrote:
> On Mon, 28 Nov 2011 11:15:31 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> > On Mon, 2011-11-28 at 11:25 +1030, Rusty Russell wrote:
> > > I'd like to see kvmtools remove support for legacy mode altogether,
> > > but they probably have existing users.
> > 
> > While we can't simply remove it right away, instead of mixing our
> > implementation for both legacy and new spec in the same code we can
> > split the virtio-pci implementation into two:
> > 
> > 	- virtio/virtio-pci-legacy.c
> > 	- virtio/virtio-pci.c
> > 
> > At that point we can #ifdef the entire virtio-pci-legacy.c for now and
> > remove it at the same time legacy virtio-pci is removed from the kernel.
> 
> Hmm, that might be neat, but we can't tell the driver core to try
> virtio-pci before virtio-pci-legacy, so we need detection code in both
> modules (and add a "force" flag to virtio-pci-legacy to tell it to
> accept the device even if it's not a legacy-only one).

I was thinking more in the direction of fallback code in virtio-pci.c to
virtio-pci-legacy.c.

Something like:
#ifdef VIRTIO_PCI_LEGACY
[Create BAR0 and map it to virtio-pci-legacy.c]
#endif

So BAR0 isn't defined as long as legacy code is there, which makes
falling back to legacy pretty simple.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout
  2011-11-30 13:12               ` Sasha Levin
@ 2011-12-01  2:42                 ` Rusty Russell
  0 siblings, 0 replies; 20+ messages in thread
From: Rusty Russell @ 2011-12-01  2:42 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael S. Tsirkin, lkml - Kernel Mailing List,
	Alexey Kardashevskiy, Amit Shah, Christian Borntraeger,
	Krishna Kumar, Pawel Moll, Wang Sheng-Hui, virtualization, kvm

On Wed, 30 Nov 2011 15:12:43 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> On Wed, 2011-11-30 at 10:10 +1030, Rusty Russell wrote:
> > On Mon, 28 Nov 2011 11:15:31 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> > > On Mon, 2011-11-28 at 11:25 +1030, Rusty Russell wrote:
> > > > I'd like to see kvmtools remove support for legacy mode altogether,
> > > > but they probably have existing users.
> > > 
> > > While we can't simply remove it right away, instead of mixing our
> > > implementation for both legacy and new spec in the same code we can
> > > split the virtio-pci implementation into two:
> > > 
> > > 	- virtio/virtio-pci-legacy.c
> > > 	- virtio/virtio-pci.c
> > > 
> > > At that point we can #ifdef the entire virtio-pci-legacy.c for now and
> > > remove it at the same time legacy virtio-pci is removed from the kernel.
> > 
> > Hmm, that might be neat, but we can't tell the driver core to try
> > virtio-pci before virtio-pci-legacy, so we need detection code in both
> > modules (and add a "force" flag to virtio-pci-legacy to tell it to
> > accept the device even if it's not a legacy-only one).
> 
> I was thinking more in the direction of fallback code in virtio-pci.c to
> virtio-pci-legacy.c.
> 
> Something like:
> #ifdef VIRTIO_PCI_LEGACY
> [Create BAR0 and map it to virtio-pci-legacy.c]
> #endif
> 
> So BAR0 isn't defined as long as legacy code is there, which makes
> falling back to legacy pretty simple.

But it's nicer to see the driver actually labelled "virtio-pci-legacy",
and such a module.

I'll code something up, see what it looks like.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2011-12-01  3:17 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-11-22 18:36 [PATCHv3 RFC] virtio-pci: flexible configuration layout Michael S. Tsirkin
2011-11-23  2:32 ` Rusty Russell
2011-11-23  8:46   ` Michael S. Tsirkin
2011-11-23 15:34     ` Michael S. Tsirkin
2011-11-24  0:36     ` Rusty Russell
2011-11-24  6:24       ` Michael S. Tsirkin
2011-11-24  7:11       ` Michael S. Tsirkin
2011-11-28  0:55         ` Rusty Russell
2011-11-28  8:41           ` Michael S. Tsirkin
2011-11-29 23:28             ` Rusty Russell
2011-11-30  7:18               ` Michael S. Tsirkin
2011-11-28  9:15           ` Sasha Levin
2011-11-29 23:40             ` Rusty Russell
2011-11-30  8:14               ` Michael S. Tsirkin
2011-11-30 13:12               ` Sasha Levin
2011-12-01  2:42                 ` Rusty Russell
2011-11-23  8:49   ` Michael S. Tsirkin
2011-11-23  9:38     ` Sasha Levin
2011-11-24  1:07       ` Rusty Russell
2011-11-23  9:44   ` Sasha Levin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).