[PATCHv3 RFC] virtio-pci: flexible configuration layout

* [PATCHv3 RFC] virtio-pci: flexible configuration layout
@ 2011-11-22 18:36 Michael S. Tsirkin
  2011-11-23  2:32 ` Rusty Russell
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-11-22 18:36 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Sasha Levin, lkml - Kernel Mailing List, Alexey Kardashevskiy,
	Amit Shah, Christian Borntraeger, Krishna Kumar, Pawel Moll,
	Wang Sheng-Hui, virtualization, kvm

Here's an updated vesion.
I'm alternating between updating the spec and the driver,
spec update to follow.

Compiled only.  Posting here for early feedback, and to allow Sasha to
proceed with his "kvm tool" work.

Changes from v2:
	address comments by Rusty
	bugfixes by Sasha
Changes from v1:
	Updated to match v3 of the spec, see:

todo:
	split core changes out

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/Kconfig      |   22 +++++
 drivers/virtio/virtio_pci.c |  203 ++++++++++++++++++++++++++++++++++++++++---
 include/asm-generic/io.h    |    4 +
 include/asm-generic/iomap.h |   11 +++
 include/linux/pci_regs.h    |    4 +
 include/linux/virtio_pci.h  |   41 +++++++++
 lib/iomap.c                 |   41 ++++++++-
 7 files changed, 307 insertions(+), 19 deletions(-)

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 816ed08..465245e 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -25,6 +25,28 @@ config VIRTIO_PCI
 
 	  If unsure, say M.
 
+config VIRTIO_PCI_LEGACY
+	bool "Compatibility with Legacy PIO"
+	default y
+	depends on VIRTIO_PCI
+	---help---
+	  Look out into your driveway.  Do you have a flying car?  If
+	  so, you can happily disable this option and virtio will not
+	  break.  Otherwise, leave it set.  Unless you're testing what
+	  life will be like in The Future.
+
+	  In other words:
+
+	  Support compatibility with legacy PIO mapping in hypervisors.
+	  As of Nov 2011, this is required by all hypervisors without
+	  exception, so for now, disabling this option is only useful for
+	  testing.  In future hypervisors, it will be possible to disable
+	  this option and get a slightly smaller kernel.
+	  You know that your hypervisor is new enough if you disable this
+	  option and the device initialization passes.
+
+	  If unsure, say Y.
+
 config VIRTIO_BALLOON
 	tristate "Virtio balloon driver (EXPERIMENTAL)"
 	select VIRTIO
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 3d1bf41..681347b 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -37,8 +37,14 @@ struct virtio_pci_device
 	struct virtio_device vdev;
 	struct pci_dev *pci_dev;
 
-	/* the IO mapping for the PCI config space */
+	/* the IO address for the common PCI config space */
 	void __iomem *ioaddr;
+	/* the IO address for device specific config */
+	void __iomem *ioaddr_device;
+	/* the IO address to use for notifications operations */
+	void __iomem *ioaddr_notify;
+	/* the IO address to use for reading ISR */
+	void __iomem *ioaddr_isr;
 
 	/* a list of queues so we can dispatch IRQs */
 	spinlock_t lock;
@@ -57,8 +63,175 @@ struct virtio_pci_device
 	unsigned msix_used_vectors;
 	/* Whether we have vector per vq */
 	bool per_vq_vectors;
+
+	/* Various IO mappings: used for resource tracking only. */
+
+#ifdef CONFIG_VIRTIO_PCI_LEGACY
+	/* Legacy BAR0: typically PIO. */
+	void __iomem *legacy_map;
+#endif
+
+	/* Mappings specified by device capabilities: typically in MMIO */
+	void __iomem *isr_map;
+	void __iomem *notify_map;
+	void __iomem *common_map;
+	void __iomem *device_map;
 };
 
+#ifdef CONFIG_VIRTIO_PCI_LEGACY
+static void __iomem *virtio_pci_set_legacy_map(struct virtio_pci_device *vp_dev)
+{
+	return vp_dev->legacy_map = pci_iomap(vp_dev->pci_dev, 0, 256);
+}
+
+static void __iomem *virtio_pci_legacy_map(struct virtio_pci_device *vp_dev)
+{
+	return vp_dev->legacy_map;
+}
+#else
+static void __iomem *virtio_pci_set_legacy_map(struct virtio_pci_device *vp_dev)
+{
+	return NULL;
+}
+
+static void __iomem *virtio_pci_legacy_map(struct virtio_pci_device *vp_dev)
+{
+	return NULL;
+}
+#endif
+
+/*
+ * With PIO, device-specific config moves as MSI-X is enabled/disabled.
+ * Use this accessor to keep pointer to that config in sync.
+ */
+static void virtio_pci_set_msix_enabled(struct virtio_pci_device *vp_dev, int enabled)
+{
+	void __iomem* m;
+	vp_dev->msix_enabled = enabled;
+	m = virtio_pci_legacy_map(vp_dev);
+	if (m)
+		vp_dev->ioaddr_device = m + VIRTIO_PCI_CONFIG(vp_dev);
+	else
+		vp_dev->ioaddr_device = vp_dev->device_map;
+}
+
+static void __iomem *virtio_pci_map_cfg(struct virtio_pci_device *vp_dev, u8 cap_id,
+					u32 align)
+{
+        u32 size;
+        u32 offset;
+        u8 bir;
+        u8 cap_len;
+	int pos;
+	struct pci_dev *dev = vp_dev->pci_dev;
+	void __iomem *p;
+
+	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+		u8 id;
+		pci_read_config_byte(dev, pos + PCI_VNDR_CAP_LEN, &cap_len);
+		if (cap_len < VIRTIO_PCI_CAP_ID + 1)
+			continue;
+		pci_read_config_byte(dev, pos + VIRTIO_PCI_CAP_ID, &id);
+		id >>= VIRTIO_PCI_CAP_ID_SHIFT;
+		id &= VIRTIO_PCI_CAP_ID_MASK;
+		if (id == cap_id)
+			break;
+	}
+
+	if (pos <= 0)
+		return NULL;
+
+	if (cap_len < VIRTIO_PCI_CAP_CFG_BIR + 1)
+		goto err;
+        pci_read_config_byte(dev, pos + VIRTIO_PCI_CAP_CFG_BIR, &bir);
+	if (cap_len < VIRTIO_PCI_CAP_CFG_OFF + 4)
+		goto err;
+        pci_read_config_dword(dev, pos + VIRTIO_PCI_CAP_CFG_OFF, &offset);
+	if (cap_len < VIRTIO_PCI_CAP_CFG_SIZE + 4)
+		goto err;
+        pci_read_config_dword(dev, pos + VIRTIO_PCI_CAP_CFG_SIZE, &size);
+        bir >>= VIRTIO_PCI_CAP_CFG_BIR_SHIFT;
+        bir &= VIRTIO_PCI_CAP_CFG_BIR_MASK;
+        size >>= VIRTIO_PCI_CAP_CFG_SIZE_SHIFT;
+        size &= VIRTIO_PCI_CAP_CFG_SIZE_MASK;
+        offset >>= VIRTIO_PCI_CAP_CFG_OFF_SHIFT;
+        offset &= VIRTIO_PCI_CAP_CFG_OFF_MASK;
+	/* Align offset appropriately */
+	offset &= ~(align - 1);
+
+	/* It's possible for a device to supply a huge config space,
+	 * but we'll never need to map more than a page ATM. */
+	p = pci_iomap_range(dev, bir, offset, size, PAGE_SIZE);
+	if (!p)
+		dev_err(&vp_dev->vdev.dev, "Unable to map virtio pci memory");
+	return p;
+err:
+	dev_err(&vp_dev->vdev.dev, "Unable to parse virtio pci capability");
+	return NULL;
+}
+
+static void virtio_pci_iounmap(struct virtio_pci_device *vp_dev)
+{
+	if (virtio_pci_legacy_map(vp_dev))
+		pci_iounmap(vp_dev->pci_dev, virtio_pci_legacy_map(vp_dev));
+	if (vp_dev->isr_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->isr_map);
+	if (vp_dev->notify_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->notify_map);
+	if (vp_dev->common_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->common_map);
+	if (vp_dev->device_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->device_map);
+}
+
+static int virtio_pci_iomap(struct virtio_pci_device *vp_dev)
+{
+	vp_dev->isr_map = virtio_pci_map_cfg(vp_dev,
+					     VIRTIO_PCI_CAP_ISR_CFG,
+					     sizeof(u8));
+	vp_dev->notify_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_NOTIFY_CFG,
+						sizeof(u16));
+	vp_dev->common_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_COMMON_CFG,
+						sizeof(u32));
+	vp_dev->device_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_DEVICE_CFG,
+						sizeof(u8));
+
+	if (vp_dev->notify_map && vp_dev->isr_map &&
+	    vp_dev->common_map && vp_dev->device_map) {
+		vp_dev->ioaddr = vp_dev->common_map;
+		vp_dev->ioaddr_isr = vp_dev->isr_map;
+		vp_dev->ioaddr_notify = vp_dev->notify_map;
+		vp_dev->ioaddr_device = vp_dev->device_map;
+	} else {
+		void __iomem* m;
+		/*
+		 * If not all capabilities present, map legacy PIO.
+		 * Legacy access is at BAR 0. We never need to map
+		 * more than 256 bytes there, since legacy config space
+		 * used PIO which has this size limit.
+		 * */
+		m = virtio_pci_set_legacy_map(vp_dev);
+		if (!m) {
+			dev_err(&vp_dev->vdev.dev, "Unable to map legacy PIO");
+			goto err;
+		}
+		vp_dev->ioaddr = m;
+		vp_dev->ioaddr_isr = m + VIRTIO_PCI_ISR;
+		vp_dev->ioaddr_notify = m + VIRTIO_PCI_QUEUE_NOTIFY;
+		vp_dev->ioaddr_device = m + VIRTIO_PCI_CONFIG(vp_dev);
+	}
+
+	return 0;
+err:
+	virtio_pci_iounmap(vp_dev);
+	return -EINVAL;
+}
+
 /* Constants for MSI-X */
 /* Use first vector for configuration changes, second and the rest for
  * virtqueues Thus, we need at least 2 vectors for MSI. */
@@ -130,8 +303,7 @@ static void vp_get(struct virtio_device *vdev, unsigned offset,
 		   void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
-				VIRTIO_PCI_CONFIG(vp_dev) + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr_device + offset;
 	u8 *ptr = buf;
 	int i;
 
@@ -145,8 +317,7 @@ static void vp_set(struct virtio_device *vdev, unsigned offset,
 		   const void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
-				VIRTIO_PCI_CONFIG(vp_dev) + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr_device + offset;
 	const u8 *ptr = buf;
 	int i;
 
@@ -184,7 +355,7 @@ static void vp_notify(struct virtqueue *vq)
 
 	/* we write the queue's selector into the notification register to
 	 * signal the other end */
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+	iowrite16(info->queue_index, vp_dev->ioaddr_notify);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -231,7 +402,8 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
 
 	/* reading the ISR has the effect of also clearing it so it's very
 	 * important to save off the value. */
-	isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+	isr = ioread8(vp_dev->ioaddr_notify +
+		      VIRTIO_PCI_ISR - VIRTIO_PCI_QUEUE_NOTIFY);
 
 	/* It's definitely not us if the ISR was not high */
 	if (!isr)
@@ -265,7 +437,7 @@ static void vp_free_vectors(struct virtio_device *vdev)
 		ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
 
 		pci_disable_msix(vp_dev->pci_dev);
-		vp_dev->msix_enabled = 0;
+                virtio_pci_set_msix_enabled(vp_dev, 0);
 		vp_dev->msix_vectors = 0;
 	}
 
@@ -303,7 +475,7 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 	if (err)
 		goto error;
 	vp_dev->msix_vectors = nvectors;
-	vp_dev->msix_enabled = 1;
+        virtio_pci_set_msix_enabled(vp_dev, 1);
 
 	/* Set the vector used for configuration */
 	v = vp_dev->msix_used_vectors;
@@ -451,7 +623,10 @@ static void vp_del_vq(struct virtqueue *vq)
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
 			  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
 		/* Flush the write out to device */
-		ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+		ioread8(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		/* And clear ISR: TODO: really needed? */
+		ioread8(vp_dev->ioaddr_notify +
+		      VIRTIO_PCI_ISR - VIRTIO_PCI_QUEUE_NOTIFY);
 	}
 
 	vring_del_virtqueue(vq);
@@ -642,8 +817,8 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 	if (err)
 		goto out_enable_device;
 
-	vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
-	if (vp_dev->ioaddr == NULL)
+	err = virtio_pci_iomap(vp_dev);
+	if (err)
 		goto out_req_regions;
 
 	pci_set_drvdata(pci_dev, vp_dev);
@@ -665,7 +840,7 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 
 out_set_drvdata:
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	virtio_pci_iounmap(vp_dev);
 out_req_regions:
 	pci_release_regions(pci_dev);
 out_enable_device:
@@ -683,7 +858,7 @@ static void __devexit virtio_pci_remove(struct pci_dev *pci_dev)
 
 	vp_del_vqs(&vp_dev->vdev);
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	virtio_pci_iounmap(vp_dev);
 	pci_release_regions(pci_dev);
 	pci_disable_device(pci_dev);
 	kfree(vp_dev);
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 9120887..3cf1787 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -286,6 +286,10 @@ static inline void writesb(const void __iomem *addr, const void *buf, int len)
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                     unsigned offset,
+                                     unsigned long minlen,
+                                     unsigned long maxlen);
 static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
 {
 }
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 98dcd76..6f192d4 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -70,8 +70,19 @@ extern void ioport_unmap(void __iomem *);
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                     unsigned offset,
+                                     unsigned long minlen,
+                                     unsigned long maxlen);
 extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 #else
+static inline void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                            unsigned offset,
+                                            unsigned long minlen,
+                                            unsigned long maxlen)
+{
+	return NULL;
+}
 struct pci_dev;
 static inline void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
 {
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index b5d9657..3e7905c 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -375,6 +375,10 @@
 #define  PCI_X_STATUS_266MHZ	0x40000000	/* 266 MHz capable */
 #define  PCI_X_STATUS_533MHZ	0x80000000	/* 533 MHz capable */
 
+/* Vendor specific capability */
+#define PCI_VNDR_CAP_LEN	2	/* Capability length (8 bits), including
+					   bytes: ID, NEXT and LEN itself. */
+
 /* PCI Bridge Subsystem ID registers */
 
 #define PCI_SSVID_VENDOR_ID     4	/* PCI-Bridge subsystem vendor id register */
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index ea66f3f..d6568e7 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -92,4 +92,45 @@
 /* The alignment to use between consumer and producer parts of vring.
  * x86 pagesize again. */
 #define VIRTIO_PCI_VRING_ALIGN		4096
+
+/*
+ * Layout for Virtio PCI vendor specific capability (little-endian):
+ * 5 bit virtio capability id.
+ * 3 bit BAR index register, specifying which BAR to use.
+ * 4 byte cfg offset within the BAR.
+ * 4 byte cfg size.
+ */
+
+/* A single virtio device has multiple vendor specific capabilities, we use the
+ * 5 bit ID field to distinguish between these. */
+#define VIRTIO_PCI_CAP_ID		3
+#define VIRTIO_PCI_CAP_ID_MASK		0xff
+#define VIRTIO_PCI_CAP_ID_SHIFT		0
+
+/* IDs for different capabilities. If a specific configuration
+ * is missing, legacy PIO path is used. */
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR access */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific confiuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+
+/* Index of the BAR including this configuration */
+#define VIRTIO_PCI_CAP_CFG_BIR		4
+#define VIRTIO_PCI_CAP_CFG_BIR_MASK	(0x7)
+#define VIRTIO_PCI_CAP_CFG_BIR_SHIFT	0
+
+/* Size of the configuration space */
+#define VIRTIO_PCI_CAP_CFG_SIZE		4
+#define VIRTIO_PCI_CAP_CFG_SIZE_MASK	0xffffff
+#define VIRTIO_PCI_CAP_CFG_SIZE_SHIFT	8
+
+/* Offset within the BAR */
+#define VIRTIO_PCI_CAP_CFG_OFF		8
+#define VIRTIO_PCI_CAP_CFG_OFF_MASK	0xffffffff
+#define VIRTIO_PCI_CAP_CFG_OFF_SHIFT	0
+
 #endif
diff --git a/lib/iomap.c b/lib/iomap.c
index 5dbcb4b..93ae915 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -243,26 +243,37 @@ EXPORT_SYMBOL(ioport_unmap);
 
 #ifdef CONFIG_PCI
 /**
- * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * pci_iomap_range - create a virtual mapping cookie for a PCI BAR
  * @dev: PCI device that owns the BAR
  * @bar: BAR number
- * @maxlen: length of the memory to map
+ * @offset: map memory at the given offset in BAR
+ * @minlen: min length of the memory to map
+ * @maxlen: max length of the memory to map
  *
  * Using this function you will get a __iomem address to your device BAR.
  * You can access it using ioread*() and iowrite*(). These functions hide
  * the details if this is a MMIO or PIO address space and will just do what
  * you expect from them in the correct way.
  *
+ * @minlen specifies the minimum length to map. We check that BAR is
+ * large enough.
  * @maxlen specifies the maximum length to map. If you want to get access to
- * the complete BAR without checking for its length first, pass %0 here.
+ * the complete BAR from offset to the end, pass %0 here.
  * */
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                              unsigned offset,
+                              unsigned long minlen,
+                              unsigned long maxlen)
 {
 	resource_size_t start = pci_resource_start(dev, bar);
 	resource_size_t len = pci_resource_len(dev, bar);
 	unsigned long flags = pci_resource_flags(dev, bar);
 
-	if (!len || !start)
+	if (len <= offset || !start)
+		return NULL;
+        len -= offset;
+	start += offset;
+        if (len < minlen)
 		return NULL;
 	if (maxlen && len > maxlen)
 		len = maxlen;
@@ -277,10 +288,30 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
 	return NULL;
 }
 
+/**
+ * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * @dev: PCI device that owns the BAR
+ * @bar: BAR number
+ * @maxlen: length of the memory to map
+ *
+ * Using this function you will get a __iomem address to your device BAR.
+ * You can access it using ioread*() and iowrite*(). These functions hide
+ * the details if this is a MMIO or PIO address space and will just do what
+ * you expect from them in the correct way.
+ *
+ * @maxlen specifies the maximum length to map. If you want to get access to
+ * the complete BAR without checking for its length first, pass %0 here.
+ * */
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+    return pci_iomap_range(dev, bar, 0, 0, maxlen);
+}
+
 void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
 {
 	IO_COND(addr, /* nothing */, iounmap(addr));
 }
 EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iomap_range);
 EXPORT_SYMBOL(pci_iounmap);
 #endif /* CONFIG_PCI */
-- 
1.7.5.53.gc233e

^ permalink raw reply related	[flat|nested] 20+ messages in thread