All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/11] RFC: PCI using capabilitities
@ 2011-12-08 10:22 Rusty Russell
  2011-12-08 10:30 ` [RFC 1/11] virtio: use u32, not bitmap for struct virtio_device's features Rusty Russell
                   ` (13 more replies)
  0 siblings, 14 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:22 UTC (permalink / raw)
  To: virtualization; +Cc: Avi Kivity, Sasha Levin, kvm, Michael S. Tsirkin

Here's the patch series I ended up with.  I haven't coded up the QEMU
side yet, so no idea if the new driver works.

Questions:
(1) Do we win from separating ISR, NOTIFY and COMMON?
(2) I used a "u8 bar"; should I use a bir and pack it instead?  BIR
    seems a little obscure (noone else in the kernel source seems to
    refer to it).

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 1/11] virtio: use u32, not bitmap for struct virtio_device's features
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
@ 2011-12-08 10:30 ` Rusty Russell
  2011-12-08 10:31 ` [RFC 2/11] virtio: add support for 64 bit features Rusty Russell
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:30 UTC (permalink / raw)
  To: virtualization; +Cc: Michael S. Tsirkin, Avi Kivity, kvm, Sasha Levin

It seemed like a good idea, but it's actually a pain when we get more
than 32 feature bits.  Just change it to a u32 for now.
---
 drivers/char/virtio_console.c  |    2 +-
 drivers/lguest/lguest_device.c |    2 +-
 drivers/s390/kvm/kvm_virtio.c  |    2 +-
 drivers/virtio/virtio.c        |   10 +++++-----
 drivers/virtio/virtio_mmio.c   |    8 ++------
 drivers/virtio/virtio_pci.c    |    3 +--
 drivers/virtio/virtio_ring.c   |    2 +-
 include/linux/virtio.h         |    3 +--
 include/linux/virtio_config.h  |    2 +-
 tools/virtio/linux/virtio.h    |   18 ++----------------
 tools/virtio/virtio_test.c     |    5 ++---
 11 files changed, 18 insertions(+), 39 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -331,7 +331,7 @@ static inline bool use_multiport(struct 
 	 */
 	if (!portdev->vdev)
 		return 0;
-	return portdev->vdev->features[0] & (1 << VIRTIO_CONSOLE_F_MULTIPORT);
+	return portdev->vdev->features & (1 << VIRTIO_CONSOLE_F_MULTIPORT);
 }
 
 static void free_buf(struct port_buffer *buf)
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -144,7 +144,7 @@ static void lg_finalize_features(struct 
 	memset(out_features, 0, desc->feature_len);
 	bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
 	for (i = 0; i < bits; i++) {
-		if (test_bit(i, vdev->features))
+		if (vdev->features & (1 << i))
 			out_features[i / 8] |= (1 << (i % 8));
 	}
 
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -105,7 +105,7 @@ static void kvm_finalize_features(struct
 	memset(out_features, 0, desc->feature_len);
 	bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
 	for (i = 0; i < bits; i++) {
-		if (test_bit(i, vdev->features))
+		if (vdev->features & (1 << i))
 			out_features[i / 8] |= (1 << (i % 8));
 	}
 }
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -41,9 +41,9 @@ static ssize_t features_show(struct devi
 
 	/* We actually represent this as a bitstring, as it could be
 	 * arbitrary length in future. */
-	for (i = 0; i < ARRAY_SIZE(dev->features)*BITS_PER_LONG; i++)
+	for (i = 0; i < sizeof(dev->features)*8; i++)
 		len += sprintf(buf+len, "%c",
-			       test_bit(i, dev->features) ? '1' : '0');
+			       dev->features & (1ULL << i) ? '1' : '0');
 	len += sprintf(buf+len, "\n");
 	return len;
 }
@@ -122,18 +122,18 @@ static int virtio_dev_probe(struct devic
 	device_features = dev->config->get_features(dev);
 
 	/* Features supported by both device and driver into dev->features. */
-	memset(dev->features, 0, sizeof(dev->features));
+	dev->features = 0;
 	for (i = 0; i < drv->feature_table_size; i++) {
 		unsigned int f = drv->feature_table[i];
 		BUG_ON(f >= 32);
 		if (device_features & (1 << f))
-			set_bit(f, dev->features);
+			dev->features |= (1 << f);
 	}
 
 	/* Transport features always preserved to pass to finalize_features. */
 	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++)
 		if (device_features & (1 << i))
-			set_bit(i, dev->features);
+			dev->features |= (1 << i);
 
 	dev->config->finalize_features(dev);
 
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -112,16 +112,12 @@ static u32 vm_get_features(struct virtio
 static void vm_finalize_features(struct virtio_device *vdev)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
-	int i;
 
 	/* Give virtio_ring a chance to accept features. */
 	vring_transport_features(vdev);
 
-	for (i = 0; i < ARRAY_SIZE(vdev->features); i++) {
-		writel(i, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES_SEL);
-		writel(vdev->features[i],
-				vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES);
-	}
+	writel(0, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES_SEL);
+	writel(vdev->features, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES);
 }
 
 static void vm_get(struct virtio_device *vdev, unsigned offset,
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -121,8 +121,7 @@ static void vp_finalize_features(struct 
 	vring_transport_features(vdev);
 
 	/* We only support 32 feature bits. */
-	BUILD_BUG_ON(ARRAY_SIZE(vdev->features) != 1);
-	iowrite32(vdev->features[0], vp_dev->ioaddr+VIRTIO_PCI_GUEST_FEATURES);
+	iowrite32(vdev->features, vp_dev->ioaddr+VIRTIO_PCI_GUEST_FEATURES);
 }
 
 /* virtio config->get() implementation */
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -685,7 +685,7 @@ void vring_transport_features(struct vir
 			break;
 		default:
 			/* We don't understand this bit. */
-			clear_bit(i, vdev->features);
+			vdev->features &= ~(1 << i);
 		}
 	}
 }
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -66,8 +66,7 @@ struct virtio_device {
 	struct virtio_device_id id;
 	struct virtio_config_ops *config;
 	struct list_head vqs;
-	/* Note that this is a Linux set_bit-style bitmap. */
-	unsigned long features[1];
+	u32 features;
 	void *priv;
 };
 
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -142,7 +142,7 @@ static inline bool virtio_has_feature(co
 	if (fbit < VIRTIO_TRANSPORT_F_START)
 		virtio_check_driver_offered_feature(vdev, fbit);
 
-	return test_bit(fbit, vdev->features);
+	return vdev->features & (1 << fbit);
 }
 
 /**
diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
--- a/tools/virtio/linux/virtio.h
+++ b/tools/virtio/linux/virtio.h
@@ -131,29 +131,15 @@ static inline void kfree(void *p)
 #define BITS_PER_BYTE		8
 #define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE)
 #define BIT_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG))
-/* TODO: Not atomic as it should be:
- * we don't use this for anything important. */
-static inline void clear_bit(int nr, volatile unsigned long *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-
-	*p &= ~mask;
-}
-
-static inline int test_bit(int nr, const volatile unsigned long *addr)
-{
-        return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
-}
 
 /* The only feature we care to support */
 #define virtio_has_feature(dev, feature) \
-	test_bit((feature), (dev)->features)
+	((dev)->features & (1 << (feature)))
 /* end of stubs */
 
 struct virtio_device {
 	void *dev;
-	unsigned long features[1];
+	u32 features;
 };
 
 struct virtqueue {
diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c
--- a/tools/virtio/virtio_test.c
+++ b/tools/virtio/virtio_test.c
@@ -55,7 +55,7 @@ void vhost_vq_setup(struct vdev_info *de
 {
 	struct vhost_vring_state state = { .index = info->idx };
 	struct vhost_vring_file file = { .index = info->idx };
-	unsigned long long features = dev->vdev.features[0];
+	unsigned long long features = dev->vdev.features;
 	struct vhost_vring_addr addr = {
 		.index = info->idx,
 		.desc_user_addr = (uint64_t)(unsigned long)info->vring.desc,
@@ -106,8 +106,7 @@ static void vdev_info_init(struct vdev_i
 {
 	int r;
 	memset(dev, 0, sizeof *dev);
-	dev->vdev.features[0] = features;
-	dev->vdev.features[1] = features >> 32;
+	dev->vdev.features = features;
 	dev->buf_size = 1024;
 	dev->buf = malloc(dev->buf_size);
 	assert(dev->buf);

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 2/11] virtio: add support for 64 bit features.
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
  2011-12-08 10:30 ` [RFC 1/11] virtio: use u32, not bitmap for struct virtio_device's features Rusty Russell
@ 2011-12-08 10:31 ` Rusty Russell
  2011-12-08 10:32 ` [PATCH 0/11] RFC: PCI using capabilitities Sasha Levin
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:31 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin

Change the u32 to a u64, and make sure to use 1ULL everywhere!
---
 drivers/char/virtio_console.c  |    2 +-
 drivers/lguest/lguest_device.c |   10 +++++-----
 drivers/s390/kvm/kvm_virtio.c  |   10 +++++-----
 drivers/virtio/virtio.c        |   12 ++++++------
 drivers/virtio/virtio_mmio.c   |   14 +++++++++-----
 drivers/virtio/virtio_pci.c    |    5 ++---
 drivers/virtio/virtio_ring.c   |    2 +-
 include/linux/virtio.h         |    2 +-
 include/linux/virtio_config.h  |    8 ++++----
 tools/virtio/linux/virtio.h    |    4 ++--
 10 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -331,7 +331,7 @@ static inline bool use_multiport(struct 
 	 */
 	if (!portdev->vdev)
 		return 0;
-	return portdev->vdev->features & (1 << VIRTIO_CONSOLE_F_MULTIPORT);
+	return portdev->vdev->features & (1ULL << VIRTIO_CONSOLE_F_MULTIPORT);
 }
 
 static void free_buf(struct port_buffer *buf)
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -94,17 +94,17 @@ static unsigned desc_size(const struct l
 }
 
 /* This gets the device's feature bits. */
-static u32 lg_get_features(struct virtio_device *vdev)
+static u64 lg_get_features(struct virtio_device *vdev)
 {
 	unsigned int i;
-	u32 features = 0;
+	u64 features = 0;
 	struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
 	u8 *in_features = lg_features(desc);
 
 	/* We do this the slow but generic way. */
-	for (i = 0; i < min(desc->feature_len * 8, 32); i++)
+	for (i = 0; i < min(desc->feature_len * 8, 64); i++)
 		if (in_features[i / 8] & (1 << (i % 8)))
-			features |= (1 << i);
+			features |= (1ULL << i);
 
 	return features;
 }
@@ -144,7 +144,7 @@ static void lg_finalize_features(struct 
 	memset(out_features, 0, desc->feature_len);
 	bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
 	for (i = 0; i < bits; i++) {
-		if (vdev->features & (1 << i))
+		if (vdev->features & (1ULL << i))
 			out_features[i / 8] |= (1 << (i % 8));
 	}
 
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -79,16 +79,16 @@ static unsigned desc_size(const struct k
 }
 
 /* This gets the device's feature bits. */
-static u32 kvm_get_features(struct virtio_device *vdev)
+static u64 kvm_get_features(struct virtio_device *vdev)
 {
 	unsigned int i;
-	u32 features = 0;
+	u64 features = 0;
 	struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
 	u8 *in_features = kvm_vq_features(desc);
 
-	for (i = 0; i < min(desc->feature_len * 8, 32); i++)
+	for (i = 0; i < min(desc->feature_len * 8, 64); i++)
 		if (in_features[i / 8] & (1 << (i % 8)))
-			features |= (1 << i);
+			features |= (1ULL << i);
 	return features;
 }
 
@@ -105,7 +105,7 @@ static void kvm_finalize_features(struct
 	memset(out_features, 0, desc->feature_len);
 	bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
 	for (i = 0; i < bits; i++) {
-		if (vdev->features & (1 << i))
+		if (vdev->features & (1ULL << i))
 			out_features[i / 8] |= (1 << (i % 8));
 	}
 }
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -113,7 +113,7 @@ static int virtio_dev_probe(struct devic
 	struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
 	struct virtio_driver *drv = container_of(dev->dev.driver,
 						 struct virtio_driver, driver);
-	u32 device_features;
+	u64 device_features;
 
 	/* We have a driver! */
 	add_status(dev, VIRTIO_CONFIG_S_DRIVER);
@@ -125,15 +125,15 @@ static int virtio_dev_probe(struct devic
 	dev->features = 0;
 	for (i = 0; i < drv->feature_table_size; i++) {
 		unsigned int f = drv->feature_table[i];
-		BUG_ON(f >= 32);
-		if (device_features & (1 << f))
-			dev->features |= (1 << f);
+		BUG_ON(f >= 64);
+		if (device_features & (1ULL << f))
+			dev->features |= (1ULL << f);
 	}
 
 	/* Transport features always preserved to pass to finalize_features. */
 	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++)
-		if (device_features & (1 << i))
-			dev->features |= (1 << i);
+		if (device_features & (1ULL << i))
+			dev->features |= (1ULL << i);
 
 	dev->config->finalize_features(dev);
 
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -99,14 +99,16 @@ struct virtio_mmio_vq_info {
 
 /* Configuration interface */
 
-static u32 vm_get_features(struct virtio_device *vdev)
+static u64 vm_get_features(struct virtio_device *vdev)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+	u64 features;
 
-	/* TODO: Features > 32 bits */
 	writel(0, vm_dev->base + VIRTIO_MMIO_HOST_FEATURES_SEL);
-
-	return readl(vm_dev->base + VIRTIO_MMIO_HOST_FEATURES);
+	features = readl(vm_dev->base + VIRTIO_MMIO_HOST_FEATURES);
+	writel(1, vm_dev->base + VIRTIO_MMIO_HOST_FEATURES_SEL);
+	features |= ((u64)readl(vm_dev->base + VIRTIO_MMIO_HOST_FEATURES) << 32);
+	return features;
 }
 
 static void vm_finalize_features(struct virtio_device *vdev)
@@ -117,7 +119,9 @@ static void vm_finalize_features(struct 
 	vring_transport_features(vdev);
 
 	writel(0, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES_SEL);
-	writel(vdev->features, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES);
+	writel((u32)vdev->features, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES);
+	writel(1, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES_SEL);
+	writel(vdev->features >> 32, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES);
 }
 
 static void vm_get(struct virtio_device *vdev, unsigned offset,
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -103,12 +103,11 @@ static struct virtio_pci_device *to_vp_d
 }
 
 /* virtio config->get_features() implementation */
-static u32 vp_get_features(struct virtio_device *vdev)
+static u64 vp_get_features(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
-	/* When someone needs more than 32 feature bits, we'll need to
-	 * steal a bit to indicate that the rest are somewhere else. */
+	/* We only support 32 feature bits. */
 	return ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES);
 }
 
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -685,7 +685,7 @@ void vring_transport_features(struct vir
 			break;
 		default:
 			/* We don't understand this bit. */
-			vdev->features &= ~(1 << i);
+			vdev->features &= ~(1ULL << i);
 		}
 	}
 }
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -66,7 +66,7 @@ struct virtio_device {
 	struct virtio_device_id id;
 	struct virtio_config_ops *config;
 	struct list_head vqs;
-	u32 features;
+	u64 features;
 	void *priv;
 };
 
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -117,7 +117,7 @@ struct virtio_config_ops {
 			vq_callback_t *callbacks[],
 			const char *names[]);
 	void (*del_vqs)(struct virtio_device *);
-	u32 (*get_features)(struct virtio_device *vdev);
+	u64 (*get_features)(struct virtio_device *vdev);
 	void (*finalize_features)(struct virtio_device *vdev);
 };
 
@@ -135,14 +135,14 @@ static inline bool virtio_has_feature(co
 {
 	/* Did you forget to fix assumptions on max features? */
 	if (__builtin_constant_p(fbit))
-		BUILD_BUG_ON(fbit >= 32);
+		BUILD_BUG_ON(fbit >= 64);
 	else
-		BUG_ON(fbit >= 32);
+		BUG_ON(fbit >= 64);
 
 	if (fbit < VIRTIO_TRANSPORT_F_START)
 		virtio_check_driver_offered_feature(vdev, fbit);
 
-	return vdev->features & (1 << fbit);
+	return vdev->features & (1ULL << fbit);
 }
 
 /**
diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
--- a/tools/virtio/linux/virtio.h
+++ b/tools/virtio/linux/virtio.h
@@ -134,12 +134,12 @@ static inline void kfree(void *p)
 
 /* The only feature we care to support */
 #define virtio_has_feature(dev, feature) \
-	((dev)->features & (1 << (feature)))
+	((dev)->features & (1ULL << (feature)))
 /* end of stubs */
 
 struct virtio_device {
 	void *dev;
-	u32 features;
+	u64 features;
 };
 
 struct virtqueue {

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
  2011-12-08 10:30 ` [RFC 1/11] virtio: use u32, not bitmap for struct virtio_device's features Rusty Russell
  2011-12-08 10:31 ` [RFC 2/11] virtio: add support for 64 bit features Rusty Russell
@ 2011-12-08 10:32 ` Sasha Levin
  2011-12-08 10:32 ` [RFC 3/11] pci: add pci_iomap_range Rusty Russell
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-08 10:32 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Michael S. Tsirkin, Avi Kivity, kvm, virtualization

Rusty, I can't find the actual patches, could you verify that they were
indeed sent?

On Thu, 2011-12-08 at 20:52 +1030, Rusty Russell wrote:
> Here's the patch series I ended up with.  I haven't coded up the QEMU
> side yet, so no idea if the new driver works.
> 
> Questions:
> (1) Do we win from separating ISR, NOTIFY and COMMON?

By separating ISR, NOTIFY and COMMON we can place ISR and NOTIFY in PIO
and COMMON in MMIO. This gives us the benefit of having the small data
path use fast PIO, while big config path can use MMIO.

> (2) I used a "u8 bar"; should I use a bir and pack it instead?  BIR
>     seems a little obscure (noone else in the kernel source seems to
>     refer to it).

BIR is a concept from the PCI spec, but it was only used for MSI-X. I
don't expect to see it all around the kernel source.

-- 

Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 3/11] pci: add pci_iomap_range
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (2 preceding siblings ...)
  2011-12-08 10:32 ` [PATCH 0/11] RFC: PCI using capabilitities Sasha Levin
@ 2011-12-08 10:32 ` Rusty Russell
  2011-12-15  8:30   ` Michael S. Tsirkin
  2011-12-08 10:34 ` [RFC 4/11] virtio-pci: define layout for virtio vendor-specific capabilities Rusty Russell
                   ` (9 subsequent siblings)
  13 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:32 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin

From: Michael S Tsirkin <mst@redhat.com>

Virtio drivers should map the part of the range they need, not necessarily
all of it.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/asm-generic/io.h    |    4 ++++
 include/asm-generic/iomap.h |   11 +++++++++++
 lib/iomap.c                 |   41 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 9120887..3cf1787 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -286,6 +286,10 @@ static inline void writesb(const void __iomem *addr, const void *buf, int len)
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+				     unsigned offset,
+				     unsigned long minlen,
+				     unsigned long maxlen);
 static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
 {
 }
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 98dcd76..6f192d4 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -70,8 +70,19 @@ extern void ioport_unmap(void __iomem *);
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+				     unsigned offset,
+				     unsigned long minlen,
+				     unsigned long maxlen);
 extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 #else
+static inline void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+					    unsigned offset,
+					    unsigned long minlen,
+					    unsigned long maxlen)
+{
+	return NULL;
+}
 struct pci_dev;
 static inline void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
 {
diff --git a/lib/iomap.c b/lib/iomap.c
index 5dbcb4b..93ae915 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -243,26 +243,37 @@ EXPORT_SYMBOL(ioport_unmap);
 
 #ifdef CONFIG_PCI
 /**
- * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * pci_iomap_range - create a virtual mapping cookie for a PCI BAR
  * @dev: PCI device that owns the BAR
  * @bar: BAR number
- * @maxlen: length of the memory to map
+ * @offset: map memory at the given offset in BAR
+ * @minlen: min length of the memory to map
+ * @maxlen: max length of the memory to map
  *
  * Using this function you will get a __iomem address to your device BAR.
  * You can access it using ioread*() and iowrite*(). These functions hide
  * the details if this is a MMIO or PIO address space and will just do what
  * you expect from them in the correct way.
  *
+ * @minlen specifies the minimum length to map. We check that BAR is
+ * large enough.
  * @maxlen specifies the maximum length to map. If you want to get access to
- * the complete BAR without checking for its length first, pass %0 here.
+ * the complete BAR from offset to the end, pass %0 here.
  * */
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+			      unsigned offset,
+			      unsigned long minlen,
+			      unsigned long maxlen)
 {
 	resource_size_t start = pci_resource_start(dev, bar);
 	resource_size_t len = pci_resource_len(dev, bar);
 	unsigned long flags = pci_resource_flags(dev, bar);
 
-	if (!len || !start)
+	if (len <= offset || !start)
+		return NULL;
+	len -= offset;
+	start += offset;
+	if (len < minlen)
 		return NULL;
 	if (maxlen && len > maxlen)
 		len = maxlen;
@@ -277,10 +288,30 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
 	return NULL;
 }
 
+/**
+ * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * @dev: PCI device that owns the BAR
+ * @bar: BAR number
+ * @maxlen: length of the memory to map
+ *
+ * Using this function you will get a __iomem address to your device BAR.
+ * You can access it using ioread*() and iowrite*(). These functions hide
+ * the details if this is a MMIO or PIO address space and will just do what
+ * you expect from them in the correct way.
+ *
+ * @maxlen specifies the maximum length to map. If you want to get access to
+ * the complete BAR without checking for its length first, pass %0 here.
+ * */
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	return pci_iomap_range(dev, bar, 0, 0, maxlen);
+}
+
 void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
 {
 	IO_COND(addr, /* nothing */, iounmap(addr));
 }
 EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iomap_range);
 EXPORT_SYMBOL(pci_iounmap);
 #endif /* CONFIG_PCI */

^ permalink raw reply related	[flat|nested] 106+ messages in thread

* [RFC 4/11] virtio-pci: define layout for virtio vendor-specific capabilities.
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (3 preceding siblings ...)
  2011-12-08 10:32 ` [RFC 3/11] pci: add pci_iomap_range Rusty Russell
@ 2011-12-08 10:34 ` Rusty Russell
  2011-12-10 21:14   ` Sasha Levin
  2011-12-08 10:35 ` [RFC 6/11] virtio_pci: move old defines to legacy, introduce new structure Rusty Russell
                   ` (8 subsequent siblings)
  13 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:34 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin

Based on patch by Michael S. Tsirkin <mst@redhat.com>, but I found it
hard to follow so changed to use structures which are more
self-documenting.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/virtio_pci.h |   41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -92,4 +92,45 @@
 /* The alignment to use between consumer and producer parts of vring.
  * x86 pagesize again. */
 #define VIRTIO_PCI_VRING_ALIGN		4096
+
+/* IDs for different capabilities.  Must all exist. */
+/* FIXME: Do we win from separating ISR, NOTIFY and COMMON? */
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR access */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific confiuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	u8 cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	u8 cap_next;	/* Generic PCI field: next ptr. */
+	u8 cfg_type;	/* One of the VIRTIO_PCI_CAP_*_CFG. */
+/* FIXME: Should we use a bir, instead of raw bar number? */
+	u8 bar;		/* Where to find it. */
+	__le32 offset;	/* Offset within bar. */
+	__le32 length;	/* Length. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	__le32 device_feature_select;	/* read-write */
+	__le32 device_feature;		/* read-only */
+	__le32 guest_feature_select;	/* read-write */
+	__le32 guest_feature;		/* read-only */
+	__le16 msix_config;		/* read-write */
+	__u8 device_status;		/* read-write */
+	__u8 unused;
+
+	/* About a specific virtqueue. */
+	__le16 queue_select;	/* read-write */
+	__le16 queue_align;	/* read-write, power of 2. */
+	__le16 queue_size;	/* read-write, power of 2. */
+	__le16 queue_msix_vector;/* read-write */
+	__le64 queue_address;	/* read-write: 0xFFFFFFFFFFFFFFFF == DNE. */
+};
 #endif

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 6/11] virtio_pci: move old defines to legacy, introduce new structure.
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (4 preceding siblings ...)
  2011-12-08 10:34 ` [RFC 4/11] virtio-pci: define layout for virtio vendor-specific capabilities Rusty Russell
@ 2011-12-08 10:35 ` Rusty Russell
  2011-12-08 10:38 ` [RFC 6/11] virtio_pci: don't use the legacy driver if we find the new PCI capabilities Rusty Russell
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:35 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin

We don't *remove* the old ones, unless VIRTIO_PCI_NO_LEGACY is defined,
but they get a friendly #warning about the change.

Note that config option is not promted; we always enable it for now.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/Kconfig             |   12 +++++++
 drivers/virtio/Makefile            |    2 -
 drivers/virtio/virtio_pci_legacy.c |   16 ++++-----
 include/linux/virtio_pci.h         |   63 ++++++++++++++++++++++++-------------
 4 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -25,6 +25,18 @@ config VIRTIO_PCI
 
 	  If unsure, say M.
 
+config VIRTIO_PCI_LEGACY
+	bool
+	default y
+	depends on VIRTIO_PCI
+	---help---
+	  The old BAR0 virtio pci layout was deprecated early 2012.
+
+	  So look out into your driveway.  Do you have a flying car?  If
+	  so, you can happily disable this option and virtio will not
+	  break.  Otherwise, leave it set.  Unless you're testing what
+	  life will be like in The Future.
+
 config VIRTIO_BALLOON
 	tristate "Virtio balloon driver (EXPERIMENTAL)"
 	select VIRTIO
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_VIRTIO) += virtio.o
 obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
 obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
-obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
+obj-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci_legacy.c
rename from drivers/virtio/virtio_pci.c
rename to drivers/virtio/virtio_pci_legacy.c
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -1,5 +1,5 @@
 /*
- * Virtio PCI driver
+ * Virtio PCI driver (legacy mode)
  *
  * This module allows virtio devices to be used over a virtual PCI device.
  * This can be used with QEMU based VMMs like KVM or Xen.
@@ -27,7 +27,7 @@
 #include <linux/spinlock.h>
 
 MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>");
-MODULE_DESCRIPTION("virtio-pci");
+MODULE_DESCRIPTION("virtio-pci-legacy");
 MODULE_LICENSE("GPL");
 MODULE_VERSION("1");
 
@@ -629,7 +629,7 @@ static int __devinit virtio_pci_probe(st
 		return -ENODEV;
 
 	if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) {
-		printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n",
+		printk(KERN_ERR "virtio_pci_legacy: expected ABI version %d, got %d\n",
 		       VIRTIO_PCI_ABI_VERSION, pci_dev->revision);
 		return -ENODEV;
 	}
@@ -654,7 +654,7 @@ static int __devinit virtio_pci_probe(st
 	if (err)
 		goto out;
 
-	err = pci_request_regions(pci_dev, "virtio-pci");
+	err = pci_request_regions(pci_dev, "virtio-pci-legacy");
 	if (err)
 		goto out_enable_device;
 
@@ -721,8 +721,8 @@ static int virtio_pci_resume(struct pci_
 }
 #endif
 
-static struct pci_driver virtio_pci_driver = {
-	.name		= "virtio-pci",
+static struct pci_driver virtio_pci_driver_legacy = {
+	.name		= "virtio-pci-legacy",
 	.id_table	= virtio_pci_id_table,
 	.probe		= virtio_pci_probe,
 	.remove		= __devexit_p(virtio_pci_remove),
@@ -734,14 +734,14 @@ static struct pci_driver virtio_pci_driv
 
 static int __init virtio_pci_init(void)
 {
-	return pci_register_driver(&virtio_pci_driver);
+	return pci_register_driver(&virtio_pci_driver_legacy);
 }
 
 module_init(virtio_pci_init);
 
 static void __exit virtio_pci_exit(void)
 {
-	pci_unregister_driver(&virtio_pci_driver);
+	pci_unregister_driver(&virtio_pci_driver_legacy);
 }
 
 module_exit(virtio_pci_exit);
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -42,56 +42,75 @@
 #include <linux/virtio_config.h>
 
 /* A 32-bit r/o bitmask of the features supported by the host */
-#define VIRTIO_PCI_HOST_FEATURES	0
+#define VIRTIO_PCI_LEGACY_HOST_FEATURES		0
 
 /* A 32-bit r/w bitmask of features activated by the guest */
-#define VIRTIO_PCI_GUEST_FEATURES	4
+#define VIRTIO_PCI_LEGACY_GUEST_FEATURES	4
 
 /* A 32-bit r/w PFN for the currently selected queue */
-#define VIRTIO_PCI_QUEUE_PFN		8
+#define VIRTIO_PCI_LEGACY_QUEUE_PFN		8
 
 /* A 16-bit r/o queue size for the currently selected queue */
-#define VIRTIO_PCI_QUEUE_NUM		12
+#define VIRTIO_PCI_LEGACY_QUEUE_NUM		12
 
 /* A 16-bit r/w queue selector */
-#define VIRTIO_PCI_QUEUE_SEL		14
+#define VIRTIO_PCI_LEGACY_QUEUE_SEL		14
 
 /* A 16-bit r/w queue notifier */
-#define VIRTIO_PCI_QUEUE_NOTIFY		16
+#define VIRTIO_PCI_LEGACY_QUEUE_NOTIFY		16
 
 /* An 8-bit device status register.  */
-#define VIRTIO_PCI_STATUS		18
+#define VIRTIO_PCI_LEGACY_STATUS		18
 
 /* An 8-bit r/o interrupt status register.  Reading the value will return the
  * current contents of the ISR and will also clear it.  This is effectively
  * a read-and-acknowledge. */
-#define VIRTIO_PCI_ISR			19
-
-/* The bit of the ISR which indicates a device configuration change. */
-#define VIRTIO_PCI_ISR_CONFIG		0x2
+#define VIRTIO_PCI_LEGACY_ISR			19
 
 /* MSI-X registers: only enabled if MSI-X is enabled. */
 /* A 16-bit vector for configuration changes. */
-#define VIRTIO_MSI_CONFIG_VECTOR        20
+#define VIRTIO_MSI_LEGACY_CONFIG_VECTOR        20
 /* A 16-bit vector for selected queue notifications. */
-#define VIRTIO_MSI_QUEUE_VECTOR         22
-/* Vector value used to disable MSI for queue */
-#define VIRTIO_MSI_NO_VECTOR            0xffff
+#define VIRTIO_MSI_LEGACY_QUEUE_VECTOR         22
 
 /* The remaining space is defined by each driver as the per-driver
  * configuration space */
-#define VIRTIO_PCI_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
+#define VIRTIO_PCI_LEGACY_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
+
+/* How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size. */
+#define VIRTIO_PCI_LEGACY_QUEUE_ADDR_SHIFT	12
+
+/* The alignment to use between consumer and producer parts of vring.
+ * x86 pagesize again. */
+#define VIRTIO_PCI_LEGACY_VRING_ALIGN		4096
+
+#ifndef VIRTIO_PCI_NO_LEGACY
+/* Don't break compile of old userspace code.  These will go away. */
+#warning "Please support virtio_pci non-legacy mode!"
+#define VIRTIO_PCI_HOST_FEATURES VIRTIO_PCI_LEGACY_HOST_FEATURES
+#define VIRTIO_PCI_GUEST_FEATURES VIRTIO_PCI_LEGACY_GUEST_FEATURES
+#define VIRTIO_PCI_QUEUE_PFN VIRTIO_PCI_LEGACY_QUEUE_PFN
+#define VIRTIO_PCI_QUEUE_NUM VIRTIO_PCI_LEGACY_QUEUE_NUM
+#define VIRTIO_PCI_QUEUE_SEL VIRTIO_PCI_LEGACY_QUEUE_SEL
+#define VIRTIO_PCI_QUEUE_NOTIFY VIRTIO_PCI_LEGACY_QUEUE_NOTIFY
+#define VIRTIO_PCI_STATUS VIRTIO_PCI_LEGACY_STATUS
+#define VIRTIO_PCI_ISR VIRTIO_PCI_LEGACY_ISR
+#define VIRTIO_MSI_CONFIG_VECTOR VIRTIO_MSI_LEGACY_CONFIG_VECTOR
+#define VIRTIO_MSI_QUEUE_VECTOR VIRTIO_MSI_LEGACY_QUEUE_VECTOR
+#define VIRTIO_PCI_CONFIG(dev) VIRTIO_PCI_LEGACY_CONFIG(dev)
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT VIRTIO_PCI_LEGACY_QUEUE_ADDR_SHIFT
+#define VIRTIO_PCI_VRING_ALIGN VIRTIO_PCI_LEGACY_VRING_ALIGN
+#endif /* ...!KERNEL */
 
 /* Virtio ABI version, this must match exactly */
 #define VIRTIO_PCI_ABI_VERSION		0
 
-/* How many bits to shift physical queue address written to QUEUE_PFN.
- * 12 is historical, and due to x86 page size. */
-#define VIRTIO_PCI_QUEUE_ADDR_SHIFT	12
+/* Vector value used to disable MSI for queue */
+#define VIRTIO_MSI_NO_VECTOR            0xffff
 
-/* The alignment to use between consumer and producer parts of vring.
- * x86 pagesize again. */
-#define VIRTIO_PCI_VRING_ALIGN		4096
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG		0x2
 
 /* IDs for different capabilities.  Must all exist. */
 /* FIXME: Do we win from separating ISR, NOTIFY and COMMON? */

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 6/11] virtio_pci: don't use the legacy driver if we find the new PCI capabilities.
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (5 preceding siblings ...)
  2011-12-08 10:35 ` [RFC 6/11] virtio_pci: move old defines to legacy, introduce new structure Rusty Russell
@ 2011-12-08 10:38 ` Rusty Russell
  2011-12-10 21:18   ` Sasha Levin
  2011-12-08 10:39 ` [RFC 7/11] virtio_pci: new, capability-aware driver Rusty Russell
                   ` (6 subsequent siblings)
  13 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:38 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin


With module option to override.  I assume I can call
pci_find_capability() before pci_request_regions?
---
 drivers/virtio/virtio_pci_legacy.c |   20 +++++++++++++++++++-
 include/linux/virtio_pci.h         |   19 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -26,6 +26,10 @@
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 
+static bool force_nonlegacy;
+module_param(force_nonlegacy, bool, 0644);
+MODULE_PARM_DESC(force_nonlegacy, "Take over non-legacy virtio devices too");
+
 MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>");
 MODULE_DESCRIPTION("virtio-pci-legacy");
 MODULE_LICENSE("GPL");
@@ -622,7 +626,7 @@ static int __devinit virtio_pci_probe(st
 				      const struct pci_device_id *id)
 {
 	struct virtio_pci_device *vp_dev;
-	int err;
+	int err, cap;
 
 	/* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
 	if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f)
@@ -654,6 +658,21 @@ static int __devinit virtio_pci_probe(st
 	if (err)
 		goto out;
 
+	/* We leave modern virtio-pci for the modern driver. */
+	cap = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG);
+	if (cap) {
+		if (force_nonlegacy)
+			dev_info(&pci_dev->dev,
+				 "virtio_pci_legacy: forcing legacy mode!\n");
+		else {
+			dev_info(&pci_dev->dev,
+				 "virtio_pci_legacy: leaving to"
+				 " non-legacy driver\n");
+			err = -ENODEV;
+			goto out_enable_device;
+		}
+	}
+
 	err = pci_request_regions(pci_dev, "virtio-pci-legacy");
 	if (err)
 		goto out_enable_device;
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -152,4 +152,23 @@ struct virtio_pci_common_cfg {
 	__le16 queue_msix_vector;/* read-write */
 	__le64 queue_address;	/* read-write: 0xFFFFFFFFFFFFFFFF == DNE. */
 };
+
+#ifdef __KERNEL__
+/* Returns offset of the capability, or 0. */
+static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type)
+{
+	int pos;
+
+	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+		u8 type;
+		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+							 cfg_type), &type);
+		if (type == cfg_type)
+			return pos;
+	}
+	return 0;
+}
+#endif /* __KERNEL__ */
 #endif

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (6 preceding siblings ...)
  2011-12-08 10:38 ` [RFC 6/11] virtio_pci: don't use the legacy driver if we find the new PCI capabilities Rusty Russell
@ 2011-12-08 10:39 ` Rusty Russell
  2011-12-11  9:42   ` Michael S. Tsirkin
  2011-12-08 10:40 ` [RFC 8/11] virtio_pci: share structure between legacy and modern Rusty Russell
                   ` (5 subsequent siblings)
  13 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:39 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin

Differences:
1) Uses 4 pci capabilities to demark common, irq, notify and dev-specific areas.
2) Guest sets queue size, using host-provided maximum.
3) Guest sets queue alignment, rather than ABI-defined 4096.
4) More than 32 feature bits (a lot more!).
---
 drivers/virtio/Makefile     |    1
 drivers/virtio/virtio_pci.c |  868 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 869 insertions(+)

diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_VIRTIO) += virtio.o
 obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
 obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
+obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
 obj-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
new file mode 100644
--- /dev/null
+++ b/drivers/virtio/virtio_pci.c
@@ -0,0 +1,869 @@
+/*
+ * Virtio PCI driver
+ *
+ * This module allows virtio devices to be used over a virtual PCI
+ * device.  Copyright 2011, Rusty Russell IBM Corporation, but based
+ * on the older virtio_pci_legacy.c, which was Copyright IBM
+ * Corp. 2007.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#define VIRTIO_PCI_NO_LEGACY
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("virtio-pci");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("2");
+
+/* Use cacheline size as a good guess at a nice alignment. */
+#define VIRTIO_PCI_ALIGN	SMP_CACHE_BYTES
+
+/* Our device structure */
+struct virtio_pci_device
+{
+	struct virtio_device vdev;
+	struct pci_dev *pci_dev;
+
+	/* The IO mapping for the PCI config space */
+	struct virtio_pci_common_cfg __iomem *common;
+	/* Where to read and clear interrupt */ 
+	u8 __iomem *isr;
+	/* Write the virtqueue index here to notify device of activity. */
+	__le16 __iomem *notify;
+	/* Device-specific data. */
+	void __iomem *device;
+
+	/* a list of queues so we can dispatch IRQs */
+	spinlock_t lock;
+	struct list_head virtqueues;
+
+	/* MSI-X support */
+	int msix_enabled;
+	int intx_enabled;
+	struct msix_entry *msix_entries;
+	/* Name strings for interrupts. This size should be enough,
+	 * and I'm too lazy to allocate each name separately. */
+	char (*msix_names)[256];
+	/* Number of available vectors */
+	unsigned msix_vectors;
+	/* Vectors allocated, excluding per-vq vectors if any */
+	unsigned msix_used_vectors;
+	/* Whether we have vector per vq */
+	bool per_vq_vectors;
+};
+
+/* Constants for MSI-X */
+/* Use first vector for configuration changes, second and the rest for
+ * virtqueues Thus, we need at least 2 vectors for MSI. */
+enum {
+	VP_MSIX_CONFIG_VECTOR = 0,
+	VP_MSIX_VQ_VECTOR = 1,
+};
+
+struct virtio_pci_vq_info
+{
+	/* the actual virtqueue */
+	struct virtqueue *vq;
+
+	/* the number of entries in the queue */
+	int num;
+
+	/* the index of the queue */
+	int queue_index;
+
+	/* the virtual address of the ring queue */
+	void *queue;
+
+	/* the list node for the virtqueues list */
+	struct list_head node;
+
+	/* MSI-X vector (or none) */
+	unsigned msix_vector;
+};
+
+/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */
+static struct pci_device_id virtio_pci_id_table[] = {
+	{ 0x1af4, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
+	{ 0 },
+};
+
+MODULE_DEVICE_TABLE(pci, virtio_pci_id_table);
+
+/* Convert a generic virtio device to our structure */
+static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev)
+{
+	return container_of(vdev, struct virtio_pci_device, vdev);
+}
+
+/* There is no iowrite64.  We use two 32-bit ops. */
+static void iowrite64(u64 val, const __le64 *addr)
+{
+	iowrite32((u32)val, (__le32 *)addr);
+	iowrite32(val >> 32, (__le32 *)addr + 1);
+}
+
+/* There is no ioread64.  We use two 32-bit ops. */
+static u64 ioread64(__le64 *addr)
+{
+	return ioread32(addr) | ((u64)ioread32((__le32 *)addr + 1) << 32);
+}
+
+static u64 vp_get_features(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	u64 features;
+
+	iowrite32(0, &vp_dev->common->device_feature_select);
+	features = ioread32(&vp_dev->common->device_feature);
+	iowrite32(1, &vp_dev->common->device_feature_select);
+	features |= ((u64)ioread32(&vp_dev->common->device_feature) << 32);
+	return features;
+}
+
+static void vp_finalize_features(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+	/* Give virtio_ring a chance to accept features. */
+	vring_transport_features(vdev);
+
+	iowrite32(0, &vp_dev->common->guest_feature_select);
+	iowrite32((u32)vdev->features, &vp_dev->common->guest_feature);
+	iowrite32(1, &vp_dev->common->guest_feature_select);
+	iowrite32(vdev->features >> 32, &vp_dev->common->guest_feature);
+}
+
+/* virtio config->get() implementation */
+static void vp_get(struct virtio_device *vdev, unsigned offset,
+		   void *buf, unsigned len)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	void __iomem *ioaddr = vp_dev->device + offset;
+	u8 *ptr = buf;
+	int i;
+
+	for (i = 0; i < len; i++)
+		ptr[i] = ioread8(ioaddr + i);
+}
+
+/* the config->set() implementation.  it's symmetric to the config->get()
+ * implementation */
+static void vp_set(struct virtio_device *vdev, unsigned offset,
+		   const void *buf, unsigned len)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	void __iomem *ioaddr = vp_dev->device + offset;
+	const u8 *ptr = buf;
+	int i;
+
+	for (i = 0; i < len; i++)
+		iowrite8(ptr[i], ioaddr + i);
+}
+
+/* config->{get,set}_status() implementations */
+static u8 vp_get_status(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	return ioread8(&vp_dev->common->device_status);
+}
+
+static void vp_set_status(struct virtio_device *vdev, u8 status)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	/* We should never be setting status to 0. */
+	BUG_ON(status == 0);
+	iowrite8(status, &vp_dev->common->device_status);
+}
+
+/* wait for pending irq handlers */
+static void vp_synchronize_vectors(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	int i;
+
+	if (vp_dev->intx_enabled)
+		synchronize_irq(vp_dev->pci_dev->irq);
+
+	for (i = 0; i < vp_dev->msix_vectors; ++i)
+		synchronize_irq(vp_dev->msix_entries[i].vector);
+}
+
+static void vp_reset(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	/* 0 status means a reset. */
+	iowrite8(0, &vp_dev->common->device_status);
+	/* Flush out the status write, and flush in device writes,
+	 * including MSi-X interrupts, if any. */
+	ioread8(&vp_dev->common->device_status);
+	/* Flush pending VQ/configuration callbacks. */
+	vp_synchronize_vectors(vdev);
+}
+
+/* the notify function used when creating a virt queue */
+static void vp_notify(struct virtqueue *vq)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+	struct virtio_pci_vq_info *info = vq->priv;
+
+	/* we write the queue's selector into the notification register to
+	 * signal the other end */
+	iowrite16(info->queue_index, vp_dev->notify);
+}
+
+/* Handle a configuration change: Tell driver if it wants to know. */
+static irqreturn_t vp_config_changed(int irq, void *opaque)
+{
+	struct virtio_pci_device *vp_dev = opaque;
+	struct virtio_driver *drv;
+	drv = container_of(vp_dev->vdev.dev.driver,
+			   struct virtio_driver, driver);
+
+	if (drv->config_changed)
+		drv->config_changed(&vp_dev->vdev);
+	return IRQ_HANDLED;
+}
+
+/* Notify all virtqueues on an interrupt. */
+static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
+{
+	struct virtio_pci_device *vp_dev = opaque;
+	struct virtio_pci_vq_info *info;
+	irqreturn_t ret = IRQ_NONE;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vp_dev->lock, flags);
+	list_for_each_entry(info, &vp_dev->virtqueues, node) {
+		if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
+			ret = IRQ_HANDLED;
+	}
+	spin_unlock_irqrestore(&vp_dev->lock, flags);
+
+	return ret;
+}
+
+/* A small wrapper to also acknowledge the interrupt when it's handled.
+ * I really need an EIO hook for the vring so I can ack the interrupt once we
+ * know that we'll be handling the IRQ but before we invoke the callback since
+ * the callback may notify the host which results in the host attempting to
+ * raise an interrupt that we would then mask once we acknowledged the
+ * interrupt. */
+static irqreturn_t vp_interrupt(int irq, void *opaque)
+{
+	struct virtio_pci_device *vp_dev = opaque;
+	u8 isr;
+
+	/* reading the ISR has the effect of also clearing it so it's very
+	 * important to save off the value. */
+	isr = ioread8(vp_dev->isr);
+
+	/* It's definitely not us if the ISR was not high */
+	if (!isr)
+		return IRQ_NONE;
+
+	/* Configuration change?  Tell driver if it wants to know. */
+	if (isr & VIRTIO_PCI_ISR_CONFIG)
+		vp_config_changed(irq, opaque);
+
+	return vp_vring_interrupt(irq, opaque);
+}
+
+static void vp_free_vectors(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	int i;
+
+	if (vp_dev->intx_enabled) {
+		free_irq(vp_dev->pci_dev->irq, vp_dev);
+		vp_dev->intx_enabled = 0;
+	}
+
+	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
+		free_irq(vp_dev->msix_entries[i].vector, vp_dev);
+
+	if (vp_dev->msix_enabled) {
+		/* Disable the vector used for configuration */
+		iowrite16(VIRTIO_MSI_NO_VECTOR, &vp_dev->common->msix_config);
+		/* Flush the write out to device */
+		ioread16(&vp_dev->common->msix_config);
+
+		pci_disable_msix(vp_dev->pci_dev);
+		vp_dev->msix_enabled = 0;
+		vp_dev->msix_vectors = 0;
+	}
+
+	vp_dev->msix_used_vectors = 0;
+	kfree(vp_dev->msix_names);
+	vp_dev->msix_names = NULL;
+	kfree(vp_dev->msix_entries);
+	vp_dev->msix_entries = NULL;
+}
+
+static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
+				   bool per_vq_vectors)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	const char *name = dev_name(&vp_dev->vdev.dev);
+	unsigned i, v;
+	int err = -ENOMEM;
+
+	vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries,
+				       GFP_KERNEL);
+	if (!vp_dev->msix_entries)
+		goto error;
+	vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
+				     GFP_KERNEL);
+	if (!vp_dev->msix_names)
+		goto error;
+
+	for (i = 0; i < nvectors; ++i)
+		vp_dev->msix_entries[i].entry = i;
+
+	/* pci_enable_msix returns positive if we can't get this many. */
+	err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors);
+	if (err > 0)
+		err = -ENOSPC;
+	if (err)
+		goto error;
+	vp_dev->msix_vectors = nvectors;
+	vp_dev->msix_enabled = 1;
+
+	/* Set the vector used for configuration */
+	v = vp_dev->msix_used_vectors;
+	snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+		 "%s-config", name);
+	err = request_irq(vp_dev->msix_entries[v].vector,
+			  vp_config_changed, 0, vp_dev->msix_names[v],
+			  vp_dev);
+	if (err)
+		goto error;
+	++vp_dev->msix_used_vectors;
+
+	iowrite16(v, &vp_dev->common->msix_config);
+	/* Verify we had enough resources to assign the vector */
+	v = ioread16(&vp_dev->common->msix_config);
+	if (v == VIRTIO_MSI_NO_VECTOR) {
+		err = -EBUSY;
+		goto error;
+	}
+
+	if (!per_vq_vectors) {
+		/* Shared vector for all VQs */
+		v = vp_dev->msix_used_vectors;
+		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+			 "%s-virtqueues", name);
+		err = request_irq(vp_dev->msix_entries[v].vector,
+				  vp_vring_interrupt, 0, vp_dev->msix_names[v],
+				  vp_dev);
+		if (err)
+			goto error;
+		++vp_dev->msix_used_vectors;
+	}
+	return 0;
+error:
+	vp_free_vectors(vdev);
+	return err;
+}
+
+static int vp_request_intx(struct virtio_device *vdev)
+{
+	int err;
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+	err = request_irq(vp_dev->pci_dev->irq, vp_interrupt,
+			  IRQF_SHARED, dev_name(&vdev->dev), vp_dev);
+	if (!err)
+		vp_dev->intx_enabled = 1;
+	return err;
+}
+
+static void *alloc_virtqueue_pages(u16 *num)
+{
+	void *pages;
+
+	/* 1024 entries uses about 32k */
+	if (*num > 1024)
+		*num = 1024;
+
+	for (; *num; *num /= 2) {
+		size_t size = PAGE_ALIGN(vring_size(*num, VIRTIO_PCI_ALIGN));
+		pages = alloc_pages_exact(size,
+					  GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
+		if (pages)
+			return pages;
+	}
+	return NULL;
+}
+
+static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
+				  void (*callback)(struct virtqueue *vq),
+				  const char *name,
+				  u16 msix_vec)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	struct virtio_pci_vq_info *info;
+	struct virtqueue *vq;
+	u16 num;
+	int err;
+
+	/* Select the queue we're interested in */
+	iowrite16(index, &vp_dev->common->queue_select);
+
+	switch (ioread64(&vp_dev->common->queue_address)) {
+	case 0xFFFFFFFFFFFFFFFFULL:
+		return ERR_PTR(-ENOENT);
+	case 0:
+		/* Uninitialized.  Excellent. */
+		break;
+	default:
+		/* We've already set this up? */
+		return ERR_PTR(-EBUSY);
+	}
+
+	/* Maximum size must be a power of 2. */
+	num = ioread16(&vp_dev->common->queue_size);
+	if (num & (num - 1)) {
+		dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", num);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* allocate and fill out our structure the represents an active
+	 * queue */
+	info = kmalloc(sizeof(struct virtio_pci_vq_info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	info->queue_index = index;
+	info->msix_vector = msix_vec;
+
+	info->queue = alloc_virtqueue_pages(&num);
+	if (info->queue == NULL) {
+		err = -ENOMEM;
+		goto out_info;
+	}
+	info->num = num;
+
+	/* create the vring */
+	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_ALIGN,
+				 vdev, info->queue, vp_notify, callback, name);
+	if (!vq) {
+		err = -ENOMEM;
+		goto out_alloc_pages;
+	}
+
+	vq->priv = info;
+	info->vq = vq;
+
+	if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
+		iowrite16(msix_vec, &vp_dev->common->queue_msix_vector);
+		msix_vec = ioread16(&vp_dev->common->queue_msix_vector);
+		if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
+			err = -EBUSY;
+			goto out_new_virtqueue;
+		}
+	}
+
+	if (callback) {
+		unsigned long flags;
+		spin_lock_irqsave(&vp_dev->lock, flags);
+		list_add(&info->node, &vp_dev->virtqueues);
+		spin_unlock_irqrestore(&vp_dev->lock, flags);
+	} else {
+		INIT_LIST_HEAD(&info->node);
+	}
+
+	/* Activate the queue. */
+	iowrite64(virt_to_phys(info->queue), &vp_dev->common->queue_address);
+	iowrite16(VIRTIO_PCI_ALIGN, &vp_dev->common->queue_align);
+	iowrite16(num, &vp_dev->common->queue_size);
+
+	return vq;
+
+out_new_virtqueue:
+	vring_del_virtqueue(vq);
+out_alloc_pages:
+	free_pages_exact(info->queue,
+			 PAGE_ALIGN(vring_size(num, VIRTIO_PCI_ALIGN)));
+out_info:
+	kfree(info);
+	return ERR_PTR(err);
+}
+
+static void vp_del_vq(struct virtqueue *vq)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+	struct virtio_pci_vq_info *info = vq->priv;
+	unsigned long flags, size;
+
+	spin_lock_irqsave(&vp_dev->lock, flags);
+	list_del(&info->node);
+	spin_unlock_irqrestore(&vp_dev->lock, flags);
+
+	/* Select and deactivate the queue */
+	iowrite16(info->queue_index, &vp_dev->common->queue_select);
+
+	if (vp_dev->msix_enabled) {
+		iowrite16(VIRTIO_MSI_NO_VECTOR,
+			  &vp_dev->common->queue_msix_vector);
+		/* Flush the write out to device */
+		ioread16(&vp_dev->common->queue_msix_vector);
+	}
+
+	vring_del_virtqueue(vq);
+
+	/* This is for our own benefit, not the device's! */
+	iowrite64(0, &vp_dev->common->queue_address);
+	iowrite16(0, &vp_dev->common->queue_size);
+	iowrite16(0, &vp_dev->common->queue_align);
+
+	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_ALIGN));
+	free_pages_exact(info->queue, size);
+	kfree(info);
+}
+
+/* the config->del_vqs() implementation */
+static void vp_del_vqs(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	struct virtqueue *vq, *n;
+	struct virtio_pci_vq_info *info;
+
+	list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
+		info = vq->priv;
+		if (vp_dev->per_vq_vectors &&
+			info->msix_vector != VIRTIO_MSI_NO_VECTOR)
+			free_irq(vp_dev->msix_entries[info->msix_vector].vector,
+				 vq);
+		vp_del_vq(vq);
+	}
+	vp_dev->per_vq_vectors = false;
+
+	vp_free_vectors(vdev);
+}
+
+static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+			      struct virtqueue *vqs[],
+			      vq_callback_t *callbacks[],
+			      const char *names[],
+			      bool use_msix,
+			      bool per_vq_vectors)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	u16 msix_vec;
+	int i, err, nvectors, allocated_vectors;
+
+	if (!use_msix) {
+		/* Old style: one normal interrupt for change and all vqs. */
+		err = vp_request_intx(vdev);
+		if (err)
+			goto error_request;
+	} else {
+		if (per_vq_vectors) {
+			/* Best option: one for change interrupt, one per vq. */
+			nvectors = 1;
+			for (i = 0; i < nvqs; ++i)
+				if (callbacks[i])
+					++nvectors;
+		} else {
+			/* Second best: one for change, shared for all vqs. */
+			nvectors = 2;
+		}
+
+		err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors);
+		if (err)
+			goto error_request;
+	}
+
+	vp_dev->per_vq_vectors = per_vq_vectors;
+	allocated_vectors = vp_dev->msix_used_vectors;
+	for (i = 0; i < nvqs; ++i) {
+		if (!callbacks[i] || !vp_dev->msix_enabled)
+			msix_vec = VIRTIO_MSI_NO_VECTOR;
+		else if (vp_dev->per_vq_vectors)
+			msix_vec = allocated_vectors++;
+		else
+			msix_vec = VP_MSIX_VQ_VECTOR;
+		vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec);
+		if (IS_ERR(vqs[i])) {
+			err = PTR_ERR(vqs[i]);
+			goto error_find;
+		}
+
+		if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR)
+			continue;
+
+		/* allocate per-vq irq if available and necessary */
+		snprintf(vp_dev->msix_names[msix_vec],
+			 sizeof *vp_dev->msix_names,
+			 "%s-%s",
+			 dev_name(&vp_dev->vdev.dev), names[i]);
+		err = request_irq(vp_dev->msix_entries[msix_vec].vector,
+				  vring_interrupt, 0,
+				  vp_dev->msix_names[msix_vec],
+				  vqs[i]);
+		if (err) {
+			vp_del_vq(vqs[i]);
+			goto error_find;
+		}
+	}
+	return 0;
+
+error_find:
+	vp_del_vqs(vdev);
+
+error_request:
+	return err;
+}
+
+/* the config->find_vqs() implementation */
+static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+		       struct virtqueue *vqs[],
+		       vq_callback_t *callbacks[],
+		       const char *names[])
+{
+	int err;
+
+	/* Try MSI-X with one vector per queue. */
+	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, true, true);
+	if (!err)
+		return 0;
+	/* Fallback: MSI-X with one vector for config, one shared for queues. */
+	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
+				 true, false);
+	if (!err)
+		return 0;
+	/* Finally fall back to regular interrupts. */
+	return vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
+				  false, false);
+}
+
+static struct virtio_config_ops virtio_pci_config_ops = {
+	.get		= vp_get,
+	.set		= vp_set,
+	.get_status	= vp_get_status,
+	.set_status	= vp_set_status,
+	.reset		= vp_reset,
+	.find_vqs	= vp_find_vqs,
+	.del_vqs	= vp_del_vqs,
+	.get_features	= vp_get_features,
+	.finalize_features = vp_finalize_features,
+};
+
+static void virtio_pci_release_dev(struct device *_d)
+{
+	/*
+	 * No need for a release method as we allocate/free
+	 * all devices together with the pci devices.
+	 * Provide an empty one to avoid getting a warning from core.
+	 */
+}
+
+static void __iomem *map_capability(struct pci_dev *dev, int off, size_t expect)
+{
+	u8 bar;
+	u32 offset, length;
+	void __iomem *p;
+
+	pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap, bar),
+			     &bar);
+	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset),
+			     &offset);
+	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length),
+			     &length);
+
+	if (length < expect) {
+		/* FIXME: I assume we want to report errors as PCI device? */
+		dev_err(&dev->dev,
+			"virtio_pci: small capability len %u (%u expected)\n",
+			length, expect);
+		return NULL;
+	}		
+
+	p = pci_iomap_range(dev, bar, offset, length, PAGE_SIZE);
+	if (!p)
+		/* FIXME: I assume we want to report errors as PCI device? */
+		dev_err(&dev->dev,
+			"virtio_pci: unable to map virtio %u@%u on bar %i\n",
+			length, offset, bar);
+	return p;
+}
+
+
+/* the PCI probing function */
+static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
+				      const struct pci_device_id *id)
+{
+	struct virtio_pci_device *vp_dev;
+	int err, common, isr, notify, device;
+
+	/* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
+	if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f)
+		return -ENODEV;
+
+	if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) {
+		printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n",
+		       VIRTIO_PCI_ABI_VERSION, pci_dev->revision);
+		return -ENODEV;
+	}
+
+	/* allocate our structure and fill it out */
+	vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
+	if (vp_dev == NULL)
+		return -ENOMEM;
+
+	vp_dev->vdev.dev.parent = &pci_dev->dev;
+	vp_dev->vdev.dev.release = virtio_pci_release_dev;
+	vp_dev->vdev.config = &virtio_pci_config_ops;
+	vp_dev->pci_dev = pci_dev;
+	INIT_LIST_HEAD(&vp_dev->virtqueues);
+	spin_lock_init(&vp_dev->lock);
+
+	/* Disable MSI/MSIX to bring device to a known good state. */
+	pci_msi_off(pci_dev);
+
+	/* enable the device */
+	err = pci_enable_device(pci_dev);
+	if (err)
+		goto out;
+
+	/* check for a legacy bar0 device. */
+	common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG);
+	if (!common) {
+		dev_info(&pci_dev->dev,
+			 "virtio_pci: leaving for legacy driver\n");
+		err = -ENODEV;
+		goto out_enable_device;
+	}
+	isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG);
+	notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG);
+	device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG);
+	if (!isr || !notify || !device) {
+		dev_err(&pci_dev->dev,
+			"virtio_pci: missing capabilities %i/%i/%i/%i\n",
+			common, isr, notify, device);
+		goto out_enable_device;
+	}
+
+	err = pci_request_regions(pci_dev, "virtio-pci");
+	if (err)
+		goto out_enable_device;
+
+	vp_dev->common = map_capability(pci_dev, common,
+					sizeof(struct virtio_pci_common_cfg));
+	if (!vp_dev->common)
+		goto out_req_regions;
+	vp_dev->isr = map_capability(pci_dev, isr, sizeof(u8));
+	if (!vp_dev->isr)
+		goto out_map_common;
+	vp_dev->notify = map_capability(pci_dev, notify, sizeof(u16));
+	if (!vp_dev->notify)
+		goto out_map_isr;
+	vp_dev->device = map_capability(pci_dev, device, 0);
+	if (!vp_dev->device)
+		goto out_map_notify;
+
+	pci_set_drvdata(pci_dev, vp_dev);
+	pci_set_master(pci_dev);
+
+	/* we use the subsystem vendor/device id as the virtio vendor/device
+	 * id.  this allows us to use the same PCI vendor/device id for all
+	 * virtio devices and to identify the particular virtio driver by
+	 * the subsystem ids */
+	vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
+	vp_dev->vdev.id.device = pci_dev->subsystem_device;
+
+	/* finally register the virtio device */
+	err = register_virtio_device(&vp_dev->vdev);
+	if (err)
+		goto out_set_drvdata;
+
+	return 0;
+
+out_set_drvdata:
+	pci_set_drvdata(pci_dev, NULL);
+	pci_iounmap(pci_dev, vp_dev->device);
+out_map_notify:
+	pci_iounmap(pci_dev, vp_dev->notify);
+out_map_isr:
+	pci_iounmap(pci_dev, vp_dev->isr);
+out_map_common:
+	pci_iounmap(pci_dev, vp_dev->common);
+out_req_regions:
+	pci_release_regions(pci_dev);
+out_enable_device:
+	pci_disable_device(pci_dev);
+out:
+	kfree(vp_dev);
+	return err;
+}
+
+static void __devexit virtio_pci_remove(struct pci_dev *pci_dev)
+{
+	struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+
+	unregister_virtio_device(&vp_dev->vdev);
+
+	vp_del_vqs(&vp_dev->vdev);
+	pci_set_drvdata(pci_dev, NULL);
+	pci_iounmap(pci_dev, vp_dev->device);
+	pci_iounmap(pci_dev, vp_dev->notify);
+	pci_iounmap(pci_dev, vp_dev->isr);
+	pci_iounmap(pci_dev, vp_dev->common);
+	pci_release_regions(pci_dev);
+	pci_disable_device(pci_dev);
+	kfree(vp_dev);
+}
+
+#ifdef CONFIG_PM
+static int virtio_pci_suspend(struct pci_dev *pci_dev, pm_message_t state)
+{
+	pci_save_state(pci_dev);
+	pci_set_power_state(pci_dev, PCI_D3hot);
+	return 0;
+}
+
+static int virtio_pci_resume(struct pci_dev *pci_dev)
+{
+	pci_restore_state(pci_dev);
+	pci_set_power_state(pci_dev, PCI_D0);
+	return 0;
+}
+#endif
+
+static struct pci_driver virtio_pci_driver = {
+	.name		= "virtio-pci",
+	.id_table	= virtio_pci_id_table,
+	.probe		= virtio_pci_probe,
+	.remove		= __devexit_p(virtio_pci_remove),
+#ifdef CONFIG_PM
+	.suspend	= virtio_pci_suspend,
+	.resume		= virtio_pci_resume,
+#endif
+};
+
+static int __init virtio_pci_init(void)
+{
+	return pci_register_driver(&virtio_pci_driver);
+}
+
+module_init(virtio_pci_init);
+
+static void __exit virtio_pci_exit(void)
+{
+	pci_unregister_driver(&virtio_pci_driver);
+}
+
+module_exit(virtio_pci_exit);

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 8/11] virtio_pci: share structure between legacy and modern.
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (7 preceding siblings ...)
  2011-12-08 10:39 ` [RFC 7/11] virtio_pci: new, capability-aware driver Rusty Russell
@ 2011-12-08 10:40 ` Rusty Russell
  2011-12-08 10:41 ` [RFC 9/11] virtio_pci: share interrupt/notify handlers " Rusty Russell
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:40 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin

They're almost identical: we add a "legacy" ioregion (what was
"ioaddr" in the legacy driver), and move it out to virtio_pci-common.h.
---
 drivers/virtio/virtio_pci-common.h |   72 ++++++++++++++++++++++
 drivers/virtio/virtio_pci.c        |   64 -------------------
 drivers/virtio/virtio_pci_legacy.c |  120 +++++++++----------------------------
 3 files changed, 105 insertions(+), 151 deletions(-)

diff --git a/drivers/virtio/virtio_pci-common.h b/drivers/virtio/virtio_pci-common.h
new file mode 100644
--- /dev/null
+++ b/drivers/virtio/virtio_pci-common.h
@@ -0,0 +1,72 @@
+#include <linux/pci.h>
+#include <linux/virtio_pci.h>
+
+/* Our device structure: shared by virtio_pci and virtio_pci_legacy. */
+struct virtio_pci_device
+{
+	struct virtio_device vdev;
+	struct pci_dev *pci_dev;
+
+	/* The IO mapping for the PCI config space (non-legacy mode) */
+	struct virtio_pci_common_cfg __iomem *common;
+	/* Device-specific data (non-legacy mode). */
+	void __iomem *device;
+
+	/* In legacy mode, these two point to within ->legacy. */
+	/* Where to read and clear interrupt */ 
+	u8 __iomem *isr;
+	/* Write the virtqueue index here to notify device of activity. */
+	__le16 __iomem *notify;
+
+#ifdef CONFIG_VIRTIO_PCI_LEGACY
+	/* Instead of common, notify and device, legacy uses this: */
+	void __iomem *legacy;
+#endif
+
+	/* a list of queues so we can dispatch IRQs */
+	spinlock_t lock;
+	struct list_head virtqueues;
+
+	/* MSI-X support */
+	int msix_enabled;
+	int intx_enabled;
+	struct msix_entry *msix_entries;
+	/* Name strings for interrupts. This size should be enough,
+	 * and I'm too lazy to allocate each name separately. */
+	char (*msix_names)[256];
+	/* Number of available vectors */
+	unsigned msix_vectors;
+	/* Vectors allocated, excluding per-vq vectors if any */
+	unsigned msix_used_vectors;
+	/* Whether we have vector per vq */
+	bool per_vq_vectors;
+};
+
+/* Constants for MSI-X */
+/* Use first vector for configuration changes, second and the rest for
+ * virtqueues Thus, we need at least 2 vectors for MSI. */
+enum {
+	VP_MSIX_CONFIG_VECTOR = 0,
+	VP_MSIX_VQ_VECTOR = 1,
+};
+
+struct virtio_pci_vq_info
+{
+	/* the actual virtqueue */
+	struct virtqueue *vq;
+
+	/* the number of entries in the queue */
+	int num;
+
+	/* the index of the queue */
+	int queue_index;
+
+	/* the virtual address of the ring queue */
+	void *queue;
+
+	/* the list node for the virtqueues list */
+	struct list_head node;
+
+	/* MSI-X vector (or none) */
+	unsigned msix_vector;
+};
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -22,6 +22,7 @@
 #include <linux/virtio_pci.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
+#include "virtio_pci-common.h"
 
 MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
 MODULE_DESCRIPTION("virtio-pci");
@@ -31,69 +32,6 @@ MODULE_VERSION("2");
 /* Use cacheline size as a good guess at a nice alignment. */
 #define VIRTIO_PCI_ALIGN	SMP_CACHE_BYTES
 
-/* Our device structure */
-struct virtio_pci_device
-{
-	struct virtio_device vdev;
-	struct pci_dev *pci_dev;
-
-	/* The IO mapping for the PCI config space */
-	struct virtio_pci_common_cfg __iomem *common;
-	/* Where to read and clear interrupt */ 
-	u8 __iomem *isr;
-	/* Write the virtqueue index here to notify device of activity. */
-	__le16 __iomem *notify;
-	/* Device-specific data. */
-	void __iomem *device;
-
-	/* a list of queues so we can dispatch IRQs */
-	spinlock_t lock;
-	struct list_head virtqueues;
-
-	/* MSI-X support */
-	int msix_enabled;
-	int intx_enabled;
-	struct msix_entry *msix_entries;
-	/* Name strings for interrupts. This size should be enough,
-	 * and I'm too lazy to allocate each name separately. */
-	char (*msix_names)[256];
-	/* Number of available vectors */
-	unsigned msix_vectors;
-	/* Vectors allocated, excluding per-vq vectors if any */
-	unsigned msix_used_vectors;
-	/* Whether we have vector per vq */
-	bool per_vq_vectors;
-};
-
-/* Constants for MSI-X */
-/* Use first vector for configuration changes, second and the rest for
- * virtqueues Thus, we need at least 2 vectors for MSI. */
-enum {
-	VP_MSIX_CONFIG_VECTOR = 0,
-	VP_MSIX_VQ_VECTOR = 1,
-};
-
-struct virtio_pci_vq_info
-{
-	/* the actual virtqueue */
-	struct virtqueue *vq;
-
-	/* the number of entries in the queue */
-	int num;
-
-	/* the index of the queue */
-	int queue_index;
-
-	/* the virtual address of the ring queue */
-	void *queue;
-
-	/* the list node for the virtqueues list */
-	struct list_head node;
-
-	/* MSI-X vector (or none) */
-	unsigned msix_vector;
-};
-
 /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */
 static struct pci_device_id virtio_pci_id_table[] = {
 	{ 0x1af4, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -25,6 +25,7 @@
 #include <linux/virtio_pci.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
+#include "virtio_pci-common.h"
 
 static bool force_nonlegacy;
 module_param(force_nonlegacy, bool, 0644);
@@ -35,63 +36,6 @@ MODULE_DESCRIPTION("virtio-pci-legacy");
 MODULE_LICENSE("GPL");
 MODULE_VERSION("1");
 
-/* Our device structure */
-struct virtio_pci_device
-{
-	struct virtio_device vdev;
-	struct pci_dev *pci_dev;
-
-	/* the IO mapping for the PCI config space */
-	void __iomem *ioaddr;
-
-	/* a list of queues so we can dispatch IRQs */
-	spinlock_t lock;
-	struct list_head virtqueues;
-
-	/* MSI-X support */
-	int msix_enabled;
-	int intx_enabled;
-	struct msix_entry *msix_entries;
-	/* Name strings for interrupts. This size should be enough,
-	 * and I'm too lazy to allocate each name separately. */
-	char (*msix_names)[256];
-	/* Number of available vectors */
-	unsigned msix_vectors;
-	/* Vectors allocated, excluding per-vq vectors if any */
-	unsigned msix_used_vectors;
-	/* Whether we have vector per vq */
-	bool per_vq_vectors;
-};
-
-/* Constants for MSI-X */
-/* Use first vector for configuration changes, second and the rest for
- * virtqueues Thus, we need at least 2 vectors for MSI. */
-enum {
-	VP_MSIX_CONFIG_VECTOR = 0,
-	VP_MSIX_VQ_VECTOR = 1,
-};
-
-struct virtio_pci_vq_info
-{
-	/* the actual virtqueue */
-	struct virtqueue *vq;
-
-	/* the number of entries in the queue */
-	int num;
-
-	/* the index of the queue */
-	int queue_index;
-
-	/* the virtual address of the ring queue */
-	void *queue;
-
-	/* the list node for the virtqueues list */
-	struct list_head node;
-
-	/* MSI-X vector (or none) */
-	unsigned msix_vector;
-};
-
 /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */
 static struct pci_device_id virtio_pci_id_table[] = {
 	{ 0x1af4, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
@@ -112,7 +56,7 @@ static u64 vp_get_features(struct virtio
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
 	/* We only support 32 feature bits. */
-	return ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES);
+	return ioread32(vp_dev->legacy + VIRTIO_PCI_HOST_FEATURES);
 }
 
 /* virtio config->finalize_features() implementation */
@@ -124,7 +68,7 @@ static void vp_finalize_features(struct 
 	vring_transport_features(vdev);
 
 	/* We only support 32 feature bits. */
-	iowrite32(vdev->features, vp_dev->ioaddr+VIRTIO_PCI_GUEST_FEATURES);
+	iowrite32(vdev->features, vp_dev->legacy+VIRTIO_PCI_GUEST_FEATURES);
 }
 
 /* virtio config->get() implementation */
@@ -132,13 +76,13 @@ static void vp_get(struct virtio_device 
 		   void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
+	void __iomem *legacy = vp_dev->legacy +
 				VIRTIO_PCI_CONFIG(vp_dev) + offset;
 	u8 *ptr = buf;
 	int i;
 
 	for (i = 0; i < len; i++)
-		ptr[i] = ioread8(ioaddr + i);
+		ptr[i] = ioread8(legacy + i);
 }
 
 /* the config->set() implementation.  it's symmetric to the config->get()
@@ -147,20 +91,20 @@ static void vp_set(struct virtio_device 
 		   const void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
+	void __iomem *legacy = vp_dev->legacy +
 				VIRTIO_PCI_CONFIG(vp_dev) + offset;
 	const u8 *ptr = buf;
 	int i;
 
 	for (i = 0; i < len; i++)
-		iowrite8(ptr[i], ioaddr + i);
+		iowrite8(ptr[i], legacy + i);
 }
 
 /* config->{get,set}_status() implementations */
 static u8 vp_get_status(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	return ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS);
+	return ioread8(vp_dev->legacy + VIRTIO_PCI_STATUS);
 }
 
 static void vp_set_status(struct virtio_device *vdev, u8 status)
@@ -168,7 +112,7 @@ static void vp_set_status(struct virtio_
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	/* We should never be setting status to 0. */
 	BUG_ON(status == 0);
-	iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS);
+	iowrite8(status, vp_dev->legacy + VIRTIO_PCI_STATUS);
 }
 
 /* wait for pending irq handlers */
@@ -188,10 +132,10 @@ static void vp_reset(struct virtio_devic
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	/* 0 status means a reset. */
-	iowrite8(0, vp_dev->ioaddr + VIRTIO_PCI_STATUS);
+	iowrite8(0, vp_dev->legacy + VIRTIO_PCI_STATUS);
 	/* Flush out the status write, and flush in device writes,
 	 * including MSi-X interrupts, if any. */
-	ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS);
+	ioread8(vp_dev->legacy + VIRTIO_PCI_STATUS);
 	/* Flush pending VQ/configuration callbacks. */
 	vp_synchronize_vectors(vdev);
 }
@@ -204,7 +148,7 @@ static void vp_notify(struct virtqueue *
 
 	/* we write the queue's selector into the notification register to
 	 * signal the other end */
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+	iowrite16(info->queue_index, vp_dev->legacy + VIRTIO_PCI_QUEUE_NOTIFY);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -251,7 +195,7 @@ static irqreturn_t vp_interrupt(int irq,
 
 	/* reading the ISR has the effect of also clearing it so it's very
 	 * important to save off the value. */
-	isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+	isr = ioread8(vp_dev->legacy + VIRTIO_PCI_ISR);
 
 	/* It's definitely not us if the ISR was not high */
 	if (!isr)
@@ -280,9 +224,9 @@ static void vp_free_vectors(struct virti
 	if (vp_dev->msix_enabled) {
 		/* Disable the vector used for configuration */
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
-			  vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+			  vp_dev->legacy + VIRTIO_MSI_CONFIG_VECTOR);
 		/* Flush the write out to device */
-		ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+		ioread16(vp_dev->legacy + VIRTIO_MSI_CONFIG_VECTOR);
 
 		pci_disable_msix(vp_dev->pci_dev);
 		vp_dev->msix_enabled = 0;
@@ -336,9 +280,9 @@ static int vp_request_msix_vectors(struc
 		goto error;
 	++vp_dev->msix_used_vectors;
 
-	iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+	iowrite16(v, vp_dev->legacy + VIRTIO_MSI_CONFIG_VECTOR);
 	/* Verify we had enough resources to assign the vector */
-	v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+	v = ioread16(vp_dev->legacy + VIRTIO_MSI_CONFIG_VECTOR);
 	if (v == VIRTIO_MSI_NO_VECTOR) {
 		err = -EBUSY;
 		goto error;
@@ -387,11 +331,11 @@ static struct virtqueue *setup_vq(struct
 	int err;
 
 	/* Select the queue we're interested in */
-	iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+	iowrite16(index, vp_dev->legacy + VIRTIO_PCI_QUEUE_SEL);
 
 	/* Check if queue is either not available or already active. */
-	num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM);
-	if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
+	num = ioread16(vp_dev->legacy + VIRTIO_PCI_QUEUE_NUM);
+	if (!num || ioread32(vp_dev->legacy + VIRTIO_PCI_QUEUE_PFN))
 		return ERR_PTR(-ENOENT);
 
 	/* allocate and fill out our structure the represents an active
@@ -413,7 +357,7 @@ static struct virtqueue *setup_vq(struct
 
 	/* activate the queue */
 	iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
-		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+		  vp_dev->legacy + VIRTIO_PCI_QUEUE_PFN);
 
 	/* create the vring */
 	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN,
@@ -427,8 +371,8 @@ static struct virtqueue *setup_vq(struct
 	info->vq = vq;
 
 	if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
-		iowrite16(msix_vec, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
-		msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		iowrite16(msix_vec, vp_dev->legacy + VIRTIO_MSI_QUEUE_VECTOR);
+		msix_vec = ioread16(vp_dev->legacy + VIRTIO_MSI_QUEUE_VECTOR);
 		if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
 			err = -EBUSY;
 			goto out_assign;
@@ -448,7 +392,7 @@ static struct virtqueue *setup_vq(struct
 out_assign:
 	vring_del_virtqueue(vq);
 out_activate_queue:
-	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+	iowrite32(0, vp_dev->legacy + VIRTIO_PCI_QUEUE_PFN);
 	free_pages_exact(info->queue, size);
 out_info:
 	kfree(info);
@@ -465,19 +409,19 @@ static void vp_del_vq(struct virtqueue *
 	list_del(&info->node);
 	spin_unlock_irqrestore(&vp_dev->lock, flags);
 
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+	iowrite16(info->queue_index, vp_dev->legacy + VIRTIO_PCI_QUEUE_SEL);
 
 	if (vp_dev->msix_enabled) {
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
-			  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+			  vp_dev->legacy + VIRTIO_MSI_QUEUE_VECTOR);
 		/* Flush the write out to device */
-		ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+		ioread8(vp_dev->legacy + VIRTIO_PCI_ISR);
 	}
 
 	vring_del_virtqueue(vq);
 
 	/* Select and deactivate the queue */
-	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+	iowrite32(0, vp_dev->legacy + VIRTIO_PCI_QUEUE_PFN);
 
 	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
 	free_pages_exact(info->queue, size);
@@ -676,8 +620,8 @@ static int __devinit virtio_pci_probe(st
 	if (err)
 		goto out_enable_device;
 
-	vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
-	if (vp_dev->ioaddr == NULL)
+	vp_dev->legacy = pci_iomap(pci_dev, 0, 0);
+	if (vp_dev->legacy == NULL)
 		goto out_req_regions;
 
 	pci_set_drvdata(pci_dev, vp_dev);
@@ -699,7 +643,7 @@ static int __devinit virtio_pci_probe(st
 
 out_set_drvdata:
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	pci_iounmap(pci_dev, vp_dev->legacy);
 out_req_regions:
 	pci_release_regions(pci_dev);
 out_enable_device:
@@ -717,7 +661,7 @@ static void __devexit virtio_pci_remove(
 
 	vp_del_vqs(&vp_dev->vdev);
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	pci_iounmap(pci_dev, vp_dev->legacy);
 	pci_release_regions(pci_dev);
 	pci_disable_device(pci_dev);
 	kfree(vp_dev);

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 9/11] virtio_pci: share interrupt/notify handlers between legacy and modern.
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (8 preceding siblings ...)
  2011-12-08 10:40 ` [RFC 8/11] virtio_pci: share structure between legacy and modern Rusty Russell
@ 2011-12-08 10:41 ` Rusty Russell
  2011-12-08 10:42 ` [RFC 10/11] virtio_pci: share virtqueue setup/teardown between modern and legacy driver Rusty Russell
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:41 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin

If we make the legacy driver set up the ->notify and ->isr pointers in
the struct virtio_pci_device structure, we can use them in common code
(the positions have changed, but the semantics haven't).
---
 drivers/virtio/Makefile            |    4 -
 drivers/virtio/virtio_pci-common.c |   81 ++++++++++++++++++++++++++++++++
 drivers/virtio/virtio_pci-common.h |   34 +++++++++++++
 drivers/virtio/virtio_pci.c        |   87 ++---------------------------------
 drivers/virtio/virtio_pci_legacy.c |   91 ++++---------------------------------
 5 files changed, 135 insertions(+), 162 deletions(-)

diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_VIRTIO) += virtio.o
 obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
 obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
-obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
-obj-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
+obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o virtio_pci-common.o
+obj-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o virtio_pci-common.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
diff --git a/drivers/virtio/virtio_pci-common.c b/drivers/virtio/virtio_pci-common.c
new file mode 100644
--- /dev/null
+++ b/drivers/virtio/virtio_pci-common.c
@@ -0,0 +1,81 @@
+/*
+ * Virtio PCI driver - common code for legacy and non-legacy.
+ *
+ * Copyright 2011, Rusty Russell IBM Corporation, but based on the
+ * older virtio_pci_legacy.c, which was Copyright IBM Corp. 2007.  But
+ * most of the interrupt setup code was written by Michael S. Tsirkin.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#define VIRTIO_PCI_NO_LEGACY
+#include "virtio_pci-common.h"
+#include <linux/virtio_ring.h>
+
+/* the notify function used when creating a virt queue */
+void virtio_pci_notify(struct virtqueue *vq)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+	struct virtio_pci_vq_info *info = vq->priv;
+
+	/* we write the queue's selector into the notification register to
+	 * signal the other end */
+	iowrite16(info->queue_index, vp_dev->notify);
+}
+
+/* Handle a configuration change: Tell driver if it wants to know. */
+irqreturn_t virtio_pci_config_changed(int irq, void *opaque)
+{
+	struct virtio_pci_device *vp_dev = opaque;
+	struct virtio_driver *drv;
+	drv = container_of(vp_dev->vdev.dev.driver,
+			   struct virtio_driver, driver);
+
+	if (drv->config_changed)
+		drv->config_changed(&vp_dev->vdev);
+	return IRQ_HANDLED;
+}
+
+/* Notify all virtqueues on an interrupt. */
+irqreturn_t virtio_pci_vring_interrupt(int irq, void *opaque)
+{
+	struct virtio_pci_device *vp_dev = opaque;
+	struct virtio_pci_vq_info *info;
+	irqreturn_t ret = IRQ_NONE;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vp_dev->lock, flags);
+	list_for_each_entry(info, &vp_dev->virtqueues, node) {
+		if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
+			ret = IRQ_HANDLED;
+	}
+	spin_unlock_irqrestore(&vp_dev->lock, flags);
+
+	return ret;
+}
+
+/* A small wrapper to also acknowledge the interrupt when it's handled.
+ * I really need an EIO hook for the vring so I can ack the interrupt once we
+ * know that we'll be handling the IRQ but before we invoke the callback since
+ * the callback may notify the host which results in the host attempting to
+ * raise an interrupt that we would then mask once we acknowledged the
+ * interrupt. */
+irqreturn_t virtio_pci_interrupt(int irq, void *opaque)
+{
+	struct virtio_pci_device *vp_dev = opaque;
+	u8 isr;
+
+	/* reading the ISR has the effect of also clearing it so it's very
+	 * important to save off the value. */
+	isr = ioread8(vp_dev->isr);
+
+	/* It's definitely not us if the ISR was not high */
+	if (!isr)
+		return IRQ_NONE;
+
+	/* Configuration change?  Tell driver if it wants to know. */
+	if (isr & VIRTIO_PCI_ISR_CONFIG)
+		virtio_pci_config_changed(irq, opaque);
+
+	return virtio_pci_vring_interrupt(irq, opaque);
+}
diff --git a/drivers/virtio/virtio_pci-common.h b/drivers/virtio/virtio_pci-common.h
--- a/drivers/virtio/virtio_pci-common.h
+++ b/drivers/virtio/virtio_pci-common.h
@@ -42,6 +42,12 @@ struct virtio_pci_device
 	bool per_vq_vectors;
 };
 
+/* Convert a generic virtio device to our structure */
+static inline struct virtio_pci_device *to_vp_device(struct virtio_device *vdev)
+{
+	return container_of(vdev, struct virtio_pci_device, vdev);
+}
+
 /* Constants for MSI-X */
 /* Use first vector for configuration changes, second and the rest for
  * virtqueues Thus, we need at least 2 vectors for MSI. */
@@ -70,3 +76,31 @@ struct virtio_pci_vq_info
 	/* MSI-X vector (or none) */
 	unsigned msix_vector;
 };
+
+/* the notify function used when creating a virt queue */
+void virtio_pci_notify(struct virtqueue *vq);
+/* Handle a configuration change: Tell driver if it wants to know. */
+irqreturn_t virtio_pci_config_changed(int irq, void *opaque);
+/* Notify all virtqueues on an interrupt. */
+irqreturn_t virtio_pci_vring_interrupt(int irq, void *opaque);
+/* Acknowledge, check for config or vq interrupt. */
+irqreturn_t virtio_pci_interrupt(int irq, void *opaque);
+
+/* Core of a config->find_vqs() implementation */
+int virtio_pci_find_vqs(struct virtio_pci_device *vp_dev,
+			__le16 __iomem *msix_config,
+			struct virtqueue *(setup_vq)(struct virtio_pci_device *,
+						     unsigned,
+						     void (*)(struct virtqueue*),
+						     const char *,
+						     u16 msix_vec),
+			void (*del_vq)(struct virtqueue *vq),
+			unsigned nvqs,
+			struct virtqueue *vqs[],
+			vq_callback_t *callbacks[],
+			const char *names[]);
+
+/* the core of a config->del_vqs() implementation */
+void virtio_pci_del_vqs(struct virtio_pci_device *vp_dev,
+			__le16 __iomem *msix_config,
+			void (*del_vq)(struct virtqueue *vq));
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -40,12 +40,6 @@ static struct pci_device_id virtio_pci_i
 
 MODULE_DEVICE_TABLE(pci, virtio_pci_id_table);
 
-/* Convert a generic virtio device to our structure */
-static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev)
-{
-	return container_of(vdev, struct virtio_pci_device, vdev);
-}
-
 /* There is no iowrite64.  We use two 32-bit ops. */
 static void iowrite64(u64 val, const __le64 *addr)
 {
@@ -151,74 +145,6 @@ static void vp_reset(struct virtio_devic
 	vp_synchronize_vectors(vdev);
 }
 
-/* the notify function used when creating a virt queue */
-static void vp_notify(struct virtqueue *vq)
-{
-	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
-	struct virtio_pci_vq_info *info = vq->priv;
-
-	/* we write the queue's selector into the notification register to
-	 * signal the other end */
-	iowrite16(info->queue_index, vp_dev->notify);
-}
-
-/* Handle a configuration change: Tell driver if it wants to know. */
-static irqreturn_t vp_config_changed(int irq, void *opaque)
-{
-	struct virtio_pci_device *vp_dev = opaque;
-	struct virtio_driver *drv;
-	drv = container_of(vp_dev->vdev.dev.driver,
-			   struct virtio_driver, driver);
-
-	if (drv->config_changed)
-		drv->config_changed(&vp_dev->vdev);
-	return IRQ_HANDLED;
-}
-
-/* Notify all virtqueues on an interrupt. */
-static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
-{
-	struct virtio_pci_device *vp_dev = opaque;
-	struct virtio_pci_vq_info *info;
-	irqreturn_t ret = IRQ_NONE;
-	unsigned long flags;
-
-	spin_lock_irqsave(&vp_dev->lock, flags);
-	list_for_each_entry(info, &vp_dev->virtqueues, node) {
-		if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
-			ret = IRQ_HANDLED;
-	}
-	spin_unlock_irqrestore(&vp_dev->lock, flags);
-
-	return ret;
-}
-
-/* A small wrapper to also acknowledge the interrupt when it's handled.
- * I really need an EIO hook for the vring so I can ack the interrupt once we
- * know that we'll be handling the IRQ but before we invoke the callback since
- * the callback may notify the host which results in the host attempting to
- * raise an interrupt that we would then mask once we acknowledged the
- * interrupt. */
-static irqreturn_t vp_interrupt(int irq, void *opaque)
-{
-	struct virtio_pci_device *vp_dev = opaque;
-	u8 isr;
-
-	/* reading the ISR has the effect of also clearing it so it's very
-	 * important to save off the value. */
-	isr = ioread8(vp_dev->isr);
-
-	/* It's definitely not us if the ISR was not high */
-	if (!isr)
-		return IRQ_NONE;
-
-	/* Configuration change?  Tell driver if it wants to know. */
-	if (isr & VIRTIO_PCI_ISR_CONFIG)
-		vp_config_changed(irq, opaque);
-
-	return vp_vring_interrupt(irq, opaque);
-}
-
 static void vp_free_vectors(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -284,7 +210,7 @@ static int vp_request_msix_vectors(struc
 	snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
 		 "%s-config", name);
 	err = request_irq(vp_dev->msix_entries[v].vector,
-			  vp_config_changed, 0, vp_dev->msix_names[v],
+			  virtio_pci_config_changed, 0, vp_dev->msix_names[v],
 			  vp_dev);
 	if (err)
 		goto error;
@@ -304,8 +230,8 @@ static int vp_request_msix_vectors(struc
 		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
 			 "%s-virtqueues", name);
 		err = request_irq(vp_dev->msix_entries[v].vector,
-				  vp_vring_interrupt, 0, vp_dev->msix_names[v],
-				  vp_dev);
+				  virtio_pci_vring_interrupt, 0,
+				  vp_dev->msix_names[v], vp_dev);
 		if (err)
 			goto error;
 		++vp_dev->msix_used_vectors;
@@ -321,7 +247,7 @@ static int vp_request_intx(struct virtio
 	int err;
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
-	err = request_irq(vp_dev->pci_dev->irq, vp_interrupt,
+	err = request_irq(vp_dev->pci_dev->irq, virtio_pci_interrupt,
 			  IRQF_SHARED, dev_name(&vdev->dev), vp_dev);
 	if (!err)
 		vp_dev->intx_enabled = 1;
@@ -396,7 +322,8 @@ static struct virtqueue *setup_vq(struct
 
 	/* create the vring */
 	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_ALIGN,
-				 vdev, info->queue, vp_notify, callback, name);
+				 vdev, info->queue, virtio_pci_notify,
+				 callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto out_alloc_pages;
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -44,12 +44,6 @@ static struct pci_device_id virtio_pci_i
 
 MODULE_DEVICE_TABLE(pci, virtio_pci_id_table);
 
-/* Convert a generic virtio device to our structure */
-static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev)
-{
-	return container_of(vdev, struct virtio_pci_device, vdev);
-}
-
 /* virtio config->get_features() implementation */
 static u64 vp_get_features(struct virtio_device *vdev)
 {
@@ -140,74 +134,6 @@ static void vp_reset(struct virtio_devic
 	vp_synchronize_vectors(vdev);
 }
 
-/* the notify function used when creating a virt queue */
-static void vp_notify(struct virtqueue *vq)
-{
-	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
-	struct virtio_pci_vq_info *info = vq->priv;
-
-	/* we write the queue's selector into the notification register to
-	 * signal the other end */
-	iowrite16(info->queue_index, vp_dev->legacy + VIRTIO_PCI_QUEUE_NOTIFY);
-}
-
-/* Handle a configuration change: Tell driver if it wants to know. */
-static irqreturn_t vp_config_changed(int irq, void *opaque)
-{
-	struct virtio_pci_device *vp_dev = opaque;
-	struct virtio_driver *drv;
-	drv = container_of(vp_dev->vdev.dev.driver,
-			   struct virtio_driver, driver);
-
-	if (drv && drv->config_changed)
-		drv->config_changed(&vp_dev->vdev);
-	return IRQ_HANDLED;
-}
-
-/* Notify all virtqueues on an interrupt. */
-static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
-{
-	struct virtio_pci_device *vp_dev = opaque;
-	struct virtio_pci_vq_info *info;
-	irqreturn_t ret = IRQ_NONE;
-	unsigned long flags;
-
-	spin_lock_irqsave(&vp_dev->lock, flags);
-	list_for_each_entry(info, &vp_dev->virtqueues, node) {
-		if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
-			ret = IRQ_HANDLED;
-	}
-	spin_unlock_irqrestore(&vp_dev->lock, flags);
-
-	return ret;
-}
-
-/* A small wrapper to also acknowledge the interrupt when it's handled.
- * I really need an EIO hook for the vring so I can ack the interrupt once we
- * know that we'll be handling the IRQ but before we invoke the callback since
- * the callback may notify the host which results in the host attempting to
- * raise an interrupt that we would then mask once we acknowledged the
- * interrupt. */
-static irqreturn_t vp_interrupt(int irq, void *opaque)
-{
-	struct virtio_pci_device *vp_dev = opaque;
-	u8 isr;
-
-	/* reading the ISR has the effect of also clearing it so it's very
-	 * important to save off the value. */
-	isr = ioread8(vp_dev->legacy + VIRTIO_PCI_ISR);
-
-	/* It's definitely not us if the ISR was not high */
-	if (!isr)
-		return IRQ_NONE;
-
-	/* Configuration change?  Tell driver if it wants to know. */
-	if (isr & VIRTIO_PCI_ISR_CONFIG)
-		vp_config_changed(irq, opaque);
-
-	return vp_vring_interrupt(irq, opaque);
-}
-
 static void vp_free_vectors(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -274,7 +200,7 @@ static int vp_request_msix_vectors(struc
 	snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
 		 "%s-config", name);
 	err = request_irq(vp_dev->msix_entries[v].vector,
-			  vp_config_changed, 0, vp_dev->msix_names[v],
+			  virtio_pci_config_changed, 0, vp_dev->msix_names[v],
 			  vp_dev);
 	if (err)
 		goto error;
@@ -294,8 +220,8 @@ static int vp_request_msix_vectors(struc
 		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
 			 "%s-virtqueues", name);
 		err = request_irq(vp_dev->msix_entries[v].vector,
-				  vp_vring_interrupt, 0, vp_dev->msix_names[v],
-				  vp_dev);
+				  virtio_pci_vring_interrupt, 0,
+				  vp_dev->msix_names[v], vp_dev);
 		if (err)
 			goto error;
 		++vp_dev->msix_used_vectors;
@@ -311,7 +237,7 @@ static int vp_request_intx(struct virtio
 	int err;
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
-	err = request_irq(vp_dev->pci_dev->irq, vp_interrupt,
+	err = request_irq(vp_dev->pci_dev->irq, virtio_pci_interrupt,
 			  IRQF_SHARED, dev_name(&vdev->dev), vp_dev);
 	if (!err)
 		vp_dev->intx_enabled = 1;
@@ -361,7 +287,8 @@ static struct virtqueue *setup_vq(struct
 
 	/* create the vring */
 	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN,
-				 vdev, info->queue, vp_notify, callback, name);
+				 vdev, info->queue, virtio_pci_notify,
+				 callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto out_activate_queue;
@@ -624,6 +551,10 @@ static int __devinit virtio_pci_probe(st
 	if (vp_dev->legacy == NULL)
 		goto out_req_regions;
 
+	/* Setting this lets us share interrupt handlers with virtio_pci */
+	vp_dev->isr = vp_dev->legacy + VIRTIO_PCI_LEGACY_ISR;
+	vp_dev->notify = vp_dev->legacy + VIRTIO_PCI_LEGACY_QUEUE_NOTIFY;
+
 	pci_set_drvdata(pci_dev, vp_dev);
 	pci_set_master(pci_dev);

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 10/11] virtio_pci: share virtqueue setup/teardown between modern and legacy driver.
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (9 preceding siblings ...)
  2011-12-08 10:41 ` [RFC 9/11] virtio_pci: share interrupt/notify handlers " Rusty Russell
@ 2011-12-08 10:42 ` Rusty Russell
  2011-12-08 10:44 ` [RFC 11/11] virtio_pci: simplify common helpers Rusty Russell
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:42 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin

There's a great deal of work in setting up and disabling interrupts,
particularly with MSI-X, which is generic.  So we move most of the
work out to helpers which take the location of the msix_config
register, and setup_vq and del_vq functions.
---
 drivers/virtio/virtio_pci-common.c |  249 +++++++++++++++++++++++++++++++++++++
 drivers/virtio/virtio_pci-common.h |   19 ++
 drivers/virtio/virtio_pci.c        |  224 +--------------------------------
 drivers/virtio/virtio_pci_legacy.c |  229 ++--------------------------------
 4 files changed, 291 insertions(+), 430 deletions(-)

diff --git a/drivers/virtio/virtio_pci-common.c b/drivers/virtio/virtio_pci-common.c
--- a/drivers/virtio/virtio_pci-common.c
+++ b/drivers/virtio/virtio_pci-common.c
@@ -11,6 +11,7 @@
 #define VIRTIO_PCI_NO_LEGACY
 #include "virtio_pci-common.h"
 #include <linux/virtio_ring.h>
+#include <linux/interrupt.h>
 
 /* the notify function used when creating a virt queue */
 void virtio_pci_notify(struct virtqueue *vq)
@@ -79,3 +80,251 @@ irqreturn_t virtio_pci_interrupt(int irq
 
 	return virtio_pci_vring_interrupt(irq, opaque);
 }
+
+static void vp_free_vectors(struct virtio_device *vdev,
+			    __le16 __iomem *msix_config)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	int i;
+
+	if (vp_dev->intx_enabled) {
+		free_irq(vp_dev->pci_dev->irq, vp_dev);
+		vp_dev->intx_enabled = 0;
+	}
+
+	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
+		free_irq(vp_dev->msix_entries[i].vector, vp_dev);
+
+	if (vp_dev->msix_enabled) {
+		/* Disable the vector used for configuration */
+		iowrite16(VIRTIO_MSI_NO_VECTOR, msix_config);
+		/* Flush the write out to device */
+		ioread16(msix_config);
+
+		pci_disable_msix(vp_dev->pci_dev);
+		vp_dev->msix_enabled = 0;
+		vp_dev->msix_vectors = 0;
+	}
+
+	vp_dev->msix_used_vectors = 0;
+	kfree(vp_dev->msix_names);
+	vp_dev->msix_names = NULL;
+	kfree(vp_dev->msix_entries);
+	vp_dev->msix_entries = NULL;
+}
+
+static int vp_request_msix_vectors(struct virtio_device *vdev,
+				   __le16 __iomem *msix_config,
+				   int nvectors, bool per_vq_vectors)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	const char *name = dev_name(&vp_dev->vdev.dev);
+	unsigned i, v;
+	int err = -ENOMEM;
+
+	vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries,
+				       GFP_KERNEL);
+	if (!vp_dev->msix_entries)
+		goto error;
+	vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
+				     GFP_KERNEL);
+	if (!vp_dev->msix_names)
+		goto error;
+
+	for (i = 0; i < nvectors; ++i)
+		vp_dev->msix_entries[i].entry = i;
+
+	/* pci_enable_msix returns positive if we can't get this many. */
+	err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors);
+	if (err > 0)
+		err = -ENOSPC;
+	if (err)
+		goto error;
+	vp_dev->msix_vectors = nvectors;
+	vp_dev->msix_enabled = 1;
+
+	/* Set the vector used for configuration */
+	v = vp_dev->msix_used_vectors;
+	snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+		 "%s-config", name);
+	err = request_irq(vp_dev->msix_entries[v].vector,
+			  virtio_pci_config_changed, 0, vp_dev->msix_names[v],
+			  vp_dev);
+	if (err)
+		goto error;
+	++vp_dev->msix_used_vectors;
+
+	iowrite16(v, msix_config);
+	/* Verify we had enough resources to assign the vector */
+	v = ioread16(msix_config);
+	if (v == VIRTIO_MSI_NO_VECTOR) {
+		err = -EBUSY;
+		goto error;
+	}
+
+	if (!per_vq_vectors) {
+		/* Shared vector for all VQs */
+		v = vp_dev->msix_used_vectors;
+		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+			 "%s-virtqueues", name);
+		err = request_irq(vp_dev->msix_entries[v].vector,
+				  virtio_pci_vring_interrupt, 0,
+				  vp_dev->msix_names[v], vp_dev);
+		if (err)
+			goto error;
+		++vp_dev->msix_used_vectors;
+	}
+	return 0;
+error:
+	vp_free_vectors(vdev, msix_config);
+	return err;
+}
+
+static int vp_request_intx(struct virtio_device *vdev)
+{
+	int err;
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+	err = request_irq(vp_dev->pci_dev->irq, virtio_pci_interrupt,
+			  IRQF_SHARED, dev_name(&vdev->dev), vp_dev);
+	if (!err)
+		vp_dev->intx_enabled = 1;
+	return err;
+}
+
+static int vp_try_to_find_vqs(struct virtio_device *vdev,
+			      __le16 __iomem *msix_config,
+			      struct virtqueue *(setup_vq)(struct virtio_device*,
+							   unsigned,
+							   void (*)(struct
+								    virtqueue *),
+							   const char *,
+							   u16 msix_vec),
+			      void (*del_vq)(struct virtqueue *vq),
+			      unsigned nvqs,
+			      struct virtqueue *vqs[],
+			      vq_callback_t *callbacks[],
+			      const char *names[],
+			      bool use_msix,
+			      bool per_vq_vectors)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	u16 msix_vec;
+	int i, err, nvectors, allocated_vectors;
+
+	if (!use_msix) {
+		/* Old style: one normal interrupt for change and all vqs. */
+		err = vp_request_intx(vdev);
+		if (err)
+			goto error_request;
+	} else {
+		if (per_vq_vectors) {
+			/* Best option: one for change interrupt, one per vq. */
+			nvectors = 1;
+			for (i = 0; i < nvqs; ++i)
+				if (callbacks[i])
+					++nvectors;
+		} else {
+			/* Second best: one for change, shared for all vqs. */
+			nvectors = 2;
+		}
+
+		err = vp_request_msix_vectors(vdev, msix_config,
+					      nvectors, per_vq_vectors);
+		if (err)
+			goto error_request;
+	}
+
+	vp_dev->per_vq_vectors = per_vq_vectors;
+	allocated_vectors = vp_dev->msix_used_vectors;
+	for (i = 0; i < nvqs; ++i) {
+		if (!callbacks[i] || !vp_dev->msix_enabled)
+			msix_vec = VIRTIO_MSI_NO_VECTOR;
+		else if (vp_dev->per_vq_vectors)
+			msix_vec = allocated_vectors++;
+		else
+			msix_vec = VP_MSIX_VQ_VECTOR;
+		vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec);
+		if (IS_ERR(vqs[i])) {
+			err = PTR_ERR(vqs[i]);
+			goto error_find;
+		}
+
+		if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR)
+			continue;
+
+		/* allocate per-vq irq if available and necessary */
+		snprintf(vp_dev->msix_names[msix_vec],
+			 sizeof *vp_dev->msix_names,
+			 "%s-%s",
+			 dev_name(&vp_dev->vdev.dev), names[i]);
+		err = request_irq(vp_dev->msix_entries[msix_vec].vector,
+				  vring_interrupt, 0,
+				  vp_dev->msix_names[msix_vec],
+				  vqs[i]);
+		if (err) {
+			del_vq(vqs[i]);
+			goto error_find;
+		}
+	}
+	return 0;
+
+error_find:
+	virtio_pci_del_vqs(vdev, msix_config, del_vq);
+
+error_request:
+	return err;
+}
+
+/* the config->find_vqs() implementation */
+int virtio_pci_find_vqs(struct virtio_device *vdev,
+			__le16 __iomem *msix_config,
+			struct virtqueue *(setup_vq)(struct virtio_device *,
+						     unsigned,
+						     void (*)(struct virtqueue*),
+						     const char *,
+						     u16 msix_vec),
+			void (*del_vq)(struct virtqueue *vq),
+			unsigned nvqs,
+			struct virtqueue *vqs[],
+			vq_callback_t *callbacks[],
+			const char *names[])
+{
+	int err;
+
+	/* Try MSI-X with one vector per queue. */
+	err = vp_try_to_find_vqs(vdev, msix_config, setup_vq, del_vq,
+				 nvqs, vqs, callbacks, names, true, true);
+	if (!err)
+		return 0;
+	/* Fallback: MSI-X with one vector for config, one shared for queues. */
+	err = vp_try_to_find_vqs(vdev, msix_config, setup_vq, del_vq,
+				 nvqs, vqs, callbacks, names, true, false);
+	if (!err)
+		return 0;
+	/* Finally fall back to regular interrupts. */
+	return vp_try_to_find_vqs(vdev, msix_config, setup_vq, del_vq,
+				  nvqs, vqs, callbacks, names, false, false);
+}
+
+/* the core of a config->del_vqs() implementation */
+void virtio_pci_del_vqs(struct virtio_device *vdev,
+			__le16 __iomem *msix_config,
+			void (*del_vq)(struct virtqueue *vq))
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	struct virtqueue *vq, *n;
+	struct virtio_pci_vq_info *info;
+
+	list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
+		info = vq->priv;
+		if (vp_dev->per_vq_vectors &&
+			info->msix_vector != VIRTIO_MSI_NO_VECTOR)
+			free_irq(vp_dev->msix_entries[info->msix_vector].vector,
+				 vq);
+		del_vq(vq);
+	}
+	vp_dev->per_vq_vectors = false;
+
+	vp_free_vectors(vdev, msix_config);
+}
diff --git a/drivers/virtio/virtio_pci-common.h b/drivers/virtio/virtio_pci-common.h
--- a/drivers/virtio/virtio_pci-common.h
+++ b/drivers/virtio/virtio_pci-common.h
@@ -87,6 +87,25 @@ irqreturn_t virtio_pci_vring_interrupt(i
 irqreturn_t virtio_pci_interrupt(int irq, void *opaque);
 
 /* Core of a config->find_vqs() implementation */
+int virtio_pci_find_vqs(struct virtio_device *vdev,
+			__le16 __iomem *msix_config,
+			struct virtqueue *(setup_vq)(struct virtio_device *,
+						     unsigned,
+						     void (*)(struct virtqueue*),
+						     const char *,
+						     u16 msix_vec),
+			void (*del_vq)(struct virtqueue *vq),
+			unsigned nvqs,
+			struct virtqueue *vqs[],
+			vq_callback_t *callbacks[],
+			const char *names[]);
+
+/* the core of a config->del_vqs() implementation */
+void virtio_pci_del_vqs(struct virtio_device *vdev,
+			__le16 __iomem *msix_config,
+			void (*del_vq)(struct virtqueue *vq));
+
+/* Core of a config->find_vqs() implementation */
 int virtio_pci_find_vqs(struct virtio_pci_device *vp_dev,
 			__le16 __iomem *msix_config,
 			struct virtqueue *(setup_vq)(struct virtio_pci_device *,
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -145,115 +145,6 @@ static void vp_reset(struct virtio_devic
 	vp_synchronize_vectors(vdev);
 }
 
-static void vp_free_vectors(struct virtio_device *vdev)
-{
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	int i;
-
-	if (vp_dev->intx_enabled) {
-		free_irq(vp_dev->pci_dev->irq, vp_dev);
-		vp_dev->intx_enabled = 0;
-	}
-
-	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
-		free_irq(vp_dev->msix_entries[i].vector, vp_dev);
-
-	if (vp_dev->msix_enabled) {
-		/* Disable the vector used for configuration */
-		iowrite16(VIRTIO_MSI_NO_VECTOR, &vp_dev->common->msix_config);
-		/* Flush the write out to device */
-		ioread16(&vp_dev->common->msix_config);
-
-		pci_disable_msix(vp_dev->pci_dev);
-		vp_dev->msix_enabled = 0;
-		vp_dev->msix_vectors = 0;
-	}
-
-	vp_dev->msix_used_vectors = 0;
-	kfree(vp_dev->msix_names);
-	vp_dev->msix_names = NULL;
-	kfree(vp_dev->msix_entries);
-	vp_dev->msix_entries = NULL;
-}
-
-static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
-				   bool per_vq_vectors)
-{
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	const char *name = dev_name(&vp_dev->vdev.dev);
-	unsigned i, v;
-	int err = -ENOMEM;
-
-	vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries,
-				       GFP_KERNEL);
-	if (!vp_dev->msix_entries)
-		goto error;
-	vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
-				     GFP_KERNEL);
-	if (!vp_dev->msix_names)
-		goto error;
-
-	for (i = 0; i < nvectors; ++i)
-		vp_dev->msix_entries[i].entry = i;
-
-	/* pci_enable_msix returns positive if we can't get this many. */
-	err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors);
-	if (err > 0)
-		err = -ENOSPC;
-	if (err)
-		goto error;
-	vp_dev->msix_vectors = nvectors;
-	vp_dev->msix_enabled = 1;
-
-	/* Set the vector used for configuration */
-	v = vp_dev->msix_used_vectors;
-	snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
-		 "%s-config", name);
-	err = request_irq(vp_dev->msix_entries[v].vector,
-			  virtio_pci_config_changed, 0, vp_dev->msix_names[v],
-			  vp_dev);
-	if (err)
-		goto error;
-	++vp_dev->msix_used_vectors;
-
-	iowrite16(v, &vp_dev->common->msix_config);
-	/* Verify we had enough resources to assign the vector */
-	v = ioread16(&vp_dev->common->msix_config);
-	if (v == VIRTIO_MSI_NO_VECTOR) {
-		err = -EBUSY;
-		goto error;
-	}
-
-	if (!per_vq_vectors) {
-		/* Shared vector for all VQs */
-		v = vp_dev->msix_used_vectors;
-		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
-			 "%s-virtqueues", name);
-		err = request_irq(vp_dev->msix_entries[v].vector,
-				  virtio_pci_vring_interrupt, 0,
-				  vp_dev->msix_names[v], vp_dev);
-		if (err)
-			goto error;
-		++vp_dev->msix_used_vectors;
-	}
-	return 0;
-error:
-	vp_free_vectors(vdev);
-	return err;
-}
-
-static int vp_request_intx(struct virtio_device *vdev)
-{
-	int err;
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-
-	err = request_irq(vp_dev->pci_dev->irq, virtio_pci_interrupt,
-			  IRQF_SHARED, dev_name(&vdev->dev), vp_dev);
-	if (!err)
-		vp_dev->intx_enabled = 1;
-	return err;
-}
-
 static void *alloc_virtqueue_pages(u16 *num)
 {
 	void *pages;
@@ -403,116 +294,19 @@ static void vp_del_vq(struct virtqueue *
 static void vp_del_vqs(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	struct virtqueue *vq, *n;
-	struct virtio_pci_vq_info *info;
-
-	list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
-		info = vq->priv;
-		if (vp_dev->per_vq_vectors &&
-			info->msix_vector != VIRTIO_MSI_NO_VECTOR)
-			free_irq(vp_dev->msix_entries[info->msix_vector].vector,
-				 vq);
-		vp_del_vq(vq);
-	}
-	vp_dev->per_vq_vectors = false;
-
-	vp_free_vectors(vdev);
+	virtio_pci_del_vqs(vdev, &vp_dev->common->msix_config, vp_del_vq);
 }
 
-static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-			      struct virtqueue *vqs[],
-			      vq_callback_t *callbacks[],
-			      const char *names[],
-			      bool use_msix,
-			      bool per_vq_vectors)
+static int vp_find_vqs(struct virtio_device *vdev,
+			unsigned nvqs,
+			struct virtqueue *vqs[],
+			vq_callback_t *callbacks[],
+			const char *names[])
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	u16 msix_vec;
-	int i, err, nvectors, allocated_vectors;
-
-	if (!use_msix) {
-		/* Old style: one normal interrupt for change and all vqs. */
-		err = vp_request_intx(vdev);
-		if (err)
-			goto error_request;
-	} else {
-		if (per_vq_vectors) {
-			/* Best option: one for change interrupt, one per vq. */
-			nvectors = 1;
-			for (i = 0; i < nvqs; ++i)
-				if (callbacks[i])
-					++nvectors;
-		} else {
-			/* Second best: one for change, shared for all vqs. */
-			nvectors = 2;
-		}
-
-		err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors);
-		if (err)
-			goto error_request;
-	}
-
-	vp_dev->per_vq_vectors = per_vq_vectors;
-	allocated_vectors = vp_dev->msix_used_vectors;
-	for (i = 0; i < nvqs; ++i) {
-		if (!callbacks[i] || !vp_dev->msix_enabled)
-			msix_vec = VIRTIO_MSI_NO_VECTOR;
-		else if (vp_dev->per_vq_vectors)
-			msix_vec = allocated_vectors++;
-		else
-			msix_vec = VP_MSIX_VQ_VECTOR;
-		vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec);
-		if (IS_ERR(vqs[i])) {
-			err = PTR_ERR(vqs[i]);
-			goto error_find;
-		}
-
-		if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR)
-			continue;
-
-		/* allocate per-vq irq if available and necessary */
-		snprintf(vp_dev->msix_names[msix_vec],
-			 sizeof *vp_dev->msix_names,
-			 "%s-%s",
-			 dev_name(&vp_dev->vdev.dev), names[i]);
-		err = request_irq(vp_dev->msix_entries[msix_vec].vector,
-				  vring_interrupt, 0,
-				  vp_dev->msix_names[msix_vec],
-				  vqs[i]);
-		if (err) {
-			vp_del_vq(vqs[i]);
-			goto error_find;
-		}
-	}
-	return 0;
-
-error_find:
-	vp_del_vqs(vdev);
-
-error_request:
-	return err;
-}
-
-/* the config->find_vqs() implementation */
-static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-		       struct virtqueue *vqs[],
-		       vq_callback_t *callbacks[],
-		       const char *names[])
-{
-	int err;
-
-	/* Try MSI-X with one vector per queue. */
-	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, true, true);
-	if (!err)
-		return 0;
-	/* Fallback: MSI-X with one vector for config, one shared for queues. */
-	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
-				 true, false);
-	if (!err)
-		return 0;
-	/* Finally fall back to regular interrupts. */
-	return vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
-				  false, false);
+	return virtio_pci_find_vqs(vdev, &vp_dev->common->msix_config,
+				   setup_vq, vp_del_vq,
+				   nvqs, vqs, callbacks, names);
 }
 
 static struct virtio_config_ops virtio_pci_config_ops = {
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -134,120 +134,11 @@ static void vp_reset(struct virtio_devic
 	vp_synchronize_vectors(vdev);
 }
 
-static void vp_free_vectors(struct virtio_device *vdev)
-{
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	int i;
-
-	if (vp_dev->intx_enabled) {
-		free_irq(vp_dev->pci_dev->irq, vp_dev);
-		vp_dev->intx_enabled = 0;
-	}
-
-	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
-		free_irq(vp_dev->msix_entries[i].vector, vp_dev);
-
-	if (vp_dev->msix_enabled) {
-		/* Disable the vector used for configuration */
-		iowrite16(VIRTIO_MSI_NO_VECTOR,
-			  vp_dev->legacy + VIRTIO_MSI_CONFIG_VECTOR);
-		/* Flush the write out to device */
-		ioread16(vp_dev->legacy + VIRTIO_MSI_CONFIG_VECTOR);
-
-		pci_disable_msix(vp_dev->pci_dev);
-		vp_dev->msix_enabled = 0;
-		vp_dev->msix_vectors = 0;
-	}
-
-	vp_dev->msix_used_vectors = 0;
-	kfree(vp_dev->msix_names);
-	vp_dev->msix_names = NULL;
-	kfree(vp_dev->msix_entries);
-	vp_dev->msix_entries = NULL;
-}
-
-static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
-				   bool per_vq_vectors)
-{
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	const char *name = dev_name(&vp_dev->vdev.dev);
-	unsigned i, v;
-	int err = -ENOMEM;
-
-	vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries,
-				       GFP_KERNEL);
-	if (!vp_dev->msix_entries)
-		goto error;
-	vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
-				     GFP_KERNEL);
-	if (!vp_dev->msix_names)
-		goto error;
-
-	for (i = 0; i < nvectors; ++i)
-		vp_dev->msix_entries[i].entry = i;
-
-	/* pci_enable_msix returns positive if we can't get this many. */
-	err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors);
-	if (err > 0)
-		err = -ENOSPC;
-	if (err)
-		goto error;
-	vp_dev->msix_vectors = nvectors;
-	vp_dev->msix_enabled = 1;
-
-	/* Set the vector used for configuration */
-	v = vp_dev->msix_used_vectors;
-	snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
-		 "%s-config", name);
-	err = request_irq(vp_dev->msix_entries[v].vector,
-			  virtio_pci_config_changed, 0, vp_dev->msix_names[v],
-			  vp_dev);
-	if (err)
-		goto error;
-	++vp_dev->msix_used_vectors;
-
-	iowrite16(v, vp_dev->legacy + VIRTIO_MSI_CONFIG_VECTOR);
-	/* Verify we had enough resources to assign the vector */
-	v = ioread16(vp_dev->legacy + VIRTIO_MSI_CONFIG_VECTOR);
-	if (v == VIRTIO_MSI_NO_VECTOR) {
-		err = -EBUSY;
-		goto error;
-	}
-
-	if (!per_vq_vectors) {
-		/* Shared vector for all VQs */
-		v = vp_dev->msix_used_vectors;
-		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
-			 "%s-virtqueues", name);
-		err = request_irq(vp_dev->msix_entries[v].vector,
-				  virtio_pci_vring_interrupt, 0,
-				  vp_dev->msix_names[v], vp_dev);
-		if (err)
-			goto error;
-		++vp_dev->msix_used_vectors;
-	}
-	return 0;
-error:
-	vp_free_vectors(vdev);
-	return err;
-}
-
-static int vp_request_intx(struct virtio_device *vdev)
-{
-	int err;
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-
-	err = request_irq(vp_dev->pci_dev->irq, virtio_pci_interrupt,
-			  IRQF_SHARED, dev_name(&vdev->dev), vp_dev);
-	if (!err)
-		vp_dev->intx_enabled = 1;
-	return err;
-}
-
-static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
-				  void (*callback)(struct virtqueue *vq),
-				  const char *name,
-				  u16 msix_vec)
+static struct virtqueue *setup_legacy_vq(struct virtio_device *vdev,
+					 unsigned index,
+					 void (*callback)(struct virtqueue *vq),
+					 const char *name,
+					 u16 msix_vec)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtio_pci_vq_info *info;
@@ -326,7 +217,7 @@ out_info:
 	return ERR_PTR(err);
 }
 
-static void vp_del_vq(struct virtqueue *vq)
+static void del_legacy_vq(struct virtqueue *vq)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
 	struct virtio_pci_vq_info *info = vq->priv;
@@ -359,94 +250,10 @@ static void vp_del_vq(struct virtqueue *
 static void vp_del_vqs(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	struct virtqueue *vq, *n;
-	struct virtio_pci_vq_info *info;
 
-	list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
-		info = vq->priv;
-		if (vp_dev->per_vq_vectors &&
-			info->msix_vector != VIRTIO_MSI_NO_VECTOR)
-			free_irq(vp_dev->msix_entries[info->msix_vector].vector,
-				 vq);
-		vp_del_vq(vq);
-	}
-	vp_dev->per_vq_vectors = false;
-
-	vp_free_vectors(vdev);
-}
-
-static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-			      struct virtqueue *vqs[],
-			      vq_callback_t *callbacks[],
-			      const char *names[],
-			      bool use_msix,
-			      bool per_vq_vectors)
-{
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	u16 msix_vec;
-	int i, err, nvectors, allocated_vectors;
-
-	if (!use_msix) {
-		/* Old style: one normal interrupt for change and all vqs. */
-		err = vp_request_intx(vdev);
-		if (err)
-			goto error_request;
-	} else {
-		if (per_vq_vectors) {
-			/* Best option: one for change interrupt, one per vq. */
-			nvectors = 1;
-			for (i = 0; i < nvqs; ++i)
-				if (callbacks[i])
-					++nvectors;
-		} else {
-			/* Second best: one for change, shared for all vqs. */
-			nvectors = 2;
-		}
-
-		err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors);
-		if (err)
-			goto error_request;
-	}
-
-	vp_dev->per_vq_vectors = per_vq_vectors;
-	allocated_vectors = vp_dev->msix_used_vectors;
-	for (i = 0; i < nvqs; ++i) {
-		if (!callbacks[i] || !vp_dev->msix_enabled)
-			msix_vec = VIRTIO_MSI_NO_VECTOR;
-		else if (vp_dev->per_vq_vectors)
-			msix_vec = allocated_vectors++;
-		else
-			msix_vec = VP_MSIX_VQ_VECTOR;
-		vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec);
-		if (IS_ERR(vqs[i])) {
-			err = PTR_ERR(vqs[i]);
-			goto error_find;
-		}
-
-		if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR)
-			continue;
-
-		/* allocate per-vq irq if available and necessary */
-		snprintf(vp_dev->msix_names[msix_vec],
-			 sizeof *vp_dev->msix_names,
-			 "%s-%s",
-			 dev_name(&vp_dev->vdev.dev), names[i]);
-		err = request_irq(vp_dev->msix_entries[msix_vec].vector,
-				  vring_interrupt, 0,
-				  vp_dev->msix_names[msix_vec],
-				  vqs[i]);
-		if (err) {
-			vp_del_vq(vqs[i]);
-			goto error_find;
-		}
-	}
-	return 0;
-
-error_find:
-	vp_del_vqs(vdev);
-
-error_request:
-	return err;
+	return virtio_pci_del_vqs(vdev, vp_dev->legacy +
+				  VIRTIO_MSI_LEGACY_CONFIG_VECTOR,
+				  del_legacy_vq);
 }
 
 /* the config->find_vqs() implementation */
@@ -455,20 +262,12 @@ static int vp_find_vqs(struct virtio_dev
 		       vq_callback_t *callbacks[],
 		       const char *names[])
 {
-	int err;
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
-	/* Try MSI-X with one vector per queue. */
-	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, true, true);
-	if (!err)
-		return 0;
-	/* Fallback: MSI-X with one vector for config, one shared for queues. */
-	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
-				 true, false);
-	if (!err)
-		return 0;
-	/* Finally fall back to regular interrupts. */
-	return vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
-				  false, false);
+	return virtio_pci_find_vqs(vdev, vp_dev->legacy +
+				   VIRTIO_MSI_LEGACY_CONFIG_VECTOR,
+				   setup_legacy_vq, del_legacy_vq,
+				   nvqs, vqs, callbacks, names);
 }
 
 static struct virtio_config_ops virtio_pci_config_ops = {

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC 11/11] virtio_pci: simplify common helpers.
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (10 preceding siblings ...)
  2011-12-08 10:42 ` [RFC 10/11] virtio_pci: share virtqueue setup/teardown between modern and legacy driver Rusty Russell
@ 2011-12-08 10:44 ` Rusty Russell
  2011-12-08 15:37 ` [PATCH 0/11] RFC: PCI using capabilitities Sasha Levin
  2011-12-08 15:37 ` Sasha Levin
  13 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-08 10:44 UTC (permalink / raw)
  To: virtualization; +Cc: Sasha Levin, Michael S. Tsirkin

Our helpers can take a virtio_pci_device, rather than converting from
a virtio_device all the time.  They couldn't do this when they were
called from the common virtio code, but now we wrap them anyway, it
simplifies things.
---
 drivers/virtio/virtio_pci-common.c |   54 ++++++++++++++++---------------------
 drivers/virtio/virtio_pci-common.h |    6 ++--
 drivers/virtio/virtio_pci.c        |   10 +++---
 drivers/virtio/virtio_pci_legacy.c |    9 ++----
 4 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/drivers/virtio/virtio_pci-common.c b/drivers/virtio/virtio_pci-common.c
--- a/drivers/virtio/virtio_pci-common.c
+++ b/drivers/virtio/virtio_pci-common.c
@@ -81,10 +81,9 @@ irqreturn_t virtio_pci_interrupt(int irq
 	return virtio_pci_vring_interrupt(irq, opaque);
 }
 
-static void vp_free_vectors(struct virtio_device *vdev,
+static void vp_free_vectors(struct virtio_pci_device *vp_dev,
 			    __le16 __iomem *msix_config)
 {
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	int i;
 
 	if (vp_dev->intx_enabled) {
@@ -113,11 +112,10 @@ static void vp_free_vectors(struct virti
 	vp_dev->msix_entries = NULL;
 }
 
-static int vp_request_msix_vectors(struct virtio_device *vdev,
+static int vp_request_msix_vectors(struct virtio_pci_device *vp_dev,
 				   __le16 __iomem *msix_config,
 				   int nvectors, bool per_vq_vectors)
 {
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	const char *name = dev_name(&vp_dev->vdev.dev);
 	unsigned i, v;
 	int err = -ENOMEM;
@@ -176,30 +174,28 @@ static int vp_request_msix_vectors(struc
 	}
 	return 0;
 error:
-	vp_free_vectors(vdev, msix_config);
+	vp_free_vectors(vp_dev, msix_config);
 	return err;
 }
 
-static int vp_request_intx(struct virtio_device *vdev)
+static int vp_request_intx(struct virtio_pci_device *vp_dev)
 {
 	int err;
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-
 	err = request_irq(vp_dev->pci_dev->irq, virtio_pci_interrupt,
-			  IRQF_SHARED, dev_name(&vdev->dev), vp_dev);
+			  IRQF_SHARED, dev_name(&vp_dev->vdev.dev), vp_dev);
 	if (!err)
 		vp_dev->intx_enabled = 1;
 	return err;
 }
 
-static int vp_try_to_find_vqs(struct virtio_device *vdev,
+static int vp_try_to_find_vqs(struct virtio_pci_device *vp_dev,
 			      __le16 __iomem *msix_config,
-			      struct virtqueue *(setup_vq)(struct virtio_device*,
-							   unsigned,
-							   void (*)(struct
-								    virtqueue *),
-							   const char *,
-							   u16 msix_vec),
+			      struct virtqueue *(setup_vq)
+			      (struct virtio_pci_device *,
+			       unsigned,
+			       void (*)(struct virtqueue *),
+			       const char *,
+			       u16 msix_vec),
 			      void (*del_vq)(struct virtqueue *vq),
 			      unsigned nvqs,
 			      struct virtqueue *vqs[],
@@ -208,13 +204,12 @@ static int vp_try_to_find_vqs(struct vir
 			      bool use_msix,
 			      bool per_vq_vectors)
 {
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	u16 msix_vec;
 	int i, err, nvectors, allocated_vectors;
 
 	if (!use_msix) {
 		/* Old style: one normal interrupt for change and all vqs. */
-		err = vp_request_intx(vdev);
+		err = vp_request_intx(vp_dev);
 		if (err)
 			goto error_request;
 	} else {
@@ -229,7 +224,7 @@ static int vp_try_to_find_vqs(struct vir
 			nvectors = 2;
 		}
 
-		err = vp_request_msix_vectors(vdev, msix_config,
+		err = vp_request_msix_vectors(vp_dev, msix_config,
 					      nvectors, per_vq_vectors);
 		if (err)
 			goto error_request;
@@ -244,7 +239,7 @@ static int vp_try_to_find_vqs(struct vir
 			msix_vec = allocated_vectors++;
 		else
 			msix_vec = VP_MSIX_VQ_VECTOR;
-		vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec);
+		vqs[i] = setup_vq(vp_dev, i, callbacks[i], names[i], msix_vec);
 		if (IS_ERR(vqs[i])) {
 			err = PTR_ERR(vqs[i]);
 			goto error_find;
@@ -270,16 +265,16 @@ static int vp_try_to_find_vqs(struct vir
 	return 0;
 
 error_find:
-	virtio_pci_del_vqs(vdev, msix_config, del_vq);
+	virtio_pci_del_vqs(vp_dev, msix_config, del_vq);
 
 error_request:
 	return err;
 }
 
 /* the config->find_vqs() implementation */
-int virtio_pci_find_vqs(struct virtio_device *vdev,
+int virtio_pci_find_vqs(struct virtio_pci_device *vp_dev,
 			__le16 __iomem *msix_config,
-			struct virtqueue *(setup_vq)(struct virtio_device *,
+			struct virtqueue *(setup_vq)(struct virtio_pci_device *,
 						     unsigned,
 						     void (*)(struct virtqueue*),
 						     const char *,
@@ -293,30 +288,29 @@ int virtio_pci_find_vqs(struct virtio_de
 	int err;
 
 	/* Try MSI-X with one vector per queue. */
-	err = vp_try_to_find_vqs(vdev, msix_config, setup_vq, del_vq,
+	err = vp_try_to_find_vqs(vp_dev, msix_config, setup_vq, del_vq,
 				 nvqs, vqs, callbacks, names, true, true);
 	if (!err)
 		return 0;
 	/* Fallback: MSI-X with one vector for config, one shared for queues. */
-	err = vp_try_to_find_vqs(vdev, msix_config, setup_vq, del_vq,
+	err = vp_try_to_find_vqs(vp_dev, msix_config, setup_vq, del_vq,
 				 nvqs, vqs, callbacks, names, true, false);
 	if (!err)
 		return 0;
 	/* Finally fall back to regular interrupts. */
-	return vp_try_to_find_vqs(vdev, msix_config, setup_vq, del_vq,
+	return vp_try_to_find_vqs(vp_dev, msix_config, setup_vq, del_vq,
 				  nvqs, vqs, callbacks, names, false, false);
 }
 
 /* the core of a config->del_vqs() implementation */
-void virtio_pci_del_vqs(struct virtio_device *vdev,
+void virtio_pci_del_vqs(struct virtio_pci_device *vp_dev,
 			__le16 __iomem *msix_config,
 			void (*del_vq)(struct virtqueue *vq))
 {
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq, *n;
 	struct virtio_pci_vq_info *info;
 
-	list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
+	list_for_each_entry_safe(vq, n, &vp_dev->vdev.vqs, list) {
 		info = vq->priv;
 		if (vp_dev->per_vq_vectors &&
 			info->msix_vector != VIRTIO_MSI_NO_VECTOR)
@@ -326,5 +320,5 @@ void virtio_pci_del_vqs(struct virtio_de
 	}
 	vp_dev->per_vq_vectors = false;
 
-	vp_free_vectors(vdev, msix_config);
+	vp_free_vectors(vp_dev, msix_config);
 }
diff --git a/drivers/virtio/virtio_pci-common.h b/drivers/virtio/virtio_pci-common.h
--- a/drivers/virtio/virtio_pci-common.h
+++ b/drivers/virtio/virtio_pci-common.h
@@ -87,9 +87,9 @@ irqreturn_t virtio_pci_vring_interrupt(i
 irqreturn_t virtio_pci_interrupt(int irq, void *opaque);
 
 /* Core of a config->find_vqs() implementation */
-int virtio_pci_find_vqs(struct virtio_device *vdev,
+int virtio_pci_find_vqs(struct virtio_pci_device *vp_dev,
 			__le16 __iomem *msix_config,
-			struct virtqueue *(setup_vq)(struct virtio_device *,
+			struct virtqueue *(setup_vq)(struct virtio_pci_device *,
 						     unsigned,
 						     void (*)(struct virtqueue*),
 						     const char *,
@@ -101,7 +101,7 @@ int virtio_pci_find_vqs(struct virtio_de
 			const char *names[]);
 
 /* the core of a config->del_vqs() implementation */
-void virtio_pci_del_vqs(struct virtio_device *vdev,
+void virtio_pci_del_vqs(struct virtio_pci_device *vp_dev,
 			__le16 __iomem *msix_config,
 			void (*del_vq)(struct virtqueue *vq));
 
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -163,12 +163,12 @@ static void *alloc_virtqueue_pages(u16 *
 	return NULL;
 }
 
-static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
+static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
+				  unsigned index,
 				  void (*callback)(struct virtqueue *vq),
 				  const char *name,
 				  u16 msix_vec)
 {
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtio_pci_vq_info *info;
 	struct virtqueue *vq;
 	u16 num;
@@ -213,7 +213,7 @@ static struct virtqueue *setup_vq(struct
 
 	/* create the vring */
 	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_ALIGN,
-				 vdev, info->queue, virtio_pci_notify,
+				 &vp_dev->vdev, info->queue, virtio_pci_notify,
 				 callback, name);
 	if (!vq) {
 		err = -ENOMEM;
@@ -294,7 +294,7 @@ static void vp_del_vq(struct virtqueue *
 static void vp_del_vqs(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	virtio_pci_del_vqs(vdev, &vp_dev->common->msix_config, vp_del_vq);
+	virtio_pci_del_vqs(vp_dev, &vp_dev->common->msix_config, vp_del_vq);
 }
 
 static int vp_find_vqs(struct virtio_device *vdev,
@@ -304,7 +304,7 @@ static int vp_find_vqs(struct virtio_dev
 			const char *names[])
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	return virtio_pci_find_vqs(vdev, &vp_dev->common->msix_config,
+	return virtio_pci_find_vqs(vp_dev, &vp_dev->common->msix_config,
 				   setup_vq, vp_del_vq,
 				   nvqs, vqs, callbacks, names);
 }
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -134,13 +134,12 @@ static void vp_reset(struct virtio_devic
 	vp_synchronize_vectors(vdev);
 }
 
-static struct virtqueue *setup_legacy_vq(struct virtio_device *vdev,
+static struct virtqueue *setup_legacy_vq(struct virtio_pci_device *vp_dev,
 					 unsigned index,
 					 void (*callback)(struct virtqueue *vq),
 					 const char *name,
 					 u16 msix_vec)
 {
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtio_pci_vq_info *info;
 	struct virtqueue *vq;
 	unsigned long flags, size;
@@ -178,7 +177,7 @@ static struct virtqueue *setup_legacy_vq
 
 	/* create the vring */
 	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN,
-				 vdev, info->queue, virtio_pci_notify,
+				 &vp_dev->vdev, info->queue, virtio_pci_notify,
 				 callback, name);
 	if (!vq) {
 		err = -ENOMEM;
@@ -251,7 +250,7 @@ static void vp_del_vqs(struct virtio_dev
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
-	return virtio_pci_del_vqs(vdev, vp_dev->legacy +
+	return virtio_pci_del_vqs(vp_dev, vp_dev->legacy +
 				  VIRTIO_MSI_LEGACY_CONFIG_VECTOR,
 				  del_legacy_vq);
 }
@@ -264,7 +263,7 @@ static int vp_find_vqs(struct virtio_dev
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
-	return virtio_pci_find_vqs(vdev, vp_dev->legacy +
+	return virtio_pci_find_vqs(vp_dev, vp_dev->legacy +
 				   VIRTIO_MSI_LEGACY_CONFIG_VECTOR,
 				   setup_legacy_vq, del_legacy_vq,
 				   nvqs, vqs, callbacks, names);

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (12 preceding siblings ...)
  2011-12-08 15:37 ` [PATCH 0/11] RFC: PCI using capabilitities Sasha Levin
@ 2011-12-08 15:37 ` Sasha Levin
  2011-12-09  6:17   ` Rusty Russell
                     ` (2 more replies)
  13 siblings, 3 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-08 15:37 UTC (permalink / raw)
  To: Rusty Russell; +Cc: virtualization, Michael S. Tsirkin, Avi Kivity, kvm

On Thu, 2011-12-08 at 20:52 +1030, Rusty Russell wrote:
> Here's the patch series I ended up with.  I haven't coded up the QEMU
> side yet, so no idea if the new driver works.
> 
> Questions:
> (1) Do we win from separating ISR, NOTIFY and COMMON?
> (2) I used a "u8 bar"; should I use a bir and pack it instead?  BIR
>     seems a little obscure (noone else in the kernel source seems to
>     refer to it).

I started implementing it for KVM tools, when I noticed a strange thing:
my vq creating was failing because the driver was reading a value other
than 0 from the address field of a new vq, and failing.

I've added simple prints in the usermode code, and saw the following
ordering:

1. queue select vq 0
2. queue read address (returns 0 - new vq)
3. queue write address (good address of vq)
4. queue read address (returns !=0, fails)
4. queue select vq 1

>From that I understood that the ordering is wrong, the driver was trying
to read address before selecting the correct vq.

At that point, I've added simple prints to the driver. Initially it
looked as follows:

	iowrite16(index, &vp_dev->common->queue_select);

	switch (ioread64(&vp_dev->common->queue_address)) {
		[...]
	};

So I added prints before the iowrite16() and after the ioread64(), and
saw that while the driver prints were ordered, the device ones weren't:

	[    1.264052] before iowrite index=1
	kvmtool: net returning pfn (vq=0): 310706176
	kvmtool: queue selected: 1
	[    1.264890] after ioread index=1

Suspecting that something was wrong with ordering, I've added a print
between the iowrite and the ioread, and it finally started working well.

Which leads me to the question: Are MMIO vs MMIO reads/writes not
ordered?

-- 

Sasha.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
                   ` (11 preceding siblings ...)
  2011-12-08 10:44 ` [RFC 11/11] virtio_pci: simplify common helpers Rusty Russell
@ 2011-12-08 15:37 ` Sasha Levin
  2011-12-08 15:37 ` Sasha Levin
  13 siblings, 0 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-08 15:37 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Michael S. Tsirkin, Avi Kivity, kvm, virtualization

On Thu, 2011-12-08 at 20:52 +1030, Rusty Russell wrote:
> Here's the patch series I ended up with.  I haven't coded up the QEMU
> side yet, so no idea if the new driver works.
> 
> Questions:
> (1) Do we win from separating ISR, NOTIFY and COMMON?
> (2) I used a "u8 bar"; should I use a bir and pack it instead?  BIR
>     seems a little obscure (noone else in the kernel source seems to
>     refer to it).

I started implementing it for KVM tools, when I noticed a strange thing:
my vq creating was failing because the driver was reading a value other
than 0 from the address field of a new vq, and failing.

I've added simple prints in the usermode code, and saw the following
ordering:

1. queue select vq 0
2. queue read address (returns 0 - new vq)
3. queue write address (good address of vq)
4. queue read address (returns !=0, fails)
4. queue select vq 1

From that I understood that the ordering is wrong, the driver was trying
to read address before selecting the correct vq.

At that point, I've added simple prints to the driver. Initially it
looked as follows:

	iowrite16(index, &vp_dev->common->queue_select);

	switch (ioread64(&vp_dev->common->queue_address)) {
		[...]
	};

So I added prints before the iowrite16() and after the ioread64(), and
saw that while the driver prints were ordered, the device ones weren't:

	[    1.264052] before iowrite index=1
	kvmtool: net returning pfn (vq=0): 310706176
	kvmtool: queue selected: 1
	[    1.264890] after ioread index=1

Suspecting that something was wrong with ordering, I've added a print
between the iowrite and the ioread, and it finally started working well.

Which leads me to the question: Are MMIO vs MMIO reads/writes not
ordered?

-- 

Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-08 15:37 ` Sasha Levin
@ 2011-12-09  6:17   ` Rusty Russell
  2011-12-10 21:32     ` Sasha Levin
  2011-12-11  9:05   ` Avi Kivity
  2011-12-11 12:47   ` Michael S. Tsirkin
  2 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-09  6:17 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Michael S. Tsirkin, Avi Kivity, kvm, virtualization

On Thu, 08 Dec 2011 17:37:37 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> Which leads me to the question: Are MMIO vs MMIO reads/writes not
> ordered?

That seems really odd, especially being repeatable.

BTW, that's an address, not a pfn now.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 4/11] virtio-pci: define layout for virtio vendor-specific capabilities.
  2011-12-08 10:34 ` [RFC 4/11] virtio-pci: define layout for virtio vendor-specific capabilities Rusty Russell
@ 2011-12-10 21:14   ` Sasha Levin
  0 siblings, 0 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-10 21:14 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Michael S. Tsirkin, virtualization

On Thu, 2011-12-08 at 21:04 +1030, Rusty Russell wrote:
> Based on patch by Michael S. Tsirkin <mst@redhat.com>, but I found it
> hard to follow so changed to use structures which are more
> self-documenting.
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> ---
>  include/linux/virtio_pci.h |   41 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 41 insertions(+)
> 
> diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
> --- a/include/linux/virtio_pci.h
> +++ b/include/linux/virtio_pci.h
> @@ -92,4 +92,45 @@
>  /* The alignment to use between consumer and producer parts of vring.
>   * x86 pagesize again. */
>  #define VIRTIO_PCI_VRING_ALIGN		4096
> +
> +/* IDs for different capabilities.  Must all exist. */
> +/* FIXME: Do we win from separating ISR, NOTIFY and COMMON? */
> +/* Common configuration */
> +#define VIRTIO_PCI_CAP_COMMON_CFG	1
> +/* Notifications */
> +#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
> +/* ISR access */
> +#define VIRTIO_PCI_CAP_ISR_CFG		3
> +/* Device specific confiuration */
> +#define VIRTIO_PCI_CAP_DEVICE_CFG	4
> +
> +/* This is the PCI capability header: */
> +struct virtio_pci_cap {
> +	u8 cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
> +	u8 cap_next;	/* Generic PCI field: next ptr. */

There should be a cap_len field here, which is mandatory for
PCI_CAP_ID_VNDR capabilities.

> +	u8 cfg_type;	/* One of the VIRTIO_PCI_CAP_*_CFG. */
> +/* FIXME: Should we use a bir, instead of raw bar number? */
> +	u8 bar;		/* Where to find it. */
> +	__le32 offset;	/* Offset within bar. */
> +	__le32 length;	/* Length. */
> +};
> +
> +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
> +struct virtio_pci_common_cfg {
> +	/* About the whole device. */
> +	__le32 device_feature_select;	/* read-write */
> +	__le32 device_feature;		/* read-only */
> +	__le32 guest_feature_select;	/* read-write */
> +	__le32 guest_feature;		/* read-only */
> +	__le16 msix_config;		/* read-write */
> +	__u8 device_status;		/* read-write */
> +	__u8 unused;
> +
> +	/* About a specific virtqueue. */
> +	__le16 queue_select;	/* read-write */
> +	__le16 queue_align;	/* read-write, power of 2. */
> +	__le16 queue_size;	/* read-write, power of 2. */
> +	__le16 queue_msix_vector;/* read-write */
> +	__le64 queue_address;	/* read-write: 0xFFFFFFFFFFFFFFFF == DNE. */

This is now a 64bit address, but we can't do an atomic iowrite64(). We
should make it clear to the device when it should initialize the vq.

-- 

Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 6/11] virtio_pci: don't use the legacy driver if we find the new PCI capabilities.
  2011-12-08 10:38 ` [RFC 6/11] virtio_pci: don't use the legacy driver if we find the new PCI capabilities Rusty Russell
@ 2011-12-10 21:18   ` Sasha Levin
  2011-12-11  5:15     ` Rusty Russell
  2011-12-11  9:37     ` Michael S. Tsirkin
  0 siblings, 2 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-10 21:18 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Michael S. Tsirkin, virtualization

On Thu, 2011-12-08 at 21:08 +1030, Rusty Russell wrote:
> With module option to override.  I assume I can call
> pci_find_capability() before pci_request_regions? 

I don't think thats safe.

-- 

Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-09  6:17   ` Rusty Russell
@ 2011-12-10 21:32     ` Sasha Levin
  0 siblings, 0 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-10 21:32 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Michael S. Tsirkin, Avi Kivity, kvm, virtualization

On Fri, 2011-12-09 at 16:47 +1030, Rusty Russell wrote:
> On Thu, 08 Dec 2011 17:37:37 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> > Which leads me to the question: Are MMIO vs MMIO reads/writes not
> > ordered?
> 
> That seems really odd, especially being repeatable.

Happens every single time. Can't be a coincidence.

I even went into paranoia mode and made sure that both IO requests come
from the same vcpu.

Another weird thing I've noticed is that mb() doesn't fix it, while if I
replace the mb() with a printk() it works well.

> BTW, that's an address, not a pfn now.

Fixed :)

-- 

Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 6/11] virtio_pci: don't use the legacy driver if we find the new PCI capabilities.
  2011-12-10 21:18   ` Sasha Levin
@ 2011-12-11  5:15     ` Rusty Russell
  2011-12-11  9:37     ` Michael S. Tsirkin
  1 sibling, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-11  5:15 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Michael S. Tsirkin, virtualization

On Sat, 10 Dec 2011 23:18:38 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> On Thu, 2011-12-08 at 21:08 +1030, Rusty Russell wrote:
> > With module option to override.  I assume I can call
> > pci_find_capability() before pci_request_regions? 
> 
> I don't think thats safe.

Actually, a number of drivers do it first thing in probe()
(eg. drivers/char/agp/intel-agp.c).

So I think it's easiest right at the start.  I've changed that.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-08 15:37 ` Sasha Levin
  2011-12-09  6:17   ` Rusty Russell
@ 2011-12-11  9:05   ` Avi Kivity
  2011-12-11 10:03     ` Sasha Levin
  2011-12-11 12:47   ` Michael S. Tsirkin
  2 siblings, 1 reply; 106+ messages in thread
From: Avi Kivity @ 2011-12-11  9:05 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Michael S. Tsirkin, kvm, virtualization

On 12/08/2011 05:37 PM, Sasha Levin wrote:
> On Thu, 2011-12-08 at 20:52 +1030, Rusty Russell wrote:
> > Here's the patch series I ended up with.  I haven't coded up the QEMU
> > side yet, so no idea if the new driver works.
> > 
> > Questions:
> > (1) Do we win from separating ISR, NOTIFY and COMMON?
> > (2) I used a "u8 bar"; should I use a bir and pack it instead?  BIR
> >     seems a little obscure (noone else in the kernel source seems to
> >     refer to it).
>
> I started implementing it for KVM tools, when I noticed a strange thing:
> my vq creating was failing because the driver was reading a value other
> than 0 from the address field of a new vq, and failing.
>
> I've added simple prints in the usermode code, and saw the following
> ordering:
>
> 1. queue select vq 0
> 2. queue read address (returns 0 - new vq)
> 3. queue write address (good address of vq)
> 4. queue read address (returns !=0, fails)
> 4. queue select vq 1
>
> From that I understood that the ordering is wrong, the driver was trying
> to read address before selecting the correct vq.
>
> At that point, I've added simple prints to the driver. Initially it
> looked as follows:
>
> 	iowrite16(index, &vp_dev->common->queue_select);
>
> 	switch (ioread64(&vp_dev->common->queue_address)) {
> 		[...]
> 	};
>
> So I added prints before the iowrite16() and after the ioread64(), and
> saw that while the driver prints were ordered, the device ones weren't:
>
> 	[    1.264052] before iowrite index=1
> 	kvmtool: net returning pfn (vq=0): 310706176
> 	kvmtool: queue selected: 1
> 	[    1.264890] after ioread index=1
>
> Suspecting that something was wrong with ordering, I've added a print
> between the iowrite and the ioread, and it finally started working well.
>
> Which leads me to the question: Are MMIO vs MMIO reads/writes not
> ordered?

mmios are strictly ordered.

Perhaps your printfs are reordered by buffering?  Are they from
different threads?  Are you using coalesced mmio (which is still
strictly ordered, if used correctly)?

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 6/11] virtio_pci: don't use the legacy driver if we find the new PCI capabilities.
  2011-12-10 21:18   ` Sasha Levin
  2011-12-11  5:15     ` Rusty Russell
@ 2011-12-11  9:37     ` Michael S. Tsirkin
  1 sibling, 0 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-11  9:37 UTC (permalink / raw)
  To: Sasha Levin; +Cc: virtualization

On Sat, Dec 10, 2011 at 11:18:38PM +0200, Sasha Levin wrote:
> On Thu, 2011-12-08 at 21:08 +1030, Rusty Russell wrote:
> > With module option to override.  I assume I can call
> > pci_find_capability() before pci_request_regions? 
> 
> I don't think thats safe.

Why wouldn't it be? pci_find_capability is only
using config cycles, it never needs any regions.

> -- 
> 
> Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-08 10:39 ` [RFC 7/11] virtio_pci: new, capability-aware driver Rusty Russell
@ 2011-12-11  9:42   ` Michael S. Tsirkin
  2011-12-11 22:45     ` Rusty Russell
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-11  9:42 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization

On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
> +/* There is no iowrite64.  We use two 32-bit ops. */
> +static void iowrite64(u64 val, const __le64 *addr)
> +{
> +	iowrite32((u32)val, (__le32 *)addr);
> +	iowrite32(val >> 32, (__le32 *)addr + 1);
> +}
> +

Let's put addr_lo/addr_hi in the structure then,
to make the fact this field is not atomic explicit?

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-11  9:05   ` Avi Kivity
@ 2011-12-11 10:03     ` Sasha Levin
  2011-12-11 12:30       ` Michael S. Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Sasha Levin @ 2011-12-11 10:03 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Michael S. Tsirkin, kvm, virtualization

On Sun, 2011-12-11 at 11:05 +0200, Avi Kivity wrote:
> mmios are strictly ordered.
> 
> Perhaps your printfs are reordered by buffering?  Are they from
> different threads?  Are you using coalesced mmio (which is still
> strictly ordered, if used correctly)? 

I print the queue_selector and queue_address in the printfs, even if
printfs were reordered they would be printing the data right, unlike
they do now. It's the data in the printfs that matters, not their order.

Same vcpu thread with both accesses.

Not using coalesced mmio.

-- 

Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-11 10:03     ` Sasha Levin
@ 2011-12-11 12:30       ` Michael S. Tsirkin
  2011-12-11 12:48         ` Sasha Levin
  2011-12-11 12:48         ` Sasha Levin
  0 siblings, 2 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-11 12:30 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Avi Kivity, kvm, virtualization

On Sun, Dec 11, 2011 at 12:03:52PM +0200, Sasha Levin wrote:
> On Sun, 2011-12-11 at 11:05 +0200, Avi Kivity wrote:
> > mmios are strictly ordered.
> > 
> > Perhaps your printfs are reordered by buffering?  Are they from
> > different threads?  Are you using coalesced mmio (which is still
> > strictly ordered, if used correctly)? 
> 
> I print the queue_selector and queue_address in the printfs, even if
> printfs were reordered they would be printing the data right, unlike
> they do now. It's the data in the printfs that matters, not their order.
> 
> Same vcpu thread with both accesses.
> 
> Not using coalesced mmio.

Not sure why this would matter, but is the BAR a prefetcheable one?
Rusty's patch uses pci_iomap which maps a prefetcheable BAR
as cacheable.


> -- 
> 
> Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-08 15:37 ` Sasha Levin
  2011-12-09  6:17   ` Rusty Russell
  2011-12-11  9:05   ` Avi Kivity
@ 2011-12-11 12:47   ` Michael S. Tsirkin
  2011-12-11 12:53     ` Sasha Levin
  2011-12-11 12:53     ` Sasha Levin
  2 siblings, 2 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-11 12:47 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Avi Kivity, kvm, virtualization

On Thu, Dec 08, 2011 at 05:37:37PM +0200, Sasha Levin wrote:
> On Thu, 2011-12-08 at 20:52 +1030, Rusty Russell wrote:
> > Here's the patch series I ended up with.  I haven't coded up the QEMU
> > side yet, so no idea if the new driver works.
> > 
> > Questions:
> > (1) Do we win from separating ISR, NOTIFY and COMMON?
> > (2) I used a "u8 bar"; should I use a bir and pack it instead?  BIR
> >     seems a little obscure (noone else in the kernel source seems to
> >     refer to it).
> 
> I started implementing it for KVM tools, when I noticed a strange thing:
> my vq creating was failing because the driver was reading a value other
> than 0 from the address field of a new vq, and failing.
> 
> I've added simple prints in the usermode code, and saw the following
> ordering:
> 
> 1. queue select vq 0
> 2. queue read address (returns 0 - new vq)
> 3. queue write address (good address of vq)
> 4. queue read address (returns !=0, fails)
> 4. queue select vq 1
> 
> >From that I understood that the ordering is wrong, the driver was trying
> to read address before selecting the correct vq.
> 
> At that point, I've added simple prints to the driver. Initially it
> looked as follows:
> 
> 	iowrite16(index, &vp_dev->common->queue_select);
> 
> 	switch (ioread64(&vp_dev->common->queue_address)) {
> 		[...]
> 	};
> 
> So I added prints before the iowrite16() and after the ioread64(), and
> saw that while the driver prints were ordered, the device ones weren't:
> 
> 	[    1.264052] before iowrite index=1
> 	kvmtool: net returning pfn (vq=0): 310706176
> 	kvmtool: queue selected: 1
> 	[    1.264890] after ioread index=1
> 
> Suspecting that something was wrong with ordering, I've added a print
> between the iowrite and the ioread, and it finally started working well.
> 
> Which leads me to the question: Are MMIO vs MMIO reads/writes not
> ordered?

First, I'd like to answer your questions from the PCI side.
Look for PCI rules in the PCI spec.
You will notices that a write is required to be able to
pass a read request. It might also pass read completion.
A read request will not pass a write request.
There's more or less no ordering between different types of transactions
(memory versus io/configuration).

That's wrt to the question you asked.

But this is not your setup: you have a single vcpu so
you will not initiate a write (select vq) until you get
a read completion.

So what you are really describing is this setup: guest reads a value,
gets the response, then writes out another one, and kvm tool reports the
write before the read.


> -- 
> 
> Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-11 12:30       ` Michael S. Tsirkin
  2011-12-11 12:48         ` Sasha Levin
@ 2011-12-11 12:48         ` Sasha Levin
  1 sibling, 0 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-11 12:48 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Rusty Russell, virtualization, kvm

On Sun, 2011-12-11 at 14:30 +0200, Michael S. Tsirkin wrote:
> On Sun, Dec 11, 2011 at 12:03:52PM +0200, Sasha Levin wrote:
> > On Sun, 2011-12-11 at 11:05 +0200, Avi Kivity wrote:
> > > mmios are strictly ordered.
> > > 
> > > Perhaps your printfs are reordered by buffering?  Are they from
> > > different threads?  Are you using coalesced mmio (which is still
> > > strictly ordered, if used correctly)? 
> > 
> > I print the queue_selector and queue_address in the printfs, even if
> > printfs were reordered they would be printing the data right, unlike
> > they do now. It's the data in the printfs that matters, not their order.
> > 
> > Same vcpu thread with both accesses.
> > 
> > Not using coalesced mmio.
> 
> Not sure why this would matter, but is the BAR a prefetcheable one?
> Rusty's patch uses pci_iomap which maps a prefetcheable BAR
> as cacheable.

Wasn't defined as prefetchable, but I'm seeing same thing with or
without it.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-11 12:30       ` Michael S. Tsirkin
@ 2011-12-11 12:48         ` Sasha Levin
  2011-12-11 12:48         ` Sasha Levin
  1 sibling, 0 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-11 12:48 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, kvm, virtualization

On Sun, 2011-12-11 at 14:30 +0200, Michael S. Tsirkin wrote:
> On Sun, Dec 11, 2011 at 12:03:52PM +0200, Sasha Levin wrote:
> > On Sun, 2011-12-11 at 11:05 +0200, Avi Kivity wrote:
> > > mmios are strictly ordered.
> > > 
> > > Perhaps your printfs are reordered by buffering?  Are they from
> > > different threads?  Are you using coalesced mmio (which is still
> > > strictly ordered, if used correctly)? 
> > 
> > I print the queue_selector and queue_address in the printfs, even if
> > printfs were reordered they would be printing the data right, unlike
> > they do now. It's the data in the printfs that matters, not their order.
> > 
> > Same vcpu thread with both accesses.
> > 
> > Not using coalesced mmio.
> 
> Not sure why this would matter, but is the BAR a prefetcheable one?
> Rusty's patch uses pci_iomap which maps a prefetcheable BAR
> as cacheable.

Wasn't defined as prefetchable, but I'm seeing same thing with or
without it.

-- 

Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-11 12:47   ` Michael S. Tsirkin
@ 2011-12-11 12:53     ` Sasha Levin
  2011-12-11 12:53     ` Sasha Levin
  1 sibling, 0 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-11 12:53 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Rusty Russell, virtualization, Avi Kivity, kvm

On Sun, 2011-12-11 at 14:47 +0200, Michael S. Tsirkin wrote:
> First, I'd like to answer your questions from the PCI side.
> Look for PCI rules in the PCI spec.
> You will notices that a write is required to be able to
> pass a read request. It might also pass read completion.
> A read request will not pass a write request.
> There's more or less no ordering between different types of transactions
> (memory versus io/configuration).
> 
> That's wrt to the question you asked.
> 
> But this is not your setup: you have a single vcpu so
> you will not initiate a write (select vq) until you get
> a read completion.
> 
> So what you are really describing is this setup: guest reads a value,
> gets the response, then writes out another one, and kvm tool reports the
> write before the read. 

No, it's exactly the opposite. Guest writes a value first and then reads
one (writes queue_select and reads queue_address) and kvm tool reporting
the read before the write.

I must add here that the kvm tool doesn't do anything fancy with simple
IO/MMIO. Theres no thread games or anything similar there. The vcpu
thread is doing all the IO/MMIO work.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 0/11] RFC: PCI using capabilitities
  2011-12-11 12:47   ` Michael S. Tsirkin
  2011-12-11 12:53     ` Sasha Levin
@ 2011-12-11 12:53     ` Sasha Levin
  1 sibling, 0 replies; 106+ messages in thread
From: Sasha Levin @ 2011-12-11 12:53 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, kvm, virtualization

On Sun, 2011-12-11 at 14:47 +0200, Michael S. Tsirkin wrote:
> First, I'd like to answer your questions from the PCI side.
> Look for PCI rules in the PCI spec.
> You will notices that a write is required to be able to
> pass a read request. It might also pass read completion.
> A read request will not pass a write request.
> There's more or less no ordering between different types of transactions
> (memory versus io/configuration).
> 
> That's wrt to the question you asked.
> 
> But this is not your setup: you have a single vcpu so
> you will not initiate a write (select vq) until you get
> a read completion.
> 
> So what you are really describing is this setup: guest reads a value,
> gets the response, then writes out another one, and kvm tool reports the
> write before the read. 

No, it's exactly the opposite. Guest writes a value first and then reads
one (writes queue_select and reads queue_address) and kvm tool reporting
the read before the write.

I must add here that the kvm tool doesn't do anything fancy with simple
IO/MMIO. Theres no thread games or anything similar there. The vcpu
thread is doing all the IO/MMIO work.

-- 

Sasha.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-11  9:42   ` Michael S. Tsirkin
@ 2011-12-11 22:45     ` Rusty Russell
  2011-12-12 11:49       ` Michael S. Tsirkin
  2011-12-12 18:25       ` Michael S. Tsirkin
  0 siblings, 2 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-11 22:45 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sasha Levin, virtualization

On Sun, 11 Dec 2011 11:42:56 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
> > +/* There is no iowrite64.  We use two 32-bit ops. */
> > +static void iowrite64(u64 val, const __le64 *addr)
> > +{
> > +	iowrite32((u32)val, (__le32 *)addr);
> > +	iowrite32(val >> 32, (__le32 *)addr + 1);
> > +}
> > +
> 
> Let's put addr_lo/addr_hi in the structure then,
> to make the fact this field is not atomic explicit?

Good point, assuming I haven't missed something.

Are 64-bit accesses actually unknown in PCI-land?  Or is this a limited
availability thing?

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-11 22:45     ` Rusty Russell
@ 2011-12-12 11:49       ` Michael S. Tsirkin
  2011-12-12 18:10         ` Don Dutile
  2013-05-28  7:56         ` Michael S. Tsirkin
  2011-12-12 18:25       ` Michael S. Tsirkin
  1 sibling, 2 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-12 11:49 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization

On Mon, Dec 12, 2011 at 09:15:03AM +1030, Rusty Russell wrote:
> On Sun, 11 Dec 2011 11:42:56 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
> > > +/* There is no iowrite64.  We use two 32-bit ops. */
> > > +static void iowrite64(u64 val, const __le64 *addr)
> > > +{
> > > +	iowrite32((u32)val, (__le32 *)addr);
> > > +	iowrite32(val >> 32, (__le32 *)addr + 1);
> > > +}
> > > +
> > 
> > Let's put addr_lo/addr_hi in the structure then,
> > to make the fact this field is not atomic explicit?
> 
> Good point, assuming I haven't missed something.
> 
> Are 64-bit accesses actually unknown in PCI-land?  Or is this a limited
> availability thing?
> 
> Thanks,
> Rusty.

I think PCI can optionally support atomic 64 bit accesses, but not all
architectures can generate them.

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-12 11:49       ` Michael S. Tsirkin
@ 2011-12-12 18:10         ` Don Dutile
  2011-12-16  1:58           ` Rusty Russell
  2013-05-28  7:56         ` Michael S. Tsirkin
  1 sibling, 1 reply; 106+ messages in thread
From: Don Dutile @ 2011-12-12 18:10 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sasha Levin, virtualization

On 12/12/2011 06:49 AM, Michael S. Tsirkin wrote:
> On Mon, Dec 12, 2011 at 09:15:03AM +1030, Rusty Russell wrote:
>> On Sun, 11 Dec 2011 11:42:56 +0200, "Michael S. Tsirkin"<mst@redhat.com>  wrote:
>>> On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
>>>> +/* There is no iowrite64.  We use two 32-bit ops. */
>>>> +static void iowrite64(u64 val, const __le64 *addr)
>>>> +{
>>>> +	iowrite32((u32)val, (__le32 *)addr);
>>>> +	iowrite32(val>>  32, (__le32 *)addr + 1);
>>>> +}
>>>> +
>>>
>>> Let's put addr_lo/addr_hi in the structure then,
>>> to make the fact this field is not atomic explicit?
>>
>> Good point, assuming I haven't missed something.
>>
>> Are 64-bit accesses actually unknown in PCI-land?  Or is this a limited
>> availability thing?
>>
>> Thanks,
>> Rusty.
>
> I think PCI can optionally support atomic 64 bit accesses, but not all
> architectures can generate them.
>
yes. PCI(e) support atomic 64-bit ops; it's dependent on CPU & chipset interface
to PCI that determines ability to generate a 64-bit length xaction.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-11 22:45     ` Rusty Russell
  2011-12-12 11:49       ` Michael S. Tsirkin
@ 2011-12-12 18:25       ` Michael S. Tsirkin
  2011-12-13  2:21         ` Rusty Russell
  1 sibling, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-12 18:25 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization

On Mon, Dec 12, 2011 at 09:15:03AM +1030, Rusty Russell wrote:
> On Sun, 11 Dec 2011 11:42:56 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
> > > +/* There is no iowrite64.  We use two 32-bit ops. */
> > > +static void iowrite64(u64 val, const __le64 *addr)
> > > +{
> > > +	iowrite32((u32)val, (__le32 *)addr);
> > > +	iowrite32(val >> 32, (__le32 *)addr + 1);
> > > +}
> > > +
> > 
> > Let's put addr_lo/addr_hi in the structure then,
> > to make the fact this field is not atomic explicit?
> 
> Good point, assuming I haven't missed something.
> 
> Are 64-bit accesses actually unknown in PCI-land?  Or is this a limited
> availability thing?
> 
> Thanks,
> Rusty.

By the way, a generic question on virtio-pci: we now have:

/* virtio config->get() implementation */
static void vp_get(struct virtio_device *vdev, unsigned offset,
                   void *buf, unsigned len)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        void __iomem *ioaddr = vp_dev->ioaddr +
                                VIRTIO_PCI_CONFIG(vp_dev) + offset;
        u8 *ptr = buf;
        int i;

        for (i = 0; i < len; i++)
                ptr[i] = ioread8(ioaddr + i);
}

This means that if configuration is read while
it is changed, we might get an inconsistent state,
with parts of a 64 bit field coming from old
and parts from new value.

Isn't this a problem?

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-12 18:25       ` Michael S. Tsirkin
@ 2011-12-13  2:21         ` Rusty Russell
  2011-12-15  8:27           ` Michael S. Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-13  2:21 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sasha Levin, virtualization

On Mon, 12 Dec 2011 20:25:34 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> By the way, a generic question on virtio-pci: we now have:
> 
> /* virtio config->get() implementation */
> static void vp_get(struct virtio_device *vdev, unsigned offset,
>                    void *buf, unsigned len)
> {
>         struct virtio_pci_device *vp_dev = to_vp_device(vdev);
>         void __iomem *ioaddr = vp_dev->ioaddr +
>                                 VIRTIO_PCI_CONFIG(vp_dev) + offset;
>         u8 *ptr = buf;
>         int i;
> 
>         for (i = 0; i < len; i++)
>                 ptr[i] = ioread8(ioaddr + i);
> }
> 
> This means that if configuration is read while
> it is changed, we might get an inconsistent state,
> with parts of a 64 bit field coming from old
> and parts from new value.
> 
> Isn't this a problem?

I don't think so; it's the caller's problem if they need to do locking.
Is there a caller which needs this?

Or am I missing something?
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-13  2:21         ` Rusty Russell
@ 2011-12-15  8:27           ` Michael S. Tsirkin
  2011-12-16  1:50             ` Rusty Russell
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-15  8:27 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization

On Tue, Dec 13, 2011 at 12:51:20PM +1030, Rusty Russell wrote:
> On Mon, 12 Dec 2011 20:25:34 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > By the way, a generic question on virtio-pci: we now have:
> > 
> > /* virtio config->get() implementation */
> > static void vp_get(struct virtio_device *vdev, unsigned offset,
> >                    void *buf, unsigned len)
> > {
> >         struct virtio_pci_device *vp_dev = to_vp_device(vdev);
> >         void __iomem *ioaddr = vp_dev->ioaddr +
> >                                 VIRTIO_PCI_CONFIG(vp_dev) + offset;
> >         u8 *ptr = buf;
> >         int i;
> > 
> >         for (i = 0; i < len; i++)
> >                 ptr[i] = ioread8(ioaddr + i);
> > }
> > 
> > This means that if configuration is read while
> > it is changed, we might get an inconsistent state,
> > with parts of a 64 bit field coming from old
> > and parts from new value.
> > 
> > Isn't this a problem?
> 
> I don't think so; it's the caller's problem if they need to do locking.
> Is there a caller which needs this?
> 
> Or am I missing something?
> Rusty.


I mean like this in block:



        /* Host must always specify the capacity. */
        vdev->config->get(vdev, offsetof(struct virtio_blk_config,
capacity),
                          &capacity, sizeof(capacity));

        /* If capacity is too big, truncate with warning. */
        if ((sector_t)capacity != capacity) {
                dev_warn(&vdev->dev, "Capacity %llu too large:
truncating\n",
                         (unsigned long long)capacity);
                capacity = (sector_t)-1;
        }


Now let's assume capacity field is changed from 0x8000 to 0x10000
on host. Is it possible that we read two upper bytes
before the change so we see 0x0000....
and 2 lower bytes after the change
so we see 0x....0000 and resulting capacity appears
to be 0?

If no why not?

And what kind of locking can help?

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 3/11] pci: add pci_iomap_range
  2011-12-08 10:32 ` [RFC 3/11] pci: add pci_iomap_range Rusty Russell
@ 2011-12-15  8:30   ` Michael S. Tsirkin
  2011-12-16  1:56     ` Rusty Russell
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-15  8:30 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization

On Thu, Dec 08, 2011 at 09:02:46PM +1030, Rusty Russell wrote:
> From: Michael S Tsirkin <mst@redhat.com>
> 
> Virtio drivers should map the part of the range they need, not necessarily
> all of it.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>


I think that we should add a forcenocache flag.
This will let devices put the cap structure in
the prefetcheable BAR. That has an advantage that
it can be located anywhere in the 2^64 space,
while non-prefetcheable BARs are limited to lower 4G
for devices behind a PCI-to-PCI bridge.

> ---
>  include/asm-generic/io.h    |    4 ++++
>  include/asm-generic/iomap.h |   11 +++++++++++
>  lib/iomap.c                 |   41 ++++++++++++++++++++++++++++++++++++-----
>  3 files changed, 51 insertions(+), 5 deletions(-)
> 
> diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
> index 9120887..3cf1787 100644
> --- a/include/asm-generic/io.h
> +++ b/include/asm-generic/io.h
> @@ -286,6 +286,10 @@ static inline void writesb(const void __iomem *addr, const void *buf, int len)
>  /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
>  struct pci_dev;
>  extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
> +extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
> +				     unsigned offset,
> +				     unsigned long minlen,
> +				     unsigned long maxlen);
>  static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
>  {
>  }
> diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
> index 98dcd76..6f192d4 100644
> --- a/include/asm-generic/iomap.h
> +++ b/include/asm-generic/iomap.h
> @@ -70,8 +70,19 @@ extern void ioport_unmap(void __iomem *);
>  /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
>  struct pci_dev;
>  extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
> +extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
> +				     unsigned offset,
> +				     unsigned long minlen,
> +				     unsigned long maxlen);
>  extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
>  #else
> +static inline void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
> +					    unsigned offset,
> +					    unsigned long minlen,
> +					    unsigned long maxlen)
> +{
> +	return NULL;
> +}
>  struct pci_dev;
>  static inline void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
>  {
> diff --git a/lib/iomap.c b/lib/iomap.c
> index 5dbcb4b..93ae915 100644
> --- a/lib/iomap.c
> +++ b/lib/iomap.c
> @@ -243,26 +243,37 @@ EXPORT_SYMBOL(ioport_unmap);
>  
>  #ifdef CONFIG_PCI
>  /**
> - * pci_iomap - create a virtual mapping cookie for a PCI BAR
> + * pci_iomap_range - create a virtual mapping cookie for a PCI BAR
>   * @dev: PCI device that owns the BAR
>   * @bar: BAR number
> - * @maxlen: length of the memory to map
> + * @offset: map memory at the given offset in BAR
> + * @minlen: min length of the memory to map
> + * @maxlen: max length of the memory to map
>   *
>   * Using this function you will get a __iomem address to your device BAR.
>   * You can access it using ioread*() and iowrite*(). These functions hide
>   * the details if this is a MMIO or PIO address space and will just do what
>   * you expect from them in the correct way.
>   *
> + * @minlen specifies the minimum length to map. We check that BAR is
> + * large enough.
>   * @maxlen specifies the maximum length to map. If you want to get access to
> - * the complete BAR without checking for its length first, pass %0 here.
> + * the complete BAR from offset to the end, pass %0 here.
>   * */
> -void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
> +void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
> +			      unsigned offset,
> +			      unsigned long minlen,
> +			      unsigned long maxlen)
>  {
>  	resource_size_t start = pci_resource_start(dev, bar);
>  	resource_size_t len = pci_resource_len(dev, bar);
>  	unsigned long flags = pci_resource_flags(dev, bar);
>  
> -	if (!len || !start)
> +	if (len <= offset || !start)
> +		return NULL;
> +	len -= offset;
> +	start += offset;
> +	if (len < minlen)
>  		return NULL;
>  	if (maxlen && len > maxlen)
>  		len = maxlen;
> @@ -277,10 +288,30 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
>  	return NULL;
>  }
>  
> +/**
> + * pci_iomap - create a virtual mapping cookie for a PCI BAR
> + * @dev: PCI device that owns the BAR
> + * @bar: BAR number
> + * @maxlen: length of the memory to map
> + *
> + * Using this function you will get a __iomem address to your device BAR.
> + * You can access it using ioread*() and iowrite*(). These functions hide
> + * the details if this is a MMIO or PIO address space and will just do what
> + * you expect from them in the correct way.
> + *
> + * @maxlen specifies the maximum length to map. If you want to get access to
> + * the complete BAR without checking for its length first, pass %0 here.
> + * */
> +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
> +{
> +	return pci_iomap_range(dev, bar, 0, 0, maxlen);
> +}
> +
>  void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
>  {
>  	IO_COND(addr, /* nothing */, iounmap(addr));
>  }
>  EXPORT_SYMBOL(pci_iomap);
> +EXPORT_SYMBOL(pci_iomap_range);
>  EXPORT_SYMBOL(pci_iounmap);
>  #endif /* CONFIG_PCI */

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-15  8:27           ` Michael S. Tsirkin
@ 2011-12-16  1:50             ` Rusty Russell
  2011-12-18 10:18               ` Michael S. Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-16  1:50 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sasha Levin, virtualization

On Thu, 15 Dec 2011 10:27:50 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Tue, Dec 13, 2011 at 12:51:20PM +1030, Rusty Russell wrote:
> I mean like this in block:
> 
> 
> 
>         /* Host must always specify the capacity. */
>         vdev->config->get(vdev, offsetof(struct virtio_blk_config,
> capacity),
>                           &capacity, sizeof(capacity));
> 
>         /* If capacity is too big, truncate with warning. */
>         if ((sector_t)capacity != capacity) {
>                 dev_warn(&vdev->dev, "Capacity %llu too large:
> truncating\n",
>                          (unsigned long long)capacity);
>                 capacity = (sector_t)-1;
>         }
> 
> 
> Now let's assume capacity field is changed from 0x8000 to 0x10000
> on host. Is it possible that we read two upper bytes
> before the change so we see 0x0000....
> and 2 lower bytes after the change
> so we see 0x....0000 and resulting capacity appears
> to be 0?
> 
> If no why not?

Same issue in reverse with the guest setting the MAC address in
virtio_net, if the host were reading it.  And virtio_balloon?  We have
ignored it, so far.

Perhaps a new feature VIRTIO_F_UNSTABLE?  Which (unlike other features)
appears and vanishes around config writes by either side?  Kind of a
hack though...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 3/11] pci: add pci_iomap_range
  2011-12-15  8:30   ` Michael S. Tsirkin
@ 2011-12-16  1:56     ` Rusty Russell
  2011-12-26 14:05         ` Michael S Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-16  1:56 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sasha Levin, virtualization

On Thu, 15 Dec 2011 10:30:49 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Thu, Dec 08, 2011 at 09:02:46PM +1030, Rusty Russell wrote:
> > From: Michael S Tsirkin <mst@redhat.com>
> > 
> > Virtio drivers should map the part of the range they need, not necessarily
> > all of it.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> 
> 
> I think that we should add a forcenocache flag.
> This will let devices put the cap structure in
> the prefetcheable BAR. That has an advantage that
> it can be located anywhere in the 2^64 space,
> while non-prefetcheable BARs are limited to lower 4G
> for devices behind a PCI-to-PCI bridge.

OK, want to respin that patch (or patch on top and I'll fold?)

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-12 18:10         ` Don Dutile
@ 2011-12-16  1:58           ` Rusty Russell
  0 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-16  1:58 UTC (permalink / raw)
  To: Don Dutile, Michael S. Tsirkin; +Cc: Sasha Levin, virtualization

On Mon, 12 Dec 2011 13:10:08 -0500, Don Dutile <ddutile@redhat.com> wrote:
> On 12/12/2011 06:49 AM, Michael S. Tsirkin wrote:
> > On Mon, Dec 12, 2011 at 09:15:03AM +1030, Rusty Russell wrote:
> >> On Sun, 11 Dec 2011 11:42:56 +0200, "Michael S. Tsirkin"<mst@redhat.com>  wrote:
> >>> On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
> >>>> +/* There is no iowrite64.  We use two 32-bit ops. */
> >>>> +static void iowrite64(u64 val, const __le64 *addr)
> >>>> +{
> >>>> +	iowrite32((u32)val, (__le32 *)addr);
> >>>> +	iowrite32(val>>  32, (__le32 *)addr + 1);
> >>>> +}
> >>>> +
> >>>
> >>> Let's put addr_lo/addr_hi in the structure then,
> >>> to make the fact this field is not atomic explicit?
> >>
> >> Good point, assuming I haven't missed something.
> >>
> >> Are 64-bit accesses actually unknown in PCI-land?  Or is this a limited
> >> availability thing?
> >>
> >> Thanks,
> >> Rusty.
> >
> > I think PCI can optionally support atomic 64 bit accesses, but not all
> > architectures can generate them.
> >
> yes. PCI(e) support atomic 64-bit ops; it's dependent on CPU & chipset interface
> to PCI that determines ability to generate a 64-bit length xaction.

Does this mean it's possible to detect inside Linux?  I'd like to use it
if we can, but if everyone is really going to do two 32 bit writes, then
we should just define it that way.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-16  1:50             ` Rusty Russell
@ 2011-12-18 10:18               ` Michael S. Tsirkin
  2011-12-19  6:06                 ` Rusty Russell
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-18 10:18 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization

On Fri, Dec 16, 2011 at 12:20:08PM +1030, Rusty Russell wrote:
> On Thu, 15 Dec 2011 10:27:50 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Tue, Dec 13, 2011 at 12:51:20PM +1030, Rusty Russell wrote:
> > I mean like this in block:
> > 
> > 
> > 
> >         /* Host must always specify the capacity. */
> >         vdev->config->get(vdev, offsetof(struct virtio_blk_config,
> > capacity),
> >                           &capacity, sizeof(capacity));
> > 
> >         /* If capacity is too big, truncate with warning. */
> >         if ((sector_t)capacity != capacity) {
> >                 dev_warn(&vdev->dev, "Capacity %llu too large:
> > truncating\n",
> >                          (unsigned long long)capacity);
> >                 capacity = (sector_t)-1;
> >         }
> > 
> > 
> > Now let's assume capacity field is changed from 0x8000 to 0x10000
> > on host. Is it possible that we read two upper bytes
> > before the change so we see 0x0000....
> > and 2 lower bytes after the change
> > so we see 0x....0000 and resulting capacity appears
> > to be 0?
> > 
> > If no why not?
> 
> Same issue in reverse with the guest setting the MAC address in
> virtio_net, if the host were reading it.  And virtio_balloon?  We have
> ignored it, so far.
> 
> Perhaps a new feature VIRTIO_F_UNSTABLE?  Which (unlike other features)
> appears and vanishes around config writes by either side?  Kind of a
> hack though...
> 
> Cheers,
> Rusty.

Not sure how this can work in such a setup: when would guest
check this bit to avoid races?
A separate registers also seems nicer than a flag.

Some other possible design choices:
- a flag to signal config accesses in progress by guest
  host would need to buffer changes and apply them in one go
  when flag is cleared
- a register to make host get/set config in guest memory
- use a control vq for all devices

The last two involve defining a structure specifying
offset and length of device configuration affected.

The last option is preferable if other transports
besides pci might have the same issue: we solve
it once and for all this way.

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-18 10:18               ` Michael S. Tsirkin
@ 2011-12-19  6:06                 ` Rusty Russell
  2011-12-19  9:13                   ` Michael S. Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-19  6:06 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Sun, 18 Dec 2011 12:18:32 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Fri, Dec 16, 2011 at 12:20:08PM +1030, Rusty Russell wrote:
> > Perhaps a new feature VIRTIO_F_UNSTABLE?  Which (unlike other features)
> > appears and vanishes around config writes by either side?  Kind of a
> > hack though...
> 
> Not sure how this can work in such a setup: when would guest
> check this bit to avoid races?
> A separate registers also seems nicer than a flag.
> 
> Some other possible design choices:
> - a flag to signal config accesses in progress by guest
>   host would need to buffer changes and apply them in one go
>   when flag is cleared
> - a register to make host get/set config in guest memory
> - use a control vq for all devices

- seqlock-style generation count register(s)?  Has the advantage of
  being a noop if things never change.

- continue to ignore it ;)

And yes, it's a more general problem than virtio_pci...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-19  6:06                 ` Rusty Russell
@ 2011-12-19  9:13                   ` Michael S. Tsirkin
  2011-12-19 11:08                     ` Rusty Russell
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-19  9:13 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Mon, Dec 19, 2011 at 04:36:38PM +1030, Rusty Russell wrote:
> On Sun, 18 Dec 2011 12:18:32 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Fri, Dec 16, 2011 at 12:20:08PM +1030, Rusty Russell wrote:
> > > Perhaps a new feature VIRTIO_F_UNSTABLE?  Which (unlike other features)
> > > appears and vanishes around config writes by either side?  Kind of a
> > > hack though...
> > 
> > Not sure how this can work in such a setup: when would guest
> > check this bit to avoid races?
> > A separate registers also seems nicer than a flag.
> > 
> > Some other possible design choices:
> > - a flag to signal config accesses in progress by guest
> >   host would need to buffer changes and apply them in one go
> >   when flag is cleared
> > - a register to make host get/set config in guest memory
> > - use a control vq for all devices
> 
> - seqlock-style generation count register(s)?
>   Has the advantage of
>   being a noop if things never change.

The counter can be in guest memory, right? So we don't pay extra exits.

> - continue to ignore it ;)

Since you decided on a config layout redesign it seems a good time to
fix architectural problems ...

> And yes, it's a more general problem than virtio_pci...
> 
> Cheers,
> Rusty.


-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-19  9:13                   ` Michael S. Tsirkin
@ 2011-12-19 11:08                     ` Rusty Russell
  2011-12-20 11:37                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2011-12-19 11:08 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Mon, 19 Dec 2011 11:13:26 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Dec 19, 2011 at 04:36:38PM +1030, Rusty Russell wrote:
> > On Sun, 18 Dec 2011 12:18:32 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Fri, Dec 16, 2011 at 12:20:08PM +1030, Rusty Russell wrote:
> > > > Perhaps a new feature VIRTIO_F_UNSTABLE?  Which (unlike other features)
> > > > appears and vanishes around config writes by either side?  Kind of a
> > > > hack though...
> > > 
> > > Not sure how this can work in such a setup: when would guest
> > > check this bit to avoid races?
> > > A separate registers also seems nicer than a flag.
> > > 
> > > Some other possible design choices:
> > > - a flag to signal config accesses in progress by guest
> > >   host would need to buffer changes and apply them in one go
> > >   when flag is cleared
> > > - a register to make host get/set config in guest memory
> > > - use a control vq for all devices
> > 
> > - seqlock-style generation count register(s)?
> >   Has the advantage of
> >   being a noop if things never change.

Actually, the host doesn't need anything, since it can always lock out
the guest while it updates the area.

It's the guest which can't do atomic updates.

> The counter can be in guest memory, right? So we don't pay extra
> exits.

Could be, but I'm not delighted about the design.  What does the host do
if the guest screws things up?  How long do you wait for them to
complete the seqlock?  Or does it save the old version for use in the
duration?

And we don't have any atomic guest write problems that I know of.  We
can solve it in future (by specifying a config queue).

> > - continue to ignore it ;)
> 
> Since you decided on a config layout redesign it seems a good time to
> fix architectural problems ...

Yes, indeed.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-19 11:08                     ` Rusty Russell
@ 2011-12-20 11:37                       ` Michael S. Tsirkin
  2011-12-21  0:33                         ` Rusty Russell
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-20 11:37 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Mon, Dec 19, 2011 at 09:38:42PM +1030, Rusty Russell wrote:
> On Mon, 19 Dec 2011 11:13:26 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Mon, Dec 19, 2011 at 04:36:38PM +1030, Rusty Russell wrote:
> > > On Sun, 18 Dec 2011 12:18:32 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > On Fri, Dec 16, 2011 at 12:20:08PM +1030, Rusty Russell wrote:
> > > > > Perhaps a new feature VIRTIO_F_UNSTABLE?  Which (unlike other features)
> > > > > appears and vanishes around config writes by either side?  Kind of a
> > > > > hack though...
> > > > 
> > > > Not sure how this can work in such a setup: when would guest
> > > > check this bit to avoid races?
> > > > A separate registers also seems nicer than a flag.
> > > > 
> > > > Some other possible design choices:
> > > > - a flag to signal config accesses in progress by guest
> > > >   host would need to buffer changes and apply them in one go
> > > >   when flag is cleared
> > > > - a register to make host get/set config in guest memory
> > > > - use a control vq for all devices
> > > 
> > > - seqlock-style generation count register(s)?
> > >   Has the advantage of
> > >   being a noop if things never change.
> 
> Actually, the host doesn't need anything, since it can always lock out
> the guest while it updates the area.
> It's the guest which can't do atomic updates.

There are 2 cases
- updates by guest accesses by host
- accesses by guest updates by host

Both are problematic because the guest accesses are split.
Consider the example I gave at the beginning was with capacity read
by guest. Host can not solve it without guest changes, right?

> > The counter can be in guest memory, right? So we don't pay extra
> > exits.
> 
> Could be, but I'm not delighted about the design.

OK, so this is an argument for always using a control vq, right?

> What does the host do
> if the guest screws things up?  How long do you wait for them to
> complete the seqlock?  Or does it save the old version for use in the
> duration?

Yes, it will have to only apply the change when seqlock is dropped.

> And we don't have any atomic guest write problems that I know of.  We
> can solve it in future (by specifying a config queue).

Don't have == not reported as observed in the field?
It seems clear from code that we do have a race, correct?

> > > - continue to ignore it ;)
> > 
> > Since you decided on a config layout redesign it seems a good time to
> > fix architectural problems ...
> 
> Yes, indeed.
> 
> Cheers,
> Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-20 11:37                       ` Michael S. Tsirkin
@ 2011-12-21  0:33                         ` Rusty Russell
  2011-12-21  9:19                           ` Michael S. Tsirkin
  2012-01-10 17:03                           ` Michael S. Tsirkin
  0 siblings, 2 replies; 106+ messages in thread
From: Rusty Russell @ 2011-12-21  0:33 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Tue, 20 Dec 2011 13:37:18 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Dec 19, 2011 at 09:38:42PM +1030, Rusty Russell wrote:
> > On Mon, 19 Dec 2011 11:13:26 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > 
> > Actually, the host doesn't need anything, since it can always lock out
> > the guest while it updates the area.
> > It's the guest which can't do atomic updates.
> 
> There are 2 cases
> - updates by guest accesses by host
> - accesses by guest updates by host
> 
> Both are problematic because the guest accesses are split.
> Consider the example I gave at the beginning was with capacity read
> by guest. Host can not solve it without guest changes, right?

Indeed, my brain fart.  Let's pretend I didn't say that, and you didn't
have to explain it to me in baby words :)

> > > The counter can be in guest memory, right? So we don't pay extra
> > > exits.
> > 
> > Could be, but I'm not delighted about the design.
> 
> OK, so this is an argument for always using a control vq, right?

Yes.  The idea that we can alter fields in the device-specific config
area is flawed.  There may be cases where it doesn't matter, but as an
idea it was holed to begin with.

We can reduce probability by doing a double read to check, but there are
still cases where it will fail.

> > What does the host do
> > if the guest screws things up?  How long do you wait for them to
> > complete the seqlock?  Or does it save the old version for use in the
> > duration?
> 
> Yes, it will have to only apply the change when seqlock is dropped.

If the seqlock is in normal memory, how does it get notified?  It would
have to poll.  That's annoying, since you don't know when to give up and
declare the device terminally broken.

> Don't have == not reported as observed in the field?
> It seems clear from code that we do have a race, correct?

Yes, and yes.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-21  0:33                         ` Rusty Russell
@ 2011-12-21  9:19                           ` Michael S. Tsirkin
  2012-01-10 17:03                           ` Michael S. Tsirkin
  1 sibling, 0 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2011-12-21  9:19 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
> > > What does the host do
> > > if the guest screws things up?  How long do you wait for them to
> > > complete the seqlock?  Or does it save the old version for use in the
> > > duration?
> > 
> > Yes, it will have to only apply the change when seqlock is dropped.
> 
> If the seqlock is in normal memory, how does it get notified?  It would
> have to poll.  That's annoying, since you don't know when to give up and
> declare the device terminally broken.

OK, so you think all devices need a config vq then?
It actually has other benefits:
- devices don't need a design choice between configuration and vq
- minimum number of MSI vectors per device will be 1, not 2
- reduced PCI memory usage
- configuration updates become more lightweight

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [PATCHv2 RFC] pci: add pci_iomap_range
  2011-12-16  1:56     ` Rusty Russell
@ 2011-12-26 14:05         ` Michael S Tsirkin
  0 siblings, 0 replies; 106+ messages in thread
From: Michael S Tsirkin @ 2011-12-26 14:05 UTC (permalink / raw)
  Cc: Arnd Bergmann, Rusty Russell, Michael S. Tsirkin, Jonas Bonn,
	linux-arch, linux-kernel, Jesse Barnes

Virtio drivers should map the part of the range they need, not necessarily
all of it. They also need non-cacheable mapping even for
prefetchable BARs.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---

>Rusty Russell wrote:
>> I think that we should add a forcenocache flag.
>> This will let devices put the cap structure in
>> the prefetcheable BAR. That has an advantage that
>> it can be located anywhere in the 2^64 space,
>> while non-prefetcheable BARs are limited to lower 4G
>> for devices behind a PCI-to-PCI bridge.
>OK, want to respin that patch (or patch on top and I'll fold?)

Here comes (warning - untested). This is on top of master.
As I'm moving this function for portability into
a separate file, it might be a good idea for this
to be rebased on top of my linux-next tree?

The whole mmio rework is not 3.3 material yes, is it?

 include/asm-generic/io.h    |    5 ++++
 include/asm-generic/iomap.h |   13 ++++++++++++
 lib/iomap.c                 |   46 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 9120887..78aa159 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -286,6 +286,11 @@ static inline void writesb(const void __iomem *addr, const void *buf, int len)
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+				     unsigned offset,
+				     unsigned long minlen,
+				     unsigned long maxlen,
+				     bool force_nocache);
 static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
 {
 }
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 98dcd76..584a2b6 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -70,8 +70,21 @@ extern void ioport_unmap(void __iomem *);
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+				     unsigned offset,
+				     unsigned long minlen,
+				     unsigned long maxlen,
+				     bool force_nocache);
 extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 #else
+static inline void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+					    unsigned offset,
+					    unsigned long minlen,
+					    unsigned long maxlen,
+					    bool force_nocache);
+{
+	return NULL;
+}
 struct pci_dev;
 static inline void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
 {
diff --git a/lib/iomap.c b/lib/iomap.c
index 5dbcb4b..efa7c29 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -243,33 +243,47 @@ EXPORT_SYMBOL(ioport_unmap);
 
 #ifdef CONFIG_PCI
 /**
- * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * pci_iomap_range - create a virtual mapping cookie for a PCI BAR
  * @dev: PCI device that owns the BAR
  * @bar: BAR number
- * @maxlen: length of the memory to map
+ * @offset: map memory at the given offset in BAR
+ * @minlen: min length of the memory to map
+ * @maxlen: max length of the memory to map
  *
  * Using this function you will get a __iomem address to your device BAR.
  * You can access it using ioread*() and iowrite*(). These functions hide
  * the details if this is a MMIO or PIO address space and will just do what
  * you expect from them in the correct way.
  *
+ * @minlen specifies the minimum length to map. We check that BAR is
+ * large enough.
  * @maxlen specifies the maximum length to map. If you want to get access to
- * the complete BAR without checking for its length first, pass %0 here.
+ * the complete BAR from offset to the end, pass %0 here.
+ * @force_nocache makes the mapping noncacheable even if the BAR
+ * is prefetcheable. It has no effect otherwise.
  * */
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+			      unsigned offset,
+			      unsigned long minlen,
+			      unsigned long maxlen,
+			      bool force_nocache)
 {
 	resource_size_t start = pci_resource_start(dev, bar);
 	resource_size_t len = pci_resource_len(dev, bar);
 	unsigned long flags = pci_resource_flags(dev, bar);
 
-	if (!len || !start)
+	if (len <= offset || !start)
+		return NULL;
+	len -= offset;
+	start += offset;
+	if (len < minlen)
 		return NULL;
 	if (maxlen && len > maxlen)
 		len = maxlen;
 	if (flags & IORESOURCE_IO)
 		return ioport_map(start, len);
 	if (flags & IORESOURCE_MEM) {
-		if (flags & IORESOURCE_CACHEABLE)
+		if (!force_nocache && (flags & IORESOURCE_CACHEABLE))
 			return ioremap(start, len);
 		return ioremap_nocache(start, len);
 	}
@@ -277,10 +291,30 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
 	return NULL;
 }
 
+/**
+ * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * @dev: PCI device that owns the BAR
+ * @bar: BAR number
+ * @maxlen: length of the memory to map
+ *
+ * Using this function you will get a __iomem address to your device BAR.
+ * You can access it using ioread*() and iowrite*(). These functions hide
+ * the details if this is a MMIO or PIO address space and will just do what
+ * you expect from them in the correct way.
+ *
+ * @maxlen specifies the maximum length to map. If you want to get access to
+ * the complete BAR without checking for its length first, pass %0 here.
+ * */
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	return pci_iomap_range(dev, bar, 0, 0, maxlen, false);
+}
+
 void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
 {
 	IO_COND(addr, /* nothing */, iounmap(addr));
 }
 EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iomap_range);
 EXPORT_SYMBOL(pci_iounmap);
 #endif /* CONFIG_PCI */
-- 
1.7.8.382.g3daff

^ permalink raw reply related	[flat|nested] 106+ messages in thread

* [PATCHv2 RFC] pci: add pci_iomap_range
@ 2011-12-26 14:05         ` Michael S Tsirkin
  0 siblings, 0 replies; 106+ messages in thread
From: Michael S Tsirkin @ 2011-12-26 14:05 UTC (permalink / raw)
  Cc: Arnd Bergmann, Rusty Russell, Michael S. Tsirkin, Jonas Bonn,
	linux-arch, linux-kernel, Jesse Barnes

Virtio drivers should map the part of the range they need, not necessarily
all of it. They also need non-cacheable mapping even for
prefetchable BARs.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---

>Rusty Russell wrote:
>> I think that we should add a forcenocache flag.
>> This will let devices put the cap structure in
>> the prefetcheable BAR. That has an advantage that
>> it can be located anywhere in the 2^64 space,
>> while non-prefetcheable BARs are limited to lower 4G
>> for devices behind a PCI-to-PCI bridge.
>OK, want to respin that patch (or patch on top and I'll fold?)

Here comes (warning - untested). This is on top of master.
As I'm moving this function for portability into
a separate file, it might be a good idea for this
to be rebased on top of my linux-next tree?

The whole mmio rework is not 3.3 material yes, is it?

 include/asm-generic/io.h    |    5 ++++
 include/asm-generic/iomap.h |   13 ++++++++++++
 lib/iomap.c                 |   46 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 9120887..78aa159 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -286,6 +286,11 @@ static inline void writesb(const void __iomem *addr, const void *buf, int len)
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+				     unsigned offset,
+				     unsigned long minlen,
+				     unsigned long maxlen,
+				     bool force_nocache);
 static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
 {
 }
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 98dcd76..584a2b6 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -70,8 +70,21 @@ extern void ioport_unmap(void __iomem *);
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+				     unsigned offset,
+				     unsigned long minlen,
+				     unsigned long maxlen,
+				     bool force_nocache);
 extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 #else
+static inline void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+					    unsigned offset,
+					    unsigned long minlen,
+					    unsigned long maxlen,
+					    bool force_nocache);
+{
+	return NULL;
+}
 struct pci_dev;
 static inline void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
 {
diff --git a/lib/iomap.c b/lib/iomap.c
index 5dbcb4b..efa7c29 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -243,33 +243,47 @@ EXPORT_SYMBOL(ioport_unmap);
 
 #ifdef CONFIG_PCI
 /**
- * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * pci_iomap_range - create a virtual mapping cookie for a PCI BAR
  * @dev: PCI device that owns the BAR
  * @bar: BAR number
- * @maxlen: length of the memory to map
+ * @offset: map memory at the given offset in BAR
+ * @minlen: min length of the memory to map
+ * @maxlen: max length of the memory to map
  *
  * Using this function you will get a __iomem address to your device BAR.
  * You can access it using ioread*() and iowrite*(). These functions hide
  * the details if this is a MMIO or PIO address space and will just do what
  * you expect from them in the correct way.
  *
+ * @minlen specifies the minimum length to map. We check that BAR is
+ * large enough.
  * @maxlen specifies the maximum length to map. If you want to get access to
- * the complete BAR without checking for its length first, pass %0 here.
+ * the complete BAR from offset to the end, pass %0 here.
+ * @force_nocache makes the mapping noncacheable even if the BAR
+ * is prefetcheable. It has no effect otherwise.
  * */
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+			      unsigned offset,
+			      unsigned long minlen,
+			      unsigned long maxlen,
+			      bool force_nocache)
 {
 	resource_size_t start = pci_resource_start(dev, bar);
 	resource_size_t len = pci_resource_len(dev, bar);
 	unsigned long flags = pci_resource_flags(dev, bar);
 
-	if (!len || !start)
+	if (len <= offset || !start)
+		return NULL;
+	len -= offset;
+	start += offset;
+	if (len < minlen)
 		return NULL;
 	if (maxlen && len > maxlen)
 		len = maxlen;
 	if (flags & IORESOURCE_IO)
 		return ioport_map(start, len);
 	if (flags & IORESOURCE_MEM) {
-		if (flags & IORESOURCE_CACHEABLE)
+		if (!force_nocache && (flags & IORESOURCE_CACHEABLE))
 			return ioremap(start, len);
 		return ioremap_nocache(start, len);
 	}
@@ -277,10 +291,30 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
 	return NULL;
 }
 
+/**
+ * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * @dev: PCI device that owns the BAR
+ * @bar: BAR number
+ * @maxlen: length of the memory to map
+ *
+ * Using this function you will get a __iomem address to your device BAR.
+ * You can access it using ioread*() and iowrite*(). These functions hide
+ * the details if this is a MMIO or PIO address space and will just do what
+ * you expect from them in the correct way.
+ *
+ * @maxlen specifies the maximum length to map. If you want to get access to
+ * the complete BAR without checking for its length first, pass %0 here.
+ * */
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	return pci_iomap_range(dev, bar, 0, 0, maxlen, false);
+}
+
 void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
 {
 	IO_COND(addr, /* nothing */, iounmap(addr));
 }
 EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iomap_range);
 EXPORT_SYMBOL(pci_iounmap);
 #endif /* CONFIG_PCI */
-- 
1.7.8.382.g3daff

^ permalink raw reply related	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-21  0:33                         ` Rusty Russell
  2011-12-21  9:19                           ` Michael S. Tsirkin
@ 2012-01-10 17:03                           ` Michael S. Tsirkin
  2012-01-11  0:25                             ` Rusty Russell
  1 sibling, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-10 17:03 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
> On Tue, 20 Dec 2011 13:37:18 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Mon, Dec 19, 2011 at 09:38:42PM +1030, Rusty Russell wrote:
> > > On Mon, 19 Dec 2011 11:13:26 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > 
> > > Actually, the host doesn't need anything, since it can always lock out
> > > the guest while it updates the area.
> > > It's the guest which can't do atomic updates.
> > 
> > There are 2 cases
> > - updates by guest accesses by host
> > - accesses by guest updates by host
> > 
> > Both are problematic because the guest accesses are split.
> > Consider the example I gave at the beginning was with capacity read
> > by guest. Host can not solve it without guest changes, right?
> 
> Indeed, my brain fart.  Let's pretend I didn't say that, and you didn't
> have to explain it to me in baby words :)
> 
> > > > The counter can be in guest memory, right? So we don't pay extra
> > > > exits.
> > > 
> > > Could be, but I'm not delighted about the design.
> > 
> > OK, so this is an argument for always using a control vq, right?
> 
> Yes.  The idea that we can alter fields in the device-specific config
> area is flawed.  There may be cases where it doesn't matter, but as an
> idea it was holed to begin with.
> 
> We can reduce probability by doing a double read to check, but there are
> still cases where it will fail.

Okay - want me to propose an interface for that?

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-10 17:03                           ` Michael S. Tsirkin
@ 2012-01-11  0:25                             ` Rusty Russell
  2012-01-11  1:48                               ` Benjamin Herrenschmidt
                                                 ` (3 more replies)
  0 siblings, 4 replies; 106+ messages in thread
From: Rusty Russell @ 2012-01-11  0:25 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Benjamin Herrenschmidt, Sasha Levin,
	Pawel Moll, virtualization

On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
> > Yes.  The idea that we can alter fields in the device-specific config
> > area is flawed.  There may be cases where it doesn't matter, but as an
> > idea it was holed to begin with.
> > 
> > We can reduce probability by doing a double read to check, but there are
> > still cases where it will fail.
> 
> Okay - want me to propose an interface for that?

Had a brief chat with BenH (CC'd).

I think we should deprecate writing to the config space.  Only balloon
does it AFAICT, and I can't quite figure out *why* it has an 'active'
field.  This solves half the problem, of sync guest writes.  For the
other half, I suggest a generation counter; odd means inconsistent.  The
guest can poll.

BenH also convinced me we should finally make the config space LE if
we're going to change things.  Since PCI is the most common transport,
guest-endian confuses people.  And it sucks for really weird machines.

We should also change the ring (to a single ring, I think).  Descriptors
to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).
We might be able to squeeze it into 20 bytes but that means packing.  We
should support inline, chained or indirect.  Let the other side ack by
setting flag, cookie and len (if written).

Moreover, I think we should make all these changes at once (at least, in
the spec).  That makes it a big change, and it'll take longer to
develop, but makes it easy in the long run to differentiate legacy and
modern virtio.

Thoughts?
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11  0:25                             ` Rusty Russell
@ 2012-01-11  1:48                               ` Benjamin Herrenschmidt
  2012-01-11  8:47                               ` Stefan Hajnoczi
                                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11  1:48 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Christian Borntraeger, virtualization, Sasha Levin, Pawel Moll,
	Michael S. Tsirkin

On Wed, 2012-01-11 at 10:55 +1030, Rusty Russell wrote:
> On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
> > > Yes.  The idea that we can alter fields in the device-specific config
> > > area is flawed.  There may be cases where it doesn't matter, but as an
> > > idea it was holed to begin with.
> > > 
> > > We can reduce probability by doing a double read to check, but there are
> > > still cases where it will fail.
> > 
> > Okay - want me to propose an interface for that?
> 
> Had a brief chat with BenH (CC'd).

Thanks Rusty :-)

> I think we should deprecate writing to the config space.  Only balloon
> does it AFAICT, and I can't quite figure out *why* it has an 'active'
> field.  This solves half the problem, of sync guest writes.  For the
> other half, I suggest a generation counter; odd means inconsistent.  The
> guest can poll.

Agreed. Commands are the way to go.

> BenH also convinced me we should finally make the config space LE if
> we're going to change things.  Since PCI is the most common transport,
> guest-endian confuses people.  And it sucks for really weird machines.

I have a few more "good" reasons but yes, basically it confuses things.
Just see the bugs we are still fixing in qemu :-)

A fixed endian makes everything simpler, there is no need to play games
when using virtio in heterogenous environment (slave CPUs), no need to
handle variable endian on CPUs that support both etc... and the cost of
byteswap is negligible (we do it routinely for normal PCI hardware and
it's never been a hot spot).

> We should also change the ring (to a single ring, I think).  Descriptors
> to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).

Do we care about space waste ? 32 bytes is a nicely round power of two
and would make general math, but also debugging (visual inspection of
the ring) etc... easier.

If we support immediate data, that means we'd have enough for most
common commands and status straight in the ring.

Another approach that may or may not be worthwhile is to have the size
of the ring elements configurable (as power of two) so that it can be
adjusted to contain both immediate and a descriptor. This is done
typically in network hardware to put the routing headers in the
immediate part and the rest of the packet elsewhere which improves cache
affinity when processing large amount of packets.

> We might be able to squeeze it into 20 bytes but that means packing.  We
> should support inline, chained or indirect.  Let the other side ack by
> setting flag, cookie and len (if written).
> 
> Moreover, I think we should make all these changes at once (at least, in
> the spec).  That makes it a big change, and it'll take longer to
> develop, but makes it easy in the long run to differentiate legacy and
> modern virtio.

Agreed,

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11  0:25                             ` Rusty Russell
  2012-01-11  1:48                               ` Benjamin Herrenschmidt
@ 2012-01-11  8:47                               ` Stefan Hajnoczi
  2012-01-11  9:10                                 ` Benjamin Herrenschmidt
  2012-01-11 10:21                               ` Michael S. Tsirkin
  2012-01-11 13:30                               ` Anthony Liguori
  3 siblings, 1 reply; 106+ messages in thread
From: Stefan Hajnoczi @ 2012-01-11  8:47 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Pawel Moll, Michael S. Tsirkin, Benjamin Herrenschmidt,
	virtualization, Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 12:25 AM, Rusty Russell <rusty@rustcorp.com.au> wrote:
> On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
>> > Yes.  The idea that we can alter fields in the device-specific config
>> > area is flawed.  There may be cases where it doesn't matter, but as an
>> > idea it was holed to begin with.
>> >
>> > We can reduce probability by doing a double read to check, but there are
>> > still cases where it will fail.
>>
>> Okay - want me to propose an interface for that?
>
> Had a brief chat with BenH (CC'd).
>
> I think we should deprecate writing to the config space.  Only balloon
> does it AFAICT, and I can't quite figure out *why* it has an 'active'
> field.  This solves half the problem, of sync guest writes.  For the
> other half, I suggest a generation counter; odd means inconsistent.  The
> guest can poll.
>
> BenH also convinced me we should finally make the config space LE if
> we're going to change things.  Since PCI is the most common transport,
> guest-endian confuses people.  And it sucks for really weird machines.
>
> We should also change the ring (to a single ring, I think).  Descriptors
> to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).
> We might be able to squeeze it into 20 bytes but that means packing.  We
> should support inline, chained or indirect.  Let the other side ack by
> setting flag, cookie and len (if written).
>
> Moreover, I think we should make all these changes at once (at least, in
> the spec).  That makes it a big change, and it'll take longer to
> develop, but makes it easy in the long run to differentiate legacy and
> modern virtio.

This is also an opportunity to stop using CPU physical addresses in
the ring and instead perform DMA like a normal PCI device (use bus
addresses).

Stefan

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11  8:47                               ` Stefan Hajnoczi
@ 2012-01-11  9:10                                 ` Benjamin Herrenschmidt
  2012-01-11 14:28                                   ` Stefan Hajnoczi
  0 siblings, 1 reply; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11  9:10 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Pawel Moll, Michael S. Tsirkin, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, 2012-01-11 at 08:47 +0000, Stefan Hajnoczi wrote:
> 
> This is also an opportunity to stop using CPU physical addresses in
> the ring and instead perform DMA like a normal PCI device (use bus
> addresses).

Euh why ?

That would mean in many cases adding a layer of iommu, which will slow
things down a lot ... unless we create a special virtio bus which has
its own dma-ops that do a direct v:p translation that is. But anything
PCI doing dma_map_sg and co will involved emulated iommu mapping &
unmapping and will be a huge hit on performances.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11  0:25                             ` Rusty Russell
  2012-01-11  1:48                               ` Benjamin Herrenschmidt
  2012-01-11  8:47                               ` Stefan Hajnoczi
@ 2012-01-11 10:21                               ` Michael S. Tsirkin
  2012-01-11 21:13                                 ` Benjamin Herrenschmidt
  2012-01-12  2:01                                 ` Rusty Russell
  2012-01-11 13:30                               ` Anthony Liguori
  3 siblings, 2 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 10:21 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Christian Borntraeger, Benjamin Herrenschmidt, Sasha Levin,
	Pawel Moll, virtualization

On Wed, Jan 11, 2012 at 10:55:52AM +1030, Rusty Russell wrote:
> On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
> > > Yes.  The idea that we can alter fields in the device-specific config
> > > area is flawed.  There may be cases where it doesn't matter, but as an
> > > idea it was holed to begin with.
> > > 
> > > We can reduce probability by doing a double read to check, but there are
> > > still cases where it will fail.
> > 
> > Okay - want me to propose an interface for that?
> 
> Had a brief chat with BenH (CC'd).
> 
> I think we should deprecate writing to the config space.  Only balloon
> does it AFAICT, and I can't quite figure out *why* it has an 'active'
> field.

Are you sure? I think net writes a mac address.

> This solves half the problem, of sync guest writes.  For the
> other half, I suggest a generation counter; odd means inconsistent.  The
> guest can poll.

So we get the counter until it's even, get the config, if it's changed
repeat? Yes it works. However, I would like to have a way to detect
config change just by looking at memory. ATM we need to read ISR to
know.  If we used a VQ, the advantage would be that the device can work
with a single MSIX vector shared by all VQs.

If we do require config VQ anyway, why not use it to notify
guest of config changes? Guest could pre-post an in buffer
and host uses that.


> BenH also convinced me we should finally make the config space LE if
> we're going to change things.  Since PCI is the most common transport,
> guest-endian confuses people.  And it sucks for really weird machines.

Are we going to keep guest endian for e.g. virtio net header?
If yes the benefit of switching config space is not that big.
And changes in devices would affect non-PCI transports.

> We should also change the ring (to a single ring, I think).  Descriptors
> to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).
> We might be able to squeeze it into 20 bytes but that means packing.  We
> should support inline, chained or indirect.  Let the other side ack by
> setting flag, cookie and len (if written).

Quite possibly all or some of these things help performance
but do we have to change the spec before we have experimental
proof?

I did experiment with a single ring using tools/virtio and
I didn't see a measureable performance gain. Two rings
do have the advantage of not requiring host side copy,
which copy would surely add to cache pressure.  Since
host doesn't change desriptors, we could also
preformat some descriptors in the current design.
There is a fragmentation problem in theory but it can be alleviated with
a smart allocator. About inline - it can only help very small buffers.
Which workloads do you have in mind exactly?


> Moreover, I think we should make all these changes at once (at least, in
> the spec).  That makes it a big change, and it'll take longer to
> develop, but makes it easy in the long run to differentiate legacy and
> modern virtio.
> 
> Thoughts?
> Rusty.

BTW this seems to be the reverse from what you have in Mar 2001,
see 87mxkjls61.fsf@rustcorp.com.au :)

I am much less concerned with what we do for configuration,
but I do not believe we have learned all performance lessons
from virtio ring1. Is there any reason why we shouldn't be
able to experiment with inline within virtio1 and see
whether that gets us anything?
If we do a bunch of changes to the ring at once, we can't
figure out what's right, what's wrong, or back out of
mistakes later.

Since there are non PCI transports that use the ring,
we really shouldn't make both the configuration and
the ring changes depend on the same feature bit.

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11  0:25                             ` Rusty Russell
                                                 ` (2 preceding siblings ...)
  2012-01-11 10:21                               ` Michael S. Tsirkin
@ 2012-01-11 13:30                               ` Anthony Liguori
  2012-01-11 15:12                                 ` Michael S. Tsirkin
  2012-01-12  1:35                                 ` Rusty Russell
  3 siblings, 2 replies; 106+ messages in thread
From: Anthony Liguori @ 2012-01-11 13:30 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Pawel Moll, Michael S. Tsirkin, Benjamin Herrenschmidt,
	virtualization, Christian Borntraeger, Sasha Levin

On 01/10/2012 06:25 PM, Rusty Russell wrote:
> On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin"<mst@redhat.com>  wrote:
>> On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
>>> Yes.  The idea that we can alter fields in the device-specific config
>>> area is flawed.  There may be cases where it doesn't matter, but as an
>>> idea it was holed to begin with.
>>>
>>> We can reduce probability by doing a double read to check, but there are
>>> still cases where it will fail.
>>
>> Okay - want me to propose an interface for that?
>
> Had a brief chat with BenH (CC'd).
>
> I think we should deprecate writing to the config space.  Only balloon
> does it AFAICT, and I can't quite figure out *why* it has an 'active'
> field.  This solves half the problem, of sync guest writes.  For the
> other half, I suggest a generation counter; odd means inconsistent.  The
> guest can poll.
>
> BenH also convinced me we should finally make the config space LE if
> we're going to change things.  Since PCI is the most common transport,
> guest-endian confuses people.  And it sucks for really weird machines

I think the more important thing to do is require accesses to integers in the 
config space to always be aligned and to use the appropriate accessor. 
Non-integer fields should be restricted to byte access.

That limits config space entries to 32-bit but also means that there is no need 
for a generation counter.  It's also easier to deal with endian conversion that way.

But it means the backend code ends up being much simpler to write (because it 
behaves more like a normal PCI device).

If we're already making the change, the endianness ought to be a feature bit.

> We should also change the ring (to a single ring, I think).

Ack.

> Descriptors
> to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).
> We might be able to squeeze it into 20 bytes but that means packing.  We
> should support inline, chained or indirect.  Let the other side ack by
> setting flag, cookie and len (if written).
>
> Moreover, I think we should make all these changes at once (at least, in
> the spec).  That makes it a big change, and it'll take longer to
> develop, but makes it easy in the long run to differentiate legacy and
> modern virtio.

Ack.  Long live virtio2! :-)

Regards,

Anthony Liguori

>
> Thoughts?
> Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11  9:10                                 ` Benjamin Herrenschmidt
@ 2012-01-11 14:28                                   ` Stefan Hajnoczi
  2012-01-11 15:39                                     ` Michael S. Tsirkin
  2012-01-11 20:46                                     ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 106+ messages in thread
From: Stefan Hajnoczi @ 2012-01-11 14:28 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Pawel Moll, Michael S. Tsirkin, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 9:10 AM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Wed, 2012-01-11 at 08:47 +0000, Stefan Hajnoczi wrote:
>>
>> This is also an opportunity to stop using CPU physical addresses in
>> the ring and instead perform DMA like a normal PCI device (use bus
>> addresses).
>
> Euh why ?

Because it's a paravirt hack that ends up hitting corner cases.  It's
not possible to do virtio-pci passthrough under nested virtualization
unless we use an IOMMU.  Imagine passing virtio-net from L0 into the
L2 guest (i.e. PCI-passthrough).  If virtio-pci is really "PCI" this
should be possible but it's not when we use physical addresses instead
of bus addresses.

Stefan

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 13:30                               ` Anthony Liguori
@ 2012-01-11 15:12                                 ` Michael S. Tsirkin
  2012-01-11 15:15                                   ` Anthony Liguori
  2012-01-11 20:50                                   ` Benjamin Herrenschmidt
  2012-01-12  1:35                                 ` Rusty Russell
  1 sibling, 2 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 15:12 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 07:30:34AM -0600, Anthony Liguori wrote:
> On 01/10/2012 06:25 PM, Rusty Russell wrote:
> >On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin"<mst@redhat.com>  wrote:
> >>On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
> >>>Yes.  The idea that we can alter fields in the device-specific config
> >>>area is flawed.  There may be cases where it doesn't matter, but as an
> >>>idea it was holed to begin with.
> >>>
> >>>We can reduce probability by doing a double read to check, but there are
> >>>still cases where it will fail.
> >>
> >>Okay - want me to propose an interface for that?
> >
> >Had a brief chat with BenH (CC'd).
> >
> >I think we should deprecate writing to the config space.  Only balloon
> >does it AFAICT, and I can't quite figure out *why* it has an 'active'
> >field.  This solves half the problem, of sync guest writes.  For the
> >other half, I suggest a generation counter; odd means inconsistent.  The
> >guest can poll.
> >
> >BenH also convinced me we should finally make the config space LE if
> >we're going to change things.  Since PCI is the most common transport,
> >guest-endian confuses people.  And it sucks for really weird machines
> 
> I think the more important thing to do is require accesses to
> integers in the config space to always be aligned and to use the
> appropriate accessor. Non-integer fields should be restricted to
> byte access.
> 
> That limits config space entries to 32-bit but also means that there
> is no need for a generation counter.  It's also easier to deal with
> endian conversion that way.

This is similar to what we have now. But it's still buggy: e.g. if guest
updates MAC byte by byte, we have no way to know when it's done doing
so. 


> But it means the backend code ends up being much simpler to write
> (because it behaves more like a normal PCI device).
> 
> If we're already making the change, the endianness ought to be a feature bit.
> 
> >We should also change the ring (to a single ring, I think).
> 
> Ack.
> 
> >Descriptors
> >to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).
> >We might be able to squeeze it into 20 bytes but that means packing.  We
> >should support inline, chained or indirect.  Let the other side ack by
> >setting flag, cookie and len (if written).
> >
> >Moreover, I think we should make all these changes at once (at least, in
> >the spec).  That makes it a big change, and it'll take longer to
> >develop, but makes it easy in the long run to differentiate legacy and
> >modern virtio.
> 
> Ack.  Long live virtio2! :-)
> 
> Regards,
> 
> Anthony Liguori
> 
> >
> >Thoughts?
> >Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 15:12                                 ` Michael S. Tsirkin
@ 2012-01-11 15:15                                   ` Anthony Liguori
  2012-01-11 15:21                                     ` Michael S. Tsirkin
  2012-01-11 20:50                                   ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 106+ messages in thread
From: Anthony Liguori @ 2012-01-11 15:15 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On 01/11/2012 09:12 AM, Michael S. Tsirkin wrote:
> On Wed, Jan 11, 2012 at 07:30:34AM -0600, Anthony Liguori wrote:
>> On 01/10/2012 06:25 PM, Rusty Russell wrote:
>>> On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin"<mst@redhat.com>   wrote:
>>>> On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
>>>>> Yes.  The idea that we can alter fields in the device-specific config
>>>>> area is flawed.  There may be cases where it doesn't matter, but as an
>>>>> idea it was holed to begin with.
>>>>>
>>>>> We can reduce probability by doing a double read to check, but there are
>>>>> still cases where it will fail.
>>>>
>>>> Okay - want me to propose an interface for that?
>>>
>>> Had a brief chat with BenH (CC'd).
>>>
>>> I think we should deprecate writing to the config space.  Only balloon
>>> does it AFAICT, and I can't quite figure out *why* it has an 'active'
>>> field.  This solves half the problem, of sync guest writes.  For the
>>> other half, I suggest a generation counter; odd means inconsistent.  The
>>> guest can poll.
>>>
>>> BenH also convinced me we should finally make the config space LE if
>>> we're going to change things.  Since PCI is the most common transport,
>>> guest-endian confuses people.  And it sucks for really weird machines
>>
>> I think the more important thing to do is require accesses to
>> integers in the config space to always be aligned and to use the
>> appropriate accessor. Non-integer fields should be restricted to
>> byte access.
>>
>> That limits config space entries to 32-bit but also means that there
>> is no need for a generation counter.  It's also easier to deal with
>> endian conversion that way.
>
> This is similar to what we have now. But it's still buggy: e.g. if guest
> updates MAC byte by byte, we have no way to know when it's done doing
> so.

This is no different than a normal network card.  You have to use a secondary 
register to trigger an update.

Regards,

Anthony Liguori

>
>
>> But it means the backend code ends up being much simpler to write
>> (because it behaves more like a normal PCI device).
>>
>> If we're already making the change, the endianness ought to be a feature bit.
>>
>>> We should also change the ring (to a single ring, I think).
>>
>> Ack.
>>
>>> Descriptors
>>> to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).
>>> We might be able to squeeze it into 20 bytes but that means packing.  We
>>> should support inline, chained or indirect.  Let the other side ack by
>>> setting flag, cookie and len (if written).
>>>
>>> Moreover, I think we should make all these changes at once (at least, in
>>> the spec).  That makes it a big change, and it'll take longer to
>>> develop, but makes it easy in the long run to differentiate legacy and
>>> modern virtio.
>>
>> Ack.  Long live virtio2! :-)
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>>
>>> Thoughts?
>>> Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 15:15                                   ` Anthony Liguori
@ 2012-01-11 15:21                                     ` Michael S. Tsirkin
  2012-01-11 15:28                                       ` Anthony Liguori
  2012-01-11 20:51                                       ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 15:21 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 09:15:49AM -0600, Anthony Liguori wrote:
> On 01/11/2012 09:12 AM, Michael S. Tsirkin wrote:
> >On Wed, Jan 11, 2012 at 07:30:34AM -0600, Anthony Liguori wrote:
> >>On 01/10/2012 06:25 PM, Rusty Russell wrote:
> >>>On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin"<mst@redhat.com>   wrote:
> >>>>On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
> >>>>>Yes.  The idea that we can alter fields in the device-specific config
> >>>>>area is flawed.  There may be cases where it doesn't matter, but as an
> >>>>>idea it was holed to begin with.
> >>>>>
> >>>>>We can reduce probability by doing a double read to check, but there are
> >>>>>still cases where it will fail.
> >>>>
> >>>>Okay - want me to propose an interface for that?
> >>>
> >>>Had a brief chat with BenH (CC'd).
> >>>
> >>>I think we should deprecate writing to the config space.  Only balloon
> >>>does it AFAICT, and I can't quite figure out *why* it has an 'active'
> >>>field.  This solves half the problem, of sync guest writes.  For the
> >>>other half, I suggest a generation counter; odd means inconsistent.  The
> >>>guest can poll.
> >>>
> >>>BenH also convinced me we should finally make the config space LE if
> >>>we're going to change things.  Since PCI is the most common transport,
> >>>guest-endian confuses people.  And it sucks for really weird machines
> >>
> >>I think the more important thing to do is require accesses to
> >>integers in the config space to always be aligned and to use the
> >>appropriate accessor. Non-integer fields should be restricted to
> >>byte access.
> >>
> >>That limits config space entries to 32-bit but also means that there
> >>is no need for a generation counter.  It's also easier to deal with
> >>endian conversion that way.
> >
> >This is similar to what we have now. But it's still buggy: e.g. if guest
> >updates MAC byte by byte, we have no way to know when it's done doing
> >so.
> 
> This is no different than a normal network card.  You have to use a
> secondary register to trigger an update.
> 
> Regards,
> 
> Anthony Liguori

Possible but doesn't let us layer nicely to allow unchanged drivers
that work with all transports (new pci, old pci, non pci).
Something like a command VQ would be a generic transport
that can be hidden behind  config->set(...).

> >
> >
> >>But it means the backend code ends up being much simpler to write
> >>(because it behaves more like a normal PCI device).
> >>
> >>If we're already making the change, the endianness ought to be a feature bit.
> >>
> >>>We should also change the ring (to a single ring, I think).
> >>
> >>Ack.
> >>
> >>>Descriptors
> >>>to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).
> >>>We might be able to squeeze it into 20 bytes but that means packing.  We
> >>>should support inline, chained or indirect.  Let the other side ack by
> >>>setting flag, cookie and len (if written).
> >>>
> >>>Moreover, I think we should make all these changes at once (at least, in
> >>>the spec).  That makes it a big change, and it'll take longer to
> >>>develop, but makes it easy in the long run to differentiate legacy and
> >>>modern virtio.
> >>
> >>Ack.  Long live virtio2! :-)
> >>
> >>Regards,
> >>
> >>Anthony Liguori
> >>
> >>>
> >>>Thoughts?
> >>>Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 15:21                                     ` Michael S. Tsirkin
@ 2012-01-11 15:28                                       ` Anthony Liguori
  2012-01-11 15:45                                         ` Michael S. Tsirkin
  2012-01-11 20:51                                       ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 106+ messages in thread
From: Anthony Liguori @ 2012-01-11 15:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On 01/11/2012 09:21 AM, Michael S. Tsirkin wrote:
> On Wed, Jan 11, 2012 at 09:15:49AM -0600, Anthony Liguori wrote:
>>> This is similar to what we have now. But it's still buggy: e.g. if guest
>>> updates MAC byte by byte, we have no way to know when it's done doing
>>> so.
>>
>> This is no different than a normal network card.  You have to use a
>> secondary register to trigger an update.
>>
>> Regards,
>>
>> Anthony Liguori
>
> Possible but doesn't let us layer nicely to allow unchanged drivers
> that work with all transports (new pci, old pci, non pci).

If we declare config space LE, then we have to touch all drivers.  There's no 
way around it because the virtio API is byte-based, not word based.

This is why I'm suggesting making the virtio API (and then the virtio-pci ABI) 
word based.  It gives us the flexibility to make endianness a property of the 
transport and not a property of the devices.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 14:28                                   ` Stefan Hajnoczi
@ 2012-01-11 15:39                                     ` Michael S. Tsirkin
  2012-01-11 17:21                                       ` Stefan Hajnoczi
  2012-01-11 20:46                                     ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 15:39 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 02:28:48PM +0000, Stefan Hajnoczi wrote:
> On Wed, Jan 11, 2012 at 9:10 AM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
> > On Wed, 2012-01-11 at 08:47 +0000, Stefan Hajnoczi wrote:
> >>
> >> This is also an opportunity to stop using CPU physical addresses in
> >> the ring and instead perform DMA like a normal PCI device (use bus
> >> addresses).
> >
> > Euh why ?
> 
> Because it's a paravirt hack that ends up hitting corner cases.  It's
> not possible to do virtio-pci passthrough under nested virtualization
> unless we use an IOMMU.  Imagine passing virtio-net from L0 into the
> L2 guest (i.e. PCI-passthrough).  If virtio-pci is really "PCI" this
> should be possible but it's not when we use physical addresses instead
> of bus addresses.
> 
> Stefan

It won't be hard to show siginificant performance regression if
we do this. Hard to justify for something as niche as nested virt.

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 15:28                                       ` Anthony Liguori
@ 2012-01-11 15:45                                         ` Michael S. Tsirkin
  2012-01-11 16:02                                           ` Anthony Liguori
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 15:45 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 09:28:27AM -0600, Anthony Liguori wrote:
> On 01/11/2012 09:21 AM, Michael S. Tsirkin wrote:
> >On Wed, Jan 11, 2012 at 09:15:49AM -0600, Anthony Liguori wrote:
> >>>This is similar to what we have now. But it's still buggy: e.g. if guest
> >>>updates MAC byte by byte, we have no way to know when it's done doing
> >>>so.
> >>
> >>This is no different than a normal network card.  You have to use a
> >>secondary register to trigger an update.
> >>
> >>Regards,
> >>
> >>Anthony Liguori
> >
> >Possible but doesn't let us layer nicely to allow unchanged drivers
> >that work with all transports (new pci, old pci, non pci).
> 
> If we declare config space LE, then we have to touch all drivers.
> There's no way around it because the virtio API is byte-based, not
> word based.

Fine but don't we want to be compatible with old hypervisors?

> This is why I'm suggesting making the virtio API (and then the
> virtio-pci ABI) word based.  It gives us the flexibility to make
> endianness a property of the transport and not a property of the
> devices.
> 
> Regards,
> 
> Anthony Liguori

Some fields are 64 bit, this is still tricky to do atomically.
What's the objection to using a config VQ?

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 15:45                                         ` Michael S. Tsirkin
@ 2012-01-11 16:02                                           ` Anthony Liguori
  2012-01-11 17:08                                             ` Michael S. Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Anthony Liguori @ 2012-01-11 16:02 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On 01/11/2012 09:45 AM, Michael S. Tsirkin wrote:
> On Wed, Jan 11, 2012 at 09:28:27AM -0600, Anthony Liguori wrote:
>> On 01/11/2012 09:21 AM, Michael S. Tsirkin wrote:
>>> On Wed, Jan 11, 2012 at 09:15:49AM -0600, Anthony Liguori wrote:
>>>>> This is similar to what we have now. But it's still buggy: e.g. if guest
>>>>> updates MAC byte by byte, we have no way to know when it's done doing
>>>>> so.
>>>>
>>>> This is no different than a normal network card.  You have to use a
>>>> secondary register to trigger an update.
>>>>
>>>> Regards,
>>>>
>>>> Anthony Liguori
>>>
>>> Possible but doesn't let us layer nicely to allow unchanged drivers
>>> that work with all transports (new pci, old pci, non pci).
>>
>> If we declare config space LE, then we have to touch all drivers.
>> There's no way around it because the virtio API is byte-based, not
>> word based.
>
> Fine but don't we want to be compatible with old hypervisors?

We can modify the drivers to work either with a virtio1 or virtio2 transport. 
If the only difference is that we move to word access instead of byte access for 
the config space, it's a nop because drivers don't rely on sub-word access today.

>> This is why I'm suggesting making the virtio API (and then the
>> virtio-pci ABI) word based.  It gives us the flexibility to make
>> endianness a property of the transport and not a property of the
>> devices.
>>
>> Regards,
>>
>> Anthony Liguori
>
> Some fields are 64 bit, this is still tricky to do atomically.
> What's the objection to using a config VQ?

Then we move very far away from something that looks like a PCI device.  The 
problem we're having here is specifically where we've deviated from what a 
normal PCI device would do.  Fixing that by deviating even further seems counter 
intuitive to me.

Regards,

Anthony Liguori

>

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 16:02                                           ` Anthony Liguori
@ 2012-01-11 17:08                                             ` Michael S. Tsirkin
  2012-01-11 19:42                                               ` Anthony Liguori
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 17:08 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 10:02:51AM -0600, Anthony Liguori wrote:
> On 01/11/2012 09:45 AM, Michael S. Tsirkin wrote:
> >On Wed, Jan 11, 2012 at 09:28:27AM -0600, Anthony Liguori wrote:
> >>On 01/11/2012 09:21 AM, Michael S. Tsirkin wrote:
> >>>On Wed, Jan 11, 2012 at 09:15:49AM -0600, Anthony Liguori wrote:
> >>>>>This is similar to what we have now. But it's still buggy: e.g. if guest
> >>>>>updates MAC byte by byte, we have no way to know when it's done doing
> >>>>>so.
> >>>>
> >>>>This is no different than a normal network card.  You have to use a
> >>>>secondary register to trigger an update.
> >>>>
> >>>>Regards,
> >>>>
> >>>>Anthony Liguori
> >>>
> >>>Possible but doesn't let us layer nicely to allow unchanged drivers
> >>>that work with all transports (new pci, old pci, non pci).
> >>
> >>If we declare config space LE, then we have to touch all drivers.
> >>There's no way around it because the virtio API is byte-based, not
> >>word based.
> >
> >Fine but don't we want to be compatible with old hypervisors?
> 
> We can modify the drivers to work either with a virtio1 or virtio2
> transport. If the only difference is that we move to word access
> instead of byte access for the config space, it's a nop because
> drivers don't rely on sub-word access today.
> 
> >>This is why I'm suggesting making the virtio API (and then the
> >>virtio-pci ABI) word based.  It gives us the flexibility to make
> >>endianness a property of the transport and not a property of the
> >>devices.
> >>
> >>Regards,
> >>
> >>Anthony Liguori
> >
> >Some fields are 64 bit, this is still tricky to do atomically.
> >What's the objection to using a config VQ?
> 
> Then we move very far away from something that looks like a PCI
> device.  The problem we're having here is specifically where we've
> deviated from what a normal PCI device would do.  Fixing that by
> deviating even further seems counter intuitive to me.
> 
> Regards,
> 
> Anthony Liguori


Not sure what you mean. Using VQ is DMA which is pretty common for PCI.

> >

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 15:39                                     ` Michael S. Tsirkin
@ 2012-01-11 17:21                                       ` Stefan Hajnoczi
  2012-01-11 18:34                                         ` Michael S. Tsirkin
  2012-01-11 20:56                                         ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 106+ messages in thread
From: Stefan Hajnoczi @ 2012-01-11 17:21 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 3:39 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Wed, Jan 11, 2012 at 02:28:48PM +0000, Stefan Hajnoczi wrote:
>> On Wed, Jan 11, 2012 at 9:10 AM, Benjamin Herrenschmidt
>> <benh@kernel.crashing.org> wrote:
>> > On Wed, 2012-01-11 at 08:47 +0000, Stefan Hajnoczi wrote:
>> >>
>> >> This is also an opportunity to stop using CPU physical addresses in
>> >> the ring and instead perform DMA like a normal PCI device (use bus
>> >> addresses).
>> >
>> > Euh why ?
>>
>> Because it's a paravirt hack that ends up hitting corner cases.  It's
>> not possible to do virtio-pci passthrough under nested virtualization
>> unless we use an IOMMU.  Imagine passing virtio-net from L0 into the
>> L2 guest (i.e. PCI-passthrough).  If virtio-pci is really "PCI" this
>> should be possible but it's not when we use physical addresses instead
>> of bus addresses.
>>
>> Stefan
>
> It won't be hard to show siginificant performance regression if
> we do this. Hard to justify for something as niche as nested virt.

For x86 this should be mostly a nop.  For ppc and SPARC architectures
maybe you're right.  I still think it's a design flaw because if
virtio v2 doesn't use bus addresses then it will simply not be
possible to do passthrough for nested virt and other cases we haven't
hit yet.

Stefan

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 17:21                                       ` Stefan Hajnoczi
@ 2012-01-11 18:34                                         ` Michael S. Tsirkin
  2012-01-11 20:56                                         ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 18:34 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 05:21:53PM +0000, Stefan Hajnoczi wrote:
> On Wed, Jan 11, 2012 at 3:39 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Wed, Jan 11, 2012 at 02:28:48PM +0000, Stefan Hajnoczi wrote:
> >> On Wed, Jan 11, 2012 at 9:10 AM, Benjamin Herrenschmidt
> >> <benh@kernel.crashing.org> wrote:
> >> > On Wed, 2012-01-11 at 08:47 +0000, Stefan Hajnoczi wrote:
> >> >>
> >> >> This is also an opportunity to stop using CPU physical addresses in
> >> >> the ring and instead perform DMA like a normal PCI device (use bus
> >> >> addresses).
> >> >
> >> > Euh why ?
> >>
> >> Because it's a paravirt hack that ends up hitting corner cases.  It's
> >> not possible to do virtio-pci passthrough under nested virtualization
> >> unless we use an IOMMU.  Imagine passing virtio-net from L0 into the
> >> L2 guest (i.e. PCI-passthrough).  If virtio-pci is really "PCI" this
> >> should be possible but it's not when we use physical addresses instead
> >> of bus addresses.
> >>
> >> Stefan
> >
> > It won't be hard to show siginificant performance regression if
> > we do this. Hard to justify for something as niche as nested virt.
> 
> For x86 this should be mostly a nop.

Maybe it should, but AFAIK it isn't.

> For ppc and SPARC architectures maybe you're right.  I still think
> it's a design flaw because if virtio v2 doesn't use bus addresses then
> it will simply not be possible to do passthrough for nested virt and
> other cases we haven't hit yet.
> 
> Stefan

virtio-pci does not implement things like SRIOV or FLR so it
won't work anyway.

If we ever fix this, and if we really want to pass through a virtio
device (why?) using an emulated iommu seems silly - we probably want a
PV IOMMU as well.

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 17:08                                             ` Michael S. Tsirkin
@ 2012-01-11 19:42                                               ` Anthony Liguori
  2012-01-11 20:14                                                 ` Michael S. Tsirkin
  2012-01-11 20:59                                                 ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 106+ messages in thread
From: Anthony Liguori @ 2012-01-11 19:42 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On 01/11/2012 11:08 AM, Michael S. Tsirkin wrote:
>
> Not sure what you mean. Using VQ is DMA which is pretty common for PCI.

Do you know of a network device that obtains it's mac address via a DMA transaction?

Regards,

Anthony Liguori

>
>>>

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 19:42                                               ` Anthony Liguori
@ 2012-01-11 20:14                                                 ` Michael S. Tsirkin
  2012-01-11 20:26                                                   ` Anthony Liguori
  2012-01-11 20:59                                                 ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 20:14 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 01:42:39PM -0600, Anthony Liguori wrote:
> On 01/11/2012 11:08 AM, Michael S. Tsirkin wrote:
> >
> >Not sure what you mean. Using VQ is DMA which is pretty common for PCI.
> 
> Do you know of a network device that obtains it's mac address via a DMA transaction?

Sure.
See mlx4_replace_mac in drivers/net/ethernet/mellanox/mlx4/port.c

> Regards,
> 
> Anthony Liguori
> 
> >
> >>>

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 20:14                                                 ` Michael S. Tsirkin
@ 2012-01-11 20:26                                                   ` Anthony Liguori
  2012-01-11 21:02                                                     ` Benjamin Herrenschmidt
  2012-01-11 21:58                                                     ` Michael S. Tsirkin
  0 siblings, 2 replies; 106+ messages in thread
From: Anthony Liguori @ 2012-01-11 20:26 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On 01/11/2012 02:14 PM, Michael S. Tsirkin wrote:
> On Wed, Jan 11, 2012 at 01:42:39PM -0600, Anthony Liguori wrote:
>> On 01/11/2012 11:08 AM, Michael S. Tsirkin wrote:
>>>
>>> Not sure what you mean. Using VQ is DMA which is pretty common for PCI.
>>
>> Do you know of a network device that obtains it's mac address via a DMA transaction?
>
> Sure.
> See mlx4_replace_mac in drivers/net/ethernet/mellanox/mlx4/port.c

I'd say that's a special case but I see what you're getting at here.

So what about keeping the config space read-only and using control queues for 
everything else?

Regards,

Anthony Liguori

>
>> Regards,
>>
>> Anthony Liguori
>>
>>>
>>>>>

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 14:28                                   ` Stefan Hajnoczi
  2012-01-11 15:39                                     ` Michael S. Tsirkin
@ 2012-01-11 20:46                                     ` Benjamin Herrenschmidt
  2012-01-12 10:37                                       ` Stefan Hajnoczi
  1 sibling, 1 reply; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 20:46 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Pawel Moll, Michael S. Tsirkin, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, 2012-01-11 at 14:28 +0000, Stefan Hajnoczi wrote:
> On Wed, Jan 11, 2012 at 9:10 AM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
> > On Wed, 2012-01-11 at 08:47 +0000, Stefan Hajnoczi wrote:
> >>
> >> This is also an opportunity to stop using CPU physical addresses in
> >> the ring and instead perform DMA like a normal PCI device (use bus
> >> addresses).
> >
> > Euh why ?
> 
> Because it's a paravirt hack that ends up hitting corner cases.  It's
> not possible to do virtio-pci passthrough under nested virtualization
> unless we use an IOMMU.  Imagine passing virtio-net from L0 into the
> L2 guest (i.e. PCI-passthrough).  If virtio-pci is really "PCI" this
> should be possible but it's not when we use physical addresses instead
> of bus addresses.

Is this just an academic exercise or is there any actual value in doing
this ?

Using an iommu is going to slaugher your performance, so at the very
least it should be kept an option.

Yes, it's a paravirt "hack" as you call it but that's what virtio is all
about.... paravirt. If you prefer you can emulate a real HW device :-)

Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 15:12                                 ` Michael S. Tsirkin
  2012-01-11 15:15                                   ` Anthony Liguori
@ 2012-01-11 20:50                                   ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 20:50 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, virtualization, Christian Borntraeger, Sasha Levin,
	Anthony Liguori

On Wed, 2012-01-11 at 17:12 +0200, Michael S. Tsirkin wrote:

> This is similar to what we have now. But it's still buggy: e.g. if guest
> updates MAC byte by byte, we have no way to know when it's done doing
> so. 

Do like real HW, there's plenty of options:

 - (better) Have a command "update MAC" sent to a ring. A command ring
would be generally useful and could replace anything you do via writing
to config space today. The advantage of having a read-only config space
is that you significantly remove the need for synchronization. You could
also have an event ring and avoid the seqlock for reading. It's MUCH
better to have a fine granularity of what actually changed that having a
generic "something is changing in the config space".

With a new ring format allowing direct data in the ring descriptor that
would be trivial.

 - If you really don't like a command ring, you can have a command
"register", write the new MAC and send a command to make it "apply", but
that's not fantastic as there's going to be a possible discrepancy
between what's in the config and what's actually used.

 - Have a separate "MAC write" register set with the "hot" byte beeing
the low order byte. writing to it updates the MAC. 

Etc etc...

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 15:21                                     ` Michael S. Tsirkin
  2012-01-11 15:28                                       ` Anthony Liguori
@ 2012-01-11 20:51                                       ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 20:51 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, virtualization, Christian Borntraeger, Sasha Levin,
	Anthony Liguori

On Wed, 2012-01-11 at 17:21 +0200, Michael S. Tsirkin wrote:
> 
> Possible but doesn't let us layer nicely to allow unchanged drivers
> that work with all transports (new pci, old pci, non pci).
> Something like a command VQ would be a generic transport
> that can be hidden behind  config->set(...).

I agree, a command VQ (And possibly a status VQ) would be generally
useful.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 17:21                                       ` Stefan Hajnoczi
  2012-01-11 18:34                                         ` Michael S. Tsirkin
@ 2012-01-11 20:56                                         ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 20:56 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Pawel Moll, Michael S. Tsirkin, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, 2012-01-11 at 17:21 +0000, Stefan Hajnoczi wrote:
> > It won't be hard to show siginificant performance regression if
> > we do this. Hard to justify for something as niche as nested virt.
> 
> For x86 this should be mostly a nop.

No it won't be. Or rather, it will be as long as you map your entire
guests in the iommu, which means your entire guest will have to be
pinned and you lost swap, ksm, yadadada... 

Since the only way to have non-pinned guests today on x86 is to use
virtio, you just shot yourself in the foot.

Eventually x86 will have to grow some kind of virtualized iommu or
paravir iommu to overcome that at which point you will pay that price.

>   For ppc and SPARC architectures
> maybe you're right.  I still think it's a design flaw because if
> virtio v2 doesn't use bus addresses then it will simply not be
> possible to do passthrough for nested virt and other cases we haven't
> hit yet.

Then don't use virtio in those cases, use real emulated HW. Seriously,
is nested virt that interesting ? 

At the very least, make that use of iommu a feature or something like
that so it can be negociated down when not doing nesting which is going
to be 99% of your use cases.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 19:42                                               ` Anthony Liguori
  2012-01-11 20:14                                                 ` Michael S. Tsirkin
@ 2012-01-11 20:59                                                 ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 20:59 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Pawel Moll, Michael S. Tsirkin, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, 2012-01-11 at 13:42 -0600, Anthony Liguori wrote:
> On 01/11/2012 11:08 AM, Michael S. Tsirkin wrote:
> >
> > Not sure what you mean. Using VQ is DMA which is pretty common for PCI.
> 
> Do you know of a network device that obtains it's mac address via a DMA transaction?

I wouldn't be surprised if we could find one, but even if we don't why
is that an issue ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 20:26                                                   ` Anthony Liguori
@ 2012-01-11 21:02                                                     ` Benjamin Herrenschmidt
  2012-01-11 22:02                                                       ` Michael S. Tsirkin
  2012-01-11 21:58                                                     ` Michael S. Tsirkin
  1 sibling, 1 reply; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 21:02 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Pawel Moll, Michael S. Tsirkin, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, 2012-01-11 at 14:26 -0600, Anthony Liguori wrote:
> 
> I'd say that's a special case but I see what you're getting at here.
> 
> So what about keeping the config space read-only and using control
> queues for 
> everything else?

Which is exactly what Rusty and I are proposing :-) I would go further
and eliminate the idea of a seqlock and instead of a status queue with
precise messages indicating what changed.

I would couple that with the new queue format allowing immediate data in
the descriptor to avoid having to use indirect buffers for these, which
means no allocation, no buffer pool etc... which makes everything a lot
easier to deal with as well.

We could probably have a helper library for sending control messages
which could handle waiting for a ring slot to be free (practically
always the case on control queues), writing the message, sending it and
waiting for a status queue confirmation message.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 10:21                               ` Michael S. Tsirkin
@ 2012-01-11 21:13                                 ` Benjamin Herrenschmidt
  2012-01-11 22:13                                   ` Michael S. Tsirkin
  2012-01-12  2:01                                 ` Rusty Russell
  1 sibling, 1 reply; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 21:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Wed, 2012-01-11 at 12:21 +0200, Michael S. Tsirkin wrote:
> 
> > BenH also convinced me we should finally make the config space LE if
> > we're going to change things.  Since PCI is the most common transport,
> > guest-endian confuses people.  And it sucks for really weird machines.
> 
> Are we going to keep guest endian for e.g. virtio net header?
> If yes the benefit of switching config space is not that big.
> And changes in devices would affect non-PCI transports.

I think the concept of "guest endian" is broken by design. What does
that mean when running for example an ARM or a ppc 440 "guest" which
could be either endian ? Since you can't hard code your guest endian how
do you obtain/negociate it ? Also you now have to deal with dual endian
in the host, makes everything trickier.

Just make everything LE.

> Quite possibly all or some of these things help performance
> but do we have to change the spec before we have experimental
> proof?

Well, I would argue that the network driver world has proven countless
times that those are good ideas :-) But by all mean, let's do a
prototype implementation with virtio-net for example and bench it.

I don't think you need a single ring. For multiqueue net, you definitely
want multiple rings and you do want rings to remain uni-directional.

One other thing that can be useful is to separate the completion ring
from the actual ring of DMA descriptors, making the former completely
read-only by the guest and the later completely read only by the host.

For example take the ehea ethernet rx model. It has 3 rx "rings" per
queue. One contains the completions, it's a toggle-valid model so we
never write back to clear valid, it contains infos from the parser, the
tokenID of the packet and the index as to where in which ring the data
is, wich is either inline in the completion ring (small packet), header
inline & data in a data ring or completely in a data ring. Then you have
two data rings which are simply rings of SG list entries (more or less).

We typically pre-populate the data rings with skb's for 1500 and 9000
bytes packets. Small packets come in immediately in the completion ring,
and large packets via the data ring. 

That's just -an- example. There's many other to take inspiration from.
Network folks have beaten to death the problem of ring efficiency vs.
CPU caches.

> > Moreover, I think we should make all these changes at once (at least, in
> > the spec).  That makes it a big change, and it'll take longer to
> > develop, but makes it easy in the long run to differentiate legacy and
> > modern virtio.
> > 
> > Thoughts?
> > Rusty.
> 
> BTW this seems to be the reverse from what you have in Mar 2001,
> see 87mxkjls61.fsf@rustcorp.com.au :)

That was 10 years ago... 

> I am much less concerned with what we do for configuration,
> but I do not believe we have learned all performance lessons
> from virtio ring1. 

Maybe we have learned some more since then ? :-)

> Is there any reason why we shouldn't be
> able to experiment with inline within virtio1 and see
> whether that gets us anything?
> If we do a bunch of changes to the ring at once, we can't
> figure out what's right, what's wrong, or back out of
> mistakes later.
> 
> Since there are non PCI transports that use the ring,
> we really shouldn't make both the configuration and
> the ring changes depend on the same feature bit.

Another advantage of inline data is that it makes things a lot easier
for cases where only small amount of data need to be exchanged, such as
control/status rings, maybe virtio-tty (which I'm working on), etc... 

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 20:26                                                   ` Anthony Liguori
  2012-01-11 21:02                                                     ` Benjamin Herrenschmidt
@ 2012-01-11 21:58                                                     ` Michael S. Tsirkin
  1 sibling, 0 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 21:58 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 02:26:55PM -0600, Anthony Liguori wrote:
> On 01/11/2012 02:14 PM, Michael S. Tsirkin wrote:
> >On Wed, Jan 11, 2012 at 01:42:39PM -0600, Anthony Liguori wrote:
> >>On 01/11/2012 11:08 AM, Michael S. Tsirkin wrote:
> >>>
> >>>Not sure what you mean. Using VQ is DMA which is pretty common for PCI.
> >>
> >>Do you know of a network device that obtains it's mac address via a DMA transaction?
> >
> >Sure.
> >See mlx4_replace_mac in drivers/net/ethernet/mellanox/mlx4/port.c
> 
> I'd say that's a special case but I see what you're getting at here.
> 
> So what about keeping the config space read-only and using control
> queues for everything else?

Not just read-only - constant.

> Regards,
> 
> Anthony Liguori
> 
> >
> >>Regards,
> >>
> >>Anthony Liguori
> >>
> >>>
> >>>>>

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 21:02                                                     ` Benjamin Herrenschmidt
@ 2012-01-11 22:02                                                       ` Michael S. Tsirkin
  2012-01-11 22:16                                                         ` Benjamin Herrenschmidt
  2012-01-12  1:42                                                         ` Rusty Russell
  0 siblings, 2 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 22:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Pawel Moll, virtualization, Christian Borntraeger, Sasha Levin,
	Anthony Liguori

On Thu, Jan 12, 2012 at 08:02:06AM +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2012-01-11 at 14:26 -0600, Anthony Liguori wrote:
> > 
> > I'd say that's a special case but I see what you're getting at here.
> > 
> > So what about keeping the config space read-only and using control
> > queues for 
> > everything else?
> 
> Which is exactly what Rusty and I are proposing :-)
> I would go further
> and eliminate the idea of a seqlock and instead of a status queue with
> precise messages indicating what changed.
>
> I would couple that with the new queue format allowing immediate data in
> the descriptor to avoid having to use indirect buffers for these, which
> means no allocation, no buffer pool etc... which makes everything a lot
> easier to deal with as well.

We just need a couple of buffers outstanding. It can't be easier,
and a single buf descriptors already do not use indirection.

> We could probably have a helper library for sending control messages
> which could handle waiting for a ring slot to be free (practically
> always the case on control queues), writing the message, sending it and
> waiting for a status queue confirmation message.
> 
> Cheers,
> Ben.
> 

Look, we have a race currently. Let us not tie a bug fix to a huge
rewrite with unclear performance benefits, please.

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 21:13                                 ` Benjamin Herrenschmidt
@ 2012-01-11 22:13                                   ` Michael S. Tsirkin
  2012-01-11 22:19                                     ` Benjamin Herrenschmidt
  2012-01-11 22:56                                     ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 22:13 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, Jan 12, 2012 at 08:13:42AM +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2012-01-11 at 12:21 +0200, Michael S. Tsirkin wrote:
> > 
> > > BenH also convinced me we should finally make the config space LE if
> > > we're going to change things.  Since PCI is the most common transport,
> > > guest-endian confuses people.  And it sucks for really weird machines.
> > 
> > Are we going to keep guest endian for e.g. virtio net header?
> > If yes the benefit of switching config space is not that big.
> > And changes in devices would affect non-PCI transports.
> 
> I think the concept of "guest endian" is broken by design. What does
> that mean when running for example an ARM or a ppc 440 "guest" which
> could be either endian ? Since you can't hard code your guest endian how
> do you obtain/negociate it ? Also you now have to deal with dual endian
> in the host, makes everything trickier.
> 
> Just make everything LE.

Yea. But it's not a pure transport issue, just fixing configuration
won't be enough.  E.g. we have structures like virtio net header.

> > Quite possibly all or some of these things help performance
> > but do we have to change the spec before we have experimental
> > proof?
> 
> Well, I would argue that the network driver world has proven countless
> times that those are good ideas :-)

Below you seem to suggest that separate rings like
virtio has now is better than a single ring like Rusty
suggested.

> But by all mean, let's do a
> prototype implementation with virtio-net for example and bench it.
> 
> I don't think you need a single ring. For multiqueue net, you definitely
> want multiple rings and you do want rings to remain uni-directional.
> 
> One other thing that can be useful is to separate the completion ring
> from the actual ring of DMA descriptors, making the former completely
> read-only by the guest and the later completely read only by the host.

Are you familiar with current virtio ring structure?  How is this
different?

> For example take the ehea ethernet rx model. It has 3 rx "rings" per
> queue. One contains the completions, it's a toggle-valid model so we
> never write back to clear valid, it contains infos from the parser, the
> tokenID of the packet and the index as to where in which ring the data
> is, wich is either inline in the completion ring (small packet), header
> inline & data in a data ring or completely in a data ring. Then you have
> two data rings which are simply rings of SG list entries (more or less).
> 
> We typically pre-populate the data rings with skb's for 1500 and 9000
> bytes packets. Small packets come in immediately in the completion ring,
> and large packets via the data ring. 

Won't real workloads suffer from packet reordering?

> That's just -an- example. There's many other to take inspiration from.
> Network folks have beaten to death the problem of ring efficiency vs.
> CPU caches.
> 
> > > Moreover, I think we should make all these changes at once (at least, in
> > > the spec).  That makes it a big change, and it'll take longer to
> > > develop, but makes it easy in the long run to differentiate legacy and
> > > modern virtio.
> > > 
> > > Thoughts?
> > > Rusty.
> > 
> > BTW this seems to be the reverse from what you have in Mar 2001,
> > see 87mxkjls61.fsf@rustcorp.com.au :)
> 
> That was 10 years ago... 

Sorry, typo. It was Mar 2010 :)

> > I am much less concerned with what we do for configuration,
> > but I do not believe we have learned all performance lessons
> > from virtio ring1. 
> 
> Maybe we have learned some more since then ? :-)

There was 1 change in ring layout.

> > Is there any reason why we shouldn't be
> > able to experiment with inline within virtio1 and see
> > whether that gets us anything?
> > If we do a bunch of changes to the ring at once, we can't
> > figure out what's right, what's wrong, or back out of
> > mistakes later.
> > 
> > Since there are non PCI transports that use the ring,
> > we really shouldn't make both the configuration and
> > the ring changes depend on the same feature bit.
> 
> Another advantage of inline data is that it makes things a lot easier
> for cases where only small amount of data need to be exchanged, such as
> control/status rings, maybe virtio-tty (which I'm working on), etc... 
> 
> Cheers,
> Ben.

Is that getting you a lot of speedup? Note you want to add more code on
data path for everyone.  Why can't you have a fixed buffer in memory and
just point to that?

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 22:02                                                       ` Michael S. Tsirkin
@ 2012-01-11 22:16                                                         ` Benjamin Herrenschmidt
  2012-01-12  1:42                                                         ` Rusty Russell
  1 sibling, 0 replies; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 22:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, virtualization, Christian Borntraeger, Sasha Levin,
	Anthony Liguori

On Thu, 2012-01-12 at 00:02 +0200, Michael S. Tsirkin wrote:
> > We could probably have a helper library for sending control messages
> > which could handle waiting for a ring slot to be free (practically
> > always the case on control queues), writing the message, sending it
> and
> > waiting for a status queue confirmation message.
> > 
> > Cheers,
> > Ben.
> > 
> 
> Look, we have a race currently. Let us not tie a bug fix to a huge
> rewrite with unclear performance benefits, please.

Well, if we change endian, change the way config works, I think we are
doing enough dramatic changes to go all the way and change the ring
format too.

I don't think there's anything "unclear" about improving the rings in
the direction that all network cards have been chosing so far :-) But
for the control ring, performance is clearly not an issue.

I also like the simplicity of immediate data, it limits the failure
path, no allocation -does- make it easier, not having to wait for the
previous buffer to be complete if you pre-allocate etc...

In any case, if we do a lot of changes at the same time, we are probably
better off cutting the cord and making an incompatible vfio v2 with a
new set of drivers.

Having to deal with all the variants in a single driver will result in
unmaintainable drivers imho.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 22:13                                   ` Michael S. Tsirkin
@ 2012-01-11 22:19                                     ` Benjamin Herrenschmidt
  2012-01-11 22:56                                     ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 22:19 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, 2012-01-12 at 00:13 +0200, Michael S. Tsirkin wrote:

> > We typically pre-populate the data rings with skb's for 1500 and 9000
> > bytes packets. Small packets come in immediately in the completion ring,
> > and large packets via the data ring. 
> 
> Won't real workloads suffer from packet reordering?

No, they aren't re-ordered. The completion ring has an entry for each
packet, in order. Those entries eventually reference the entry index in
the data rings if the data was put there instead of being immediate.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 22:13                                   ` Michael S. Tsirkin
  2012-01-11 22:19                                     ` Benjamin Herrenschmidt
@ 2012-01-11 22:56                                     ` Benjamin Herrenschmidt
  2012-01-12  5:29                                       ` Michael S. Tsirkin
  1 sibling, 1 reply; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-11 22:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, 2012-01-12 at 00:13 +0200, Michael S. Tsirkin wrote:

> > Well, I would argue that the network driver world has proven countless
> > times that those are good ideas :-)
> 
> Below you seem to suggest that separate rings like
> virtio has now is better than a single ring like Rusty
> suggested.

I was merely pointing at examples. My understanding (but I may have
misparsed him) is that Rusty wants to collate the descriptors and the
"available" ring. I personally don't think the completions (the used
ring) should be merged, it should remain separate. If Rusty was hinting
at doing that then I disagree with him :-)

> Are you familiar with current virtio ring structure?  How is this
> different?

Vaguely yes :-) It does have this uni-directional model, I didn't
express myself very well or I got confused by the statements about a
single ring. Yes, I think we should keep the ring at least split into
two directions as it is today.

But we can certainly merge available ring and descriptors ring.

Another option is having the ring entry size be a configurable power of
two. This is trivial and comes at pretty much no cost. That would allow
virtio-net for example to do efficient things like putting the headers
as immediate data in the ring and the data as indirect.

This means a large part of the packet processing can happen without
touching additional cache lines.

Anyway, at this stage, I plan to sit down with Rusty next week and hash
out some kind of proposal, which we can experiment with & benchmark.

I'd suggest doing a simple user space app that creates such a ring,
forks, and produce / consume. We can then run that through perf and
analyze the cache behaviour, maximum throughput, etc....

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 13:30                               ` Anthony Liguori
  2012-01-11 15:12                                 ` Michael S. Tsirkin
@ 2012-01-12  1:35                                 ` Rusty Russell
  1 sibling, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2012-01-12  1:35 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Pawel Moll, Michael S. Tsirkin, Benjamin Herrenschmidt,
	virtualization, Christian Borntraeger, Sasha Levin

On Wed, 11 Jan 2012 07:30:34 -0600, Anthony Liguori <anthony@codemonkey.ws> wrote:
> I think the more important thing to do is require accesses to integers in the 
> config space to always be aligned and to use the appropriate accessor. 
> Non-integer fields should be restricted to byte access.
> 
> That limits config space entries to 32-bit but also means that there is no need 
> for a generation counter.

Unfortunately not, see virtio_blk.  capacity is 64 bits, and geometry is
multiple fields.

> If we're already making the change, the endianness ought to be a feature bit.

I'd rather tie it to the new PCI config layout (and ring format).  Then
we can simply have two backends, legacy and modern.

For non-PCI, we would need a feature bit.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 22:02                                                       ` Michael S. Tsirkin
  2012-01-11 22:16                                                         ` Benjamin Herrenschmidt
@ 2012-01-12  1:42                                                         ` Rusty Russell
  2012-01-13  2:19                                                           ` Michael S. Tsirkin
  1 sibling, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2012-01-12  1:42 UTC (permalink / raw)
  To: Michael S. Tsirkin, Benjamin Herrenschmidt
  Cc: Christian Borntraeger, Pawel Moll, Sasha Levin, Anthony Liguori,
	virtualization

On Thu, 12 Jan 2012 00:02:33 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> Look, we have a race currently. Let us not tie a bug fix to a huge
> rewrite with unclear performance benefits, please.

In theory, yes.  In practice, we bandaid it.

I think in the short term we change ->get to get the entire sequence
twice, and check it's the same.  Theoretically, still racy, but it does
cut the window.  And we haven't seen the bug yet, either.

In the longer term, we fix it properly:

1) Make it readonly, prevents one class of problems.
2) Treat it as constant if drv->config_changed is NULL (we can do this
   now, in fact) and ignore the config interrupt.
3) Use a generation counter on the config, odd means wait.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 10:21                               ` Michael S. Tsirkin
  2012-01-11 21:13                                 ` Benjamin Herrenschmidt
@ 2012-01-12  2:01                                 ` Rusty Russell
  2012-01-12  4:31                                   ` Benjamin Herrenschmidt
  2012-01-12  6:00                                   ` Michael S. Tsirkin
  1 sibling, 2 replies; 106+ messages in thread
From: Rusty Russell @ 2012-01-12  2:01 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Benjamin Herrenschmidt, Sasha Levin,
	Pawel Moll, virtualization

On Wed, 11 Jan 2012 12:21:30 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Jan 11, 2012 at 10:55:52AM +1030, Rusty Russell wrote:
> > On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
> > > > Yes.  The idea that we can alter fields in the device-specific config
> > > > area is flawed.  There may be cases where it doesn't matter, but as an
> > > > idea it was holed to begin with.
> > > > 
> > > > We can reduce probability by doing a double read to check, but there are
> > > > still cases where it will fail.
> > > 
> > > Okay - want me to propose an interface for that?
> > 
> > Had a brief chat with BenH (CC'd).
> > 
> > I think we should deprecate writing to the config space.  Only balloon
> > does it AFAICT, and I can't quite figure out *why* it has an 'active'
> > field.
> 
> Are you sure? I think net writes a mac address.

True.  We'll need to disable that, and come up with another mechanism if
we want it back (a new feature and a VIRTIO_NET_HDR_F_SET_MAC flag in
the virtio_net header?  Or would that mess up vhost_net?).

> > This solves half the problem, of sync guest writes.  For the
> > other half, I suggest a generation counter; odd means inconsistent.  The
> > guest can poll.
> 
> So we get the counter until it's even, get the config, if it's changed
> repeat? Yes it works. However, I would like to have a way to detect
> config change just by looking at memory. ATM we need to read ISR to
> know.  If we used a VQ, the advantage would be that the device can work
> with a single MSIX vector shared by all VQs.

If we use a 32-bit counter, we also get this though, right?

If counter has changed, it's a config interrupt...

> If we do require config VQ anyway, why not use it to notify
> guest of config changes? Guest could pre-post an in buffer
> and host uses that.

We could, but it's an additional burden on each device.  vqs are cheap,
but not free.  And the config area is so damn convenient...

> > BenH also convinced me we should finally make the config space LE if
> > we're going to change things.  Since PCI is the most common transport,
> > guest-endian confuses people.  And it sucks for really weird machines.
> 
> Are we going to keep guest endian for e.g. virtio net header?
> If yes the benefit of switching config space is not that big.
> And changes in devices would affect non-PCI transports.

Yep.  It would only make sense if we do it for everything.  And yes,
it'll mess up everyone who is BE, so it needs to be a feature bit for
them.

> > We should also change the ring (to a single ring, I think).  Descriptors
> > to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).
> > We might be able to squeeze it into 20 bytes but that means packing.  We
> > should support inline, chained or indirect.  Let the other side ack by
> > setting flag, cookie and len (if written).
> 
> Quite possibly all or some of these things help performance
> but do we have to change the spec before we have experimental
> proof?

We change the spec last, once we know what we're doing, ideally.

> I did experiment with a single ring using tools/virtio and
> I didn't see a measureable performance gain.

Interesting.  It is simpler and more standard than our current design,
but that's not sufficient unless there are other reasons.  Needs further
discussion and testing.

> Two rings do have the advantage of not requiring host side copy, which
> copy would surely add to cache pressure.

Well, a simple host could process in-order and leave stuff in the ring I
guess.  A smarter host would copy and queue, maybe leave one queue entry
in so it doesn't get flooded?

>  Since
> host doesn't change desriptors, we could also
> preformat some descriptors in the current design.
>
> There is a fragmentation problem in theory but it can be alleviated with
> a smart allocator.

Yeah, the complexity scares me...

> About inline - it can only help very small buffers.
> Which workloads do you have in mind exactly?

It was suggested by others, but I think TCP Acks are the classic one.
12 + 14 + 20 + 40 = 86 bytes with virtio_net_hdr_mrg_rxbuf at the front.

> BTW this seems to be the reverse from what you have in Mar 2001,
> see 87mxkjls61.fsf@rustcorp.com.au :)

(s/2001/2011).  Indeed.  Noone shared my optimism that having an open
process for a virtio2 would bring more players on board (my original
motivation).  But technical requirements are mounting up, which means
we're going to get there anyway.

> I am much less concerned with what we do for configuration,
> but I do not believe we have learned all performance lessons
> from virtio ring1. Is there any reason why we shouldn't be
> able to experiment with inline within virtio1 and see
> whether that gets us anything?

Inline in the used ring is possible, but those descriptors 8 bytes, vs
24/32.

> If we do a bunch of changes to the ring at once, we can't
> figure out what's right, what's wrong, or back out of
> mistakes later.
> 
> Since there are non PCI transports that use the ring,
> we really shouldn't make both the configuration and
> the ring changes depend on the same feature bit.

Yes, I'm thinking #define VIRTIO_F_VIRTIO2 (-1).  For PCI, this gets
mapped into a "are we using the new config layout?".  For others, it
gets mapped into a transport-specific feature.

(I'm sure you get it, but for the others) This is because I want to be
draw a clear line between all the legacy stuff at the same time, not
have to support part of it later because someone might not flip the
feature bit.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  2:01                                 ` Rusty Russell
@ 2012-01-12  4:31                                   ` Benjamin Herrenschmidt
  2012-01-12  6:09                                     ` Michael S. Tsirkin
  2012-01-12  6:00                                   ` Michael S. Tsirkin
  1 sibling, 1 reply; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-12  4:31 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Christian Borntraeger, virtualization, Sasha Levin, Pawel Moll,
	Michael S. Tsirkin

On Thu, 2012-01-12 at 12:31 +1030, Rusty Russell wrote:

> > Are we going to keep guest endian for e.g. virtio net header?
> > If yes the benefit of switching config space is not that big.
> > And changes in devices would affect non-PCI transports.
> 
> Yep.  It would only make sense if we do it for everything.  And yes,
> it'll mess up everyone who is BE, so it needs to be a feature bit for
> them.

One thing we should do (I might give it a go after LCA) is start
providing sized accessors for it and start converting the guest drivers
in Linux at least to use those.

That way, the accessors could then do the byteswap in the future
transparently if they see the feature bit for the "fixed endian".

We'd have accessors for 8,16,32 and 64-bit quantities, and accessors for
"raw" blobs (already endian neutral such as MAC addresses).

> Interesting.  It is simpler and more standard than our current design,
> but that's not sufficient unless there are other reasons.  Needs further
> discussion and testing.

I think completions shall remain separate. As for having a separate
"available" vs. "descriptors", well, I can find pro and cons.

As it is today, it's more complex than it should be and it would make
things simpler to just have an available ring that contains descriptors
like mostly everything else does.

It would also be slightly more cache friendly (cache misses are a
significant part of the performance issues for things like high speed
networking).

However I can see at least one advantage of what you've done :-) You
never have to deal with holes in the ring.

For example, a typical network driver should always try to allocate a
new skb before it "consumes" one, because otherwise, there's a chance
that it fails to allocate it, leaving a hole in the ring. Many drivers
do it wrong with consequences going all the way to leaving stale DMA
pointers in there....

With your scheme, that problem doesn't exist, and you can batch the
refill which might be more efficient under some circumstances.

But is that worth the gain and the cost in cache line accesses ?
Probably not.

> > Two rings do have the advantage of not requiring host side copy, which
> > copy would surely add to cache pressure.
> 
> Well, a simple host could process in-order and leave stuff in the ring I
> guess.  A smarter host would copy and queue, maybe leave one queue entry
> in so it doesn't get flooded?

What's wrong with a ring of descriptors + a ring of completion, with a
single toggle valid bit to indicate whether a given descriptor is valid
or not (to avoid the nasty ping pong on the ring head/tails).

> > About inline - it can only help very small buffers.
> > Which workloads do you have in mind exactly?
> 
> It was suggested by others, but I think TCP Acks are the classic one.

Split headers + data too, tho that means supporting immediate +
indirect. 

It makes a lot of sense for command rings as well if we're going to go
down that route.

> 12 + 14 + 20 + 40 = 86 bytes with virtio_net_hdr_mrg_rxbuf at the front.
> 
> > BTW this seems to be the reverse from what you have in Mar 2001,
> > see 87mxkjls61.fsf@rustcorp.com.au :)
> 
> (s/2001/2011).  Indeed.  Noone shared my optimism that having an open
> process for a virtio2 would bring more players on board (my original
> motivation).  But technical requirements are mounting up, which means
> we're going to get there anyway.
> 
> > I am much less concerned with what we do for configuration,
> > but I do not believe we have learned all performance lessons
> > from virtio ring1. Is there any reason why we shouldn't be
> > able to experiment with inline within virtio1 and see
> > whether that gets us anything?
> 
> Inline in the used ring is possible, but those descriptors 8 bytes, vs
> 24/32.
> 
> > If we do a bunch of changes to the ring at once, we can't
> > figure out what's right, what's wrong, or back out of
> > mistakes later.
> > 
> > Since there are non PCI transports that use the ring,
> > we really shouldn't make both the configuration and
> > the ring changes depend on the same feature bit.
> 
> Yes, I'm thinking #define VIRTIO_F_VIRTIO2 (-1).  For PCI, this gets
> mapped into a "are we using the new config layout?".  For others, it
> gets mapped into a transport-specific feature.

Or we can use the PCI ProgIf to indicate a different programming
interface, that way we also use that as an excuse to say that the first
BAR can either be PIO or MMIO :-)
 
> (I'm sure you get it, but for the others) This is because I want to be
> draw a clear line between all the legacy stuff at the same time, not
> have to support part of it later because someone might not flip the
> feature bit.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 22:56                                     ` Benjamin Herrenschmidt
@ 2012-01-12  5:29                                       ` Michael S. Tsirkin
  2012-01-12  6:13                                         ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-12  5:29 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, Jan 12, 2012 at 09:56:39AM +1100, Benjamin Herrenschmidt wrote:
> I'd suggest doing a simple user space app that creates such a ring,
> forks, and produce / consume. We can then run that through perf and
> analyze the cache behaviour, maximum throughput, etc....
> 
> Cheers,
> Ben.

Sure. You can also use my tools/virtio hack: this rebuilds virtio
ring code in userspace, has the advantage of reusing
actual kernel code, so it's up to date, has same barriers, etc.

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  2:01                                 ` Rusty Russell
  2012-01-12  4:31                                   ` Benjamin Herrenschmidt
@ 2012-01-12  6:00                                   ` Michael S. Tsirkin
  2012-01-13  3:22                                     ` Rusty Russell
  1 sibling, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-12  6:00 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Christian Borntraeger, Benjamin Herrenschmidt, Sasha Levin,
	Pawel Moll, virtualization

On Thu, Jan 12, 2012 at 12:31:09PM +1030, Rusty Russell wrote:
> On Wed, 11 Jan 2012 12:21:30 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Wed, Jan 11, 2012 at 10:55:52AM +1030, Rusty Russell wrote:
> > > On Tue, 10 Jan 2012 19:03:36 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > On Wed, Dec 21, 2011 at 11:03:25AM +1030, Rusty Russell wrote:
> > > > > Yes.  The idea that we can alter fields in the device-specific config
> > > > > area is flawed.  There may be cases where it doesn't matter, but as an
> > > > > idea it was holed to begin with.
> > > > > 
> > > > > We can reduce probability by doing a double read to check, but there are
> > > > > still cases where it will fail.
> > > > 
> > > > Okay - want me to propose an interface for that?
> > > 
> > > Had a brief chat with BenH (CC'd).
> > > 
> > > I think we should deprecate writing to the config space.  Only balloon
> > > does it AFAICT, and I can't quite figure out *why* it has an 'active'
> > > field.
> > 
> > Are you sure? I think net writes a mac address.
> 
> True.  We'll need to disable that, and come up with another mechanism if
> we want it back (a new feature and a VIRTIO_NET_HDR_F_SET_MAC flag in
> the virtio_net header?  Or would that mess up vhost_net?).

vhost net is only datapath, so no problem. copying in host is tricky for
vhost_net.

> > > This solves half the problem, of sync guest writes.  For the
> > > other half, I suggest a generation counter; odd means inconsistent.  The
> > > guest can poll.
> > 
> > So we get the counter until it's even, get the config, if it's changed
> > repeat? Yes it works. However, I would like to have a way to detect
> > config change just by looking at memory. ATM we need to read ISR to
> > know.  If we used a VQ, the advantage would be that the device can work
> > with a single MSIX vector shared by all VQs.
> 
> If we use a 32-bit counter, we also get this though, right?
> 
> If counter has changed, it's a config interrupt...

But we need an exit to read the counter. We can put the counter
in memory but this looks suspiciously like a simplified VQ -
so why not use a VQ then?

> > If we do require config VQ anyway, why not use it to notify
> > guest of config changes? Guest could pre-post an in buffer
> > and host uses that.
> 
> We could, but it's an additional burden on each device.  vqs are cheap,
> but not free.  And the config area is so damn convenient...

Not if you start playing with counters, checking it twice,
reinvent all kind of barriers ...

> > > BenH also convinced me we should finally make the config space LE if
> > > we're going to change things.  Since PCI is the most common transport,
> > > guest-endian confuses people.  And it sucks for really weird machines.
> > 
> > Are we going to keep guest endian for e.g. virtio net header?
> > If yes the benefit of switching config space is not that big.
> > And changes in devices would affect non-PCI transports.
> 
> Yep.  It would only make sense if we do it for everything.  And yes,
> it'll mess up everyone who is BE, so it needs to be a feature bit for
> them.
> 
> > > We should also change the ring (to a single ring, I think).  Descriptors
> > > to 24 bytes long (8 byte cookie, 8 byte addr, 4 byte len, 4 byte flags).
> > > We might be able to squeeze it into 20 bytes but that means packing.  We
> > > should support inline, chained or indirect.  Let the other side ack by
> > > setting flag, cookie and len (if written).
> > 
> > Quite possibly all or some of these things help performance
> > but do we have to change the spec before we have experimental
> > proof?
> 
> We change the spec last, once we know what we're doing, ideally.
> 
> > I did experiment with a single ring using tools/virtio and
> > I didn't see a measureable performance gain.
> 
> Interesting.  It is simpler and more standard than our current design,
> but that's not sufficient unless there are other reasons.  Needs further
> discussion and testing.
> 
> > Two rings do have the advantage of not requiring host side copy, which
> > copy would surely add to cache pressure.
> 
> Well, a simple host could process in-order and leave stuff in the ring I
> guess.  A smarter host would copy and queue, maybe leave one queue entry
> in so it doesn't get flooded?
> 
> >  Since
> > host doesn't change desriptors, we could also
> > preformat some descriptors in the current design.
> >
> > There is a fragmentation problem in theory but it can be alleviated with
> > a smart allocator.
> 
> Yeah, the complexity scares me...
> 
> > About inline - it can only help very small buffers.
> > Which workloads do you have in mind exactly?
> 
> It was suggested by others, but I think TCP Acks are the classic one.
> 12 + 14 + 20 + 40 = 86 bytes with virtio_net_hdr_mrg_rxbuf at the front.

That's only the simplest IPv4, right?
Anyway, this spans multiple descriptors so this complicates allocation
significantly.

> > BTW this seems to be the reverse from what you have in Mar 2001,
> > see 87mxkjls61.fsf@rustcorp.com.au :)
> 
> (s/2001/2011).  Indeed.  Noone shared my optimism that having an open
> process for a virtio2 would bring more players on board (my original
> motivation).  But technical requirements are mounting up, which means
> we're going to get there anyway.
> 
> > I am much less concerned with what we do for configuration,
> > but I do not believe we have learned all performance lessons
> > from virtio ring1. Is there any reason why we shouldn't be
> > able to experiment with inline within virtio1 and see
> > whether that gets us anything?
> 
> Inline in the used ring is possible, but those descriptors 8 bytes, vs
> 24/32.

Hmm, 86 > 32 anyway, right?

> > If we do a bunch of changes to the ring at once, we can't
> > figure out what's right, what's wrong, or back out of
> > mistakes later.
> > 
> > Since there are non PCI transports that use the ring,
> > we really shouldn't make both the configuration and
> > the ring changes depend on the same feature bit.
> 
> Yes, I'm thinking #define VIRTIO_F_VIRTIO2 (-1).  For PCI, this gets
> mapped into a "are we using the new config layout?".  For others, it
> gets mapped into a transport-specific feature.
> 
> (I'm sure you get it, but for the others) This is because I want to be
> draw a clear line between all the legacy stuff at the same time, not
> have to support part of it later because someone might not flip the
> feature bit.
> 
> Cheers,
> Rusty.

So my point is, config stuff and ring are completely separate,
they are different layers.

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  4:31                                   ` Benjamin Herrenschmidt
@ 2012-01-12  6:09                                     ` Michael S. Tsirkin
  2012-01-12  6:28                                       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-12  6:09 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, Jan 12, 2012 at 03:31:59PM +1100, Benjamin Herrenschmidt wrote:
> However I can see at least one advantage of what you've done :-) You
> never have to deal with holes in the ring.

Another advantage is the design goal for that ring:
host never needs to copy even if it completes
descriptors out of order. And out of order is something that does not
happen at all with hardware drivers. This is where paravirt is
different.

> > > Two rings do have the advantage of not requiring host side copy, which
> > > copy would surely add to cache pressure.
> > 
> > Well, a simple host could process in-order and leave stuff in the ring I
> > guess.  A smarter host would copy and queue, maybe leave one queue entry
> > in so it doesn't get flooded?
> 
> What's wrong with a ring of descriptors + a ring of completion, with a
> single toggle valid bit to indicate whether a given descriptor is valid
> or not (to avoid the nasty ping pong on the ring head/tails).

First, I don't understand how a valid bit avoids ping poing on the last
descriptor. Second, how do you handle out of order completions?


> > > About inline - it can only help very small buffers.
> > > Which workloads do you have in mind exactly?
> > 
> > It was suggested by others, but I think TCP Acks are the classic one.
> 
> Split headers + data too, tho that means supporting immediate +
> indirect. 
> 
> It makes a lot of sense for command rings as well if we're going to go
> down that route.

I don't see why it makes sense for commands. It's a performance
optimization and commands are off the data path.

> > 12 + 14 + 20 + 40 = 86 bytes with virtio_net_hdr_mrg_rxbuf at the front.
> > 
> > > BTW this seems to be the reverse from what you have in Mar 2001,
> > > see 87mxkjls61.fsf@rustcorp.com.au :)
> > 
> > (s/2001/2011).  Indeed.  Noone shared my optimism that having an open
> > process for a virtio2 would bring more players on board (my original
> > motivation).  But technical requirements are mounting up, which means
> > we're going to get there anyway.
> > 
> > > I am much less concerned with what we do for configuration,
> > > but I do not believe we have learned all performance lessons
> > > from virtio ring1. Is there any reason why we shouldn't be
> > > able to experiment with inline within virtio1 and see
> > > whether that gets us anything?
> > 
> > Inline in the used ring is possible, but those descriptors 8 bytes, vs
> > 24/32.
> > 
> > > If we do a bunch of changes to the ring at once, we can't
> > > figure out what's right, what's wrong, or back out of
> > > mistakes later.
> > > 
> > > Since there are non PCI transports that use the ring,
> > > we really shouldn't make both the configuration and
> > > the ring changes depend on the same feature bit.
> > 
> > Yes, I'm thinking #define VIRTIO_F_VIRTIO2 (-1).  For PCI, this gets
> > mapped into a "are we using the new config layout?".  For others, it
> > gets mapped into a transport-specific feature.
> 
> Or we can use the PCI ProgIf to indicate a different programming
> interface, that way we also use that as an excuse to say that the first
> BAR can either be PIO or MMIO :-)

We can't, legal PCI ProgIf values are defined in PCI spec.

> > (I'm sure you get it, but for the others) This is because I want to be
> > draw a clear line between all the legacy stuff at the same time, not
> > have to support part of it later because someone might not flip the
> > feature bit.
> 
> Cheers,
> Ben.
> 

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  5:29                                       ` Michael S. Tsirkin
@ 2012-01-12  6:13                                         ` Benjamin Herrenschmidt
  2012-01-12  6:37                                           ` Michael S. Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-12  6:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, 2012-01-12 at 07:29 +0200, Michael S. Tsirkin wrote:
> On Thu, Jan 12, 2012 at 09:56:39AM +1100, Benjamin Herrenschmidt wrote:
> > I'd suggest doing a simple user space app that creates such a ring,
> > forks, and produce / consume. We can then run that through perf and
> > analyze the cache behaviour, maximum throughput, etc....
> > 
> > Cheers,
> > Ben.
> 
> Sure. You can also use my tools/virtio hack: this rebuilds virtio
> ring code in userspace, has the advantage of reusing
> actual kernel code, so it's up to date, has same barriers, etc.

Yes, Rusty mentioned it today :-)

We'll play around in the next couple of weeks. With LCA, I'm not sure
how much I'll get done next week but heh...

I'm curious to see if we can still have a one-size-fit-all between
unordered blk requests with large data/descriptor access ratio and
ordered network buffers with a much smaller one.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  6:09                                     ` Michael S. Tsirkin
@ 2012-01-12  6:28                                       ` Benjamin Herrenschmidt
  2012-01-12  6:51                                         ` Michael S. Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-12  6:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, 2012-01-12 at 08:09 +0200, Michael S. Tsirkin wrote:
> On Thu, Jan 12, 2012 at 03:31:59PM +1100, Benjamin Herrenschmidt wrote:
> > However I can see at least one advantage of what you've done :-) You
> > never have to deal with holes in the ring.
> 
> Another advantage is the design goal for that ring:
> host never needs to copy even if it completes
> descriptors out of order. And out of order is something that does not
> happen at all with hardware drivers. This is where paravirt is
> different.

Actually out of order can happen with tagged command queue for SCSI or
ATA, tho I'm not 100% familiar with how things like AHCI handle this.

> > > > Two rings do have the advantage of not requiring host side copy, which
> > > > copy would surely add to cache pressure.
> > > 
> > > Well, a simple host could process in-order and leave stuff in the ring I
> > > guess.  A smarter host would copy and queue, maybe leave one queue entry
> > > in so it doesn't get flooded?
> > 
> > What's wrong with a ring of descriptors + a ring of completion, with a
> > single toggle valid bit to indicate whether a given descriptor is valid
> > or not (to avoid the nasty ping pong on the ring head/tails).
> 
> First, I don't understand how a valid bit avoids ping poing on the last
> descriptor. Second, how do you handle out of order completions?

A toggle means the valid bit is never cleared, it just changes polarity
every time you go around the ring. So there's never a write back to 0.

Out of order is something I hadn't thought about (I was most probably
too focused on virtio-net) and is indeed a PITA. It's doable with rings
but can get nasty.

I'll give that more thought in the next week, and Rusty and I shall play
with userspace models based on your tool.

> > > > About inline - it can only help very small buffers.
> > > > Which workloads do you have in mind exactly?
> > > 
> > > It was suggested by others, but I think TCP Acks are the classic one.
> > 
> > Split headers + data too, tho that means supporting immediate +
> > indirect. 
> > 
> > It makes a lot of sense for command rings as well if we're going to go
> > down that route.
> 
> I don't see why it makes sense for commands. It's a performance
> optimization and commands are off the data path.

Oh just code simplification not having to dequeue a descriptor, allocate
a buffer, etc... but a lot of that can be buried in helpers indeed.

> We can't, legal PCI ProgIf values are defined in PCI spec.

Hrm, more or less yes, I suppose we should stay away from that then,
do we use revision ID for anything in virtio-land ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  6:13                                         ` Benjamin Herrenschmidt
@ 2012-01-12  6:37                                           ` Michael S. Tsirkin
  0 siblings, 0 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-12  6:37 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, Jan 12, 2012 at 05:13:42PM +1100, Benjamin Herrenschmidt wrote:
> On Thu, 2012-01-12 at 07:29 +0200, Michael S. Tsirkin wrote:
> > On Thu, Jan 12, 2012 at 09:56:39AM +1100, Benjamin Herrenschmidt wrote:
> > > I'd suggest doing a simple user space app that creates such a ring,
> > > forks, and produce / consume. We can then run that through perf and
> > > analyze the cache behaviour, maximum throughput, etc....
> > > 
> > > Cheers,
> > > Ben.
> > 
> > Sure. You can also use my tools/virtio hack: this rebuilds virtio
> > ring code in userspace, has the advantage of reusing
> > actual kernel code, so it's up to date, has same barriers, etc.
> 
> Yes, Rusty mentioned it today :-)
> 
> We'll play around in the next couple of weeks. With LCA, I'm not sure
> how much I'll get done next week but heh...
> 
> I'm curious to see if we can still have a one-size-fit-all between
> unordered blk requests with large data/descriptor access ratio and
> ordered network buffers with a much smaller one.
> 
> Cheers,
> Ben.
> 

network buffers aren't ordered when the backend is zero copy.
Another idea is to add different sized bufs in the RX ring,
and have the device use the size that fits best.
This can improve small packet memory utilization without
need for data copies, but it also means that
descriptors are completed out of order in -net.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  6:28                                       ` Benjamin Herrenschmidt
@ 2012-01-12  6:51                                         ` Michael S. Tsirkin
  2012-01-12  7:03                                           ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-12  6:51 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, Jan 12, 2012 at 05:28:34PM +1100, Benjamin Herrenschmidt wrote:
> On Thu, 2012-01-12 at 08:09 +0200, Michael S. Tsirkin wrote:
> > On Thu, Jan 12, 2012 at 03:31:59PM +1100, Benjamin Herrenschmidt wrote:
> > > However I can see at least one advantage of what you've done :-) You
> > > never have to deal with holes in the ring.
> > 
> > Another advantage is the design goal for that ring:
> > host never needs to copy even if it completes
> > descriptors out of order. And out of order is something that does not
> > happen at all with hardware drivers. This is where paravirt is
> > different.
> 
> Actually out of order can happen with tagged command queue for SCSI or
> ATA, tho I'm not 100% familiar with how things like AHCI handle this.
> 
> > > > > Two rings do have the advantage of not requiring host side copy, which
> > > > > copy would surely add to cache pressure.
> > > > 
> > > > Well, a simple host could process in-order and leave stuff in the ring I
> > > > guess.  A smarter host would copy and queue, maybe leave one queue entry
> > > > in so it doesn't get flooded?
> > > 
> > > What's wrong with a ring of descriptors + a ring of completion, with a
> > > single toggle valid bit to indicate whether a given descriptor is valid
> > > or not (to avoid the nasty ping pong on the ring head/tails).
> > 
> > First, I don't understand how a valid bit avoids ping poing on the last
> > descriptor. Second, how do you handle out of order completions?
> 
> A toggle means the valid bit is never cleared, it just changes polarity
> every time you go around the ring. So there's never a write back to 0.

Ah, OK. Still we might have a cache line bouncing back
and forth as host rechecks this valid bit while guest
toggles it.

This also gets nasty with e.g. inline data.



> Out of order is something I hadn't thought about (I was most probably
> too focused on virtio-net) and is indeed a PITA. It's doable with rings
> but can get nasty.
> 
> I'll give that more thought in the next week, and Rusty and I shall play
> with userspace models based on your tool.

You might want to add more workloads - I have modeled stream transmit,
but e.g. ping pong is also interesting.

One othe rinteresting thing is that
in practice host often runs on the same CPU as guest.
This is the fastest way to run the host for when it
does not do a lot of work as most data is cached. And there amount of sharing
is less important than reducing cache consumption.

Again this is different from real hardware.

> > > > > About inline - it can only help very small buffers.
> > > > > Which workloads do you have in mind exactly?
> > > > 
> > > > It was suggested by others, but I think TCP Acks are the classic one.
> > > 
> > > Split headers + data too, tho that means supporting immediate +
> > > indirect. 
> > > 
> > > It makes a lot of sense for command rings as well if we're going to go
> > > down that route.
> > 
> > I don't see why it makes sense for commands. It's a performance
> > optimization and commands are off the data path.
> 
> Oh just code simplification not having to dequeue a descriptor, allocate
> a buffer, etc... but a lot of that can be buried in helpers indeed.
> 
> > We can't, legal PCI ProgIf values are defined in PCI spec.
> 
> Hrm, more or less yes, I suppose we should stay away from that then,
> do we use revision ID for anything in virtio-land ?
> 
> Cheers,
> Ben.
> 

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  6:51                                         ` Michael S. Tsirkin
@ 2012-01-12  7:03                                           ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-12  7:03 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Sasha Levin, Pawel Moll, virtualization

On Thu, 2012-01-12 at 08:51 +0200, Michael S. Tsirkin wrote:

> One othe rinteresting thing is that
> in practice host often runs on the same CPU as guest.

That's surprising. On Power with our huge exit cost, I would think that
would hurt more than anything else.

> This is the fastest way to run the host for when it
> does not do a lot of work as most data is cached. And there amount of sharing
> is less important than reducing cache consumption.
> 
> Again this is different from real hardware.

Yes indeed.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-11 20:46                                     ` Benjamin Herrenschmidt
@ 2012-01-12 10:37                                       ` Stefan Hajnoczi
  0 siblings, 0 replies; 106+ messages in thread
From: Stefan Hajnoczi @ 2012-01-12 10:37 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Pawel Moll, Michael S. Tsirkin, virtualization,
	Christian Borntraeger, Sasha Levin

On Wed, Jan 11, 2012 at 8:46 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Wed, 2012-01-11 at 14:28 +0000, Stefan Hajnoczi wrote:
>> On Wed, Jan 11, 2012 at 9:10 AM, Benjamin Herrenschmidt
>> <benh@kernel.crashing.org> wrote:
>> > On Wed, 2012-01-11 at 08:47 +0000, Stefan Hajnoczi wrote:
>> >>
>> >> This is also an opportunity to stop using CPU physical addresses in
>> >> the ring and instead perform DMA like a normal PCI device (use bus
>> >> addresses).
>> >
>> > Euh why ?
>>
>> Because it's a paravirt hack that ends up hitting corner cases.  It's
>> not possible to do virtio-pci passthrough under nested virtualization
>> unless we use an IOMMU.  Imagine passing virtio-net from L0 into the
>> L2 guest (i.e. PCI-passthrough).  If virtio-pci is really "PCI" this
>> should be possible but it's not when we use physical addresses instead
>> of bus addresses.
>
> Is this just an academic exercise or is there any actual value in doing
> this ?

It's a corner case, the value is small.  I also hit this with virtio
on SPARC which is made difficult by the fact that the Solaris kernel
assumes there is an IOMMU for scatter-gather and doesn't provide
functions for allocating physically contiguous memory in drivers.
It's another instance where this shortcut comes up against problems
and behaving like a real PCI device would work fine.

> Using an iommu is going to slaugher your performance, so at the very
> least it should be kept an option.

That's a good idea.  By default it can continue to use physical addresses.

I guess there's no point in worrying about it until we have a real user.

Stefan

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  1:42                                                         ` Rusty Russell
@ 2012-01-13  2:19                                                           ` Michael S. Tsirkin
  2012-01-13  2:32                                                             ` Benjamin Herrenschmidt
  2012-01-18  4:52                                                             ` Rusty Russell
  0 siblings, 2 replies; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-13  2:19 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin, Anthony Liguori

On Thu, Jan 12, 2012 at 12:12:17PM +1030, Rusty Russell wrote:
> On Thu, 12 Jan 2012 00:02:33 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > Look, we have a race currently. Let us not tie a bug fix to a huge
> > rewrite with unclear performance benefits, please.
> 
> In theory, yes.  In practice, we bandaid it.
> 
> I think in the short term we change ->get to get the entire sequence
> twice, and check it's the same.  Theoretically, still racy, but it does
> cut the window.  And we haven't seen the bug yet, either.

I thought about this some more. Since we always get
an interrupt on config changes, it seems that a rather
robust method would be to just synchronize against that.
Something like the below (warning - completely untested).
Still need to think about memory barriers, overflow etc.
What do you think?

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 03d1984..b5df385 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -57,6 +57,7 @@ struct virtio_pci_device
 	unsigned msix_used_vectors;
 	/* Whether we have vector per vq */
 	bool per_vq_vectors;
+	atomic_t config_changes;
 };
 
 /* Constants for MSI-X */
@@ -125,6 +126,19 @@ static void vp_finalize_features(struct virtio_device *vdev)
 	iowrite32(vdev->features[0], vp_dev->ioaddr+VIRTIO_PCI_GUEST_FEATURES);
 }
 
+/* wait for pending irq handlers */
+static void vp_synchronize_vectors(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	int i;
+
+	if (vp_dev->intx_enabled)
+		synchronize_irq(vp_dev->pci_dev->irq);
+
+	for (i = 0; i < vp_dev->msix_vectors; ++i)
+		synchronize_irq(vp_dev->msix_entries[i].vector);
+}
+
 /* virtio config->get() implementation */
 static void vp_get(struct virtio_device *vdev, unsigned offset,
 		   void *buf, unsigned len)
@@ -134,9 +148,20 @@ static void vp_get(struct virtio_device *vdev, unsigned offset,
 				VIRTIO_PCI_CONFIG(vp_dev) + offset;
 	u8 *ptr = buf;
 	int i;
-
-	for (i = 0; i < len; i++)
-		ptr[i] = ioread8(ioaddr + i);
+	int uninitialized_var(c);
+	c = atomic_read(&vp_dev->config_changes);
+	/* Make sure read is done before we get the first config byte */
+	rmb();
+	do {
+		for (i = 0; i < len; i++)
+			ptr[i] = ioread8(ioaddr + i);
+		/* Synchronize with config interrupt */
+		vp_synchronize_vectors(vdev);
+		/*
+		 * For multi-byte fields, we might get a config change interrupt
+		 * between byte reads. If this happens, retry the read.
+		 */
+	} while (c != atomic_read(&vp_dev->config_changes))
 }
 
 /* the config->set() implementation.  it's symmetric to the config->get()
@@ -169,19 +194,6 @@ static void vp_set_status(struct virtio_device *vdev, u8 status)
 	iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS);
 }
 
-/* wait for pending irq handlers */
-static void vp_synchronize_vectors(struct virtio_device *vdev)
-{
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	int i;
-
-	if (vp_dev->intx_enabled)
-		synchronize_irq(vp_dev->pci_dev->irq);
-
-	for (i = 0; i < vp_dev->msix_vectors; ++i)
-		synchronize_irq(vp_dev->msix_entries[i].vector);
-}
-
 static void vp_reset(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -213,6 +225,8 @@ static irqreturn_t vp_config_changed(int irq, void *opaque)
 	drv = container_of(vp_dev->vdev.dev.driver,
 			   struct virtio_driver, driver);
 
+	atomic_inc(&vp_dev->config_changes);
+
 	if (drv && drv->config_changed)
 		drv->config_changed(&vp_dev->vdev);
 	return IRQ_HANDLED;
@@ -646,6 +660,7 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 	vp_dev->vdev.config = &virtio_pci_config_ops;
 	vp_dev->pci_dev = pci_dev;
 	INIT_LIST_HEAD(&vp_dev->virtqueues);
+	atomic_set(&vp_dev->config_changes, 0);
 	spin_lock_init(&vp_dev->lock);
 
 	/* Disable MSI/MSIX to bring device to a known good state. */

^ permalink raw reply related	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-13  2:19                                                           ` Michael S. Tsirkin
@ 2012-01-13  2:32                                                             ` Benjamin Herrenschmidt
  2012-01-18 15:29                                                               ` Michael S. Tsirkin
  2012-01-18  4:52                                                             ` Rusty Russell
  1 sibling, 1 reply; 106+ messages in thread
From: Benjamin Herrenschmidt @ 2012-01-13  2:32 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, virtualization, Christian Borntraeger, Sasha Levin,
	Anthony Liguori

On Fri, 2012-01-13 at 04:19 +0200, Michael S. Tsirkin wrote:
> On Thu, Jan 12, 2012 at 12:12:17PM +1030, Rusty Russell wrote:
> > On Thu, 12 Jan 2012 00:02:33 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > Look, we have a race currently. Let us not tie a bug fix to a huge
> > > rewrite with unclear performance benefits, please.
> > 
> > In theory, yes.  In practice, we bandaid it.
> > 
> > I think in the short term we change ->get to get the entire sequence
> > twice, and check it's the same.  Theoretically, still racy, but it does
> > cut the window.  And we haven't seen the bug yet, either.
> 
> I thought about this some more. Since we always get
> an interrupt on config changes, it seems that a rather
> robust method would be to just synchronize against that.
> Something like the below (warning - completely untested).
> Still need to think about memory barriers, overflow etc.
> What do you think?

Your interrupt may take an unpredictable amount of time to arrive, I
don't see how you can use that as a guarantee.

Cheers,
Ben.

> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
> index 03d1984..b5df385 100644
> --- a/drivers/virtio/virtio_pci.c
> +++ b/drivers/virtio/virtio_pci.c
> @@ -57,6 +57,7 @@ struct virtio_pci_device
>  	unsigned msix_used_vectors;
>  	/* Whether we have vector per vq */
>  	bool per_vq_vectors;
> +	atomic_t config_changes;
>  };
>  
>  /* Constants for MSI-X */
> @@ -125,6 +126,19 @@ static void vp_finalize_features(struct virtio_device *vdev)
>  	iowrite32(vdev->features[0], vp_dev->ioaddr+VIRTIO_PCI_GUEST_FEATURES);
>  }
>  
> +/* wait for pending irq handlers */
> +static void vp_synchronize_vectors(struct virtio_device *vdev)
> +{
> +	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
> +	int i;
> +
> +	if (vp_dev->intx_enabled)
> +		synchronize_irq(vp_dev->pci_dev->irq);
> +
> +	for (i = 0; i < vp_dev->msix_vectors; ++i)
> +		synchronize_irq(vp_dev->msix_entries[i].vector);
> +}
> +
>  /* virtio config->get() implementation */
>  static void vp_get(struct virtio_device *vdev, unsigned offset,
>  		   void *buf, unsigned len)
> @@ -134,9 +148,20 @@ static void vp_get(struct virtio_device *vdev, unsigned offset,
>  				VIRTIO_PCI_CONFIG(vp_dev) + offset;
>  	u8 *ptr = buf;
>  	int i;
> -
> -	for (i = 0; i < len; i++)
> -		ptr[i] = ioread8(ioaddr + i);
> +	int uninitialized_var(c);
> +	c = atomic_read(&vp_dev->config_changes);
> +	/* Make sure read is done before we get the first config byte */
> +	rmb();
> +	do {
> +		for (i = 0; i < len; i++)
> +			ptr[i] = ioread8(ioaddr + i);
> +		/* Synchronize with config interrupt */
> +		vp_synchronize_vectors(vdev);
> +		/*
> +		 * For multi-byte fields, we might get a config change interrupt
> +		 * between byte reads. If this happens, retry the read.
> +		 */
> +	} while (c != atomic_read(&vp_dev->config_changes))
>  }
>  
>  /* the config->set() implementation.  it's symmetric to the config->get()
> @@ -169,19 +194,6 @@ static void vp_set_status(struct virtio_device *vdev, u8 status)
>  	iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS);
>  }
>  
> -/* wait for pending irq handlers */
> -static void vp_synchronize_vectors(struct virtio_device *vdev)
> -{
> -	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
> -	int i;
> -
> -	if (vp_dev->intx_enabled)
> -		synchronize_irq(vp_dev->pci_dev->irq);
> -
> -	for (i = 0; i < vp_dev->msix_vectors; ++i)
> -		synchronize_irq(vp_dev->msix_entries[i].vector);
> -}
> -
>  static void vp_reset(struct virtio_device *vdev)
>  {
>  	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
> @@ -213,6 +225,8 @@ static irqreturn_t vp_config_changed(int irq, void *opaque)
>  	drv = container_of(vp_dev->vdev.dev.driver,
>  			   struct virtio_driver, driver);
>  
> +	atomic_inc(&vp_dev->config_changes);
> +
>  	if (drv && drv->config_changed)
>  		drv->config_changed(&vp_dev->vdev);
>  	return IRQ_HANDLED;
> @@ -646,6 +660,7 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
>  	vp_dev->vdev.config = &virtio_pci_config_ops;
>  	vp_dev->pci_dev = pci_dev;
>  	INIT_LIST_HEAD(&vp_dev->virtqueues);
> +	atomic_set(&vp_dev->config_changes, 0);
>  	spin_lock_init(&vp_dev->lock);
>  
>  	/* Disable MSI/MSIX to bring device to a known good state. */

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-12  6:00                                   ` Michael S. Tsirkin
@ 2012-01-13  3:22                                     ` Rusty Russell
  0 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2012-01-13  3:22 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christian Borntraeger, Benjamin Herrenschmidt, Sasha Levin,
	Pawel Moll, virtualization

On Thu, 12 Jan 2012 08:00:10 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Thu, Jan 12, 2012 at 12:31:09PM +1030, Rusty Russell wrote:
> > If we use a 32-bit counter, we also get this though, right?
> > 
> > If counter has changed, it's a config interrupt...
> 
> But we need an exit to read the counter. We can put the counter
> in memory but this looks suspiciously like a simplified VQ -
> so why not use a VQ then?

Because now a driver first gets the data from config space.  But from
then on, they have to get it from the vq, and ignore the config space.
That's a bit weird.

> > > If we do require config VQ anyway, why not use it to notify
> > > guest of config changes? Guest could pre-post an in buffer
> > > and host uses that.
> > 
> > We could, but it's an additional burden on each device.  vqs are cheap,
> > but not free.  And the config area is so damn convenient...
> 
> Not if you start playing with counters, checking it twice,
> reinvent all kind of barriers ...

None of that appears inside the driver, though.  And let's be honest,
it's not *that* bad (very approx code):

static u32 vp_get_gen(struct virtio_pci_device *vp_dev)
{
        u32 gen;
        do {
                gen = ioread32(vp_dev->ioaddr + VIRTIO_PCI_CONFIG_GEN);
        } while (unlikely((gen & 1) == 1));

        virtio_rmb();
        return gen;
}

static bool vp_check_gen(struct virtio_pci_device *vp_dev, u32 gen)
{
        virtio_rmb();
        return ioread32(vp_dev->ioaddr + VIRTIO_PCI_CONFIG_GEN) == gen;
}

static void vp_get32(struct virtio_device *vdev, unsigned offset, u32 *buf)
{
	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        u32 gen;

        do {
                gen = vp_get_gen(vdev);
                *buf = ioread32(vp_dev->ioaddr + VIRTIO_PCI_CONFIG(vp_dev) + offset);
        } while (unlikely(!vp_check_gen(vp_dev, gen)));
}
...

> > It was suggested by others, but I think TCP Acks are the classic one.
> > 12 + 14 + 20 + 40 = 86 bytes with virtio_net_hdr_mrg_rxbuf at the front.
> 
> That's only the simplest IPv4, right?
> Anyway, this spans multiple descriptors so this complicates allocation
> significantly.

Yes, I think any general-but-useful inline will need to span multiple
descriptors.  That's part of the fun!

Let's get totally crazy and implement our ring in stripes, like:

00 04 08 12 01 05 09 13 02 06 10 14 03 07 11 15

That way consecutive (32-byte) descriptors don't share a cacheline!
(Not serious... quiet.)

> > Yes, I'm thinking #define VIRTIO_F_VIRTIO2 (-1).  For PCI, this gets
> > mapped into a "are we using the new config layout?".  For others, it
> > gets mapped into a transport-specific feature.
> > 
> > (I'm sure you get it, but for the others) This is because I want to be
> > draw a clear line between all the legacy stuff at the same time, not
> > have to support part of it later because someone might not flip the
> > feature bit.
> 
> So my point is, config stuff and ring are completely separate,
> they are different layers.

Absolutely, and we should analyze them separately as well as together.

*But* for maintenance is far easier if we only have to test new
config+new ring and old config+old ring.  They do interact, because
remember, the allocation of the ring changes with new config, too...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-13  2:19                                                           ` Michael S. Tsirkin
  2012-01-13  2:32                                                             ` Benjamin Herrenschmidt
@ 2012-01-18  4:52                                                             ` Rusty Russell
  1 sibling, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2012-01-18  4:52 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pawel Moll, Benjamin Herrenschmidt, virtualization,
	Christian Borntraeger, Sasha Levin, Anthony Liguori

On Fri, 13 Jan 2012 04:19:30 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Thu, Jan 12, 2012 at 12:12:17PM +1030, Rusty Russell wrote:
> > On Thu, 12 Jan 2012 00:02:33 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > Look, we have a race currently. Let us not tie a bug fix to a huge
> > > rewrite with unclear performance benefits, please.
> > 
> > In theory, yes.  In practice, we bandaid it.
> > 
> > I think in the short term we change ->get to get the entire sequence
> > twice, and check it's the same.  Theoretically, still racy, but it does
> > cut the window.  And we haven't seen the bug yet, either.
> 
> I thought about this some more. Since we always get
> an interrupt on config changes, it seems that a rather
> robust method would be to just synchronize against that.
> Something like the below (warning - completely untested).
> Still need to think about memory barriers, overflow etc.
> What do you think?

Ben's point about arbitrary delays in irq delivery still applies.  If
qemu changes config space the injects, there's a window there, too.

But in practice, like my suggested double-read, it probably slows things
down enough that it would be hard to hit.  Testing required...

Obviously, I'd like to fix this properly, but for legacy this might work
fine.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-13  2:32                                                             ` Benjamin Herrenschmidt
@ 2012-01-18 15:29                                                               ` Michael S. Tsirkin
  2012-01-22 22:53                                                                 ` Rusty Russell
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2012-01-18 15:29 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Pawel Moll, virtualization, Christian Borntraeger, Sasha Levin,
	Anthony Liguori

On Fri, Jan 13, 2012 at 01:32:42PM +1100, Benjamin Herrenschmidt wrote:
> On Fri, 2012-01-13 at 04:19 +0200, Michael S. Tsirkin wrote:
> > On Thu, Jan 12, 2012 at 12:12:17PM +1030, Rusty Russell wrote:
> > > On Thu, 12 Jan 2012 00:02:33 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > Look, we have a race currently. Let us not tie a bug fix to a huge
> > > > rewrite with unclear performance benefits, please.
> > > 
> > > In theory, yes.  In practice, we bandaid it.
> > > 
> > > I think in the short term we change ->get to get the entire sequence
> > > twice, and check it's the same.  Theoretically, still racy, but it does
> > > cut the window.  And we haven't seen the bug yet, either.
> > 
> > I thought about this some more. Since we always get
> > an interrupt on config changes, it seems that a rather
> > robust method would be to just synchronize against that.
> > Something like the below (warning - completely untested).
> > Still need to think about memory barriers, overflow etc.
> > What do you think?
> 
> Your interrupt may take an unpredictable amount of time to arrive, I
> don't see how you can use that as a guarantee.
> 
> Cheers,
> Ben.

In theory yes.
Practically, under current KVM, what would delay it?


> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
> > index 03d1984..b5df385 100644
> > --- a/drivers/virtio/virtio_pci.c
> > +++ b/drivers/virtio/virtio_pci.c
> > @@ -57,6 +57,7 @@ struct virtio_pci_device
> >  	unsigned msix_used_vectors;
> >  	/* Whether we have vector per vq */
> >  	bool per_vq_vectors;
> > +	atomic_t config_changes;
> >  };
> >  
> >  /* Constants for MSI-X */
> > @@ -125,6 +126,19 @@ static void vp_finalize_features(struct virtio_device *vdev)
> >  	iowrite32(vdev->features[0], vp_dev->ioaddr+VIRTIO_PCI_GUEST_FEATURES);
> >  }
> >  
> > +/* wait for pending irq handlers */
> > +static void vp_synchronize_vectors(struct virtio_device *vdev)
> > +{
> > +	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
> > +	int i;
> > +
> > +	if (vp_dev->intx_enabled)
> > +		synchronize_irq(vp_dev->pci_dev->irq);
> > +
> > +	for (i = 0; i < vp_dev->msix_vectors; ++i)
> > +		synchronize_irq(vp_dev->msix_entries[i].vector);
> > +}
> > +
> >  /* virtio config->get() implementation */
> >  static void vp_get(struct virtio_device *vdev, unsigned offset,
> >  		   void *buf, unsigned len)
> > @@ -134,9 +148,20 @@ static void vp_get(struct virtio_device *vdev, unsigned offset,
> >  				VIRTIO_PCI_CONFIG(vp_dev) + offset;
> >  	u8 *ptr = buf;
> >  	int i;
> > -
> > -	for (i = 0; i < len; i++)
> > -		ptr[i] = ioread8(ioaddr + i);
> > +	int uninitialized_var(c);
> > +	c = atomic_read(&vp_dev->config_changes);
> > +	/* Make sure read is done before we get the first config byte */
> > +	rmb();
> > +	do {
> > +		for (i = 0; i < len; i++)
> > +			ptr[i] = ioread8(ioaddr + i);
> > +		/* Synchronize with config interrupt */
> > +		vp_synchronize_vectors(vdev);
> > +		/*
> > +		 * For multi-byte fields, we might get a config change interrupt
> > +		 * between byte reads. If this happens, retry the read.
> > +		 */
> > +	} while (c != atomic_read(&vp_dev->config_changes))
> >  }
> >  
> >  /* the config->set() implementation.  it's symmetric to the config->get()
> > @@ -169,19 +194,6 @@ static void vp_set_status(struct virtio_device *vdev, u8 status)
> >  	iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS);
> >  }
> >  
> > -/* wait for pending irq handlers */
> > -static void vp_synchronize_vectors(struct virtio_device *vdev)
> > -{
> > -	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
> > -	int i;
> > -
> > -	if (vp_dev->intx_enabled)
> > -		synchronize_irq(vp_dev->pci_dev->irq);
> > -
> > -	for (i = 0; i < vp_dev->msix_vectors; ++i)
> > -		synchronize_irq(vp_dev->msix_entries[i].vector);
> > -}
> > -
> >  static void vp_reset(struct virtio_device *vdev)
> >  {
> >  	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
> > @@ -213,6 +225,8 @@ static irqreturn_t vp_config_changed(int irq, void *opaque)
> >  	drv = container_of(vp_dev->vdev.dev.driver,
> >  			   struct virtio_driver, driver);
> >  
> > +	atomic_inc(&vp_dev->config_changes);
> > +
> >  	if (drv && drv->config_changed)
> >  		drv->config_changed(&vp_dev->vdev);
> >  	return IRQ_HANDLED;
> > @@ -646,6 +660,7 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
> >  	vp_dev->vdev.config = &virtio_pci_config_ops;
> >  	vp_dev->pci_dev = pci_dev;
> >  	INIT_LIST_HEAD(&vp_dev->virtqueues);
> > +	atomic_set(&vp_dev->config_changes, 0);
> >  	spin_lock_init(&vp_dev->lock);
> >  
> >  	/* Disable MSI/MSIX to bring device to a known good state. */
> 

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2012-01-18 15:29                                                               ` Michael S. Tsirkin
@ 2012-01-22 22:53                                                                 ` Rusty Russell
  0 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2012-01-22 22:53 UTC (permalink / raw)
  To: Michael S. Tsirkin, Benjamin Herrenschmidt
  Cc: Christian Borntraeger, Pawel Moll, Sasha Levin, Anthony Liguori,
	virtualization

On Wed, 18 Jan 2012 17:29:49 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Fri, Jan 13, 2012 at 01:32:42PM +1100, Benjamin Herrenschmidt wrote:
> > Your interrupt may take an unpredictable amount of time to arrive, I
> > don't see how you can use that as a guarantee.
> 
> In theory yes.
> Practically, under current KVM, what would delay it?

Indeed, if we're talking about a bandaid, this is fine.  For the config
change, I do think we need something more robust.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2011-12-12 11:49       ` Michael S. Tsirkin
  2011-12-12 18:10         ` Don Dutile
@ 2013-05-28  7:56         ` Michael S. Tsirkin
  2013-05-29  1:17           ` Rusty Russell
  1 sibling, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2013-05-28  7:56 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization

On Mon, Dec 12, 2011 at 01:49:13PM +0200, Michael S. Tsirkin wrote:
> On Mon, Dec 12, 2011 at 09:15:03AM +1030, Rusty Russell wrote:
> > On Sun, 11 Dec 2011 11:42:56 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
> > > > +/* There is no iowrite64.  We use two 32-bit ops. */
> > > > +static void iowrite64(u64 val, const __le64 *addr)
> > > > +{
> > > > +	iowrite32((u32)val, (__le32 *)addr);
> > > > +	iowrite32(val >> 32, (__le32 *)addr + 1);
> > > > +}
> > > > +
> > > 
> > > Let's put addr_lo/addr_hi in the structure then,
> > > to make the fact this field is not atomic explicit?
> > 
> > Good point, assuming I haven't missed something.
> > 
> > Are 64-bit accesses actually unknown in PCI-land?  Or is this a limited
> > availability thing?
> > 
> > Thanks,
> > Rusty.
> 
> I think PCI can optionally support atomic 64 bit accesses, but not all
> architectures can generate them.

Ping. Going to change this in the layout struct?

> -- 
> MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2013-05-28  7:56         ` Michael S. Tsirkin
@ 2013-05-29  1:17           ` Rusty Russell
  2013-05-29 10:21             ` Michael S. Tsirkin
  0 siblings, 1 reply; 106+ messages in thread
From: Rusty Russell @ 2013-05-29  1:17 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sasha Levin, virtualization

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Mon, Dec 12, 2011 at 01:49:13PM +0200, Michael S. Tsirkin wrote:
>> On Mon, Dec 12, 2011 at 09:15:03AM +1030, Rusty Russell wrote:
>> > On Sun, 11 Dec 2011 11:42:56 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> > > On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
>> > > > +/* There is no iowrite64.  We use two 32-bit ops. */
>> > > > +static void iowrite64(u64 val, const __le64 *addr)
>> > > > +{
>> > > > +	iowrite32((u32)val, (__le32 *)addr);
>> > > > +	iowrite32(val >> 32, (__le32 *)addr + 1);
>> > > > +}
>> > > > +
>> > > 
>> > > Let's put addr_lo/addr_hi in the structure then,
>> > > to make the fact this field is not atomic explicit?
>> > 
>> > Good point, assuming I haven't missed something.
>> > 
>> > Are 64-bit accesses actually unknown in PCI-land?  Or is this a limited
>> > availability thing?
>> > 
>> > Thanks,
>> > Rusty.
>> 
>> I think PCI can optionally support atomic 64 bit accesses, but not all
>> architectures can generate them.
>
> Ping. Going to change this in the layout struct?

Not sure what you mean....  We use a queue_enable field, so it doesn't
matter if the accesses aren't atomic for that.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2013-05-29  1:17           ` Rusty Russell
@ 2013-05-29 10:21             ` Michael S. Tsirkin
  2013-05-30  2:17               ` Rusty Russell
  0 siblings, 1 reply; 106+ messages in thread
From: Michael S. Tsirkin @ 2013-05-29 10:21 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization

On Wed, May 29, 2013 at 10:47:52AM +0930, Rusty Russell wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Mon, Dec 12, 2011 at 01:49:13PM +0200, Michael S. Tsirkin wrote:
> >> On Mon, Dec 12, 2011 at 09:15:03AM +1030, Rusty Russell wrote:
> >> > On Sun, 11 Dec 2011 11:42:56 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >> > > On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
> >> > > > +/* There is no iowrite64.  We use two 32-bit ops. */
> >> > > > +static void iowrite64(u64 val, const __le64 *addr)
> >> > > > +{
> >> > > > +	iowrite32((u32)val, (__le32 *)addr);
> >> > > > +	iowrite32(val >> 32, (__le32 *)addr + 1);
> >> > > > +}
> >> > > > +
> >> > > 
> >> > > Let's put addr_lo/addr_hi in the structure then,
> >> > > to make the fact this field is not atomic explicit?
> >> > 
> >> > Good point, assuming I haven't missed something.
> >> > 
> >> > Are 64-bit accesses actually unknown in PCI-land?  Or is this a limited
> >> > availability thing?
> >> > 
> >> > Thanks,
> >> > Rusty.
> >> 
> >> I think PCI can optionally support atomic 64 bit accesses, but not all
> >> architectures can generate them.
> >
> > Ping. Going to change this in the layout struct?
> 
> Not sure what you mean....  We use a queue_enable field, so it doesn't
> matter if the accesses aren't atomic for that.
> 
> Cheers,
> Rusty.

I mean the struct should have separate _lo and _hi fields.

Otherwise I have to do:

+    case offsetof(struct virtio_pci_common_cfg, queue_desc):
+	 assert(size == 4);
+        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
+    case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
+	 assert(size == 4);
+        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) >> 32;

Would be nicer as:

+    case offsetof(struct virtio_pci_common_cfg, queue_desc_lo):
+	 assert(size == sizeof cfg.queue_desc_lo);
+        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
+    case offsetof(struct virtio_pci_common_cfg, queue_desc_hi):
+	 assert(size ==  sizeof cfg.queue_desc_hi);
+        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) >> 32;

Agree?

-- 
MST

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC 7/11] virtio_pci: new, capability-aware driver.
  2013-05-29 10:21             ` Michael S. Tsirkin
@ 2013-05-30  2:17               ` Rusty Russell
  0 siblings, 0 replies; 106+ messages in thread
From: Rusty Russell @ 2013-05-30  2:17 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sasha Levin, virtualization

"Michael S. Tsirkin" <mst@redhat.com> writes:
> On Wed, May 29, 2013 at 10:47:52AM +0930, Rusty Russell wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > On Mon, Dec 12, 2011 at 01:49:13PM +0200, Michael S. Tsirkin wrote:
>> >> On Mon, Dec 12, 2011 at 09:15:03AM +1030, Rusty Russell wrote:
>> >> > On Sun, 11 Dec 2011 11:42:56 +0200, "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> >> > > On Thu, Dec 08, 2011 at 09:09:33PM +1030, Rusty Russell wrote:
>> >> > > > +/* There is no iowrite64.  We use two 32-bit ops. */
>> >> > > > +static void iowrite64(u64 val, const __le64 *addr)
>> >> > > > +{
>> >> > > > +	iowrite32((u32)val, (__le32 *)addr);
>> >> > > > +	iowrite32(val >> 32, (__le32 *)addr + 1);
>> >> > > > +}
>> >> > > > +
>> >> > > 
>> >> > > Let's put addr_lo/addr_hi in the structure then,
>> >> > > to make the fact this field is not atomic explicit?
>> >> > 
>> >> > Good point, assuming I haven't missed something.
>> >> > 
>> >> > Are 64-bit accesses actually unknown in PCI-land?  Or is this a limited
>> >> > availability thing?
>> >> > 
>> >> > Thanks,
>> >> > Rusty.
>> >> 
>> >> I think PCI can optionally support atomic 64 bit accesses, but not all
>> >> architectures can generate them.
>> >
>> > Ping. Going to change this in the layout struct?
>> 
>> Not sure what you mean....  We use a queue_enable field, so it doesn't
>> matter if the accesses aren't atomic for that.
>> 
>> Cheers,
>> Rusty.
>
> I mean the struct should have separate _lo and _hi fields.
>
> Otherwise I have to do:
>
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc):
> +	 assert(size == 4);
> +        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc) + 4:
> +	 assert(size == 4);
> +        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) >> 32;
>
> Would be nicer as:
>
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc_lo):
> +	 assert(size == sizeof cfg.queue_desc_lo);
> +        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) & low;
> +    case offsetof(struct virtio_pci_common_cfg, queue_desc_hi):
> +	 assert(size ==  sizeof cfg.queue_desc_hi);
> +        return virtio_queue_get_desc_addr(vdev, vdev->queue_sel) >> 32;
>
> Agree?

Indeed.  Sorry for being obtuse!

Split into _lo and _hi fields:

Subject: virtio_pci: split u64 fields into u32.

MST points out that since we write them as two u32s, and the backend has
to handle that, let's not fool ourselves and just make it two u32s.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 8b35c2e..b5bb348 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -37,17 +37,16 @@ static DEFINE_PCI_DEVICE_TABLE(virtio_pci_id_table) = {
 
 MODULE_DEVICE_TABLE(pci, virtio_pci_id_table);
 
-/* There is no general iowrite64.  We use two 32-bit ops. */
-static void iowrite64_twopart(u64 val, const __le64 *addr)
+static void iowrite64_twopart(u64 val, __le32 *lo, __le32 *hi)
 {
-	iowrite32((u32)val, (__le32 *)addr);
-	iowrite32(val >> 32, (__le32 *)addr + 1);
+	iowrite32((u32)val, lo);
+	iowrite32(val >> 32, hi);
 }
 
 /* There is no ioread64.  We use two 32-bit ops. */
-static u64 ioread64_twopart(__le64 *addr)
+static u64 ioread64_twopart(__le32 *lo, __le32 *hi)
 {
-	return ioread32(addr) | ((u64)ioread32((__le32 *)addr + 1) << 32);
+	return ioread32(lo) | ((u64)ioread32(hi) << 32);
 }
 
 /* config->{get,set}_status() implementations */
@@ -145,14 +144,16 @@ static u64 vp_get64(struct virtio_device *vdev, unsigned offset)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
-	return ioread64_twopart(vp_dev->device + offset);
+	return ioread64_twopart(vp_dev->device + offset,
+				vp_dev->device + offset + 4);
 }
 
 static void vp_set64(struct virtio_device *vdev, unsigned offset, u64 val)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
-	iowrite64_twopart(val, vp_dev->device + offset);
+	iowrite64_twopart(val, vp_dev->device + offset,
+			  vp_dev->device + offset + 4);
 }
 
 static void vp_reset(struct virtio_device *vdev)
@@ -198,17 +199,18 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 {
 	struct virtio_pci_vq_info *info;
 	struct virtqueue *vq;
+	struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common;
 	u16 num, off;
 	int err;
 
-	if (index >= ioread16(&vp_dev->common->num_queues))
+	if (index >= ioread16(&cfg->num_queues))
 		return ERR_PTR(-ENOENT);
 
 	/* Select the queue we're interested in */
-	iowrite16(index, &vp_dev->common->queue_select);
+	iowrite16(index, &cfg->queue_select);
 
 	/* Sanity check */
-	switch (ioread64_twopart(&vp_dev->common->queue_desc)) {
+	switch (ioread64_twopart(&cfg->queue_desc_lo, &cfg->queue_desc_hi)) {
 		/* Uninitialized.  Excellent. */
 		break;
 	default:
@@ -217,7 +219,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	}
 
 	/* Maximum size must be a power of 2. */
-	num = ioread16(&vp_dev->common->queue_size);
+	num = ioread16(&cfg->queue_size);
 	if (num & (num - 1)) {
 		dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", num);
 		return ERR_PTR(-EINVAL);
@@ -233,7 +235,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	info->desired_num = num;
 
 	/* get offset of notification word for this vq (shouldn't wrap) */
-	off = ioread16(&vp_dev->common->queue_notify_off);
+	off = ioread16(&cfg->queue_notify_off);
 	if ((u64)off * vp_dev->notify_offset_multiplier + 2
 	    > vp_dev->notify_len) {
 		dev_warn(&vp_dev->pci_dev->dev,
@@ -264,8 +266,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	info->vq = vq;
 
 	if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
-		iowrite16(msix_vec, &vp_dev->common->queue_msix_vector);
-		msix_vec = ioread16(&vp_dev->common->queue_msix_vector);
+		iowrite16(msix_vec, &cfg->queue_msix_vector);
+		msix_vec = ioread16(&cfg->queue_msix_vector);
 		if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
 			err = -EBUSY;
 			goto out_new_virtqueue;
@@ -282,14 +284,14 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	}
 
 	/* Activate the queue. */
-	iowrite16(num, &vp_dev->common->queue_size);
+	iowrite16(num, &cfg->queue_size);
 	iowrite64_twopart(virt_to_phys(vq->vring.desc),
-			  &vp_dev->common->queue_desc);
+			  &cfg->queue_desc_lo, &cfg->queue_desc_hi);
 	iowrite64_twopart(virt_to_phys(vq->vring.avail),
-			  &vp_dev->common->queue_avail);
+			  &cfg->queue_avail_lo, &cfg->queue_avail_hi);
 	iowrite64_twopart(virt_to_phys(vq->vring.used),
-			  &vp_dev->common->queue_used);
-	iowrite8(1, &vp_dev->common->queue_enable);
+			  &cfg->queue_used_lo, &cfg->queue_used_hi);
+	iowrite8(1, &cfg->queue_enable);
 
 	return vq;
 
@@ -352,9 +354,9 @@ static void del_vq(struct virtqueue *vq)
 
 	/* This is for our own benefit, not the device's! */
 	iowrite16(info->desired_num, &vp_dev->common->queue_size);
-	iowrite64_twopart(0, &vp_dev->common->queue_desc);
-	iowrite64_twopart(0, &vp_dev->common->queue_avail);
-	iowrite64_twopart(0, &vp_dev->common->queue_used);
+	iowrite64_twopart(0, &vp_dev->common->queue_desc_lo, &vp_dev->common->queue_desc_hi);
+	iowrite64_twopart(0, &vp_dev->common->queue_avail_lo, &vp_dev->common->queue_avail_hi);
+	iowrite64_twopart(0, &vp_dev->common->queue_used_lo, &vp_dev->common->queue_used_hi);
 
 	free_pages_exact(info->queue, size);
 	kfree(info);
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 591eb0e5..a23226b 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -159,8 +159,11 @@ struct virtio_pci_common_cfg {
 	__le16 queue_msix_vector;	/* read-write */
 	__le16 queue_enable;		/* read-write */
 	__le16 queue_notify_off;	/* read-only */
-	__le64 queue_desc;		/* read-write */
-	__le64 queue_avail;		/* read-write */
-	__le64 queue_used;		/* read-write */
+	__le32 queue_desc_lo;		/* read-write */
+	__le32 queue_desc_hi;		/* read-write */
+	__le32 queue_avail_lo;		/* read-write */
+	__le32 queue_avail_hi;		/* read-write */
+	__le32 queue_used_lo;		/* read-write */
+	__le32 queue_used_hi;		/* read-write */
 };
 #endif /* _UAPI_LINUX_VIRTIO_PCI_H */

^ permalink raw reply related	[flat|nested] 106+ messages in thread

end of thread, other threads:[~2013-05-30  2:17 UTC | newest]

Thread overview: 106+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-12-08 10:22 [PATCH 0/11] RFC: PCI using capabilitities Rusty Russell
2011-12-08 10:30 ` [RFC 1/11] virtio: use u32, not bitmap for struct virtio_device's features Rusty Russell
2011-12-08 10:31 ` [RFC 2/11] virtio: add support for 64 bit features Rusty Russell
2011-12-08 10:32 ` [PATCH 0/11] RFC: PCI using capabilitities Sasha Levin
2011-12-08 10:32 ` [RFC 3/11] pci: add pci_iomap_range Rusty Russell
2011-12-15  8:30   ` Michael S. Tsirkin
2011-12-16  1:56     ` Rusty Russell
2011-12-26 14:05       ` [PATCHv2 RFC] " Michael S Tsirkin
2011-12-26 14:05         ` Michael S Tsirkin
2011-12-08 10:34 ` [RFC 4/11] virtio-pci: define layout for virtio vendor-specific capabilities Rusty Russell
2011-12-10 21:14   ` Sasha Levin
2011-12-08 10:35 ` [RFC 6/11] virtio_pci: move old defines to legacy, introduce new structure Rusty Russell
2011-12-08 10:38 ` [RFC 6/11] virtio_pci: don't use the legacy driver if we find the new PCI capabilities Rusty Russell
2011-12-10 21:18   ` Sasha Levin
2011-12-11  5:15     ` Rusty Russell
2011-12-11  9:37     ` Michael S. Tsirkin
2011-12-08 10:39 ` [RFC 7/11] virtio_pci: new, capability-aware driver Rusty Russell
2011-12-11  9:42   ` Michael S. Tsirkin
2011-12-11 22:45     ` Rusty Russell
2011-12-12 11:49       ` Michael S. Tsirkin
2011-12-12 18:10         ` Don Dutile
2011-12-16  1:58           ` Rusty Russell
2013-05-28  7:56         ` Michael S. Tsirkin
2013-05-29  1:17           ` Rusty Russell
2013-05-29 10:21             ` Michael S. Tsirkin
2013-05-30  2:17               ` Rusty Russell
2011-12-12 18:25       ` Michael S. Tsirkin
2011-12-13  2:21         ` Rusty Russell
2011-12-15  8:27           ` Michael S. Tsirkin
2011-12-16  1:50             ` Rusty Russell
2011-12-18 10:18               ` Michael S. Tsirkin
2011-12-19  6:06                 ` Rusty Russell
2011-12-19  9:13                   ` Michael S. Tsirkin
2011-12-19 11:08                     ` Rusty Russell
2011-12-20 11:37                       ` Michael S. Tsirkin
2011-12-21  0:33                         ` Rusty Russell
2011-12-21  9:19                           ` Michael S. Tsirkin
2012-01-10 17:03                           ` Michael S. Tsirkin
2012-01-11  0:25                             ` Rusty Russell
2012-01-11  1:48                               ` Benjamin Herrenschmidt
2012-01-11  8:47                               ` Stefan Hajnoczi
2012-01-11  9:10                                 ` Benjamin Herrenschmidt
2012-01-11 14:28                                   ` Stefan Hajnoczi
2012-01-11 15:39                                     ` Michael S. Tsirkin
2012-01-11 17:21                                       ` Stefan Hajnoczi
2012-01-11 18:34                                         ` Michael S. Tsirkin
2012-01-11 20:56                                         ` Benjamin Herrenschmidt
2012-01-11 20:46                                     ` Benjamin Herrenschmidt
2012-01-12 10:37                                       ` Stefan Hajnoczi
2012-01-11 10:21                               ` Michael S. Tsirkin
2012-01-11 21:13                                 ` Benjamin Herrenschmidt
2012-01-11 22:13                                   ` Michael S. Tsirkin
2012-01-11 22:19                                     ` Benjamin Herrenschmidt
2012-01-11 22:56                                     ` Benjamin Herrenschmidt
2012-01-12  5:29                                       ` Michael S. Tsirkin
2012-01-12  6:13                                         ` Benjamin Herrenschmidt
2012-01-12  6:37                                           ` Michael S. Tsirkin
2012-01-12  2:01                                 ` Rusty Russell
2012-01-12  4:31                                   ` Benjamin Herrenschmidt
2012-01-12  6:09                                     ` Michael S. Tsirkin
2012-01-12  6:28                                       ` Benjamin Herrenschmidt
2012-01-12  6:51                                         ` Michael S. Tsirkin
2012-01-12  7:03                                           ` Benjamin Herrenschmidt
2012-01-12  6:00                                   ` Michael S. Tsirkin
2012-01-13  3:22                                     ` Rusty Russell
2012-01-11 13:30                               ` Anthony Liguori
2012-01-11 15:12                                 ` Michael S. Tsirkin
2012-01-11 15:15                                   ` Anthony Liguori
2012-01-11 15:21                                     ` Michael S. Tsirkin
2012-01-11 15:28                                       ` Anthony Liguori
2012-01-11 15:45                                         ` Michael S. Tsirkin
2012-01-11 16:02                                           ` Anthony Liguori
2012-01-11 17:08                                             ` Michael S. Tsirkin
2012-01-11 19:42                                               ` Anthony Liguori
2012-01-11 20:14                                                 ` Michael S. Tsirkin
2012-01-11 20:26                                                   ` Anthony Liguori
2012-01-11 21:02                                                     ` Benjamin Herrenschmidt
2012-01-11 22:02                                                       ` Michael S. Tsirkin
2012-01-11 22:16                                                         ` Benjamin Herrenschmidt
2012-01-12  1:42                                                         ` Rusty Russell
2012-01-13  2:19                                                           ` Michael S. Tsirkin
2012-01-13  2:32                                                             ` Benjamin Herrenschmidt
2012-01-18 15:29                                                               ` Michael S. Tsirkin
2012-01-22 22:53                                                                 ` Rusty Russell
2012-01-18  4:52                                                             ` Rusty Russell
2012-01-11 21:58                                                     ` Michael S. Tsirkin
2012-01-11 20:59                                                 ` Benjamin Herrenschmidt
2012-01-11 20:51                                       ` Benjamin Herrenschmidt
2012-01-11 20:50                                   ` Benjamin Herrenschmidt
2012-01-12  1:35                                 ` Rusty Russell
2011-12-08 10:40 ` [RFC 8/11] virtio_pci: share structure between legacy and modern Rusty Russell
2011-12-08 10:41 ` [RFC 9/11] virtio_pci: share interrupt/notify handlers " Rusty Russell
2011-12-08 10:42 ` [RFC 10/11] virtio_pci: share virtqueue setup/teardown between modern and legacy driver Rusty Russell
2011-12-08 10:44 ` [RFC 11/11] virtio_pci: simplify common helpers Rusty Russell
2011-12-08 15:37 ` [PATCH 0/11] RFC: PCI using capabilitities Sasha Levin
2011-12-08 15:37 ` Sasha Levin
2011-12-09  6:17   ` Rusty Russell
2011-12-10 21:32     ` Sasha Levin
2011-12-11  9:05   ` Avi Kivity
2011-12-11 10:03     ` Sasha Levin
2011-12-11 12:30       ` Michael S. Tsirkin
2011-12-11 12:48         ` Sasha Levin
2011-12-11 12:48         ` Sasha Levin
2011-12-11 12:47   ` Michael S. Tsirkin
2011-12-11 12:53     ` Sasha Levin
2011-12-11 12:53     ` Sasha Levin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.