linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC v2] virtio: support packed ring
@ 2018-04-01 14:12 Tiwei Bie
  2018-04-10  2:55 ` Jason Wang
                   ` (3 more replies)
  0 siblings, 4 replies; 28+ messages in thread
From: Tiwei Bie @ 2018-04-01 14:12 UTC (permalink / raw)
  To: mst, jasowang, wexu, virtualization, linux-kernel, netdev
  Cc: jfreimann, tiwei.bie

Hello everyone,

This RFC implements packed ring support for virtio driver.

The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
Minor changes are needed for the vhost code, e.g. to kick the guest.

TODO:
- Refinements and bug fixes;
- Split into small patches;
- Test indirect descriptor support;
- Test/fix event suppression support;
- Test devices other than net;

RFC v1 -> RFC v2:
- Add indirect descriptor support - compile test only;
- Add event suppression supprt - compile test only;
- Move vring_packed_init() out of uapi (Jason, MST);
- Merge two loops into one in virtqueue_add_packed() (Jason);
- Split vring_unmap_one() for packed ring and split ring (Jason);
- Avoid using '%' operator (Jason);
- Rename free_head -> next_avail_idx (Jason);
- Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
- Some other refinements and bug fixes;

Thanks!

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
 include/linux/virtio_ring.h        |    8 +-
 include/uapi/linux/virtio_config.h |   12 +-
 include/uapi/linux/virtio_ring.h   |   61 ++
 4 files changed, 980 insertions(+), 195 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 71458f493cf8..0515dca34d77 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -58,14 +58,15 @@
 
 struct vring_desc_state {
 	void *data;			/* Data for callback. */
-	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
+	void *indir_desc;		/* Indirect descriptor, if any. */
+	int num;			/* Descriptor list length. */
 };
 
 struct vring_virtqueue {
 	struct virtqueue vq;
 
-	/* Actual memory layout for this queue */
-	struct vring vring;
+	/* Is this a packed ring? */
+	bool packed;
 
 	/* Can we use weak barriers? */
 	bool weak_barriers;
@@ -79,19 +80,45 @@ struct vring_virtqueue {
 	/* Host publishes avail event idx */
 	bool event;
 
-	/* Head of free buffer list. */
-	unsigned int free_head;
 	/* Number we've added since last sync. */
 	unsigned int num_added;
 
 	/* Last used index we've seen. */
 	u16 last_used_idx;
 
-	/* Last written value to avail->flags */
-	u16 avail_flags_shadow;
+	union {
+		/* Available for split ring */
+		struct {
+			/* Actual memory layout for this queue. */
+			struct vring vring;
 
-	/* Last written value to avail->idx in guest byte order */
-	u16 avail_idx_shadow;
+			/* Head of free buffer list. */
+			unsigned int free_head;
+
+			/* Last written value to avail->flags */
+			u16 avail_flags_shadow;
+
+			/* Last written value to avail->idx in
+			 * guest byte order. */
+			u16 avail_idx_shadow;
+		};
+
+		/* Available for packed ring */
+		struct {
+			/* Actual memory layout for this queue. */
+			struct vring_packed vring_packed;
+
+			/* Driver ring wrap counter. */
+			u8 wrap_counter;
+
+			/* Index of the next avail descriptor. */
+			unsigned int next_avail_idx;
+
+			/* Last written value to driver->flags in
+			 * guest byte order. */
+			u16 event_flags_shadow;
+		};
+	};
 
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	bool (*notify)(struct virtqueue *vq);
@@ -201,8 +228,33 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
 			      cpu_addr, size, direction);
 }
 
-static void vring_unmap_one(const struct vring_virtqueue *vq,
-			    struct vring_desc *desc)
+static void vring_unmap_one_split(const struct vring_virtqueue *vq,
+				  struct vring_desc *desc)
+{
+	u16 flags;
+
+	if (!vring_use_dma_api(vq->vq.vdev))
+		return;
+
+	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+	if (flags & VRING_DESC_F_INDIRECT) {
+		dma_unmap_single(vring_dma_dev(vq),
+				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
+				 virtio32_to_cpu(vq->vq.vdev, desc->len),
+				 (flags & VRING_DESC_F_WRITE) ?
+				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	} else {
+		dma_unmap_page(vring_dma_dev(vq),
+			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
+			       virtio32_to_cpu(vq->vq.vdev, desc->len),
+			       (flags & VRING_DESC_F_WRITE) ?
+			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	}
+}
+
+static void vring_unmap_one_packed(const struct vring_virtqueue *vq,
+				   struct vring_packed_desc *desc)
 {
 	u16 flags;
 
@@ -235,8 +287,9 @@ static int vring_mapping_error(const struct vring_virtqueue *vq,
 	return dma_mapping_error(vring_dma_dev(vq), addr);
 }
 
-static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
-					 unsigned int total_sg, gfp_t gfp)
+static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
+					       unsigned int total_sg,
+					       gfp_t gfp)
 {
 	struct vring_desc *desc;
 	unsigned int i;
@@ -257,14 +310,32 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 	return desc;
 }
 
-static inline int virtqueue_add(struct virtqueue *_vq,
-				struct scatterlist *sgs[],
-				unsigned int total_sg,
-				unsigned int out_sgs,
-				unsigned int in_sgs,
-				void *data,
-				void *ctx,
-				gfp_t gfp)
+static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
+						       unsigned int total_sg,
+						       gfp_t gfp)
+{
+	struct vring_packed_desc *desc;
+
+	/*
+	 * We require lowmem mappings for the descriptors because
+	 * otherwise virt_to_phys will give us bogus addresses in the
+	 * virtqueue.
+	 */
+	gfp &= ~__GFP_HIGHMEM;
+
+	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
+
+	return desc;
+}
+
+static inline int virtqueue_add_split(struct virtqueue *_vq,
+				      struct scatterlist *sgs[],
+				      unsigned int total_sg,
+				      unsigned int out_sgs,
+				      unsigned int in_sgs,
+				      void *data,
+				      void *ctx,
+				      gfp_t gfp)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
@@ -303,7 +374,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	/* If the host supports indirect descriptor tables, and we have multiple
 	 * buffers, then go indirect. FIXME: tune this threshold */
 	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
-		desc = alloc_indirect(_vq, total_sg, gfp);
+		desc = alloc_indirect_split(_vq, total_sg, gfp);
 	else {
 		desc = NULL;
 		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
@@ -424,7 +495,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	for (n = 0; n < total_sg; n++) {
 		if (i == err_idx)
 			break;
-		vring_unmap_one(vq, &desc[i]);
+		vring_unmap_one_split(vq, &desc[i]);
 		i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
 	}
 
@@ -435,6 +506,210 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	return -EIO;
 }
 
+static inline int virtqueue_add_packed(struct virtqueue *_vq,
+				       struct scatterlist *sgs[],
+				       unsigned int total_sg,
+				       unsigned int out_sgs,
+				       unsigned int in_sgs,
+				       void *data,
+				       void *ctx,
+				       gfp_t gfp)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct vring_packed_desc *desc;
+	struct scatterlist *sg;
+	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
+	__virtio16 uninitialized_var(head_flags), flags;
+	int head, wrap_counter;
+	bool indirect;
+
+	START_USE(vq);
+
+	BUG_ON(data == NULL);
+	BUG_ON(ctx && vq->indirect);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return -EIO;
+	}
+
+#ifdef DEBUG
+	{
+		ktime_t now = ktime_get();
+
+		/* No kick or get, with .1 second between?  Warn. */
+		if (vq->last_add_time_valid)
+			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
+					    > 100);
+		vq->last_add_time = now;
+		vq->last_add_time_valid = true;
+	}
+#endif
+
+	BUG_ON(total_sg == 0);
+
+	head = vq->next_avail_idx;
+	wrap_counter = vq->wrap_counter;
+
+	/* If the host supports indirect descriptor tables, and we have multiple
+	 * buffers, then go indirect. FIXME: tune this threshold */
+	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
+		desc = alloc_indirect_packed(_vq, total_sg, gfp);
+	else {
+		desc = NULL;
+		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
+	}
+
+	if (desc) {
+		/* Use a single buffer which doesn't continue */
+		indirect = true;
+		/* Set up rest to use this indirect table. */
+		i = 0;
+		descs_used = 1;
+	} else {
+		indirect = false;
+		desc = vq->vring_packed.desc;
+		i = head;
+		descs_used = total_sg;
+	}
+
+	if (vq->vq.num_free < descs_used) {
+		pr_debug("Can't add buf len %i - avail = %i\n",
+			 descs_used, vq->vq.num_free);
+		/* FIXME: for historical reasons, we force a notify here if
+		 * there are outgoing parts to the buffer.  Presumably the
+		 * host should service the ring ASAP. */
+		if (out_sgs)
+			vq->notify(&vq->vq);
+		if (indirect)
+			kfree(desc);
+		END_USE(vq);
+		return -ENOSPC;
+	}
+
+	for (n = 0; n < out_sgs + in_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
+						DMA_TO_DEVICE : DMA_FROM_DEVICE);
+			if (vring_mapping_error(vq, addr))
+				goto unmap_release;
+
+			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
+					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
+					VRING_DESC_F_AVAIL(vq->wrap_counter) |
+					VRING_DESC_F_USED(!vq->wrap_counter));
+			if (!indirect && i == head)
+				head_flags = flags;
+			else
+				desc[i].flags = flags;
+
+			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
+			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
+			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
+			prev = i;
+			i++;
+			if (!indirect && i >= vq->vring_packed.num) {
+				i = 0;
+				vq->wrap_counter ^= 1;
+			}
+		}
+	}
+	/* Last one doesn't continue. */
+	if (total_sg == 1)
+		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+	else
+		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+
+	if (indirect) {
+		/* Now that the indirect table is filled in, map it. */
+		dma_addr_t addr = vring_map_single(
+			vq, desc, total_sg * sizeof(struct vring_packed_desc),
+			DMA_TO_DEVICE);
+		if (vring_mapping_error(vq, addr))
+			goto unmap_release;
+
+		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
+					     VRING_DESC_F_AVAIL(wrap_counter) |
+					     VRING_DESC_F_USED(!wrap_counter));
+		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
+		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
+				total_sg * sizeof(struct vring_packed_desc));
+		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
+	}
+
+	/* We're using some buffers from the free list. */
+	vq->vq.num_free -= descs_used;
+
+	/* Update free pointer */
+	if (indirect) {
+		n = head + 1;
+		if (n >= vq->vring_packed.num) {
+			n = 0;
+			vq->wrap_counter ^= 1;
+		}
+		vq->next_avail_idx = n;
+	} else
+		vq->next_avail_idx = i;
+
+	/* Store token and indirect buffer state. */
+	vq->desc_state[head].num = descs_used;
+	vq->desc_state[head].data = data;
+	if (indirect)
+		vq->desc_state[head].indir_desc = desc;
+	else
+		vq->desc_state[head].indir_desc = ctx;
+
+	/* A driver MUST NOT make the first descriptor in the list
+	 * available before all subsequent descriptors comprising
+	 * the list are made available. */
+	virtio_wmb(vq->weak_barriers);
+	vq->vring_packed.desc[head].flags = head_flags;
+	vq->num_added++;
+
+	pr_debug("Added buffer head %i to %p\n", head, vq);
+	END_USE(vq);
+
+	return 0;
+
+unmap_release:
+	err_idx = i;
+	i = head;
+
+	for (n = 0; n < total_sg; n++) {
+		if (i == err_idx)
+			break;
+		vring_unmap_one_packed(vq, &desc[i]);
+		i++;
+		if (!indirect && i >= vq->vring_packed.num)
+			i = 0;
+	}
+
+	vq->wrap_counter = wrap_counter;
+
+	if (indirect)
+		kfree(desc);
+
+	END_USE(vq);
+	return -EIO;
+}
+
+static inline int virtqueue_add(struct virtqueue *_vq,
+				struct scatterlist *sgs[],
+				unsigned int total_sg,
+				unsigned int out_sgs,
+				unsigned int in_sgs,
+				void *data,
+				void *ctx,
+				gfp_t gfp)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
+						 in_sgs, data, ctx, gfp) :
+			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
+						in_sgs, data, ctx, gfp);
+}
+
 /**
  * virtqueue_add_sgs - expose buffers to other end
  * @vq: the struct virtqueue we're talking about.
@@ -537,18 +812,7 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
 
-/**
- * virtqueue_kick_prepare - first half of split virtqueue_kick call.
- * @vq: the struct virtqueue
- *
- * Instead of virtqueue_kick(), you can do:
- *	if (virtqueue_kick_prepare(vq))
- *		virtqueue_notify(vq);
- *
- * This is sometimes useful because the virtqueue_kick_prepare() needs
- * to be serialized, but the actual virtqueue_notify() call does not.
- */
-bool virtqueue_kick_prepare(struct virtqueue *_vq)
+static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	u16 new, old;
@@ -580,6 +844,62 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
 	END_USE(vq);
 	return needs_kick;
 }
+
+static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 new, old, off_wrap;
+	bool needs_kick;
+
+	START_USE(vq);
+	/* We need to expose the new flags value before checking notification
+	 * suppressions. */
+	virtio_mb(vq->weak_barriers);
+
+	old = vq->next_avail_idx - vq->num_added;
+	new = vq->next_avail_idx;
+	vq->num_added = 0;
+
+#ifdef DEBUG
+	if (vq->last_add_time_valid) {
+		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
+					      vq->last_add_time)) > 100);
+	}
+	vq->last_add_time_valid = false;
+#endif
+
+	off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
+
+	if (vq->event) {
+		// FIXME: fix this!
+		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
+			     vring_need_event(off_wrap & ~(1<<15), new, old);
+	} else {
+		needs_kick = (vq->vring_packed.device->flags !=
+			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
+	}
+	END_USE(vq);
+	return needs_kick;
+}
+
+/**
+ * virtqueue_kick_prepare - first half of split virtqueue_kick call.
+ * @vq: the struct virtqueue
+ *
+ * Instead of virtqueue_kick(), you can do:
+ *	if (virtqueue_kick_prepare(vq))
+ *		virtqueue_notify(vq);
+ *
+ * This is sometimes useful because the virtqueue_kick_prepare() needs
+ * to be serialized, but the actual virtqueue_notify() call does not.
+ */
+bool virtqueue_kick_prepare(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	return vq->packed ? virtqueue_kick_prepare_packed(_vq) :
+			    virtqueue_kick_prepare_split(_vq);
+}
 EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
 
 /**
@@ -626,8 +946,8 @@ bool virtqueue_kick(struct virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick);
 
-static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
-		       void **ctx)
+static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
+			     void **ctx)
 {
 	unsigned int i, j;
 	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
@@ -639,12 +959,12 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
 	i = head;
 
 	while (vq->vring.desc[i].flags & nextflag) {
-		vring_unmap_one(vq, &vq->vring.desc[i]);
+		vring_unmap_one_split(vq, &vq->vring.desc[i]);
 		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
 		vq->vq.num_free++;
 	}
 
-	vring_unmap_one(vq, &vq->vring.desc[i]);
+	vring_unmap_one_split(vq, &vq->vring.desc[i]);
 	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
 	vq->free_head = head;
 
@@ -666,7 +986,7 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
 		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
 
 		for (j = 0; j < len / sizeof(struct vring_desc); j++)
-			vring_unmap_one(vq, &indir_desc[j]);
+			vring_unmap_one_split(vq, &indir_desc[j]);
 
 		kfree(indir_desc);
 		vq->desc_state[head].indir_desc = NULL;
@@ -675,11 +995,207 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
 	}
 }
 
-static inline bool more_used(const struct vring_virtqueue *vq)
+static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
+			      void **ctx)
+{
+	struct vring_packed_desc *desc;
+	unsigned int i, j;
+
+	/* Clear data ptr. */
+	vq->desc_state[head].data = NULL;
+
+	i = head;
+
+	for (j = 0; j < vq->desc_state[head].num; j++) {
+		desc = &vq->vring_packed.desc[i];
+		vring_unmap_one_packed(vq, desc);
+		desc->flags = 0x0;
+		i++;
+		if (i >= vq->vring_packed.num)
+			i = 0;
+	}
+
+	vq->vq.num_free += vq->desc_state[head].num;
+
+	if (vq->indirect) {
+		u32 len;
+
+		desc = vq->desc_state[head].indir_desc;
+		/* Free the indirect table, if any, now that it's unmapped. */
+		if (!desc)
+			goto out;
+
+		len = virtio32_to_cpu(vq->vq.vdev,
+				      vq->vring_packed.desc[head].len);
+
+		BUG_ON(!(vq->vring_packed.desc[head].flags &
+			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
+		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
+
+		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
+			vring_unmap_one_packed(vq, &desc[j]);
+
+		kfree(desc);
+		vq->desc_state[head].indir_desc = NULL;
+	} else if (ctx) {
+		*ctx = vq->desc_state[head].indir_desc;
+	}
+
+out:
+	return vq->desc_state[head].num;
+}
+
+static inline bool more_used_split(const struct vring_virtqueue *vq)
 {
 	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
 }
 
+static inline bool more_used_packed(const struct vring_virtqueue *vq)
+{
+	u16 last_used, flags;
+	bool avail, used;
+
+	if (vq->vq.num_free == vq->vring_packed.num)
+		return false;
+
+	last_used = vq->last_used_idx;
+	flags = virtio16_to_cpu(vq->vq.vdev,
+				vq->vring_packed.desc[last_used].flags);
+	avail = flags & VRING_DESC_F_AVAIL(1);
+	used = flags & VRING_DESC_F_USED(1);
+
+	return avail == used;
+}
+
+static inline bool more_used(const struct vring_virtqueue *vq)
+{
+	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
+}
+
+void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
+				  void **ctx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	void *ret;
+	unsigned int i;
+	u16 last_used;
+
+	START_USE(vq);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return NULL;
+	}
+
+	if (!more_used(vq)) {
+		pr_debug("No more buffers in queue\n");
+		END_USE(vq);
+		return NULL;
+	}
+
+	/* Only get used array entries after they have been exposed by host. */
+	virtio_rmb(vq->weak_barriers);
+
+	last_used = (vq->last_used_idx & (vq->vring.num - 1));
+	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
+	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
+
+	if (unlikely(i >= vq->vring.num)) {
+		BAD_RING(vq, "id %u out of range\n", i);
+		return NULL;
+	}
+	if (unlikely(!vq->desc_state[i].data)) {
+		BAD_RING(vq, "id %u is not a head!\n", i);
+		return NULL;
+	}
+
+	/* detach_buf_split clears data, so grab it now. */
+	ret = vq->desc_state[i].data;
+	detach_buf_split(vq, i, ctx);
+	vq->last_used_idx++;
+	/* If we expect an interrupt for the next entry, tell host
+	 * by writing event index and flush out the write before
+	 * the read in the next get_buf call. */
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
+		virtio_store_mb(vq->weak_barriers,
+				&vring_used_event(&vq->vring),
+				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
+
+#ifdef DEBUG
+	vq->last_add_time_valid = false;
+#endif
+
+	END_USE(vq);
+	return ret;
+}
+
+void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, unsigned int *len,
+				   void **ctx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	uint16_t wrap_counter;
+	void *ret;
+	unsigned int i;
+	u16 last_used;
+
+	START_USE(vq);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return NULL;
+	}
+
+	if (!more_used(vq)) {
+		pr_debug("No more buffers in queue\n");
+		END_USE(vq);
+		return NULL;
+	}
+
+	/* Only get used elements after they have been exposed by host. */
+	virtio_rmb(vq->weak_barriers);
+
+	last_used = vq->last_used_idx;
+	i = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
+	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
+
+	if (unlikely(i >= vq->vring_packed.num)) {
+		BAD_RING(vq, "id %u out of range\n", i);
+		return NULL;
+	}
+	if (unlikely(!vq->desc_state[i].data)) {
+		BAD_RING(vq, "id %u is not a head!\n", i);
+		return NULL;
+	}
+
+	/* detach_buf_packed clears data, so grab it now. */
+	ret = vq->desc_state[i].data;
+	detach_buf_packed(vq, i, ctx);
+
+	vq->last_used_idx += vq->desc_state[i].num;
+	if (vq->last_used_idx >= vq->vring_packed.num)
+		vq->last_used_idx -= vq->vring_packed.num;
+
+	wrap_counter = vq->wrap_counter;
+	if (vq->last_used_idx > vq->next_avail_idx)
+		wrap_counter ^= 1;
+
+	/* If we expect an interrupt for the next entry, tell host
+	 * by writing event index and flush out the write before
+	 * the read in the next get_buf call. */
+	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
+		virtio_store_mb(vq->weak_barriers,
+				&vq->vring_packed.driver->off_wrap,
+				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
+						wrap_counter << 15));
+
+#ifdef DEBUG
+	vq->last_add_time_valid = false;
+#endif
+
+	END_USE(vq);
+	return ret;
+}
+
 /**
  * virtqueue_get_buf - get the next used buffer
  * @vq: the struct virtqueue we're talking about.
@@ -700,57 +1216,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
 			    void **ctx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	void *ret;
-	unsigned int i;
-	u16 last_used;
 
-	START_USE(vq);
-
-	if (unlikely(vq->broken)) {
-		END_USE(vq);
-		return NULL;
-	}
-
-	if (!more_used(vq)) {
-		pr_debug("No more buffers in queue\n");
-		END_USE(vq);
-		return NULL;
-	}
-
-	/* Only get used array entries after they have been exposed by host. */
-	virtio_rmb(vq->weak_barriers);
-
-	last_used = (vq->last_used_idx & (vq->vring.num - 1));
-	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
-	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
-
-	if (unlikely(i >= vq->vring.num)) {
-		BAD_RING(vq, "id %u out of range\n", i);
-		return NULL;
-	}
-	if (unlikely(!vq->desc_state[i].data)) {
-		BAD_RING(vq, "id %u is not a head!\n", i);
-		return NULL;
-	}
-
-	/* detach_buf clears data, so grab it now. */
-	ret = vq->desc_state[i].data;
-	detach_buf(vq, i, ctx);
-	vq->last_used_idx++;
-	/* If we expect an interrupt for the next entry, tell host
-	 * by writing event index and flush out the write before
-	 * the read in the next get_buf call. */
-	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
-		virtio_store_mb(vq->weak_barriers,
-				&vring_used_event(&vq->vring),
-				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
-
-#ifdef DEBUG
-	vq->last_add_time_valid = false;
-#endif
-
-	END_USE(vq);
-	return ret;
+	return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
+			    virtqueue_get_buf_ctx_split(_vq, len, ctx);
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
 
@@ -759,6 +1227,29 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	return virtqueue_get_buf_ctx(_vq, len, NULL);
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_buf);
+
+static void virtqueue_disable_cb_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
+		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+}
+
+static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
+		vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
+		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+							vq->event_flags_shadow);
+	}
+}
+
 /**
  * virtqueue_disable_cb - disable callbacks
  * @vq: the struct virtqueue we're talking about.
@@ -772,15 +1263,66 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
-		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-
+	if (vq->packed)
+		virtqueue_disable_cb_packed(_vq);
+	else
+		virtqueue_disable_cb_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 
+static unsigned virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 last_used_idx;
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always do both to keep code simple. */
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
+	END_USE(vq);
+	return last_used_idx;
+}
+
+static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 last_used_idx, wrap_counter, off_wrap;
+
+	START_USE(vq);
+
+	last_used_idx = vq->last_used_idx;
+	wrap_counter = vq->wrap_counter;
+
+	if (last_used_idx > vq->next_avail_idx)
+		wrap_counter ^= 1;
+
+	off_wrap = last_used_idx | (wrap_counter << 15);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always do both to keep code simple. */
+	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
+		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
+						     VRING_EVENT_F_ENABLE;
+		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+							vq->event_flags_shadow);
+	}
+	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
+	END_USE(vq);
+	return last_used_idx;
+}
+
 /**
  * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
  * @vq: the struct virtqueue we're talking about.
@@ -796,26 +1338,34 @@ EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 last_used_idx;
 
-	START_USE(vq);
-
-	/* We optimistically turn back on interrupts, then check if there was
-	 * more to do. */
-	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
-	 * either clear the flags bit or point the event index at the next
-	 * entry. Always do both to keep code simple. */
-	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
-		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
-	END_USE(vq);
-	return last_used_idx;
+	return vq->packed ? virtqueue_enable_cb_prepare_packed(_vq) :
+			    virtqueue_enable_cb_prepare_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
 
+static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	virtio_mb(vq->weak_barriers);
+	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
+}
+
+static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	bool avail, used;
+	u16 flags;
+
+	virtio_mb(vq->weak_barriers);
+	flags = virtio16_to_cpu(vq->vq.vdev,
+			vq->vring_packed.desc[last_used_idx].flags);
+	avail = flags & VRING_DESC_F_AVAIL(1);
+	used = flags & VRING_DESC_F_USED(1);
+	return avail == used;
+}
+
 /**
  * virtqueue_poll - query pending used buffers
  * @vq: the struct virtqueue we're talking about.
@@ -829,8 +1379,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	virtio_mb(vq->weak_barriers);
-	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
+	return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
+			    virtqueue_poll_split(_vq, last_used_idx);
 }
 EXPORT_SYMBOL_GPL(virtqueue_poll);
 
@@ -852,6 +1402,83 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
 
+static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 bufs;
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always update the event index to keep code simple. */
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+	/* TODO: tune this threshold */
+	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
+
+	virtio_store_mb(vq->weak_barriers,
+			&vring_used_event(&vq->vring),
+			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
+
+	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
+		END_USE(vq);
+		return false;
+	}
+
+	END_USE(vq);
+	return true;
+}
+
+static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 bufs, off_wrap, used_idx, wrap_counter;
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always update the event index to keep code simple. */
+	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
+		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
+						     VRING_EVENT_F_ENABLE;
+		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+							vq->event_flags_shadow);
+	}
+
+	/* TODO: tune this threshold */
+	bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
+
+	used_idx = vq->last_used_idx + bufs;
+	if (used_idx >= vq->vring_packed.num)
+		used_idx -= vq->vring_packed.num;
+
+	wrap_counter = vq->wrap_counter;
+	if (used_idx > vq->next_avail_idx)
+		wrap_counter ^= 1;
+
+	off_wrap = used_idx | (wrap_counter << 15);
+
+	virtio_store_mb(vq->weak_barriers, &vq->vring_packed.driver->off_wrap,
+			cpu_to_virtio16(_vq->vdev, off_wrap));
+
+	if (more_used_packed(vq)) {
+		END_USE(vq);
+		return false;
+	}
+
+	END_USE(vq);
+	return true;
+}
+
 /**
  * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
  * @vq: the struct virtqueue we're talking about.
@@ -868,37 +1495,69 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
 bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 bufs;
 
-	START_USE(vq);
-
-	/* We optimistically turn back on interrupts, then check if there was
-	 * more to do. */
-	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
-	 * either clear the flags bit or point the event index at the next
-	 * entry. Always update the event index to keep code simple. */
-	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
-		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-	/* TODO: tune this threshold */
-	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
-
-	virtio_store_mb(vq->weak_barriers,
-			&vring_used_event(&vq->vring),
-			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
-
-	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
-		END_USE(vq);
-		return false;
-	}
-
-	END_USE(vq);
-	return true;
+	return vq->packed ? virtqueue_enable_cb_delayed_packed(_vq) :
+			    virtqueue_enable_cb_delayed_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
 
+static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	unsigned int i;
+	void *buf;
+
+	START_USE(vq);
+
+	for (i = 0; i < vq->vring.num; i++) {
+		if (!vq->desc_state[i].data)
+			continue;
+		/* detach_buf clears data, so grab it now. */
+		buf = vq->desc_state[i].data;
+		detach_buf_split(vq, i, NULL);
+		vq->avail_idx_shadow--;
+		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
+		END_USE(vq);
+		return buf;
+	}
+	/* That should have freed everything. */
+	BUG_ON(vq->vq.num_free != vq->vring.num);
+
+	END_USE(vq);
+	return NULL;
+}
+
+static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	unsigned int i, num;
+	void *buf;
+
+	START_USE(vq);
+
+	for (i = 0; i < vq->vring_packed.num; i++) {
+		if (!vq->desc_state[i].data)
+			continue;
+		/* detach_buf clears data, so grab it now. */
+		buf = vq->desc_state[i].data;
+		num = detach_buf_packed(vq, i, NULL);
+		if (vq->next_avail_idx < num) {
+			vq->next_avail_idx = vq->vring_packed.num -
+					(num - vq->next_avail_idx);
+			vq->wrap_counter ^= 1;
+		} else {
+			vq->next_avail_idx -= num;
+		}
+		END_USE(vq);
+		return buf;
+	}
+	/* That should have freed everything. */
+	BUG_ON(vq->vq.num_free != vq->vring_packed.num);
+
+	END_USE(vq);
+	return NULL;
+}
+
 /**
  * virtqueue_detach_unused_buf - detach first unused buffer
  * @vq: the struct virtqueue we're talking about.
@@ -910,27 +1569,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
 void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	unsigned int i;
-	void *buf;
 
-	START_USE(vq);
-
-	for (i = 0; i < vq->vring.num; i++) {
-		if (!vq->desc_state[i].data)
-			continue;
-		/* detach_buf clears data, so grab it now. */
-		buf = vq->desc_state[i].data;
-		detach_buf(vq, i, NULL);
-		vq->avail_idx_shadow--;
-		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
-		END_USE(vq);
-		return buf;
-	}
-	/* That should have freed everything. */
-	BUG_ON(vq->vq.num_free != vq->vring.num);
-
-	END_USE(vq);
-	return NULL;
+	return vq->packed ? virtqueue_detach_unused_buf_packed(_vq) :
+			    virtqueue_detach_unused_buf_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
 
@@ -955,7 +1596,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
 EXPORT_SYMBOL_GPL(vring_interrupt);
 
 struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
+					union vring_union vring,
+					bool packed,
 					struct virtio_device *vdev,
 					bool weak_barriers,
 					bool context,
@@ -963,19 +1605,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 					void (*callback)(struct virtqueue *),
 					const char *name)
 {
-	unsigned int i;
+	unsigned int num, i;
 	struct vring_virtqueue *vq;
 
-	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
+	num = packed ? vring.vring_packed.num : vring.vring_split.num;
+
+	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
 		     GFP_KERNEL);
 	if (!vq)
 		return NULL;
 
-	vq->vring = vring;
 	vq->vq.callback = callback;
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
-	vq->vq.num_free = vring.num;
+	vq->vq.num_free = num;
 	vq->vq.index = index;
 	vq->we_own_ring = false;
 	vq->queue_dma_addr = 0;
@@ -984,9 +1627,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->weak_barriers = weak_barriers;
 	vq->broken = false;
 	vq->last_used_idx = 0;
-	vq->avail_flags_shadow = 0;
-	vq->avail_idx_shadow = 0;
 	vq->num_added = 0;
+	vq->packed = packed;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
@@ -997,18 +1639,37 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
+	if (vq->packed) {
+		vq->vring_packed = vring.vring_packed;
+		vq->next_avail_idx = 0;
+		vq->wrap_counter = 1;
+		vq->event_flags_shadow = 0;
+	} else {
+		vq->vring = vring.vring_split;
+		vq->avail_flags_shadow = 0;
+		vq->avail_idx_shadow = 0;
+
+		/* Put everything in free lists. */
+		vq->free_head = 0;
+		for (i = 0; i < num-1; i++)
+			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
+	}
+
 	/* No callback?  Tell other side not to bother us. */
 	if (!callback) {
-		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
+		if (packed) {
+			vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
+			vq->vring_packed.driver->flags = cpu_to_virtio16(vdev,
+						vq->event_flags_shadow);
+		} else {
+			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+			if (!vq->event)
+				vq->vring.avail->flags = cpu_to_virtio16(vdev,
+						vq->avail_flags_shadow);
+		}
 	}
 
-	/* Put everything in free lists. */
-	vq->free_head = 0;
-	for (i = 0; i < vring.num-1; i++)
-		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
-	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
+	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
 
 	return &vq->vq;
 }
@@ -1056,6 +1717,22 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
 	}
 }
 
+static inline int
+__vring_size(unsigned int num, unsigned long align, bool packed)
+{
+	return packed ? vring_packed_size(num, align) : vring_size(num, align);
+}
+
+static inline void vring_packed_init(struct vring_packed *vr, unsigned int num,
+				     void *p, unsigned long align)
+{
+	vr->num = num;
+	vr->desc = p;
+	vr->driver = (void *)(((uintptr_t)p + sizeof(struct vring_packed_desc)
+		* num + align - 1) & ~(align - 1));
+	vr->device = vr->driver + 1;
+}
+
 struct virtqueue *vring_create_virtqueue(
 	unsigned int index,
 	unsigned int num,
@@ -1072,7 +1749,8 @@ struct virtqueue *vring_create_virtqueue(
 	void *queue = NULL;
 	dma_addr_t dma_addr;
 	size_t queue_size_in_bytes;
-	struct vring vring;
+	union vring_union vring;
+	bool packed;
 
 	/* We assume num is a power of 2. */
 	if (num & (num - 1)) {
@@ -1080,9 +1758,13 @@ struct virtqueue *vring_create_virtqueue(
 		return NULL;
 	}
 
+	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+
 	/* TODO: allocate each queue chunk individually */
-	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
-		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
+			num /= 2) {
+		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+							     packed),
 					  &dma_addr,
 					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
 		if (queue)
@@ -1094,17 +1776,21 @@ struct virtqueue *vring_create_virtqueue(
 
 	if (!queue) {
 		/* Try to get a single page. You are my only hope! */
-		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+							     packed),
 					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
 	}
 	if (!queue)
 		return NULL;
 
-	queue_size_in_bytes = vring_size(num, vring_align);
-	vring_init(&vring, num, queue, vring_align);
+	queue_size_in_bytes = __vring_size(num, vring_align, packed);
+	if (packed)
+		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
+	else
+		vring_init(&vring.vring_split, num, queue, vring_align);
 
-	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
-				   notify, callback, name);
+	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+				   context, notify, callback, name);
 	if (!vq) {
 		vring_free_queue(vdev, queue_size_in_bytes, queue,
 				 dma_addr);
@@ -1130,10 +1816,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      void (*callback)(struct virtqueue *vq),
 				      const char *name)
 {
-	struct vring vring;
-	vring_init(&vring, num, pages, vring_align);
-	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
-				     notify, callback, name);
+	union vring_union vring;
+	bool packed;
+
+	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+	if (packed)
+		vring_packed_init(&vring.vring_packed, num, pages, vring_align);
+	else
+		vring_init(&vring.vring_split, num, pages, vring_align);
+
+	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+				     context, notify, callback, name);
 }
 EXPORT_SYMBOL_GPL(vring_new_virtqueue);
 
@@ -1143,7 +1836,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 
 	if (vq->we_own_ring) {
 		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
-				 vq->vring.desc, vq->queue_dma_addr);
+				 vq->packed ? (void *)vq->vring_packed.desc :
+					      (void *)vq->vring.desc,
+				 vq->queue_dma_addr);
 	}
 	list_del(&_vq->list);
 	kfree(vq);
@@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
 
 	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
 		switch (i) {
-		case VIRTIO_RING_F_INDIRECT_DESC:
+#if 0
+		case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
 			break;
-		case VIRTIO_RING_F_EVENT_IDX:
+		case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
 			break;
+#endif
 		case VIRTIO_F_VERSION_1:
 			break;
 		case VIRTIO_F_IOMMU_PLATFORM:
 			break;
+		case VIRTIO_F_RING_PACKED:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
@@ -1185,7 +1884,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
 
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	return vq->vring.num;
+	return vq->packed ? vq->vring_packed.num : vq->vring.num;
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
 
@@ -1228,6 +1927,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
 
 	BUG_ON(!vq->we_own_ring);
 
+	if (vq->packed)
+		return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
+				(char *)vq->vring_packed.desc);
+
 	return vq->queue_dma_addr +
 		((char *)vq->vring.avail - (char *)vq->vring.desc);
 }
@@ -1239,11 +1942,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
 
 	BUG_ON(!vq->we_own_ring);
 
+	if (vq->packed)
+		return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
+				(char *)vq->vring_packed.desc);
+
 	return vq->queue_dma_addr +
 		((char *)vq->vring.used - (char *)vq->vring.desc);
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
 
+/* Only available for split ring */
 const struct vring *virtqueue_get_vring(struct virtqueue *vq)
 {
 	return &to_vvq(vq)->vring;
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index bbf32524ab27..a0075894ad16 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
 struct virtio_device;
 struct virtqueue;
 
+union vring_union {
+	struct vring vring_split;
+	struct vring_packed vring_packed;
+};
+
 /*
  * Creates a virtqueue and allocates the descriptor ring.  If
  * may_reduce_num is set, then this may allocate a smaller ring than
@@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
 
 /* Creates a virtqueue with a custom layout. */
 struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
+					union vring_union vring,
+					bool packed,
 					struct virtio_device *vdev,
 					bool weak_barriers,
 					bool ctx,
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 308e2096291f..a6e392325e3a 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -49,7 +49,7 @@
  * transport being used (eg. virtio_ring), the rest are per-device feature
  * bits. */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		34
+#define VIRTIO_TRANSPORT_F_END		36
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -71,4 +71,14 @@
  * this is for compatibility with legacy systems.
  */
 #define VIRTIO_F_IOMMU_PLATFORM		33
+
+/* This feature indicates support for the packed virtqueue layout. */
+#define VIRTIO_F_RING_PACKED		34
+
+/*
+ * This feature indicates that all buffers are used by the device
+ * in the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER		35
+
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5faa989b..735d4207c988 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -44,6 +44,9 @@
 /* This means the buffer contains a list of buffer descriptors. */
 #define VRING_DESC_F_INDIRECT	4
 
+#define VRING_DESC_F_AVAIL(b)	((b) << 7)
+#define VRING_DESC_F_USED(b)	((b) << 15)
+
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
  * will still kick if it's out of buffers. */
@@ -53,6 +56,10 @@
  * optimization.  */
 #define VRING_AVAIL_F_NO_INTERRUPT	1
 
+#define VRING_EVENT_F_ENABLE	0x0
+#define VRING_EVENT_F_DISABLE	0x1
+#define VRING_EVENT_F_DESC	0x2
+
 /* We support indirect buffer descriptors */
 #define VIRTIO_RING_F_INDIRECT_DESC	28
 
@@ -171,4 +178,58 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
 	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
 }
 
+struct vring_packed_desc_event {
+	/* __virtio16 off  : 15; // Descriptor Event Offset
+	 * __virtio16 wrap : 1;  // Descriptor Event Wrap Counter */
+	__virtio16 off_wrap;
+	/* __virtio16 flags : 2; // Descriptor Event Flags */
+	__virtio16 flags;
+};
+
+struct vring_packed_desc {
+	/* Buffer Address. */
+	__virtio64 addr;
+	/* Buffer Length. */
+	__virtio32 len;
+	/* Buffer ID. */
+	__virtio16 id;
+	/* The flags depending on descriptor type. */
+	__virtio16 flags;
+};
+
+struct vring_packed {
+	unsigned int num;
+
+	struct vring_packed_desc *desc;
+
+	struct vring_packed_desc_event *driver;
+
+	struct vring_packed_desc_event *device;
+};
+
+/* The standard layout for the packed ring is a continuous chunk of memory
+ * which looks like this.
+ *
+ * struct vring_packed
+ * {
+ *	// The actual descriptors (16 bytes each)
+ *	struct vring_packed_desc desc[num];
+ *
+ *	// Padding to the next align boundary.
+ *	char pad[];
+ *
+ *	// Driver Event Suppression
+ *	struct vring_packed_desc_event driver;
+ *
+ *	// Device Event Suppression
+ *	struct vring_packed_desc_event device;
+ * };
+ */
+
+static inline unsigned vring_packed_size(unsigned int num, unsigned long align)
+{
+	return ((sizeof(struct vring_packed_desc) * num + align - 1)
+		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
+}
+
 #endif /* _UAPI_LINUX_VIRTIO_RING_H */
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-01 14:12 [RFC v2] virtio: support packed ring Tiwei Bie
@ 2018-04-10  2:55 ` Jason Wang
  2018-04-10  3:21   ` Tiwei Bie
  2018-04-13  4:30 ` Jason Wang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2018-04-10  2:55 UTC (permalink / raw)
  To: Tiwei Bie, mst, wexu, virtualization, linux-kernel, netdev; +Cc: jfreimann



On 2018年04月01日 22:12, Tiwei Bie wrote:
> Hello everyone,
>
> This RFC implements packed ring support for virtio driver.
>
> The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> Minor changes are needed for the vhost code, e.g. to kick the guest.
>
> TODO:
> - Refinements and bug fixes;
> - Split into small patches;
> - Test indirect descriptor support;
> - Test/fix event suppression support;
> - Test devices other than net;
>
> RFC v1 -> RFC v2:
> - Add indirect descriptor support - compile test only;
> - Add event suppression supprt - compile test only;
> - Move vring_packed_init() out of uapi (Jason, MST);
> - Merge two loops into one in virtqueue_add_packed() (Jason);
> - Split vring_unmap_one() for packed ring and split ring (Jason);
> - Avoid using '%' operator (Jason);
> - Rename free_head -> next_avail_idx (Jason);
> - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> - Some other refinements and bug fixes;
>
> Thanks!

Will try to review this later.

But it would be better if you can split it (more than 1000 lines is too 
big to be reviewed easily). E.g you can at least split it into three 
patches, new structures, datapath, and event suppression.

Thanks


>
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>   drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
>   include/linux/virtio_ring.h        |    8 +-
>   include/uapi/linux/virtio_config.h |   12 +-
>   include/uapi/linux/virtio_ring.h   |   61 ++
>   4 files changed, 980 insertions(+), 195 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 71458f493cf8..0515dca34d77 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -58,14 +58,15 @@
>   
>   struct vring_desc_state {
>   	void *data;			/* Data for callback. */
> -	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> +	void *indir_desc;		/* Indirect descriptor, if any. */
> +	int num;			/* Descriptor list length. */
>   };
>   
>   struct vring_virtqueue {
>   	struct virtqueue vq;
>   
> -	/* Actual memory layout for this queue */
> -	struct vring vring;
> +	/* Is this a packed ring? */
> +	bool packed;
>   
>   	/* Can we use weak barriers? */
>   	bool weak_barriers;
> @@ -79,19 +80,45 @@ struct vring_virtqueue {
>   	/* Host publishes avail event idx */
>   	bool event;
>   
> -	/* Head of free buffer list. */
> -	unsigned int free_head;
>   	/* Number we've added since last sync. */
>   	unsigned int num_added;
>   
>   	/* Last used index we've seen. */
>   	u16 last_used_idx;
>   
> -	/* Last written value to avail->flags */
> -	u16 avail_flags_shadow;
> +	union {
> +		/* Available for split ring */
> +		struct {
> +			/* Actual memory layout for this queue. */
> +			struct vring vring;
>   
> -	/* Last written value to avail->idx in guest byte order */
> -	u16 avail_idx_shadow;
> +			/* Head of free buffer list. */
> +			unsigned int free_head;
> +
> +			/* Last written value to avail->flags */
> +			u16 avail_flags_shadow;
> +
> +			/* Last written value to avail->idx in
> +			 * guest byte order. */
> +			u16 avail_idx_shadow;
> +		};
> +
> +		/* Available for packed ring */
> +		struct {
> +			/* Actual memory layout for this queue. */
> +			struct vring_packed vring_packed;
> +
> +			/* Driver ring wrap counter. */
> +			u8 wrap_counter;
> +
> +			/* Index of the next avail descriptor. */
> +			unsigned int next_avail_idx;
> +
> +			/* Last written value to driver->flags in
> +			 * guest byte order. */
> +			u16 event_flags_shadow;
> +		};
> +	};
>   
>   	/* How to notify other side. FIXME: commonalize hcalls! */
>   	bool (*notify)(struct virtqueue *vq);
> @@ -201,8 +228,33 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
>   			      cpu_addr, size, direction);
>   }
>   
> -static void vring_unmap_one(const struct vring_virtqueue *vq,
> -			    struct vring_desc *desc)
> +static void vring_unmap_one_split(const struct vring_virtqueue *vq,
> +				  struct vring_desc *desc)
> +{
> +	u16 flags;
> +
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return;
> +
> +	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +
> +	if (flags & VRING_DESC_F_INDIRECT) {
> +		dma_unmap_single(vring_dma_dev(vq),
> +				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +				 virtio32_to_cpu(vq->vq.vdev, desc->len),
> +				 (flags & VRING_DESC_F_WRITE) ?
> +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	} else {
> +		dma_unmap_page(vring_dma_dev(vq),
> +			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +			       virtio32_to_cpu(vq->vq.vdev, desc->len),
> +			       (flags & VRING_DESC_F_WRITE) ?
> +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	}
> +}
> +
> +static void vring_unmap_one_packed(const struct vring_virtqueue *vq,
> +				   struct vring_packed_desc *desc)
>   {
>   	u16 flags;
>   
> @@ -235,8 +287,9 @@ static int vring_mapping_error(const struct vring_virtqueue *vq,
>   	return dma_mapping_error(vring_dma_dev(vq), addr);
>   }
>   
> -static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
> -					 unsigned int total_sg, gfp_t gfp)
> +static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> +					       unsigned int total_sg,
> +					       gfp_t gfp)
>   {
>   	struct vring_desc *desc;
>   	unsigned int i;
> @@ -257,14 +310,32 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
>   	return desc;
>   }
>   
> -static inline int virtqueue_add(struct virtqueue *_vq,
> -				struct scatterlist *sgs[],
> -				unsigned int total_sg,
> -				unsigned int out_sgs,
> -				unsigned int in_sgs,
> -				void *data,
> -				void *ctx,
> -				gfp_t gfp)
> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> +						       unsigned int total_sg,
> +						       gfp_t gfp)
> +{
> +	struct vring_packed_desc *desc;
> +
> +	/*
> +	 * We require lowmem mappings for the descriptors because
> +	 * otherwise virt_to_phys will give us bogus addresses in the
> +	 * virtqueue.
> +	 */
> +	gfp &= ~__GFP_HIGHMEM;
> +
> +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> +
> +	return desc;
> +}
> +
> +static inline int virtqueue_add_split(struct virtqueue *_vq,
> +				      struct scatterlist *sgs[],
> +				      unsigned int total_sg,
> +				      unsigned int out_sgs,
> +				      unsigned int in_sgs,
> +				      void *data,
> +				      void *ctx,
> +				      gfp_t gfp)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   	struct scatterlist *sg;
> @@ -303,7 +374,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>   	/* If the host supports indirect descriptor tables, and we have multiple
>   	 * buffers, then go indirect. FIXME: tune this threshold */
>   	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> -		desc = alloc_indirect(_vq, total_sg, gfp);
> +		desc = alloc_indirect_split(_vq, total_sg, gfp);
>   	else {
>   		desc = NULL;
>   		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
> @@ -424,7 +495,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>   	for (n = 0; n < total_sg; n++) {
>   		if (i == err_idx)
>   			break;
> -		vring_unmap_one(vq, &desc[i]);
> +		vring_unmap_one_split(vq, &desc[i]);
>   		i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
>   	}
>   
> @@ -435,6 +506,210 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>   	return -EIO;
>   }
>   
> +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> +				       struct scatterlist *sgs[],
> +				       unsigned int total_sg,
> +				       unsigned int out_sgs,
> +				       unsigned int in_sgs,
> +				       void *data,
> +				       void *ctx,
> +				       gfp_t gfp)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	struct vring_packed_desc *desc;
> +	struct scatterlist *sg;
> +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> +	__virtio16 uninitialized_var(head_flags), flags;
> +	int head, wrap_counter;
> +	bool indirect;
> +
> +	START_USE(vq);
> +
> +	BUG_ON(data == NULL);
> +	BUG_ON(ctx && vq->indirect);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return -EIO;
> +	}
> +
> +#ifdef DEBUG
> +	{
> +		ktime_t now = ktime_get();
> +
> +		/* No kick or get, with .1 second between?  Warn. */
> +		if (vq->last_add_time_valid)
> +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> +					    > 100);
> +		vq->last_add_time = now;
> +		vq->last_add_time_valid = true;
> +	}
> +#endif
> +
> +	BUG_ON(total_sg == 0);
> +
> +	head = vq->next_avail_idx;
> +	wrap_counter = vq->wrap_counter;
> +
> +	/* If the host supports indirect descriptor tables, and we have multiple
> +	 * buffers, then go indirect. FIXME: tune this threshold */
> +	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> +	else {
> +		desc = NULL;
> +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> +	}
> +
> +	if (desc) {
> +		/* Use a single buffer which doesn't continue */
> +		indirect = true;
> +		/* Set up rest to use this indirect table. */
> +		i = 0;
> +		descs_used = 1;
> +	} else {
> +		indirect = false;
> +		desc = vq->vring_packed.desc;
> +		i = head;
> +		descs_used = total_sg;
> +	}
> +
> +	if (vq->vq.num_free < descs_used) {
> +		pr_debug("Can't add buf len %i - avail = %i\n",
> +			 descs_used, vq->vq.num_free);
> +		/* FIXME: for historical reasons, we force a notify here if
> +		 * there are outgoing parts to the buffer.  Presumably the
> +		 * host should service the ring ASAP. */
> +		if (out_sgs)
> +			vq->notify(&vq->vq);
> +		if (indirect)
> +			kfree(desc);
> +		END_USE(vq);
> +		return -ENOSPC;
> +	}
> +
> +	for (n = 0; n < out_sgs + in_sgs; n++) {
> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> +						DMA_TO_DEVICE : DMA_FROM_DEVICE);
> +			if (vring_mapping_error(vq, addr))
> +				goto unmap_release;
> +
> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> +					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> +					VRING_DESC_F_USED(!vq->wrap_counter));
> +			if (!indirect && i == head)
> +				head_flags = flags;
> +			else
> +				desc[i].flags = flags;
> +
> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> +			prev = i;
> +			i++;
> +			if (!indirect && i >= vq->vring_packed.num) {
> +				i = 0;
> +				vq->wrap_counter ^= 1;
> +			}
> +		}
> +	}
> +	/* Last one doesn't continue. */
> +	if (total_sg == 1)
> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> +	else
> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> +
> +	if (indirect) {
> +		/* Now that the indirect table is filled in, map it. */
> +		dma_addr_t addr = vring_map_single(
> +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
> +			DMA_TO_DEVICE);
> +		if (vring_mapping_error(vq, addr))
> +			goto unmap_release;
> +
> +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> +					     VRING_DESC_F_AVAIL(wrap_counter) |
> +					     VRING_DESC_F_USED(!wrap_counter));
> +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
> +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> +				total_sg * sizeof(struct vring_packed_desc));
> +		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
> +	}
> +
> +	/* We're using some buffers from the free list. */
> +	vq->vq.num_free -= descs_used;
> +
> +	/* Update free pointer */
> +	if (indirect) {
> +		n = head + 1;
> +		if (n >= vq->vring_packed.num) {
> +			n = 0;
> +			vq->wrap_counter ^= 1;
> +		}
> +		vq->next_avail_idx = n;
> +	} else
> +		vq->next_avail_idx = i;
> +
> +	/* Store token and indirect buffer state. */
> +	vq->desc_state[head].num = descs_used;
> +	vq->desc_state[head].data = data;
> +	if (indirect)
> +		vq->desc_state[head].indir_desc = desc;
> +	else
> +		vq->desc_state[head].indir_desc = ctx;
> +
> +	/* A driver MUST NOT make the first descriptor in the list
> +	 * available before all subsequent descriptors comprising
> +	 * the list are made available. */
> +	virtio_wmb(vq->weak_barriers);
> +	vq->vring_packed.desc[head].flags = head_flags;
> +	vq->num_added++;
> +
> +	pr_debug("Added buffer head %i to %p\n", head, vq);
> +	END_USE(vq);
> +
> +	return 0;
> +
> +unmap_release:
> +	err_idx = i;
> +	i = head;
> +
> +	for (n = 0; n < total_sg; n++) {
> +		if (i == err_idx)
> +			break;
> +		vring_unmap_one_packed(vq, &desc[i]);
> +		i++;
> +		if (!indirect && i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->wrap_counter = wrap_counter;
> +
> +	if (indirect)
> +		kfree(desc);
> +
> +	END_USE(vq);
> +	return -EIO;
> +}
> +
> +static inline int virtqueue_add(struct virtqueue *_vq,
> +				struct scatterlist *sgs[],
> +				unsigned int total_sg,
> +				unsigned int out_sgs,
> +				unsigned int in_sgs,
> +				void *data,
> +				void *ctx,
> +				gfp_t gfp)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
> +						 in_sgs, data, ctx, gfp) :
> +			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
> +						in_sgs, data, ctx, gfp);
> +}
> +
>   /**
>    * virtqueue_add_sgs - expose buffers to other end
>    * @vq: the struct virtqueue we're talking about.
> @@ -537,18 +812,7 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
>   
> -/**
> - * virtqueue_kick_prepare - first half of split virtqueue_kick call.
> - * @vq: the struct virtqueue
> - *
> - * Instead of virtqueue_kick(), you can do:
> - *	if (virtqueue_kick_prepare(vq))
> - *		virtqueue_notify(vq);
> - *
> - * This is sometimes useful because the virtqueue_kick_prepare() needs
> - * to be serialized, but the actual virtqueue_notify() call does not.
> - */
> -bool virtqueue_kick_prepare(struct virtqueue *_vq)
> +static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   	u16 new, old;
> @@ -580,6 +844,62 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>   	END_USE(vq);
>   	return needs_kick;
>   }
> +
> +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 new, old, off_wrap;
> +	bool needs_kick;
> +
> +	START_USE(vq);
> +	/* We need to expose the new flags value before checking notification
> +	 * suppressions. */
> +	virtio_mb(vq->weak_barriers);
> +
> +	old = vq->next_avail_idx - vq->num_added;
> +	new = vq->next_avail_idx;
> +	vq->num_added = 0;
> +
> +#ifdef DEBUG
> +	if (vq->last_add_time_valid) {
> +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> +					      vq->last_add_time)) > 100);
> +	}
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
> +
> +	if (vq->event) {
> +		// FIXME: fix this!
> +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
> +			     vring_need_event(off_wrap & ~(1<<15), new, old);
> +	} else {
> +		needs_kick = (vq->vring_packed.device->flags !=
> +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
> +	}
> +	END_USE(vq);
> +	return needs_kick;
> +}
> +
> +/**
> + * virtqueue_kick_prepare - first half of split virtqueue_kick call.
> + * @vq: the struct virtqueue
> + *
> + * Instead of virtqueue_kick(), you can do:
> + *	if (virtqueue_kick_prepare(vq))
> + *		virtqueue_notify(vq);
> + *
> + * This is sometimes useful because the virtqueue_kick_prepare() needs
> + * to be serialized, but the actual virtqueue_notify() call does not.
> + */
> +bool virtqueue_kick_prepare(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	return vq->packed ? virtqueue_kick_prepare_packed(_vq) :
> +			    virtqueue_kick_prepare_split(_vq);
> +}
>   EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
>   
>   /**
> @@ -626,8 +946,8 @@ bool virtqueue_kick(struct virtqueue *vq)
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_kick);
>   
> -static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> -		       void **ctx)
> +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> +			     void **ctx)
>   {
>   	unsigned int i, j;
>   	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> @@ -639,12 +959,12 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>   	i = head;
>   
>   	while (vq->vring.desc[i].flags & nextflag) {
> -		vring_unmap_one(vq, &vq->vring.desc[i]);
> +		vring_unmap_one_split(vq, &vq->vring.desc[i]);
>   		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
>   		vq->vq.num_free++;
>   	}
>   
> -	vring_unmap_one(vq, &vq->vring.desc[i]);
> +	vring_unmap_one_split(vq, &vq->vring.desc[i]);
>   	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
>   	vq->free_head = head;
>   
> @@ -666,7 +986,7 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>   		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
>   
>   		for (j = 0; j < len / sizeof(struct vring_desc); j++)
> -			vring_unmap_one(vq, &indir_desc[j]);
> +			vring_unmap_one_split(vq, &indir_desc[j]);
>   
>   		kfree(indir_desc);
>   		vq->desc_state[head].indir_desc = NULL;
> @@ -675,11 +995,207 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>   	}
>   }
>   
> -static inline bool more_used(const struct vring_virtqueue *vq)
> +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> +			      void **ctx)
> +{
> +	struct vring_packed_desc *desc;
> +	unsigned int i, j;
> +
> +	/* Clear data ptr. */
> +	vq->desc_state[head].data = NULL;
> +
> +	i = head;
> +
> +	for (j = 0; j < vq->desc_state[head].num; j++) {
> +		desc = &vq->vring_packed.desc[i];
> +		vring_unmap_one_packed(vq, desc);
> +		desc->flags = 0x0;
> +		i++;
> +		if (i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->vq.num_free += vq->desc_state[head].num;
> +
> +	if (vq->indirect) {
> +		u32 len;
> +
> +		desc = vq->desc_state[head].indir_desc;
> +		/* Free the indirect table, if any, now that it's unmapped. */
> +		if (!desc)
> +			goto out;
> +
> +		len = virtio32_to_cpu(vq->vq.vdev,
> +				      vq->vring_packed.desc[head].len);
> +
> +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> +
> +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> +			vring_unmap_one_packed(vq, &desc[j]);
> +
> +		kfree(desc);
> +		vq->desc_state[head].indir_desc = NULL;
> +	} else if (ctx) {
> +		*ctx = vq->desc_state[head].indir_desc;
> +	}
> +
> +out:
> +	return vq->desc_state[head].num;
> +}
> +
> +static inline bool more_used_split(const struct vring_virtqueue *vq)
>   {
>   	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
>   }
>   
> +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> +{
> +	u16 last_used, flags;
> +	bool avail, used;
> +
> +	if (vq->vq.num_free == vq->vring_packed.num)
> +		return false;
> +
> +	last_used = vq->last_used_idx;
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +				vq->vring_packed.desc[last_used].flags);
> +	avail = flags & VRING_DESC_F_AVAIL(1);
> +	used = flags & VRING_DESC_F_USED(1);
> +
> +	return avail == used;
> +}
> +
> +static inline bool more_used(const struct vring_virtqueue *vq)
> +{
> +	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
> +}
> +
> +void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
> +				  void **ctx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	void *ret;
> +	unsigned int i;
> +	u16 last_used;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	/* Only get used array entries after they have been exposed by host. */
> +	virtio_rmb(vq->weak_barriers);
> +
> +	last_used = (vq->last_used_idx & (vq->vring.num - 1));
> +	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
> +	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
> +
> +	if (unlikely(i >= vq->vring.num)) {
> +		BAD_RING(vq, "id %u out of range\n", i);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->desc_state[i].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", i);
> +		return NULL;
> +	}
> +
> +	/* detach_buf_split clears data, so grab it now. */
> +	ret = vq->desc_state[i].data;
> +	detach_buf_split(vq, i, ctx);
> +	vq->last_used_idx++;
> +	/* If we expect an interrupt for the next entry, tell host
> +	 * by writing event index and flush out the write before
> +	 * the read in the next get_buf call. */
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> +		virtio_store_mb(vq->weak_barriers,
> +				&vring_used_event(&vq->vring),
> +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> +
> +#ifdef DEBUG
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	END_USE(vq);
> +	return ret;
> +}
> +
> +void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, unsigned int *len,
> +				   void **ctx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	uint16_t wrap_counter;
> +	void *ret;
> +	unsigned int i;
> +	u16 last_used;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	/* Only get used elements after they have been exposed by host. */
> +	virtio_rmb(vq->weak_barriers);
> +
> +	last_used = vq->last_used_idx;
> +	i = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
> +	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
> +
> +	if (unlikely(i >= vq->vring_packed.num)) {
> +		BAD_RING(vq, "id %u out of range\n", i);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->desc_state[i].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", i);
> +		return NULL;
> +	}
> +
> +	/* detach_buf_packed clears data, so grab it now. */
> +	ret = vq->desc_state[i].data;
> +	detach_buf_packed(vq, i, ctx);
> +
> +	vq->last_used_idx += vq->desc_state[i].num;
> +	if (vq->last_used_idx >= vq->vring_packed.num)
> +		vq->last_used_idx -= vq->vring_packed.num;
> +
> +	wrap_counter = vq->wrap_counter;
> +	if (vq->last_used_idx > vq->next_avail_idx)
> +		wrap_counter ^= 1;
> +
> +	/* If we expect an interrupt for the next entry, tell host
> +	 * by writing event index and flush out the write before
> +	 * the read in the next get_buf call. */
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> +		virtio_store_mb(vq->weak_barriers,
> +				&vq->vring_packed.driver->off_wrap,
> +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> +						wrap_counter << 15));
> +
> +#ifdef DEBUG
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	END_USE(vq);
> +	return ret;
> +}
> +
>   /**
>    * virtqueue_get_buf - get the next used buffer
>    * @vq: the struct virtqueue we're talking about.
> @@ -700,57 +1216,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
>   			    void **ctx)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	void *ret;
> -	unsigned int i;
> -	u16 last_used;
>   
> -	START_USE(vq);
> -
> -	if (unlikely(vq->broken)) {
> -		END_USE(vq);
> -		return NULL;
> -	}
> -
> -	if (!more_used(vq)) {
> -		pr_debug("No more buffers in queue\n");
> -		END_USE(vq);
> -		return NULL;
> -	}
> -
> -	/* Only get used array entries after they have been exposed by host. */
> -	virtio_rmb(vq->weak_barriers);
> -
> -	last_used = (vq->last_used_idx & (vq->vring.num - 1));
> -	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
> -	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
> -
> -	if (unlikely(i >= vq->vring.num)) {
> -		BAD_RING(vq, "id %u out of range\n", i);
> -		return NULL;
> -	}
> -	if (unlikely(!vq->desc_state[i].data)) {
> -		BAD_RING(vq, "id %u is not a head!\n", i);
> -		return NULL;
> -	}
> -
> -	/* detach_buf clears data, so grab it now. */
> -	ret = vq->desc_state[i].data;
> -	detach_buf(vq, i, ctx);
> -	vq->last_used_idx++;
> -	/* If we expect an interrupt for the next entry, tell host
> -	 * by writing event index and flush out the write before
> -	 * the read in the next get_buf call. */
> -	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> -		virtio_store_mb(vq->weak_barriers,
> -				&vring_used_event(&vq->vring),
> -				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> -
> -#ifdef DEBUG
> -	vq->last_add_time_valid = false;
> -#endif
> -
> -	END_USE(vq);
> -	return ret;
> +	return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
> +			    virtqueue_get_buf_ctx_split(_vq, len, ctx);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
>   
> @@ -759,6 +1227,29 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
>   	return virtqueue_get_buf_ctx(_vq, len, NULL);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_buf);
> +
> +static void virtqueue_disable_cb_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> +		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +}
> +
> +static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
> +}
> +
>   /**
>    * virtqueue_disable_cb - disable callbacks
>    * @vq: the struct virtqueue we're talking about.
> @@ -772,15 +1263,66 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   
> -	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -
> +	if (vq->packed)
> +		virtqueue_disable_cb_packed(_vq);
> +	else
> +		virtqueue_disable_cb_split(_vq);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>   
> +static unsigned virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 last_used_idx;
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always do both to keep code simple. */
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> +	END_USE(vq);
> +	return last_used_idx;
> +}
> +
> +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 last_used_idx, wrap_counter, off_wrap;
> +
> +	START_USE(vq);
> +
> +	last_used_idx = vq->last_used_idx;
> +	wrap_counter = vq->wrap_counter;
> +
> +	if (last_used_idx > vq->next_avail_idx)
> +		wrap_counter ^= 1;
> +
> +	off_wrap = last_used_idx | (wrap_counter << 15);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always do both to keep code simple. */
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> +						     VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
> +	END_USE(vq);
> +	return last_used_idx;
> +}
> +
>   /**
>    * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
>    * @vq: the struct virtqueue we're talking about.
> @@ -796,26 +1338,34 @@ EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>   unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 last_used_idx;
>   
> -	START_USE(vq);
> -
> -	/* We optimistically turn back on interrupts, then check if there was
> -	 * more to do. */
> -	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> -	 * either clear the flags bit or point the event index at the next
> -	 * entry. Always do both to keep code simple. */
> -	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> -		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> -	END_USE(vq);
> -	return last_used_idx;
> +	return vq->packed ? virtqueue_enable_cb_prepare_packed(_vq) :
> +			    virtqueue_enable_cb_prepare_split(_vq);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
>   
> +static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	virtio_mb(vq->weak_barriers);
> +	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
> +}
> +
> +static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	bool avail, used;
> +	u16 flags;
> +
> +	virtio_mb(vq->weak_barriers);
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +			vq->vring_packed.desc[last_used_idx].flags);
> +	avail = flags & VRING_DESC_F_AVAIL(1);
> +	used = flags & VRING_DESC_F_USED(1);
> +	return avail == used;
> +}
> +
>   /**
>    * virtqueue_poll - query pending used buffers
>    * @vq: the struct virtqueue we're talking about.
> @@ -829,8 +1379,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   
> -	virtio_mb(vq->weak_barriers);
> -	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
> +	return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
> +			    virtqueue_poll_split(_vq, last_used_idx);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_poll);
>   
> @@ -852,6 +1402,83 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
>   
> +static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 bufs;
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always update the event index to keep code simple. */
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +	/* TODO: tune this threshold */
> +	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
> +
> +	virtio_store_mb(vq->weak_barriers,
> +			&vring_used_event(&vq->vring),
> +			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
> +
> +	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> +		END_USE(vq);
> +		return false;
> +	}
> +
> +	END_USE(vq);
> +	return true;
> +}
> +
> +static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 bufs, off_wrap, used_idx, wrap_counter;
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always update the event index to keep code simple. */
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> +						     VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
> +
> +	/* TODO: tune this threshold */
> +	bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> +
> +	used_idx = vq->last_used_idx + bufs;
> +	if (used_idx >= vq->vring_packed.num)
> +		used_idx -= vq->vring_packed.num;
> +
> +	wrap_counter = vq->wrap_counter;
> +	if (used_idx > vq->next_avail_idx)
> +		wrap_counter ^= 1;
> +
> +	off_wrap = used_idx | (wrap_counter << 15);
> +
> +	virtio_store_mb(vq->weak_barriers, &vq->vring_packed.driver->off_wrap,
> +			cpu_to_virtio16(_vq->vdev, off_wrap));
> +
> +	if (more_used_packed(vq)) {
> +		END_USE(vq);
> +		return false;
> +	}
> +
> +	END_USE(vq);
> +	return true;
> +}
> +
>   /**
>    * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
>    * @vq: the struct virtqueue we're talking about.
> @@ -868,37 +1495,69 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
>   bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 bufs;
>   
> -	START_USE(vq);
> -
> -	/* We optimistically turn back on interrupts, then check if there was
> -	 * more to do. */
> -	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> -	 * either clear the flags bit or point the event index at the next
> -	 * entry. Always update the event index to keep code simple. */
> -	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> -		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -	/* TODO: tune this threshold */
> -	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
> -
> -	virtio_store_mb(vq->weak_barriers,
> -			&vring_used_event(&vq->vring),
> -			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
> -
> -	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> -		END_USE(vq);
> -		return false;
> -	}
> -
> -	END_USE(vq);
> -	return true;
> +	return vq->packed ? virtqueue_enable_cb_delayed_packed(_vq) :
> +			    virtqueue_enable_cb_delayed_split(_vq);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
>   
> +static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	unsigned int i;
> +	void *buf;
> +
> +	START_USE(vq);
> +
> +	for (i = 0; i < vq->vring.num; i++) {
> +		if (!vq->desc_state[i].data)
> +			continue;
> +		/* detach_buf clears data, so grab it now. */
> +		buf = vq->desc_state[i].data;
> +		detach_buf_split(vq, i, NULL);
> +		vq->avail_idx_shadow--;
> +		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> +		END_USE(vq);
> +		return buf;
> +	}
> +	/* That should have freed everything. */
> +	BUG_ON(vq->vq.num_free != vq->vring.num);
> +
> +	END_USE(vq);
> +	return NULL;
> +}
> +
> +static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	unsigned int i, num;
> +	void *buf;
> +
> +	START_USE(vq);
> +
> +	for (i = 0; i < vq->vring_packed.num; i++) {
> +		if (!vq->desc_state[i].data)
> +			continue;
> +		/* detach_buf clears data, so grab it now. */
> +		buf = vq->desc_state[i].data;
> +		num = detach_buf_packed(vq, i, NULL);
> +		if (vq->next_avail_idx < num) {
> +			vq->next_avail_idx = vq->vring_packed.num -
> +					(num - vq->next_avail_idx);
> +			vq->wrap_counter ^= 1;
> +		} else {
> +			vq->next_avail_idx -= num;
> +		}
> +		END_USE(vq);
> +		return buf;
> +	}
> +	/* That should have freed everything. */
> +	BUG_ON(vq->vq.num_free != vq->vring_packed.num);
> +
> +	END_USE(vq);
> +	return NULL;
> +}
> +
>   /**
>    * virtqueue_detach_unused_buf - detach first unused buffer
>    * @vq: the struct virtqueue we're talking about.
> @@ -910,27 +1569,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
>   void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	unsigned int i;
> -	void *buf;
>   
> -	START_USE(vq);
> -
> -	for (i = 0; i < vq->vring.num; i++) {
> -		if (!vq->desc_state[i].data)
> -			continue;
> -		/* detach_buf clears data, so grab it now. */
> -		buf = vq->desc_state[i].data;
> -		detach_buf(vq, i, NULL);
> -		vq->avail_idx_shadow--;
> -		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> -		END_USE(vq);
> -		return buf;
> -	}
> -	/* That should have freed everything. */
> -	BUG_ON(vq->vq.num_free != vq->vring.num);
> -
> -	END_USE(vq);
> -	return NULL;
> +	return vq->packed ? virtqueue_detach_unused_buf_packed(_vq) :
> +			    virtqueue_detach_unused_buf_split(_vq);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
>   
> @@ -955,7 +1596,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
>   EXPORT_SYMBOL_GPL(vring_interrupt);
>   
>   struct virtqueue *__vring_new_virtqueue(unsigned int index,
> -					struct vring vring,
> +					union vring_union vring,
> +					bool packed,
>   					struct virtio_device *vdev,
>   					bool weak_barriers,
>   					bool context,
> @@ -963,19 +1605,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   					void (*callback)(struct virtqueue *),
>   					const char *name)
>   {
> -	unsigned int i;
> +	unsigned int num, i;
>   	struct vring_virtqueue *vq;
>   
> -	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
> +	num = packed ? vring.vring_packed.num : vring.vring_split.num;
> +
> +	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
>   		     GFP_KERNEL);
>   	if (!vq)
>   		return NULL;
>   
> -	vq->vring = vring;
>   	vq->vq.callback = callback;
>   	vq->vq.vdev = vdev;
>   	vq->vq.name = name;
> -	vq->vq.num_free = vring.num;
> +	vq->vq.num_free = num;
>   	vq->vq.index = index;
>   	vq->we_own_ring = false;
>   	vq->queue_dma_addr = 0;
> @@ -984,9 +1627,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   	vq->weak_barriers = weak_barriers;
>   	vq->broken = false;
>   	vq->last_used_idx = 0;
> -	vq->avail_flags_shadow = 0;
> -	vq->avail_idx_shadow = 0;
>   	vq->num_added = 0;
> +	vq->packed = packed;
>   	list_add_tail(&vq->vq.list, &vdev->vqs);
>   #ifdef DEBUG
>   	vq->in_use = false;
> @@ -997,18 +1639,37 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   		!context;
>   	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>   
> +	if (vq->packed) {
> +		vq->vring_packed = vring.vring_packed;
> +		vq->next_avail_idx = 0;
> +		vq->wrap_counter = 1;
> +		vq->event_flags_shadow = 0;
> +	} else {
> +		vq->vring = vring.vring_split;
> +		vq->avail_flags_shadow = 0;
> +		vq->avail_idx_shadow = 0;
> +
> +		/* Put everything in free lists. */
> +		vq->free_head = 0;
> +		for (i = 0; i < num-1; i++)
> +			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> +	}
> +
>   	/* No callback?  Tell other side not to bother us. */
>   	if (!callback) {
> -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
> +		if (packed) {
> +			vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> +			vq->vring_packed.driver->flags = cpu_to_virtio16(vdev,
> +						vq->event_flags_shadow);
> +		} else {
> +			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +			if (!vq->event)
> +				vq->vring.avail->flags = cpu_to_virtio16(vdev,
> +						vq->avail_flags_shadow);
> +		}
>   	}
>   
> -	/* Put everything in free lists. */
> -	vq->free_head = 0;
> -	for (i = 0; i < vring.num-1; i++)
> -		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> -	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
> +	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
>   
>   	return &vq->vq;
>   }
> @@ -1056,6 +1717,22 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
>   	}
>   }
>   
> +static inline int
> +__vring_size(unsigned int num, unsigned long align, bool packed)
> +{
> +	return packed ? vring_packed_size(num, align) : vring_size(num, align);
> +}
> +
> +static inline void vring_packed_init(struct vring_packed *vr, unsigned int num,
> +				     void *p, unsigned long align)
> +{
> +	vr->num = num;
> +	vr->desc = p;
> +	vr->driver = (void *)(((uintptr_t)p + sizeof(struct vring_packed_desc)
> +		* num + align - 1) & ~(align - 1));
> +	vr->device = vr->driver + 1;
> +}
> +
>   struct virtqueue *vring_create_virtqueue(
>   	unsigned int index,
>   	unsigned int num,
> @@ -1072,7 +1749,8 @@ struct virtqueue *vring_create_virtqueue(
>   	void *queue = NULL;
>   	dma_addr_t dma_addr;
>   	size_t queue_size_in_bytes;
> -	struct vring vring;
> +	union vring_union vring;
> +	bool packed;
>   
>   	/* We assume num is a power of 2. */
>   	if (num & (num - 1)) {
> @@ -1080,9 +1758,13 @@ struct virtqueue *vring_create_virtqueue(
>   		return NULL;
>   	}
>   
> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> +
>   	/* TODO: allocate each queue chunk individually */
> -	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> +	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
> +			num /= 2) {
> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> +							     packed),
>   					  &dma_addr,
>   					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
>   		if (queue)
> @@ -1094,17 +1776,21 @@ struct virtqueue *vring_create_virtqueue(
>   
>   	if (!queue) {
>   		/* Try to get a single page. You are my only hope! */
> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> +							     packed),
>   					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
>   	}
>   	if (!queue)
>   		return NULL;
>   
> -	queue_size_in_bytes = vring_size(num, vring_align);
> -	vring_init(&vring, num, queue, vring_align);
> +	queue_size_in_bytes = __vring_size(num, vring_align, packed);
> +	if (packed)
> +		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
> +	else
> +		vring_init(&vring.vring_split, num, queue, vring_align);
>   
> -	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> -				   notify, callback, name);
> +	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> +				   context, notify, callback, name);
>   	if (!vq) {
>   		vring_free_queue(vdev, queue_size_in_bytes, queue,
>   				 dma_addr);
> @@ -1130,10 +1816,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>   				      void (*callback)(struct virtqueue *vq),
>   				      const char *name)
>   {
> -	struct vring vring;
> -	vring_init(&vring, num, pages, vring_align);
> -	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> -				     notify, callback, name);
> +	union vring_union vring;
> +	bool packed;
> +
> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> +	if (packed)
> +		vring_packed_init(&vring.vring_packed, num, pages, vring_align);
> +	else
> +		vring_init(&vring.vring_split, num, pages, vring_align);
> +
> +	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> +				     context, notify, callback, name);
>   }
>   EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>   
> @@ -1143,7 +1836,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>   
>   	if (vq->we_own_ring) {
>   		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
> -				 vq->vring.desc, vq->queue_dma_addr);
> +				 vq->packed ? (void *)vq->vring_packed.desc :
> +					      (void *)vq->vring.desc,
> +				 vq->queue_dma_addr);
>   	}
>   	list_del(&_vq->list);
>   	kfree(vq);
> @@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
>   
>   	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
>   		switch (i) {
> -		case VIRTIO_RING_F_INDIRECT_DESC:
> +#if 0
> +		case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
>   			break;
> -		case VIRTIO_RING_F_EVENT_IDX:
> +		case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
>   			break;
> +#endif
>   		case VIRTIO_F_VERSION_1:
>   			break;
>   		case VIRTIO_F_IOMMU_PLATFORM:
>   			break;
> +		case VIRTIO_F_RING_PACKED:
> +			break;
>   		default:
>   			/* We don't understand this bit. */
>   			__virtio_clear_bit(vdev, i);
> @@ -1185,7 +1884,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
>   
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   
> -	return vq->vring.num;
> +	return vq->packed ? vq->vring_packed.num : vq->vring.num;
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
>   
> @@ -1228,6 +1927,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>   
>   	BUG_ON(!vq->we_own_ring);
>   
> +	if (vq->packed)
> +		return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
> +				(char *)vq->vring_packed.desc);
> +
>   	return vq->queue_dma_addr +
>   		((char *)vq->vring.avail - (char *)vq->vring.desc);
>   }
> @@ -1239,11 +1942,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
>   
>   	BUG_ON(!vq->we_own_ring);
>   
> +	if (vq->packed)
> +		return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
> +				(char *)vq->vring_packed.desc);
> +
>   	return vq->queue_dma_addr +
>   		((char *)vq->vring.used - (char *)vq->vring.desc);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
>   
> +/* Only available for split ring */
>   const struct vring *virtqueue_get_vring(struct virtqueue *vq)
>   {
>   	return &to_vvq(vq)->vring;
> diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> index bbf32524ab27..a0075894ad16 100644
> --- a/include/linux/virtio_ring.h
> +++ b/include/linux/virtio_ring.h
> @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
>   struct virtio_device;
>   struct virtqueue;
>   
> +union vring_union {
> +	struct vring vring_split;
> +	struct vring_packed vring_packed;
> +};
> +
>   /*
>    * Creates a virtqueue and allocates the descriptor ring.  If
>    * may_reduce_num is set, then this may allocate a smaller ring than
> @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
>   
>   /* Creates a virtqueue with a custom layout. */
>   struct virtqueue *__vring_new_virtqueue(unsigned int index,
> -					struct vring vring,
> +					union vring_union vring,
> +					bool packed,
>   					struct virtio_device *vdev,
>   					bool weak_barriers,
>   					bool ctx,
> diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
> index 308e2096291f..a6e392325e3a 100644
> --- a/include/uapi/linux/virtio_config.h
> +++ b/include/uapi/linux/virtio_config.h
> @@ -49,7 +49,7 @@
>    * transport being used (eg. virtio_ring), the rest are per-device feature
>    * bits. */
>   #define VIRTIO_TRANSPORT_F_START	28
> -#define VIRTIO_TRANSPORT_F_END		34
> +#define VIRTIO_TRANSPORT_F_END		36
>   
>   #ifndef VIRTIO_CONFIG_NO_LEGACY
>   /* Do we get callbacks when the ring is completely used, even if we've
> @@ -71,4 +71,14 @@
>    * this is for compatibility with legacy systems.
>    */
>   #define VIRTIO_F_IOMMU_PLATFORM		33
> +
> +/* This feature indicates support for the packed virtqueue layout. */
> +#define VIRTIO_F_RING_PACKED		34
> +
> +/*
> + * This feature indicates that all buffers are used by the device
> + * in the same order in which they have been made available.
> + */
> +#define VIRTIO_F_IN_ORDER		35
> +
>   #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
> diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
> index 6d5d5faa989b..735d4207c988 100644
> --- a/include/uapi/linux/virtio_ring.h
> +++ b/include/uapi/linux/virtio_ring.h
> @@ -44,6 +44,9 @@
>   /* This means the buffer contains a list of buffer descriptors. */
>   #define VRING_DESC_F_INDIRECT	4
>   
> +#define VRING_DESC_F_AVAIL(b)	((b) << 7)
> +#define VRING_DESC_F_USED(b)	((b) << 15)
> +
>   /* The Host uses this in used->flags to advise the Guest: don't kick me when
>    * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
>    * will still kick if it's out of buffers. */
> @@ -53,6 +56,10 @@
>    * optimization.  */
>   #define VRING_AVAIL_F_NO_INTERRUPT	1
>   
> +#define VRING_EVENT_F_ENABLE	0x0
> +#define VRING_EVENT_F_DISABLE	0x1
> +#define VRING_EVENT_F_DESC	0x2
> +
>   /* We support indirect buffer descriptors */
>   #define VIRTIO_RING_F_INDIRECT_DESC	28
>   
> @@ -171,4 +178,58 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
>   	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
>   }
>   
> +struct vring_packed_desc_event {
> +	/* __virtio16 off  : 15; // Descriptor Event Offset
> +	 * __virtio16 wrap : 1;  // Descriptor Event Wrap Counter */
> +	__virtio16 off_wrap;
> +	/* __virtio16 flags : 2; // Descriptor Event Flags */
> +	__virtio16 flags;
> +};
> +
> +struct vring_packed_desc {
> +	/* Buffer Address. */
> +	__virtio64 addr;
> +	/* Buffer Length. */
> +	__virtio32 len;
> +	/* Buffer ID. */
> +	__virtio16 id;
> +	/* The flags depending on descriptor type. */
> +	__virtio16 flags;
> +};
> +
> +struct vring_packed {
> +	unsigned int num;
> +
> +	struct vring_packed_desc *desc;
> +
> +	struct vring_packed_desc_event *driver;
> +
> +	struct vring_packed_desc_event *device;
> +};
> +
> +/* The standard layout for the packed ring is a continuous chunk of memory
> + * which looks like this.
> + *
> + * struct vring_packed
> + * {
> + *	// The actual descriptors (16 bytes each)
> + *	struct vring_packed_desc desc[num];
> + *
> + *	// Padding to the next align boundary.
> + *	char pad[];
> + *
> + *	// Driver Event Suppression
> + *	struct vring_packed_desc_event driver;
> + *
> + *	// Device Event Suppression
> + *	struct vring_packed_desc_event device;
> + * };
> + */
> +
> +static inline unsigned vring_packed_size(unsigned int num, unsigned long align)
> +{
> +	return ((sizeof(struct vring_packed_desc) * num + align - 1)
> +		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> +}
> +
>   #endif /* _UAPI_LINUX_VIRTIO_RING_H */

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-10  2:55 ` Jason Wang
@ 2018-04-10  3:21   ` Tiwei Bie
  0 siblings, 0 replies; 28+ messages in thread
From: Tiwei Bie @ 2018-04-10  3:21 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 10, 2018 at 10:55:25AM +0800, Jason Wang wrote:
> On 2018年04月01日 22:12, Tiwei Bie wrote:
> > Hello everyone,
> > 
> > This RFC implements packed ring support for virtio driver.
> > 
> > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > 
> > TODO:
> > - Refinements and bug fixes;
> > - Split into small patches;
> > - Test indirect descriptor support;
> > - Test/fix event suppression support;
> > - Test devices other than net;
> > 
> > RFC v1 -> RFC v2:
> > - Add indirect descriptor support - compile test only;
> > - Add event suppression supprt - compile test only;
> > - Move vring_packed_init() out of uapi (Jason, MST);
> > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > - Avoid using '%' operator (Jason);
> > - Rename free_head -> next_avail_idx (Jason);
> > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > - Some other refinements and bug fixes;
> > 
> > Thanks!
> 
> Will try to review this later.
> 
> But it would be better if you can split it (more than 1000 lines is too big
> to be reviewed easily). E.g you can at least split it into three patches,
> new structures, datapath, and event suppression.
> 

No problem! It's on my TODO list. I'll get it done in the next version.

Thanks!

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-01 14:12 [RFC v2] virtio: support packed ring Tiwei Bie
  2018-04-10  2:55 ` Jason Wang
@ 2018-04-13  4:30 ` Jason Wang
  2018-04-13  7:15   ` Tiwei Bie
  2018-04-13 15:22 ` Michael S. Tsirkin
  2018-04-23  5:42 ` Jason Wang
  3 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2018-04-13  4:30 UTC (permalink / raw)
  To: Tiwei Bie, mst, wexu, virtualization, linux-kernel, netdev; +Cc: jfreimann



On 2018年04月01日 22:12, Tiwei Bie wrote:
> Hello everyone,
>
> This RFC implements packed ring support for virtio driver.
>
> The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> Minor changes are needed for the vhost code, e.g. to kick the guest.
>
> TODO:
> - Refinements and bug fixes;
> - Split into small patches;
> - Test indirect descriptor support;
> - Test/fix event suppression support;
> - Test devices other than net;
>
> RFC v1 -> RFC v2:
> - Add indirect descriptor support - compile test only;
> - Add event suppression supprt - compile test only;
> - Move vring_packed_init() out of uapi (Jason, MST);
> - Merge two loops into one in virtqueue_add_packed() (Jason);
> - Split vring_unmap_one() for packed ring and split ring (Jason);
> - Avoid using '%' operator (Jason);
> - Rename free_head -> next_avail_idx (Jason);
> - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> - Some other refinements and bug fixes;
>
> Thanks!
>
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>   drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
>   include/linux/virtio_ring.h        |    8 +-
>   include/uapi/linux/virtio_config.h |   12 +-
>   include/uapi/linux/virtio_ring.h   |   61 ++
>   4 files changed, 980 insertions(+), 195 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 71458f493cf8..0515dca34d77 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -58,14 +58,15 @@
>   
>   struct vring_desc_state {
>   	void *data;			/* Data for callback. */
> -	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> +	void *indir_desc;		/* Indirect descriptor, if any. */
> +	int num;			/* Descriptor list length. */
>   };
>   
>   struct vring_virtqueue {
>   	struct virtqueue vq;
>   
> -	/* Actual memory layout for this queue */
> -	struct vring vring;
> +	/* Is this a packed ring? */
> +	bool packed;
>   
>   	/* Can we use weak barriers? */
>   	bool weak_barriers;
> @@ -79,19 +80,45 @@ struct vring_virtqueue {
>   	/* Host publishes avail event idx */
>   	bool event;
>   
> -	/* Head of free buffer list. */
> -	unsigned int free_head;
>   	/* Number we've added since last sync. */
>   	unsigned int num_added;
>   
>   	/* Last used index we've seen. */
>   	u16 last_used_idx;
>   
> -	/* Last written value to avail->flags */
> -	u16 avail_flags_shadow;
> +	union {
> +		/* Available for split ring */
> +		struct {
> +			/* Actual memory layout for this queue. */
> +			struct vring vring;
>   
> -	/* Last written value to avail->idx in guest byte order */
> -	u16 avail_idx_shadow;
> +			/* Head of free buffer list. */
> +			unsigned int free_head;
> +
> +			/* Last written value to avail->flags */
> +			u16 avail_flags_shadow;
> +
> +			/* Last written value to avail->idx in
> +			 * guest byte order. */
> +			u16 avail_idx_shadow;
> +		};
> +
> +		/* Available for packed ring */
> +		struct {
> +			/* Actual memory layout for this queue. */
> +			struct vring_packed vring_packed;
> +
> +			/* Driver ring wrap counter. */
> +			u8 wrap_counter;
> +
> +			/* Index of the next avail descriptor. */
> +			unsigned int next_avail_idx;
> +
> +			/* Last written value to driver->flags in
> +			 * guest byte order. */
> +			u16 event_flags_shadow;
> +		};
> +	};
>   
>   	/* How to notify other side. FIXME: commonalize hcalls! */
>   	bool (*notify)(struct virtqueue *vq);
> @@ -201,8 +228,33 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
>   			      cpu_addr, size, direction);
>   }
>   
> -static void vring_unmap_one(const struct vring_virtqueue *vq,
> -			    struct vring_desc *desc)
> +static void vring_unmap_one_split(const struct vring_virtqueue *vq,
> +				  struct vring_desc *desc)
> +{
> +	u16 flags;
> +
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return;
> +
> +	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +
> +	if (flags & VRING_DESC_F_INDIRECT) {
> +		dma_unmap_single(vring_dma_dev(vq),
> +				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +				 virtio32_to_cpu(vq->vq.vdev, desc->len),
> +				 (flags & VRING_DESC_F_WRITE) ?
> +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	} else {
> +		dma_unmap_page(vring_dma_dev(vq),
> +			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +			       virtio32_to_cpu(vq->vq.vdev, desc->len),
> +			       (flags & VRING_DESC_F_WRITE) ?
> +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	}
> +}
> +
> +static void vring_unmap_one_packed(const struct vring_virtqueue *vq,
> +				   struct vring_packed_desc *desc)
>   {
>   	u16 flags;
>   
> @@ -235,8 +287,9 @@ static int vring_mapping_error(const struct vring_virtqueue *vq,
>   	return dma_mapping_error(vring_dma_dev(vq), addr);
>   }
>   
> -static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
> -					 unsigned int total_sg, gfp_t gfp)
> +static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> +					       unsigned int total_sg,
> +					       gfp_t gfp)
>   {
>   	struct vring_desc *desc;
>   	unsigned int i;
> @@ -257,14 +310,32 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
>   	return desc;
>   }
>   
> -static inline int virtqueue_add(struct virtqueue *_vq,
> -				struct scatterlist *sgs[],
> -				unsigned int total_sg,
> -				unsigned int out_sgs,
> -				unsigned int in_sgs,
> -				void *data,
> -				void *ctx,
> -				gfp_t gfp)
> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> +						       unsigned int total_sg,
> +						       gfp_t gfp)
> +{
> +	struct vring_packed_desc *desc;
> +
> +	/*
> +	 * We require lowmem mappings for the descriptors because
> +	 * otherwise virt_to_phys will give us bogus addresses in the
> +	 * virtqueue.
> +	 */
> +	gfp &= ~__GFP_HIGHMEM;
> +
> +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);

Can we simply check vq->packed here to avoid duplicating helpers?

> +
> +	return desc;
> +}
> +
> +static inline int virtqueue_add_split(struct virtqueue *_vq,
> +				      struct scatterlist *sgs[],
> +				      unsigned int total_sg,
> +				      unsigned int out_sgs,
> +				      unsigned int in_sgs,
> +				      void *data,
> +				      void *ctx,
> +				      gfp_t gfp)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   	struct scatterlist *sg;
> @@ -303,7 +374,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>   	/* If the host supports indirect descriptor tables, and we have multiple
>   	 * buffers, then go indirect. FIXME: tune this threshold */
>   	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> -		desc = alloc_indirect(_vq, total_sg, gfp);
> +		desc = alloc_indirect_split(_vq, total_sg, gfp);
>   	else {
>   		desc = NULL;
>   		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
> @@ -424,7 +495,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>   	for (n = 0; n < total_sg; n++) {
>   		if (i == err_idx)
>   			break;
> -		vring_unmap_one(vq, &desc[i]);
> +		vring_unmap_one_split(vq, &desc[i]);
>   		i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
>   	}
>   
> @@ -435,6 +506,210 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>   	return -EIO;
>   }
>   
> +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> +				       struct scatterlist *sgs[],
> +				       unsigned int total_sg,
> +				       unsigned int out_sgs,
> +				       unsigned int in_sgs,
> +				       void *data,
> +				       void *ctx,
> +				       gfp_t gfp)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	struct vring_packed_desc *desc;
> +	struct scatterlist *sg;
> +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> +	__virtio16 uninitialized_var(head_flags), flags;
> +	int head, wrap_counter;
> +	bool indirect;
> +
> +	START_USE(vq);
> +
> +	BUG_ON(data == NULL);
> +	BUG_ON(ctx && vq->indirect);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return -EIO;
> +	}
> +
> +#ifdef DEBUG
> +	{
> +		ktime_t now = ktime_get();
> +
> +		/* No kick or get, with .1 second between?  Warn. */
> +		if (vq->last_add_time_valid)
> +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> +					    > 100);
> +		vq->last_add_time = now;
> +		vq->last_add_time_valid = true;
> +	}
> +#endif
> +
> +	BUG_ON(total_sg == 0);
> +
> +	head = vq->next_avail_idx;
> +	wrap_counter = vq->wrap_counter;
> +
> +	/* If the host supports indirect descriptor tables, and we have multiple
> +	 * buffers, then go indirect. FIXME: tune this threshold */
> +	if (vq->indirect && total_sg > 1 && vq->vq.num_free)

Let's introduce a helper like virtqueue_need_indirect() to avoid 
duplicating codes and FIXME.

> +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> +	else {
> +		desc = NULL;
> +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> +	}
> +
> +	if (desc) {
> +		/* Use a single buffer which doesn't continue */
> +		indirect = true;
> +		/* Set up rest to use this indirect table. */
> +		i = 0;
> +		descs_used = 1;
> +	} else {
> +		indirect = false;
> +		desc = vq->vring_packed.desc;
> +		i = head;
> +		descs_used = total_sg;
> +	}
> +
> +	if (vq->vq.num_free < descs_used) {
> +		pr_debug("Can't add buf len %i - avail = %i\n",
> +			 descs_used, vq->vq.num_free);
> +		/* FIXME: for historical reasons, we force a notify here if
> +		 * there are outgoing parts to the buffer.  Presumably the
> +		 * host should service the ring ASAP. */
> +		if (out_sgs)
> +			vq->notify(&vq->vq);
> +		if (indirect)
> +			kfree(desc);
> +		END_USE(vq);
> +		return -ENOSPC;
> +	}
> +
> +	for (n = 0; n < out_sgs + in_sgs; n++) {
> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> +						DMA_TO_DEVICE : DMA_FROM_DEVICE);
> +			if (vring_mapping_error(vq, addr))
> +				goto unmap_release;
> +
> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> +					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> +					VRING_DESC_F_USED(!vq->wrap_counter));
> +			if (!indirect && i == head)
> +				head_flags = flags;
> +			else
> +				desc[i].flags = flags;
> +
> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);

Similar to V1, we only need this for the last descriptor.

> +			prev = i;

It looks to me there's no need to track prev inside the loop here.

> +			i++;
> +			if (!indirect && i >= vq->vring_packed.num) {
> +				i = 0;
> +				vq->wrap_counter ^= 1;
> +			}
> +		}
> +	}
> +	/* Last one doesn't continue. */
> +	if (total_sg == 1)
> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> +	else
> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);

The only case when prev != i - 1 is i == 0, we can add a if here.

> +
> +	if (indirect) {
> +		/* Now that the indirect table is filled in, map it. */
> +		dma_addr_t addr = vring_map_single(
> +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
> +			DMA_TO_DEVICE);
> +		if (vring_mapping_error(vq, addr))
> +			goto unmap_release;
> +
> +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> +					     VRING_DESC_F_AVAIL(wrap_counter) |
> +					     VRING_DESC_F_USED(!wrap_counter));
> +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
> +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> +				total_sg * sizeof(struct vring_packed_desc));
> +		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
> +	}
> +
> +	/* We're using some buffers from the free list. */
> +	vq->vq.num_free -= descs_used;
> +
> +	/* Update free pointer */
> +	if (indirect) {
> +		n = head + 1;
> +		if (n >= vq->vring_packed.num) {
> +			n = 0;
> +			vq->wrap_counter ^= 1;
> +		}
> +		vq->next_avail_idx = n;
> +	} else
> +		vq->next_avail_idx = i;
> +
> +	/* Store token and indirect buffer state. */
> +	vq->desc_state[head].num = descs_used;
> +	vq->desc_state[head].data = data;
> +	if (indirect)
> +		vq->desc_state[head].indir_desc = desc;
> +	else
> +		vq->desc_state[head].indir_desc = ctx;
> +
> +	/* A driver MUST NOT make the first descriptor in the list
> +	 * available before all subsequent descriptors comprising
> +	 * the list are made available. */
> +	virtio_wmb(vq->weak_barriers);
> +	vq->vring_packed.desc[head].flags = head_flags;
> +	vq->num_added++;
> +
> +	pr_debug("Added buffer head %i to %p\n", head, vq);
> +	END_USE(vq);
> +
> +	return 0;
> +
> +unmap_release:
> +	err_idx = i;
> +	i = head;
> +
> +	for (n = 0; n < total_sg; n++) {
> +		if (i == err_idx)
> +			break;
> +		vring_unmap_one_packed(vq, &desc[i]);
> +		i++;
> +		if (!indirect && i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->wrap_counter = wrap_counter;
> +
> +	if (indirect)
> +		kfree(desc);
> +
> +	END_USE(vq);
> +	return -EIO;
> +}
> +
> +static inline int virtqueue_add(struct virtqueue *_vq,
> +				struct scatterlist *sgs[],
> +				unsigned int total_sg,
> +				unsigned int out_sgs,
> +				unsigned int in_sgs,
> +				void *data,
> +				void *ctx,
> +				gfp_t gfp)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
> +						 in_sgs, data, ctx, gfp) :
> +			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
> +						in_sgs, data, ctx, gfp);
> +}
> +
>   /**
>    * virtqueue_add_sgs - expose buffers to other end
>    * @vq: the struct virtqueue we're talking about.
> @@ -537,18 +812,7 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
>   
> -/**
> - * virtqueue_kick_prepare - first half of split virtqueue_kick call.
> - * @vq: the struct virtqueue
> - *
> - * Instead of virtqueue_kick(), you can do:
> - *	if (virtqueue_kick_prepare(vq))
> - *		virtqueue_notify(vq);
> - *
> - * This is sometimes useful because the virtqueue_kick_prepare() needs
> - * to be serialized, but the actual virtqueue_notify() call does not.
> - */
> -bool virtqueue_kick_prepare(struct virtqueue *_vq)
> +static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   	u16 new, old;
> @@ -580,6 +844,62 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>   	END_USE(vq);
>   	return needs_kick;
>   }
> +
> +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 new, old, off_wrap;
> +	bool needs_kick;
> +
> +	START_USE(vq);
> +	/* We need to expose the new flags value before checking notification
> +	 * suppressions. */
> +	virtio_mb(vq->weak_barriers);
> +
> +	old = vq->next_avail_idx - vq->num_added;
> +	new = vq->next_avail_idx;
> +	vq->num_added = 0;
> +
> +#ifdef DEBUG
> +	if (vq->last_add_time_valid) {
> +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> +					      vq->last_add_time)) > 100);
> +	}
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
> +
> +	if (vq->event) {

It looks to me we should examine RING_EVENT_FLAGS_DESC in 
desc_event_flags instead of vq->event here. Spec does not forces to use 
evenf_off and event_wrap if event index is enabled.

> +		// FIXME: fix this!
> +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
> +			     vring_need_event(off_wrap & ~(1<<15), new, old);

Why need a & here?

> +	} else {

Need a smp_rmb() to make sure desc_event_flags was checked before flags.

> +		needs_kick = (vq->vring_packed.device->flags !=
> +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
> +	}
> +	END_USE(vq);
> +	return needs_kick;
> +}
> +
> +/**
> + * virtqueue_kick_prepare - first half of split virtqueue_kick call.
> + * @vq: the struct virtqueue
> + *
> + * Instead of virtqueue_kick(), you can do:
> + *	if (virtqueue_kick_prepare(vq))
> + *		virtqueue_notify(vq);
> + *
> + * This is sometimes useful because the virtqueue_kick_prepare() needs
> + * to be serialized, but the actual virtqueue_notify() call does not.
> + */
> +bool virtqueue_kick_prepare(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	return vq->packed ? virtqueue_kick_prepare_packed(_vq) :
> +			    virtqueue_kick_prepare_split(_vq);
> +}
>   EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
>   
>   /**
> @@ -626,8 +946,8 @@ bool virtqueue_kick(struct virtqueue *vq)
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_kick);
>   
> -static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> -		       void **ctx)
> +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> +			     void **ctx)
>   {
>   	unsigned int i, j;
>   	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> @@ -639,12 +959,12 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>   	i = head;
>   
>   	while (vq->vring.desc[i].flags & nextflag) {
> -		vring_unmap_one(vq, &vq->vring.desc[i]);
> +		vring_unmap_one_split(vq, &vq->vring.desc[i]);
>   		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
>   		vq->vq.num_free++;
>   	}
>   
> -	vring_unmap_one(vq, &vq->vring.desc[i]);
> +	vring_unmap_one_split(vq, &vq->vring.desc[i]);
>   	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
>   	vq->free_head = head;
>   
> @@ -666,7 +986,7 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>   		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
>   
>   		for (j = 0; j < len / sizeof(struct vring_desc); j++)
> -			vring_unmap_one(vq, &indir_desc[j]);
> +			vring_unmap_one_split(vq, &indir_desc[j]);
>   
>   		kfree(indir_desc);
>   		vq->desc_state[head].indir_desc = NULL;
> @@ -675,11 +995,207 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>   	}
>   }
>   
> -static inline bool more_used(const struct vring_virtqueue *vq)
> +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> +			      void **ctx)
> +{
> +	struct vring_packed_desc *desc;
> +	unsigned int i, j;
> +
> +	/* Clear data ptr. */
> +	vq->desc_state[head].data = NULL;
> +
> +	i = head;
> +
> +	for (j = 0; j < vq->desc_state[head].num; j++) {
> +		desc = &vq->vring_packed.desc[i];
> +		vring_unmap_one_packed(vq, desc);
> +		desc->flags = 0x0;

Looks like this is unnecessary.

> +		i++;
> +		if (i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->vq.num_free += vq->desc_state[head].num;
> +
> +	if (vq->indirect) {
> +		u32 len;
> +
> +		desc = vq->desc_state[head].indir_desc;
> +		/* Free the indirect table, if any, now that it's unmapped. */
> +		if (!desc)
> +			goto out;
> +
> +		len = virtio32_to_cpu(vq->vq.vdev,
> +				      vq->vring_packed.desc[head].len);
> +
> +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> +
> +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> +			vring_unmap_one_packed(vq, &desc[j]);
> +
> +		kfree(desc);
> +		vq->desc_state[head].indir_desc = NULL;
> +	} else if (ctx) {
> +		*ctx = vq->desc_state[head].indir_desc;
> +	}
> +
> +out:
> +	return vq->desc_state[head].num;
> +}
> +
> +static inline bool more_used_split(const struct vring_virtqueue *vq)
>   {
>   	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
>   }
>   
> +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> +{
> +	u16 last_used, flags;
> +	bool avail, used;
> +
> +	if (vq->vq.num_free == vq->vring_packed.num)
> +		return false;
> +
> +	last_used = vq->last_used_idx;
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +				vq->vring_packed.desc[last_used].flags);
> +	avail = flags & VRING_DESC_F_AVAIL(1);
> +	used = flags & VRING_DESC_F_USED(1);
> +
> +	return avail == used;
> +}
> +
> +static inline bool more_used(const struct vring_virtqueue *vq)
> +{
> +	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
> +}
> +
> +void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
> +				  void **ctx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	void *ret;
> +	unsigned int i;
> +	u16 last_used;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	/* Only get used array entries after they have been exposed by host. */
> +	virtio_rmb(vq->weak_barriers);
> +
> +	last_used = (vq->last_used_idx & (vq->vring.num - 1));
> +	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
> +	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
> +
> +	if (unlikely(i >= vq->vring.num)) {
> +		BAD_RING(vq, "id %u out of range\n", i);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->desc_state[i].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", i);
> +		return NULL;
> +	}
> +
> +	/* detach_buf_split clears data, so grab it now. */
> +	ret = vq->desc_state[i].data;
> +	detach_buf_split(vq, i, ctx);
> +	vq->last_used_idx++;
> +	/* If we expect an interrupt for the next entry, tell host
> +	 * by writing event index and flush out the write before
> +	 * the read in the next get_buf call. */
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> +		virtio_store_mb(vq->weak_barriers,
> +				&vring_used_event(&vq->vring),
> +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> +
> +#ifdef DEBUG
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	END_USE(vq);
> +	return ret;
> +}
> +
> +void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, unsigned int *len,
> +				   void **ctx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	uint16_t wrap_counter;
> +	void *ret;
> +	unsigned int i;
> +	u16 last_used;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	/* Only get used elements after they have been exposed by host. */
> +	virtio_rmb(vq->weak_barriers);
> +
> +	last_used = vq->last_used_idx;
> +	i = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
> +	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
> +
> +	if (unlikely(i >= vq->vring_packed.num)) {
> +		BAD_RING(vq, "id %u out of range\n", i);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->desc_state[i].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", i);
> +		return NULL;
> +	}
> +
> +	/* detach_buf_packed clears data, so grab it now. */
> +	ret = vq->desc_state[i].data;
> +	detach_buf_packed(vq, i, ctx);
> +
> +	vq->last_used_idx += vq->desc_state[i].num;
> +	if (vq->last_used_idx >= vq->vring_packed.num)
> +		vq->last_used_idx -= vq->vring_packed.num;
> +
> +	wrap_counter = vq->wrap_counter;
> +	if (vq->last_used_idx > vq->next_avail_idx)
> +		wrap_counter ^= 1;
> +
> +	/* If we expect an interrupt for the next entry, tell host
> +	 * by writing event index and flush out the write before
> +	 * the read in the next get_buf call. */
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> +		virtio_store_mb(vq->weak_barriers,
> +				&vq->vring_packed.driver->off_wrap,
> +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> +						wrap_counter << 15));
> +
> +#ifdef DEBUG
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	END_USE(vq);
> +	return ret;
> +}
> +
>   /**
>    * virtqueue_get_buf - get the next used buffer
>    * @vq: the struct virtqueue we're talking about.
> @@ -700,57 +1216,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
>   			    void **ctx)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	void *ret;
> -	unsigned int i;
> -	u16 last_used;
>   
> -	START_USE(vq);
> -
> -	if (unlikely(vq->broken)) {
> -		END_USE(vq);
> -		return NULL;
> -	}
> -
> -	if (!more_used(vq)) {
> -		pr_debug("No more buffers in queue\n");
> -		END_USE(vq);
> -		return NULL;
> -	}
> -
> -	/* Only get used array entries after they have been exposed by host. */
> -	virtio_rmb(vq->weak_barriers);
> -
> -	last_used = (vq->last_used_idx & (vq->vring.num - 1));
> -	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
> -	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
> -
> -	if (unlikely(i >= vq->vring.num)) {
> -		BAD_RING(vq, "id %u out of range\n", i);
> -		return NULL;
> -	}
> -	if (unlikely(!vq->desc_state[i].data)) {
> -		BAD_RING(vq, "id %u is not a head!\n", i);
> -		return NULL;
> -	}
> -
> -	/* detach_buf clears data, so grab it now. */
> -	ret = vq->desc_state[i].data;
> -	detach_buf(vq, i, ctx);
> -	vq->last_used_idx++;
> -	/* If we expect an interrupt for the next entry, tell host
> -	 * by writing event index and flush out the write before
> -	 * the read in the next get_buf call. */
> -	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> -		virtio_store_mb(vq->weak_barriers,
> -				&vring_used_event(&vq->vring),
> -				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> -
> -#ifdef DEBUG
> -	vq->last_add_time_valid = false;
> -#endif
> -
> -	END_USE(vq);
> -	return ret;
> +	return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
> +			    virtqueue_get_buf_ctx_split(_vq, len, ctx);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
>   
> @@ -759,6 +1227,29 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
>   	return virtqueue_get_buf_ctx(_vq, len, NULL);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_buf);
> +
> +static void virtqueue_disable_cb_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> +		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +}
> +
> +static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
> +}
> +
>   /**
>    * virtqueue_disable_cb - disable callbacks
>    * @vq: the struct virtqueue we're talking about.
> @@ -772,15 +1263,66 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   
> -	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -
> +	if (vq->packed)
> +		virtqueue_disable_cb_packed(_vq);
> +	else
> +		virtqueue_disable_cb_split(_vq);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>   
> +static unsigned virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 last_used_idx;
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always do both to keep code simple. */
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> +	END_USE(vq);
> +	return last_used_idx;
> +}
> +
> +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 last_used_idx, wrap_counter, off_wrap;
> +
> +	START_USE(vq);
> +
> +	last_used_idx = vq->last_used_idx;
> +	wrap_counter = vq->wrap_counter;
> +
> +	if (last_used_idx > vq->next_avail_idx)
> +		wrap_counter ^= 1;
> +
> +	off_wrap = last_used_idx | (wrap_counter << 15);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always do both to keep code simple. */
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> +						     VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}

A smp_wmb() is missed here?

> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);

And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE 
is sufficient here.

> +	END_USE(vq);
> +	return last_used_idx;
> +}
> +
>   /**
>    * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
>    * @vq: the struct virtqueue we're talking about.
> @@ -796,26 +1338,34 @@ EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>   unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 last_used_idx;
>   
> -	START_USE(vq);
> -
> -	/* We optimistically turn back on interrupts, then check if there was
> -	 * more to do. */
> -	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> -	 * either clear the flags bit or point the event index at the next
> -	 * entry. Always do both to keep code simple. */
> -	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> -		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> -	END_USE(vq);
> -	return last_used_idx;
> +	return vq->packed ? virtqueue_enable_cb_prepare_packed(_vq) :
> +			    virtqueue_enable_cb_prepare_split(_vq);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
>   
> +static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	virtio_mb(vq->weak_barriers);
> +	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
> +}
> +
> +static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	bool avail, used;
> +	u16 flags;
> +
> +	virtio_mb(vq->weak_barriers);
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +			vq->vring_packed.desc[last_used_idx].flags);
> +	avail = flags & VRING_DESC_F_AVAIL(1);
> +	used = flags & VRING_DESC_F_USED(1);
> +	return avail == used;
> +}
> +
>   /**
>    * virtqueue_poll - query pending used buffers
>    * @vq: the struct virtqueue we're talking about.
> @@ -829,8 +1379,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   
> -	virtio_mb(vq->weak_barriers);
> -	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
> +	return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
> +			    virtqueue_poll_split(_vq, last_used_idx);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_poll);
>   
> @@ -852,6 +1402,83 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
>   
> +static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 bufs;
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always update the event index to keep code simple. */
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +	/* TODO: tune this threshold */
> +	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
> +
> +	virtio_store_mb(vq->weak_barriers,
> +			&vring_used_event(&vq->vring),
> +			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
> +
> +	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> +		END_USE(vq);
> +		return false;
> +	}
> +
> +	END_USE(vq);
> +	return true;
> +}
> +
> +static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 bufs, off_wrap, used_idx, wrap_counter;
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always update the event index to keep code simple. */
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> +						     VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
> +
> +	/* TODO: tune this threshold */
> +	bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> +
> +	used_idx = vq->last_used_idx + bufs;
> +	if (used_idx >= vq->vring_packed.num)
> +		used_idx -= vq->vring_packed.num;
> +
> +	wrap_counter = vq->wrap_counter;
> +	if (used_idx > vq->next_avail_idx)
> +		wrap_counter ^= 1;
> +
> +	off_wrap = used_idx | (wrap_counter << 15);
> +
> +	virtio_store_mb(vq->weak_barriers, &vq->vring_packed.driver->off_wrap,
> +			cpu_to_virtio16(_vq->vdev, off_wrap));
> +
> +	if (more_used_packed(vq)) {
> +		END_USE(vq);
> +		return false;
> +	}
> +
> +	END_USE(vq);
> +	return true;
> +}
> +
>   /**
>    * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
>    * @vq: the struct virtqueue we're talking about.
> @@ -868,37 +1495,69 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
>   bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 bufs;
>   
> -	START_USE(vq);
> -
> -	/* We optimistically turn back on interrupts, then check if there was
> -	 * more to do. */
> -	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> -	 * either clear the flags bit or point the event index at the next
> -	 * entry. Always update the event index to keep code simple. */
> -	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> -		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -	/* TODO: tune this threshold */
> -	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
> -
> -	virtio_store_mb(vq->weak_barriers,
> -			&vring_used_event(&vq->vring),
> -			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
> -
> -	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> -		END_USE(vq);
> -		return false;
> -	}
> -
> -	END_USE(vq);
> -	return true;
> +	return vq->packed ? virtqueue_enable_cb_delayed_packed(_vq) :
> +			    virtqueue_enable_cb_delayed_split(_vq);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
>   
> +static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	unsigned int i;
> +	void *buf;
> +
> +	START_USE(vq);
> +
> +	for (i = 0; i < vq->vring.num; i++) {
> +		if (!vq->desc_state[i].data)
> +			continue;
> +		/* detach_buf clears data, so grab it now. */
> +		buf = vq->desc_state[i].data;
> +		detach_buf_split(vq, i, NULL);
> +		vq->avail_idx_shadow--;
> +		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> +		END_USE(vq);
> +		return buf;
> +	}
> +	/* That should have freed everything. */
> +	BUG_ON(vq->vq.num_free != vq->vring.num);
> +
> +	END_USE(vq);
> +	return NULL;
> +}
> +
> +static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	unsigned int i, num;
> +	void *buf;
> +
> +	START_USE(vq);
> +
> +	for (i = 0; i < vq->vring_packed.num; i++) {
> +		if (!vq->desc_state[i].data)
> +			continue;
> +		/* detach_buf clears data, so grab it now. */
> +		buf = vq->desc_state[i].data;
> +		num = detach_buf_packed(vq, i, NULL);
> +		if (vq->next_avail_idx < num) {
> +			vq->next_avail_idx = vq->vring_packed.num -
> +					(num - vq->next_avail_idx);
> +			vq->wrap_counter ^= 1;
> +		} else {
> +			vq->next_avail_idx -= num;
> +		}
> +		END_USE(vq);
> +		return buf;
> +	}
> +	/* That should have freed everything. */
> +	BUG_ON(vq->vq.num_free != vq->vring_packed.num);
> +
> +	END_USE(vq);
> +	return NULL;
> +}
> +
>   /**
>    * virtqueue_detach_unused_buf - detach first unused buffer
>    * @vq: the struct virtqueue we're talking about.
> @@ -910,27 +1569,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
>   void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	unsigned int i;
> -	void *buf;
>   
> -	START_USE(vq);
> -
> -	for (i = 0; i < vq->vring.num; i++) {
> -		if (!vq->desc_state[i].data)
> -			continue;
> -		/* detach_buf clears data, so grab it now. */
> -		buf = vq->desc_state[i].data;
> -		detach_buf(vq, i, NULL);
> -		vq->avail_idx_shadow--;
> -		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> -		END_USE(vq);
> -		return buf;
> -	}
> -	/* That should have freed everything. */
> -	BUG_ON(vq->vq.num_free != vq->vring.num);
> -
> -	END_USE(vq);
> -	return NULL;
> +	return vq->packed ? virtqueue_detach_unused_buf_packed(_vq) :
> +			    virtqueue_detach_unused_buf_split(_vq);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
>   
> @@ -955,7 +1596,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
>   EXPORT_SYMBOL_GPL(vring_interrupt);
>   
>   struct virtqueue *__vring_new_virtqueue(unsigned int index,
> -					struct vring vring,
> +					union vring_union vring,
> +					bool packed,
>   					struct virtio_device *vdev,
>   					bool weak_barriers,
>   					bool context,
> @@ -963,19 +1605,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   					void (*callback)(struct virtqueue *),
>   					const char *name)
>   {
> -	unsigned int i;
> +	unsigned int num, i;
>   	struct vring_virtqueue *vq;
>   
> -	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
> +	num = packed ? vring.vring_packed.num : vring.vring_split.num;
> +
> +	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
>   		     GFP_KERNEL);
>   	if (!vq)
>   		return NULL;
>   
> -	vq->vring = vring;
>   	vq->vq.callback = callback;
>   	vq->vq.vdev = vdev;
>   	vq->vq.name = name;
> -	vq->vq.num_free = vring.num;
> +	vq->vq.num_free = num;
>   	vq->vq.index = index;
>   	vq->we_own_ring = false;
>   	vq->queue_dma_addr = 0;
> @@ -984,9 +1627,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   	vq->weak_barriers = weak_barriers;
>   	vq->broken = false;
>   	vq->last_used_idx = 0;
> -	vq->avail_flags_shadow = 0;
> -	vq->avail_idx_shadow = 0;
>   	vq->num_added = 0;
> +	vq->packed = packed;
>   	list_add_tail(&vq->vq.list, &vdev->vqs);
>   #ifdef DEBUG
>   	vq->in_use = false;
> @@ -997,18 +1639,37 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   		!context;
>   	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>   
> +	if (vq->packed) {
> +		vq->vring_packed = vring.vring_packed;
> +		vq->next_avail_idx = 0;
> +		vq->wrap_counter = 1;
> +		vq->event_flags_shadow = 0;
> +	} else {
> +		vq->vring = vring.vring_split;
> +		vq->avail_flags_shadow = 0;
> +		vq->avail_idx_shadow = 0;
> +
> +		/* Put everything in free lists. */
> +		vq->free_head = 0;
> +		for (i = 0; i < num-1; i++)
> +			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> +	}
> +
>   	/* No callback?  Tell other side not to bother us. */
>   	if (!callback) {
> -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
> +		if (packed) {
> +			vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> +			vq->vring_packed.driver->flags = cpu_to_virtio16(vdev,
> +						vq->event_flags_shadow);
> +		} else {
> +			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +			if (!vq->event)
> +				vq->vring.avail->flags = cpu_to_virtio16(vdev,
> +						vq->avail_flags_shadow);
> +		}
>   	}
>   
> -	/* Put everything in free lists. */
> -	vq->free_head = 0;
> -	for (i = 0; i < vring.num-1; i++)
> -		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> -	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
> +	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
>   
>   	return &vq->vq;
>   }
> @@ -1056,6 +1717,22 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
>   	}
>   }
>   
> +static inline int
> +__vring_size(unsigned int num, unsigned long align, bool packed)
> +{
> +	return packed ? vring_packed_size(num, align) : vring_size(num, align);
> +}
> +
> +static inline void vring_packed_init(struct vring_packed *vr, unsigned int num,
> +				     void *p, unsigned long align)
> +{
> +	vr->num = num;
> +	vr->desc = p;
> +	vr->driver = (void *)(((uintptr_t)p + sizeof(struct vring_packed_desc)
> +		* num + align - 1) & ~(align - 1));
> +	vr->device = vr->driver + 1;
> +}
> +
>   struct virtqueue *vring_create_virtqueue(
>   	unsigned int index,
>   	unsigned int num,
> @@ -1072,7 +1749,8 @@ struct virtqueue *vring_create_virtqueue(
>   	void *queue = NULL;
>   	dma_addr_t dma_addr;
>   	size_t queue_size_in_bytes;
> -	struct vring vring;
> +	union vring_union vring;
> +	bool packed;
>   
>   	/* We assume num is a power of 2. */
>   	if (num & (num - 1)) {
> @@ -1080,9 +1758,13 @@ struct virtqueue *vring_create_virtqueue(
>   		return NULL;
>   	}
>   
> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> +
>   	/* TODO: allocate each queue chunk individually */
> -	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> +	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
> +			num /= 2) {
> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> +							     packed),
>   					  &dma_addr,
>   					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
>   		if (queue)
> @@ -1094,17 +1776,21 @@ struct virtqueue *vring_create_virtqueue(
>   
>   	if (!queue) {
>   		/* Try to get a single page. You are my only hope! */
> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> +							     packed),
>   					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
>   	}
>   	if (!queue)
>   		return NULL;
>   
> -	queue_size_in_bytes = vring_size(num, vring_align);
> -	vring_init(&vring, num, queue, vring_align);
> +	queue_size_in_bytes = __vring_size(num, vring_align, packed);
> +	if (packed)
> +		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
> +	else
> +		vring_init(&vring.vring_split, num, queue, vring_align);
>   
> -	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> -				   notify, callback, name);
> +	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> +				   context, notify, callback, name);
>   	if (!vq) {
>   		vring_free_queue(vdev, queue_size_in_bytes, queue,
>   				 dma_addr);
> @@ -1130,10 +1816,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>   				      void (*callback)(struct virtqueue *vq),
>   				      const char *name)
>   {
> -	struct vring vring;
> -	vring_init(&vring, num, pages, vring_align);
> -	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> -				     notify, callback, name);
> +	union vring_union vring;
> +	bool packed;
> +
> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> +	if (packed)
> +		vring_packed_init(&vring.vring_packed, num, pages, vring_align);
> +	else
> +		vring_init(&vring.vring_split, num, pages, vring_align);
> +
> +	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> +				     context, notify, callback, name);
>   }
>   EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>   
> @@ -1143,7 +1836,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>   
>   	if (vq->we_own_ring) {
>   		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
> -				 vq->vring.desc, vq->queue_dma_addr);
> +				 vq->packed ? (void *)vq->vring_packed.desc :
> +					      (void *)vq->vring.desc,
> +				 vq->queue_dma_addr);
>   	}
>   	list_del(&_vq->list);
>   	kfree(vq);
> @@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
>   
>   	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
>   		switch (i) {
> -		case VIRTIO_RING_F_INDIRECT_DESC:
> +#if 0
> +		case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
>   			break;
> -		case VIRTIO_RING_F_EVENT_IDX:
> +		case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
>   			break;
> +#endif

It would be better if you can split EVENT_IDX and INDIRECT_DESC into 
separate patches too.

Thanks

>   		case VIRTIO_F_VERSION_1:
>   			break;
>   		case VIRTIO_F_IOMMU_PLATFORM:
>   			break;
> +		case VIRTIO_F_RING_PACKED:
> +			break;
>   		default:
>   			/* We don't understand this bit. */
>   			__virtio_clear_bit(vdev, i);
> @@ -1185,7 +1884,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
>   
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   
> -	return vq->vring.num;
> +	return vq->packed ? vq->vring_packed.num : vq->vring.num;
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
>   
> @@ -1228,6 +1927,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>   
>   	BUG_ON(!vq->we_own_ring);
>   
> +	if (vq->packed)
> +		return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
> +				(char *)vq->vring_packed.desc);
> +
>   	return vq->queue_dma_addr +
>   		((char *)vq->vring.avail - (char *)vq->vring.desc);
>   }
> @@ -1239,11 +1942,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
>   
>   	BUG_ON(!vq->we_own_ring);
>   
> +	if (vq->packed)
> +		return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
> +				(char *)vq->vring_packed.desc);
> +
>   	return vq->queue_dma_addr +
>   		((char *)vq->vring.used - (char *)vq->vring.desc);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
>   
> +/* Only available for split ring */
>   const struct vring *virtqueue_get_vring(struct virtqueue *vq)
>   {
>   	return &to_vvq(vq)->vring;
> diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> index bbf32524ab27..a0075894ad16 100644
> --- a/include/linux/virtio_ring.h
> +++ b/include/linux/virtio_ring.h
> @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
>   struct virtio_device;
>   struct virtqueue;
>   
> +union vring_union {
> +	struct vring vring_split;
> +	struct vring_packed vring_packed;
> +};
> +
>   /*
>    * Creates a virtqueue and allocates the descriptor ring.  If
>    * may_reduce_num is set, then this may allocate a smaller ring than
> @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
>   
>   /* Creates a virtqueue with a custom layout. */
>   struct virtqueue *__vring_new_virtqueue(unsigned int index,
> -					struct vring vring,
> +					union vring_union vring,
> +					bool packed,
>   					struct virtio_device *vdev,
>   					bool weak_barriers,
>   					bool ctx,
> diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
> index 308e2096291f..a6e392325e3a 100644
> --- a/include/uapi/linux/virtio_config.h
> +++ b/include/uapi/linux/virtio_config.h
> @@ -49,7 +49,7 @@
>    * transport being used (eg. virtio_ring), the rest are per-device feature
>    * bits. */
>   #define VIRTIO_TRANSPORT_F_START	28
> -#define VIRTIO_TRANSPORT_F_END		34
> +#define VIRTIO_TRANSPORT_F_END		36
>   
>   #ifndef VIRTIO_CONFIG_NO_LEGACY
>   /* Do we get callbacks when the ring is completely used, even if we've
> @@ -71,4 +71,14 @@
>    * this is for compatibility with legacy systems.
>    */
>   #define VIRTIO_F_IOMMU_PLATFORM		33
> +
> +/* This feature indicates support for the packed virtqueue layout. */
> +#define VIRTIO_F_RING_PACKED		34
> +
> +/*
> + * This feature indicates that all buffers are used by the device
> + * in the same order in which they have been made available.
> + */
> +#define VIRTIO_F_IN_ORDER		35
> +
>   #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
> diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
> index 6d5d5faa989b..735d4207c988 100644
> --- a/include/uapi/linux/virtio_ring.h
> +++ b/include/uapi/linux/virtio_ring.h
> @@ -44,6 +44,9 @@
>   /* This means the buffer contains a list of buffer descriptors. */
>   #define VRING_DESC_F_INDIRECT	4
>   
> +#define VRING_DESC_F_AVAIL(b)	((b) << 7)
> +#define VRING_DESC_F_USED(b)	((b) << 15)
> +
>   /* The Host uses this in used->flags to advise the Guest: don't kick me when
>    * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
>    * will still kick if it's out of buffers. */
> @@ -53,6 +56,10 @@
>    * optimization.  */
>   #define VRING_AVAIL_F_NO_INTERRUPT	1
>   
> +#define VRING_EVENT_F_ENABLE	0x0
> +#define VRING_EVENT_F_DISABLE	0x1
> +#define VRING_EVENT_F_DESC	0x2
> +
>   /* We support indirect buffer descriptors */
>   #define VIRTIO_RING_F_INDIRECT_DESC	28
>   
> @@ -171,4 +178,58 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
>   	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
>   }
>   
> +struct vring_packed_desc_event {
> +	/* __virtio16 off  : 15; // Descriptor Event Offset
> +	 * __virtio16 wrap : 1;  // Descriptor Event Wrap Counter */
> +	__virtio16 off_wrap;
> +	/* __virtio16 flags : 2; // Descriptor Event Flags */
> +	__virtio16 flags;
> +};
> +
> +struct vring_packed_desc {
> +	/* Buffer Address. */
> +	__virtio64 addr;
> +	/* Buffer Length. */
> +	__virtio32 len;
> +	/* Buffer ID. */
> +	__virtio16 id;
> +	/* The flags depending on descriptor type. */
> +	__virtio16 flags;
> +};
> +
> +struct vring_packed {
> +	unsigned int num;
> +
> +	struct vring_packed_desc *desc;
> +
> +	struct vring_packed_desc_event *driver;
> +
> +	struct vring_packed_desc_event *device;
> +};
> +
> +/* The standard layout for the packed ring is a continuous chunk of memory
> + * which looks like this.
> + *
> + * struct vring_packed
> + * {
> + *	// The actual descriptors (16 bytes each)
> + *	struct vring_packed_desc desc[num];
> + *
> + *	// Padding to the next align boundary.
> + *	char pad[];
> + *
> + *	// Driver Event Suppression
> + *	struct vring_packed_desc_event driver;
> + *
> + *	// Device Event Suppression
> + *	struct vring_packed_desc_event device;
> + * };
> + */
> +
> +static inline unsigned vring_packed_size(unsigned int num, unsigned long align)
> +{
> +	return ((sizeof(struct vring_packed_desc) * num + align - 1)
> +		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> +}
> +
>   #endif /* _UAPI_LINUX_VIRTIO_RING_H */

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-13  4:30 ` Jason Wang
@ 2018-04-13  7:15   ` Tiwei Bie
  2018-04-17  2:11     ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Tiwei Bie @ 2018-04-13  7:15 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, wexu, virtualization, linux-kernel, netdev, jfreimann

On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> On 2018年04月01日 22:12, Tiwei Bie wrote:
> > Hello everyone,
> > 
> > This RFC implements packed ring support for virtio driver.
> > 
> > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > 
> > TODO:
> > - Refinements and bug fixes;
> > - Split into small patches;
> > - Test indirect descriptor support;
> > - Test/fix event suppression support;
> > - Test devices other than net;
> > 
> > RFC v1 -> RFC v2:
> > - Add indirect descriptor support - compile test only;
> > - Add event suppression supprt - compile test only;
> > - Move vring_packed_init() out of uapi (Jason, MST);
> > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > - Avoid using '%' operator (Jason);
> > - Rename free_head -> next_avail_idx (Jason);
> > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > - Some other refinements and bug fixes;
> > 
> > Thanks!
> > 
> > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > ---
> >   drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> >   include/linux/virtio_ring.h        |    8 +-
> >   include/uapi/linux/virtio_config.h |   12 +-
> >   include/uapi/linux/virtio_ring.h   |   61 ++
> >   4 files changed, 980 insertions(+), 195 deletions(-)
[...]
> > +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> > +						       unsigned int total_sg,
> > +						       gfp_t gfp)
> > +{
> > +	struct vring_packed_desc *desc;
> > +
> > +	/*
> > +	 * We require lowmem mappings for the descriptors because
> > +	 * otherwise virt_to_phys will give us bogus addresses in the
> > +	 * virtqueue.
> > +	 */
> > +	gfp &= ~__GFP_HIGHMEM;
> > +
> > +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> 
> Can we simply check vq->packed here to avoid duplicating helpers?

Then it would be something like this:

static void *alloc_indirect(struct virtqueue *_vq, unsigned int total_sg,
			    gfp_t gfp)
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	void *data;

	/*
	 * We require lowmem mappings for the descriptors because
	 * otherwise virt_to_phys will give us bogus addresses in the
	 * virtqueue.
	 */
	gfp &= ~__GFP_HIGHMEM;

	if (vq->packed) {
		data = kmalloc(total_sg * sizeof(struct vring_packed_desc),
				gfp);
		if (!data)
			return NULL;
	} else {
		struct vring_desc *desc;
		unsigned int i;

		desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
		if (!desc)
			return NULL;

		for (i = 0; i < total_sg; i++)
			desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);

		data = desc;
	}

	return data;
}

I would prefer to have two simpler helpers (and to the callers,
it's already very clear about which one they should call), i.e.
the current implementation:

static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
						       unsigned int total_sg,
						       gfp_t gfp)
{
	struct vring_packed_desc *desc;

	/*
	 * We require lowmem mappings for the descriptors because
	 * otherwise virt_to_phys will give us bogus addresses in the
	 * virtqueue.
	 */
	gfp &= ~__GFP_HIGHMEM;

	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);

	return desc;
}

static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
					       unsigned int total_sg,
					       gfp_t gfp)
{
	struct vring_desc *desc;
	unsigned int i;

	/*
	 * We require lowmem mappings for the descriptors because
	 * otherwise virt_to_phys will give us bogus addresses in the
	 * virtqueue.
	 */
	gfp &= ~__GFP_HIGHMEM;

	desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
	if (!desc)
		return NULL;

	for (i = 0; i < total_sg; i++)
		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
	return desc;
}

> 
> > +
> > +	return desc;
> > +}
[...]
> > +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > +				       struct scatterlist *sgs[],
> > +				       unsigned int total_sg,
> > +				       unsigned int out_sgs,
> > +				       unsigned int in_sgs,
> > +				       void *data,
> > +				       void *ctx,
> > +				       gfp_t gfp)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +	struct vring_packed_desc *desc;
> > +	struct scatterlist *sg;
> > +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> > +	__virtio16 uninitialized_var(head_flags), flags;
> > +	int head, wrap_counter;
> > +	bool indirect;
> > +
> > +	START_USE(vq);
> > +
> > +	BUG_ON(data == NULL);
> > +	BUG_ON(ctx && vq->indirect);
> > +
> > +	if (unlikely(vq->broken)) {
> > +		END_USE(vq);
> > +		return -EIO;
> > +	}
> > +
> > +#ifdef DEBUG
> > +	{
> > +		ktime_t now = ktime_get();
> > +
> > +		/* No kick or get, with .1 second between?  Warn. */
> > +		if (vq->last_add_time_valid)
> > +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> > +					    > 100);
> > +		vq->last_add_time = now;
> > +		vq->last_add_time_valid = true;
> > +	}
> > +#endif
> > +
> > +	BUG_ON(total_sg == 0);
> > +
> > +	head = vq->next_avail_idx;
> > +	wrap_counter = vq->wrap_counter;
> > +
> > +	/* If the host supports indirect descriptor tables, and we have multiple
> > +	 * buffers, then go indirect. FIXME: tune this threshold */
> > +	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> 
> Let's introduce a helper like virtqueue_need_indirect() to avoid duplicating
> codes and FIXME.

Okay.

> 
> > +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> > +	else {
> > +		desc = NULL;
> > +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> > +	}
> > +
> > +	if (desc) {
> > +		/* Use a single buffer which doesn't continue */
> > +		indirect = true;
> > +		/* Set up rest to use this indirect table. */
> > +		i = 0;
> > +		descs_used = 1;
> > +	} else {
> > +		indirect = false;
> > +		desc = vq->vring_packed.desc;
> > +		i = head;
> > +		descs_used = total_sg;
> > +	}
> > +
> > +	if (vq->vq.num_free < descs_used) {
> > +		pr_debug("Can't add buf len %i - avail = %i\n",
> > +			 descs_used, vq->vq.num_free);
> > +		/* FIXME: for historical reasons, we force a notify here if
> > +		 * there are outgoing parts to the buffer.  Presumably the
> > +		 * host should service the ring ASAP. */
> > +		if (out_sgs)
> > +			vq->notify(&vq->vq);
> > +		if (indirect)
> > +			kfree(desc);
> > +		END_USE(vq);
> > +		return -ENOSPC;
> > +	}
> > +
> > +	for (n = 0; n < out_sgs + in_sgs; n++) {
> > +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> > +						DMA_TO_DEVICE : DMA_FROM_DEVICE);
> > +			if (vring_mapping_error(vq, addr))
> > +				goto unmap_release;
> > +
> > +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> > +					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> > +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> > +					VRING_DESC_F_USED(!vq->wrap_counter));
> > +			if (!indirect && i == head)
> > +				head_flags = flags;
> > +			else
> > +				desc[i].flags = flags;
> > +
> > +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> 
> Similar to V1, we only need this for the last descriptor.

Okay, will just set it for the last desc.

> 
> > +			prev = i;
> 
> It looks to me there's no need to track prev inside the loop here.
> 
> > +			i++;
> > +			if (!indirect && i >= vq->vring_packed.num) {
> > +				i = 0;
> > +				vq->wrap_counter ^= 1;
> > +			}
> > +		}
> > +	}
> > +	/* Last one doesn't continue. */
> > +	if (total_sg == 1)
> > +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > +	else
> > +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> 
> The only case when prev != i - 1 is i == 0, we can add a if here.

It's just a mirror of the existing implementation in split ring.
It seems that split ring implementation needs this just because
it's much harder for it to find the prev, which is not true for
packed ring. So I'll take your suggestion. Thanks!

> 
[...]
> > +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +	u16 new, old, off_wrap;
> > +	bool needs_kick;
> > +
> > +	START_USE(vq);
> > +	/* We need to expose the new flags value before checking notification
> > +	 * suppressions. */
> > +	virtio_mb(vq->weak_barriers);
> > +
> > +	old = vq->next_avail_idx - vq->num_added;
> > +	new = vq->next_avail_idx;
> > +	vq->num_added = 0;
> > +
> > +#ifdef DEBUG
> > +	if (vq->last_add_time_valid) {
> > +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> > +					      vq->last_add_time)) > 100);
> > +	}
> > +	vq->last_add_time_valid = false;
> > +#endif
> > +
> > +	off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
> > +
> > +	if (vq->event) {
> 
> It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
> instead of vq->event here. Spec does not forces to use evenf_off and
> event_wrap if event index is enabled.
> 
> > +		// FIXME: fix this!
> > +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
> > +			     vring_need_event(off_wrap & ~(1<<15), new, old);
> 
> Why need a & here?

Because wrap_counter (the most significant bit in off_wrap)
isn't part of the index.

> 
> > +	} else {
> 
> Need a smp_rmb() to make sure desc_event_flags was checked before flags.

I don't get your point, if my understanding is correct,
desc_event_flags is vq->vring_packed.device->flags. So
what's the "flags"?

> 
> > +		needs_kick = (vq->vring_packed.device->flags !=
> > +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
> > +	}
> > +	END_USE(vq);
> > +	return needs_kick;
> > +}
[...]
> > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > +			      void **ctx)
> > +{
> > +	struct vring_packed_desc *desc;
> > +	unsigned int i, j;
> > +
> > +	/* Clear data ptr. */
> > +	vq->desc_state[head].data = NULL;
> > +
> > +	i = head;
> > +
> > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > +		desc = &vq->vring_packed.desc[i];
> > +		vring_unmap_one_packed(vq, desc);
> > +		desc->flags = 0x0;
> 
> Looks like this is unnecessary.

It's safer to zero it. If we don't zero it, after we
call virtqueue_detach_unused_buf_packed() which calls
this function, the desc is still available to the
device.

> 
> > +		i++;
> > +		if (i >= vq->vring_packed.num)
> > +			i = 0;
> > +	}
[...]
> > +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +	u16 last_used_idx, wrap_counter, off_wrap;
> > +
> > +	START_USE(vq);
> > +
> > +	last_used_idx = vq->last_used_idx;
> > +	wrap_counter = vq->wrap_counter;
> > +
> > +	if (last_used_idx > vq->next_avail_idx)
> > +		wrap_counter ^= 1;
> > +
> > +	off_wrap = last_used_idx | (wrap_counter << 15);
> > +
> > +	/* We optimistically turn back on interrupts, then check if there was
> > +	 * more to do. */
> > +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> > +	 * either clear the flags bit or point the event index at the next
> > +	 * entry. Always do both to keep code simple. */
> > +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> > +						     VRING_EVENT_F_ENABLE;
> > +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > +							vq->event_flags_shadow);
> > +	}
> 
> A smp_wmb() is missed here?
> 
> > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
> 
> And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
> sufficient here.

I didn't think much when implementing the event suppression
for packed ring previously. After I saw your comments, I found
something new. Indeed, unlike the split ring, for the packed
ring, spec doesn't say we must use VRING_EVENT_F_DESC when
EVENT_IDX is negotiated. So do you think below thought is
right or makes sense?

- For virtqueue_enable_cb_prepare(), we just need to enable
  the ring by setting flags to VRING_EVENT_F_ENABLE in any
  case.

- We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
  negotiated) only when we want to delay the interrupts
  virtqueue_enable_cb_delayed().

> 
> > +	END_USE(vq);
> > +	return last_used_idx;
> > +}
> > +
[...]
> > @@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
> >   	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
> >   		switch (i) {
> > -		case VIRTIO_RING_F_INDIRECT_DESC:
> > +#if 0
> > +		case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
> >   			break;
> > -		case VIRTIO_RING_F_EVENT_IDX:
> > +		case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
> >   			break;
> > +#endif
> 
> It would be better if you can split EVENT_IDX and INDIRECT_DESC into
> separate patches too.

Sure. Will do it in the next version.

Thanks for the review!

> 
> Thanks
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-01 14:12 [RFC v2] virtio: support packed ring Tiwei Bie
  2018-04-10  2:55 ` Jason Wang
  2018-04-13  4:30 ` Jason Wang
@ 2018-04-13 15:22 ` Michael S. Tsirkin
  2018-04-14 11:22   ` Tiwei Bie
  2018-04-23  5:42 ` Jason Wang
  3 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2018-04-13 15:22 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: jasowang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Sun, Apr 01, 2018 at 10:12:16PM +0800, Tiwei Bie wrote:
> +static inline bool more_used(const struct vring_virtqueue *vq)
> +{
> +	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
> +}
> +
> +void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
> +				  void **ctx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	void *ret;
> +	unsigned int i;
> +	u16 last_used;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}

So virtqueue_get_buf_ctx_split should only call more_used_split.

to avoid such issues I think we should lay out the code like this:

XXX_split

XXX_packed

XXX wrappers

> +/* The standard layout

I'd drop standard here.

> for the packed ring is a continuous chunk of memory
> + * which looks like this.
> + *
> + * struct vring_packed
> + * {

Can the opening bracket go on the prev line pls?

> + *	// The actual descriptors (16 bytes each)
> + *	struct vring_packed_desc desc[num];
> + *
> + *	// Padding to the next align boundary.
> + *	char pad[];
> + *
> + *	// Driver Event Suppression
> + *	struct vring_packed_desc_event driver;
> + *
> + *	// Device Event Suppression
> + *	struct vring_packed_desc_event device;

Maybe that's how our driver does it but it's not based on spec
so I don't think this belongs in the header.

> + * };
> + */
> +
> +static inline unsigned vring_packed_size(unsigned int num, unsigned long align)
> +{
> +	return ((sizeof(struct vring_packed_desc) * num + align - 1)
> +		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> +}
> +

Cant say this API makes sense for me.


>  #endif /* _UAPI_LINUX_VIRTIO_RING_H */
> -- 
> 2.11.0

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-13 15:22 ` Michael S. Tsirkin
@ 2018-04-14 11:22   ` Tiwei Bie
  0 siblings, 0 replies; 28+ messages in thread
From: Tiwei Bie @ 2018-04-14 11:22 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Fri, Apr 13, 2018 at 06:22:45PM +0300, Michael S. Tsirkin wrote:
> On Sun, Apr 01, 2018 at 10:12:16PM +0800, Tiwei Bie wrote:
> > +static inline bool more_used(const struct vring_virtqueue *vq)
> > +{
> > +	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
> > +}
> > +
> > +void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
> > +				  void **ctx)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +	void *ret;
> > +	unsigned int i;
> > +	u16 last_used;
> > +
> > +	START_USE(vq);
> > +
> > +	if (unlikely(vq->broken)) {
> > +		END_USE(vq);
> > +		return NULL;
> > +	}
> > +
> > +	if (!more_used(vq)) {
> > +		pr_debug("No more buffers in queue\n");
> > +		END_USE(vq);
> > +		return NULL;
> > +	}
> 
> So virtqueue_get_buf_ctx_split should only call more_used_split.

Yeah, you're right! Will fix this in the next version.

> 
> to avoid such issues I think we should lay out the code like this:
> 
> XXX_split
> 
> XXX_packed
> 
> XXX wrappers

I'll do it. Thanks for the suggestion!

> 
> > +/* The standard layout
> 
> I'd drop standard here.

Got it. I'll drop the word "standard".

> 
> > for the packed ring is a continuous chunk of memory
> > + * which looks like this.
> > + *
> > + * struct vring_packed
> > + * {
> 
> Can the opening bracket go on the prev line pls?

Sure.

> 
> > + *	// The actual descriptors (16 bytes each)
> > + *	struct vring_packed_desc desc[num];
> > + *
> > + *	// Padding to the next align boundary.
> > + *	char pad[];
> > + *
> > + *	// Driver Event Suppression
> > + *	struct vring_packed_desc_event driver;
> > + *
> > + *	// Device Event Suppression
> > + *	struct vring_packed_desc_event device;
> 
> Maybe that's how our driver does it but it's not based on spec
> so I don't think this belongs in the header.

I will move it to the place where vring_packed_init()
is defined.

> 
> > + * };
> > + */
> > +
> > +static inline unsigned vring_packed_size(unsigned int num, unsigned long align)
> > +{
> > +	return ((sizeof(struct vring_packed_desc) * num + align - 1)
> > +		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> > +}
> > +
> 
> Cant say this API makes sense for me.

Hmm, do you have any suggestion? Also move it out of this header?

Thanks for the review! :)

Best regards,
Tiwei Bie

> 
> 
> >  #endif /* _UAPI_LINUX_VIRTIO_RING_H */
> > -- 
> > 2.11.0

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-13  7:15   ` Tiwei Bie
@ 2018-04-17  2:11     ` Jason Wang
  2018-04-17  2:17       ` Michael S. Tsirkin
  2018-04-17  2:51       ` Tiwei Bie
  0 siblings, 2 replies; 28+ messages in thread
From: Jason Wang @ 2018-04-17  2:11 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, wexu, virtualization, linux-kernel, netdev, jfreimann



On 2018年04月13日 15:15, Tiwei Bie wrote:
> On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
>> On 2018年04月01日 22:12, Tiwei Bie wrote:
>>> Hello everyone,
>>>
>>> This RFC implements packed ring support for virtio driver.
>>>
>>> The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
>>> by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
>>> Minor changes are needed for the vhost code, e.g. to kick the guest.
>>>
>>> TODO:
>>> - Refinements and bug fixes;
>>> - Split into small patches;
>>> - Test indirect descriptor support;
>>> - Test/fix event suppression support;
>>> - Test devices other than net;
>>>
>>> RFC v1 -> RFC v2:
>>> - Add indirect descriptor support - compile test only;
>>> - Add event suppression supprt - compile test only;
>>> - Move vring_packed_init() out of uapi (Jason, MST);
>>> - Merge two loops into one in virtqueue_add_packed() (Jason);
>>> - Split vring_unmap_one() for packed ring and split ring (Jason);
>>> - Avoid using '%' operator (Jason);
>>> - Rename free_head -> next_avail_idx (Jason);
>>> - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
>>> - Some other refinements and bug fixes;
>>>
>>> Thanks!
>>>
>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>> ---
>>>    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
>>>    include/linux/virtio_ring.h        |    8 +-
>>>    include/uapi/linux/virtio_config.h |   12 +-
>>>    include/uapi/linux/virtio_ring.h   |   61 ++
>>>    4 files changed, 980 insertions(+), 195 deletions(-)
> [...]
>>> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
>>> +						       unsigned int total_sg,
>>> +						       gfp_t gfp)
>>> +{
>>> +	struct vring_packed_desc *desc;
>>> +
>>> +	/*
>>> +	 * We require lowmem mappings for the descriptors because
>>> +	 * otherwise virt_to_phys will give us bogus addresses in the
>>> +	 * virtqueue.
>>> +	 */
>>> +	gfp &= ~__GFP_HIGHMEM;
>>> +
>>> +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
>> Can we simply check vq->packed here to avoid duplicating helpers?
> Then it would be something like this:
>
> static void *alloc_indirect(struct virtqueue *_vq, unsigned int total_sg,
> 			    gfp_t gfp)
> {
> 	struct vring_virtqueue *vq = to_vvq(_vq);
> 	void *data;
>
> 	/*
> 	 * We require lowmem mappings for the descriptors because
> 	 * otherwise virt_to_phys will give us bogus addresses in the
> 	 * virtqueue.
> 	 */
> 	gfp &= ~__GFP_HIGHMEM;
>
> 	if (vq->packed) {
> 		data = kmalloc(total_sg * sizeof(struct vring_packed_desc),
> 				gfp);
> 		if (!data)
> 			return NULL;
> 	} else {
> 		struct vring_desc *desc;
> 		unsigned int i;
>
> 		desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
> 		if (!desc)
> 			return NULL;
>
> 		for (i = 0; i < total_sg; i++)
> 			desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
>
> 		data = desc;
> 	}
>
> 	return data;
> }
>
> I would prefer to have two simpler helpers (and to the callers,
> it's already very clear about which one they should call), i.e.
> the current implementation:
>
> static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> 						       unsigned int total_sg,
> 						       gfp_t gfp)
> {
> 	struct vring_packed_desc *desc;
>
> 	/*
> 	 * We require lowmem mappings for the descriptors because
> 	 * otherwise virt_to_phys will give us bogus addresses in the
> 	 * virtqueue.
> 	 */
> 	gfp &= ~__GFP_HIGHMEM;
>
> 	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
>
> 	return desc;
> }
>
> static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> 					       unsigned int total_sg,
> 					       gfp_t gfp)
> {
> 	struct vring_desc *desc;
> 	unsigned int i;
>
> 	/*
> 	 * We require lowmem mappings for the descriptors because
> 	 * otherwise virt_to_phys will give us bogus addresses in the
> 	 * virtqueue.
> 	 */
> 	gfp &= ~__GFP_HIGHMEM;
>
> 	desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
> 	if (!desc)
> 		return NULL;
>
> 	for (i = 0; i < total_sg; i++)
> 		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
> 	return desc;
> }

Yeah, I miss that split version needs a desc list.

>
>>> +
>>> +	return desc;
>>> +}
> [...]
>>> +static inline int virtqueue_add_packed(struct virtqueue *_vq,
>>> +				       struct scatterlist *sgs[],
>>> +				       unsigned int total_sg,
>>> +				       unsigned int out_sgs,
>>> +				       unsigned int in_sgs,
>>> +				       void *data,
>>> +				       void *ctx,
>>> +				       gfp_t gfp)
>>> +{
>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>> +	struct vring_packed_desc *desc;
>>> +	struct scatterlist *sg;
>>> +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
>>> +	__virtio16 uninitialized_var(head_flags), flags;
>>> +	int head, wrap_counter;
>>> +	bool indirect;
>>> +
>>> +	START_USE(vq);
>>> +
>>> +	BUG_ON(data == NULL);
>>> +	BUG_ON(ctx && vq->indirect);
>>> +
>>> +	if (unlikely(vq->broken)) {
>>> +		END_USE(vq);
>>> +		return -EIO;
>>> +	}
>>> +
>>> +#ifdef DEBUG
>>> +	{
>>> +		ktime_t now = ktime_get();
>>> +
>>> +		/* No kick or get, with .1 second between?  Warn. */
>>> +		if (vq->last_add_time_valid)
>>> +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
>>> +					    > 100);
>>> +		vq->last_add_time = now;
>>> +		vq->last_add_time_valid = true;
>>> +	}
>>> +#endif
>>> +
>>> +	BUG_ON(total_sg == 0);
>>> +
>>> +	head = vq->next_avail_idx;
>>> +	wrap_counter = vq->wrap_counter;
>>> +
>>> +	/* If the host supports indirect descriptor tables, and we have multiple
>>> +	 * buffers, then go indirect. FIXME: tune this threshold */
>>> +	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
>> Let's introduce a helper like virtqueue_need_indirect() to avoid duplicating
>> codes and FIXME.
> Okay.
>
>>> +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
>>> +	else {
>>> +		desc = NULL;
>>> +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
>>> +	}
>>> +
>>> +	if (desc) {
>>> +		/* Use a single buffer which doesn't continue */
>>> +		indirect = true;
>>> +		/* Set up rest to use this indirect table. */
>>> +		i = 0;
>>> +		descs_used = 1;
>>> +	} else {
>>> +		indirect = false;
>>> +		desc = vq->vring_packed.desc;
>>> +		i = head;
>>> +		descs_used = total_sg;
>>> +	}
>>> +
>>> +	if (vq->vq.num_free < descs_used) {
>>> +		pr_debug("Can't add buf len %i - avail = %i\n",
>>> +			 descs_used, vq->vq.num_free);
>>> +		/* FIXME: for historical reasons, we force a notify here if
>>> +		 * there are outgoing parts to the buffer.  Presumably the
>>> +		 * host should service the ring ASAP. */
>>> +		if (out_sgs)
>>> +			vq->notify(&vq->vq);
>>> +		if (indirect)
>>> +			kfree(desc);
>>> +		END_USE(vq);
>>> +		return -ENOSPC;
>>> +	}
>>> +
>>> +	for (n = 0; n < out_sgs + in_sgs; n++) {
>>> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
>>> +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
>>> +						DMA_TO_DEVICE : DMA_FROM_DEVICE);
>>> +			if (vring_mapping_error(vq, addr))
>>> +				goto unmap_release;
>>> +
>>> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
>>> +					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
>>> +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
>>> +					VRING_DESC_F_USED(!vq->wrap_counter));
>>> +			if (!indirect && i == head)
>>> +				head_flags = flags;
>>> +			else
>>> +				desc[i].flags = flags;
>>> +
>>> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>> Similar to V1, we only need this for the last descriptor.
> Okay, will just set it for the last desc.
>
>>> +			prev = i;
>> It looks to me there's no need to track prev inside the loop here.
>>
>>> +			i++;
>>> +			if (!indirect && i >= vq->vring_packed.num) {
>>> +				i = 0;
>>> +				vq->wrap_counter ^= 1;
>>> +			}
>>> +		}
>>> +	}
>>> +	/* Last one doesn't continue. */
>>> +	if (total_sg == 1)
>>> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>>> +	else
>>> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>> The only case when prev != i - 1 is i == 0, we can add a if here.
> It's just a mirror of the existing implementation in split ring.
> It seems that split ring implementation needs this just because
> it's much harder for it to find the prev, which is not true for
> packed ring. So I'll take your suggestion. Thanks!
>
> [...]
>>> +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>>> +{
>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>> +	u16 new, old, off_wrap;
>>> +	bool needs_kick;
>>> +
>>> +	START_USE(vq);
>>> +	/* We need to expose the new flags value before checking notification
>>> +	 * suppressions. */
>>> +	virtio_mb(vq->weak_barriers);
>>> +
>>> +	old = vq->next_avail_idx - vq->num_added;
>>> +	new = vq->next_avail_idx;
>>> +	vq->num_added = 0;
>>> +
>>> +#ifdef DEBUG
>>> +	if (vq->last_add_time_valid) {
>>> +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
>>> +					      vq->last_add_time)) > 100);
>>> +	}
>>> +	vq->last_add_time_valid = false;
>>> +#endif
>>> +
>>> +	off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
>>> +
>>> +	if (vq->event) {
>> It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
>> instead of vq->event here. Spec does not forces to use evenf_off and
>> event_wrap if event index is enabled.
>>
>>> +		// FIXME: fix this!
>>> +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
>>> +			     vring_need_event(off_wrap & ~(1<<15), new, old);
>> Why need a & here?
> Because wrap_counter (the most significant bit in off_wrap)
> isn't part of the index.
>
>>> +	} else {
>> Need a smp_rmb() to make sure desc_event_flags was checked before flags.
> I don't get your point, if my understanding is correct,
> desc_event_flags is vq->vring_packed.device->flags. So
> what's the "flags"?

Sorry, I mean we need check device.flags before off_warp. So it needs an 
smp_rmb() in the middle. It looks to me there's no guarantee that 
VRING_EVENT_F_DESC is set if event index is supported.

>
>>> +		needs_kick = (vq->vring_packed.device->flags !=
>>> +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
>>> +	}
>>> +	END_USE(vq);
>>> +	return needs_kick;
>>> +}
> [...]
>>> +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>> +			      void **ctx)
>>> +{
>>> +	struct vring_packed_desc *desc;
>>> +	unsigned int i, j;
>>> +
>>> +	/* Clear data ptr. */
>>> +	vq->desc_state[head].data = NULL;
>>> +
>>> +	i = head;
>>> +
>>> +	for (j = 0; j < vq->desc_state[head].num; j++) {
>>> +		desc = &vq->vring_packed.desc[i];
>>> +		vring_unmap_one_packed(vq, desc);
>>> +		desc->flags = 0x0;
>> Looks like this is unnecessary.
> It's safer to zero it. If we don't zero it, after we
> call virtqueue_detach_unused_buf_packed() which calls
> this function, the desc is still available to the
> device.

Well detach_unused_buf_packed() should be called after device is 
stopped, otherwise even if you try to clear, there will still be a 
window that device may use it.

>
>>> +		i++;
>>> +		if (i >= vq->vring_packed.num)
>>> +			i = 0;
>>> +	}
> [...]
>>> +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>>> +{
>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>> +	u16 last_used_idx, wrap_counter, off_wrap;
>>> +
>>> +	START_USE(vq);
>>> +
>>> +	last_used_idx = vq->last_used_idx;
>>> +	wrap_counter = vq->wrap_counter;
>>> +
>>> +	if (last_used_idx > vq->next_avail_idx)
>>> +		wrap_counter ^= 1;
>>> +
>>> +	off_wrap = last_used_idx | (wrap_counter << 15);
>>> +
>>> +	/* We optimistically turn back on interrupts, then check if there was
>>> +	 * more to do. */
>>> +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
>>> +	 * either clear the flags bit or point the event index at the next
>>> +	 * entry. Always do both to keep code simple. */
>>> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
>>> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
>>> +						     VRING_EVENT_F_ENABLE;
>>> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>>> +							vq->event_flags_shadow);
>>> +	}
>> A smp_wmb() is missed here?
>>
>>> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
>> And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
>> sufficient here.
> I didn't think much when implementing the event suppression
> for packed ring previously. After I saw your comments, I found
> something new. Indeed, unlike the split ring, for the packed
> ring, spec doesn't say we must use VRING_EVENT_F_DESC when
> EVENT_IDX is negotiated. So do you think below thought is
> right or makes sense?
>
> - For virtqueue_enable_cb_prepare(), we just need to enable
>    the ring by setting flags to VRING_EVENT_F_ENABLE in any
>    case.
>
> - We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
>    negotiated) only when we want to delay the interrupts
>    virtqueue_enable_cb_delayed().

This looks good to me.

>
>>> +	END_USE(vq);
>>> +	return last_used_idx;
>>> +}
>>> +
> [...]
>>> @@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
>>>    	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
>>>    		switch (i) {
>>> -		case VIRTIO_RING_F_INDIRECT_DESC:
>>> +#if 0
>>> +		case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
>>>    			break;
>>> -		case VIRTIO_RING_F_EVENT_IDX:
>>> +		case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
>>>    			break;
>>> +#endif
>> It would be better if you can split EVENT_IDX and INDIRECT_DESC into
>> separate patches too.
> Sure. Will do it in the next version.
>
> Thanks for the review!

Thanks.

>> Thanks
>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17  2:11     ` Jason Wang
@ 2018-04-17  2:17       ` Michael S. Tsirkin
  2018-04-17  2:24         ` Jason Wang
  2018-04-17  2:51       ` Tiwei Bie
  1 sibling, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2018-04-17  2:17 UTC (permalink / raw)
  To: Jason Wang
  Cc: Tiwei Bie, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> 
> 
> On 2018年04月13日 15:15, Tiwei Bie wrote:
> > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > Hello everyone,
> > > > 
> > > > This RFC implements packed ring support for virtio driver.
> > > > 
> > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > 
> > > > TODO:
> > > > - Refinements and bug fixes;
> > > > - Split into small patches;
> > > > - Test indirect descriptor support;
> > > > - Test/fix event suppression support;
> > > > - Test devices other than net;
> > > > 
> > > > RFC v1 -> RFC v2:
> > > > - Add indirect descriptor support - compile test only;
> > > > - Add event suppression supprt - compile test only;
> > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > - Avoid using '%' operator (Jason);
> > > > - Rename free_head -> next_avail_idx (Jason);
> > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > - Some other refinements and bug fixes;
> > > > 
> > > > Thanks!
> > > > 
> > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > ---
> > > >    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > >    include/linux/virtio_ring.h        |    8 +-
> > > >    include/uapi/linux/virtio_config.h |   12 +-
> > > >    include/uapi/linux/virtio_ring.h   |   61 ++
> > > >    4 files changed, 980 insertions(+), 195 deletions(-)
> > [...]
> > > > +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> > > > +						       unsigned int total_sg,
> > > > +						       gfp_t gfp)
> > > > +{
> > > > +	struct vring_packed_desc *desc;
> > > > +
> > > > +	/*
> > > > +	 * We require lowmem mappings for the descriptors because
> > > > +	 * otherwise virt_to_phys will give us bogus addresses in the
> > > > +	 * virtqueue.
> > > > +	 */
> > > > +	gfp &= ~__GFP_HIGHMEM;
> > > > +
> > > > +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> > > Can we simply check vq->packed here to avoid duplicating helpers?
> > Then it would be something like this:
> > 
> > static void *alloc_indirect(struct virtqueue *_vq, unsigned int total_sg,
> > 			    gfp_t gfp)
> > {
> > 	struct vring_virtqueue *vq = to_vvq(_vq);
> > 	void *data;
> > 
> > 	/*
> > 	 * We require lowmem mappings for the descriptors because
> > 	 * otherwise virt_to_phys will give us bogus addresses in the
> > 	 * virtqueue.
> > 	 */
> > 	gfp &= ~__GFP_HIGHMEM;
> > 
> > 	if (vq->packed) {
> > 		data = kmalloc(total_sg * sizeof(struct vring_packed_desc),
> > 				gfp);
> > 		if (!data)
> > 			return NULL;
> > 	} else {
> > 		struct vring_desc *desc;
> > 		unsigned int i;
> > 
> > 		desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
> > 		if (!desc)
> > 			return NULL;
> > 
> > 		for (i = 0; i < total_sg; i++)
> > 			desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
> > 
> > 		data = desc;
> > 	}
> > 
> > 	return data;
> > }
> > 
> > I would prefer to have two simpler helpers (and to the callers,
> > it's already very clear about which one they should call), i.e.
> > the current implementation:
> > 
> > static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> > 						       unsigned int total_sg,
> > 						       gfp_t gfp)
> > {
> > 	struct vring_packed_desc *desc;
> > 
> > 	/*
> > 	 * We require lowmem mappings for the descriptors because
> > 	 * otherwise virt_to_phys will give us bogus addresses in the
> > 	 * virtqueue.
> > 	 */
> > 	gfp &= ~__GFP_HIGHMEM;
> > 
> > 	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> > 
> > 	return desc;
> > }
> > 
> > static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> > 					       unsigned int total_sg,
> > 					       gfp_t gfp)
> > {
> > 	struct vring_desc *desc;
> > 	unsigned int i;
> > 
> > 	/*
> > 	 * We require lowmem mappings for the descriptors because
> > 	 * otherwise virt_to_phys will give us bogus addresses in the
> > 	 * virtqueue.
> > 	 */
> > 	gfp &= ~__GFP_HIGHMEM;
> > 
> > 	desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
> > 	if (!desc)
> > 		return NULL;
> > 
> > 	for (i = 0; i < total_sg; i++)
> > 		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
> > 	return desc;
> > }
> 
> Yeah, I miss that split version needs a desc list.
> 
> > 
> > > > +
> > > > +	return desc;
> > > > +}
> > [...]
> > > > +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > +				       struct scatterlist *sgs[],
> > > > +				       unsigned int total_sg,
> > > > +				       unsigned int out_sgs,
> > > > +				       unsigned int in_sgs,
> > > > +				       void *data,
> > > > +				       void *ctx,
> > > > +				       gfp_t gfp)
> > > > +{
> > > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > +	struct vring_packed_desc *desc;
> > > > +	struct scatterlist *sg;
> > > > +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> > > > +	__virtio16 uninitialized_var(head_flags), flags;
> > > > +	int head, wrap_counter;
> > > > +	bool indirect;
> > > > +
> > > > +	START_USE(vq);
> > > > +
> > > > +	BUG_ON(data == NULL);
> > > > +	BUG_ON(ctx && vq->indirect);
> > > > +
> > > > +	if (unlikely(vq->broken)) {
> > > > +		END_USE(vq);
> > > > +		return -EIO;
> > > > +	}
> > > > +
> > > > +#ifdef DEBUG
> > > > +	{
> > > > +		ktime_t now = ktime_get();
> > > > +
> > > > +		/* No kick or get, with .1 second between?  Warn. */
> > > > +		if (vq->last_add_time_valid)
> > > > +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> > > > +					    > 100);
> > > > +		vq->last_add_time = now;
> > > > +		vq->last_add_time_valid = true;
> > > > +	}
> > > > +#endif
> > > > +
> > > > +	BUG_ON(total_sg == 0);
> > > > +
> > > > +	head = vq->next_avail_idx;
> > > > +	wrap_counter = vq->wrap_counter;
> > > > +
> > > > +	/* If the host supports indirect descriptor tables, and we have multiple
> > > > +	 * buffers, then go indirect. FIXME: tune this threshold */
> > > > +	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> > > Let's introduce a helper like virtqueue_need_indirect() to avoid duplicating
> > > codes and FIXME.
> > Okay.
> > 
> > > > +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> > > > +	else {
> > > > +		desc = NULL;
> > > > +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> > > > +	}
> > > > +
> > > > +	if (desc) {
> > > > +		/* Use a single buffer which doesn't continue */
> > > > +		indirect = true;
> > > > +		/* Set up rest to use this indirect table. */
> > > > +		i = 0;
> > > > +		descs_used = 1;
> > > > +	} else {
> > > > +		indirect = false;
> > > > +		desc = vq->vring_packed.desc;
> > > > +		i = head;
> > > > +		descs_used = total_sg;
> > > > +	}
> > > > +
> > > > +	if (vq->vq.num_free < descs_used) {
> > > > +		pr_debug("Can't add buf len %i - avail = %i\n",
> > > > +			 descs_used, vq->vq.num_free);
> > > > +		/* FIXME: for historical reasons, we force a notify here if
> > > > +		 * there are outgoing parts to the buffer.  Presumably the
> > > > +		 * host should service the ring ASAP. */
> > > > +		if (out_sgs)
> > > > +			vq->notify(&vq->vq);
> > > > +		if (indirect)
> > > > +			kfree(desc);
> > > > +		END_USE(vq);
> > > > +		return -ENOSPC;
> > > > +	}
> > > > +
> > > > +	for (n = 0; n < out_sgs + in_sgs; n++) {
> > > > +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > > +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> > > > +						DMA_TO_DEVICE : DMA_FROM_DEVICE);
> > > > +			if (vring_mapping_error(vq, addr))
> > > > +				goto unmap_release;
> > > > +
> > > > +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> > > > +					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> > > > +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> > > > +					VRING_DESC_F_USED(!vq->wrap_counter));
> > > > +			if (!indirect && i == head)
> > > > +				head_flags = flags;
> > > > +			else
> > > > +				desc[i].flags = flags;
> > > > +
> > > > +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > > > +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> > > Similar to V1, we only need this for the last descriptor.
> > Okay, will just set it for the last desc.
> > 
> > > > +			prev = i;
> > > It looks to me there's no need to track prev inside the loop here.
> > > 
> > > > +			i++;
> > > > +			if (!indirect && i >= vq->vring_packed.num) {
> > > > +				i = 0;
> > > > +				vq->wrap_counter ^= 1;
> > > > +			}
> > > > +		}
> > > > +	}
> > > > +	/* Last one doesn't continue. */
> > > > +	if (total_sg == 1)
> > > > +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > > +	else
> > > > +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > The only case when prev != i - 1 is i == 0, we can add a if here.
> > It's just a mirror of the existing implementation in split ring.
> > It seems that split ring implementation needs this just because
> > it's much harder for it to find the prev, which is not true for
> > packed ring. So I'll take your suggestion. Thanks!
> > 
> > [...]
> > > > +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > +{
> > > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > +	u16 new, old, off_wrap;
> > > > +	bool needs_kick;
> > > > +
> > > > +	START_USE(vq);
> > > > +	/* We need to expose the new flags value before checking notification
> > > > +	 * suppressions. */
> > > > +	virtio_mb(vq->weak_barriers);
> > > > +
> > > > +	old = vq->next_avail_idx - vq->num_added;
> > > > +	new = vq->next_avail_idx;
> > > > +	vq->num_added = 0;
> > > > +
> > > > +#ifdef DEBUG
> > > > +	if (vq->last_add_time_valid) {
> > > > +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> > > > +					      vq->last_add_time)) > 100);
> > > > +	}
> > > > +	vq->last_add_time_valid = false;
> > > > +#endif
> > > > +
> > > > +	off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
> > > > +
> > > > +	if (vq->event) {
> > > It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
> > > instead of vq->event here. Spec does not forces to use evenf_off and
> > > event_wrap if event index is enabled.
> > > 
> > > > +		// FIXME: fix this!
> > > > +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
> > > > +			     vring_need_event(off_wrap & ~(1<<15), new, old);
> > > Why need a & here?
> > Because wrap_counter (the most significant bit in off_wrap)
> > isn't part of the index.
> > 
> > > > +	} else {
> > > Need a smp_rmb() to make sure desc_event_flags was checked before flags.
> > I don't get your point, if my understanding is correct,
> > desc_event_flags is vq->vring_packed.device->flags. So
> > what's the "flags"?
> 
> Sorry, I mean we need check device.flags before off_warp. So it needs an
> smp_rmb() in the middle.

It's best to just read all flags atomically as u32.

> It looks to me there's no guarantee that
> VRING_EVENT_F_DESC is set if event index is supported.
> 
> > 
> > > > +		needs_kick = (vq->vring_packed.device->flags !=
> > > > +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
> > > > +	}
> > > > +	END_USE(vq);
> > > > +	return needs_kick;
> > > > +}
> > [...]
> > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > +			      void **ctx)
> > > > +{
> > > > +	struct vring_packed_desc *desc;
> > > > +	unsigned int i, j;
> > > > +
> > > > +	/* Clear data ptr. */
> > > > +	vq->desc_state[head].data = NULL;
> > > > +
> > > > +	i = head;
> > > > +
> > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > +		desc = &vq->vring_packed.desc[i];
> > > > +		vring_unmap_one_packed(vq, desc);
> > > > +		desc->flags = 0x0;
> > > Looks like this is unnecessary.
> > It's safer to zero it. If we don't zero it, after we
> > call virtqueue_detach_unused_buf_packed() which calls
> > this function, the desc is still available to the
> > device.
> 
> Well detach_unused_buf_packed() should be called after device is stopped,
> otherwise even if you try to clear, there will still be a window that device
> may use it.
> 
> > 
> > > > +		i++;
> > > > +		if (i >= vq->vring_packed.num)
> > > > +			i = 0;
> > > > +	}
> > [...]
> > > > +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > > > +{
> > > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > +	u16 last_used_idx, wrap_counter, off_wrap;
> > > > +
> > > > +	START_USE(vq);
> > > > +
> > > > +	last_used_idx = vq->last_used_idx;
> > > > +	wrap_counter = vq->wrap_counter;
> > > > +
> > > > +	if (last_used_idx > vq->next_avail_idx)
> > > > +		wrap_counter ^= 1;
> > > > +
> > > > +	off_wrap = last_used_idx | (wrap_counter << 15);
> > > > +
> > > > +	/* We optimistically turn back on interrupts, then check if there was
> > > > +	 * more to do. */
> > > > +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> > > > +	 * either clear the flags bit or point the event index at the next
> > > > +	 * entry. Always do both to keep code simple. */
> > > > +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> > > > +						     VRING_EVENT_F_ENABLE;
> > > > +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > +							vq->event_flags_shadow);
> > > > +	}
> > > A smp_wmb() is missed here?
> > > 
> > > > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
> > > And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
> > > sufficient here.
> > I didn't think much when implementing the event suppression
> > for packed ring previously. After I saw your comments, I found
> > something new. Indeed, unlike the split ring, for the packed
> > ring, spec doesn't say we must use VRING_EVENT_F_DESC when
> > EVENT_IDX is negotiated. So do you think below thought is
> > right or makes sense?
> > 
> > - For virtqueue_enable_cb_prepare(), we just need to enable
> >    the ring by setting flags to VRING_EVENT_F_ENABLE in any
> >    case.
> > 
> > - We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
> >    negotiated) only when we want to delay the interrupts
> >    virtqueue_enable_cb_delayed().
> 
> This looks good to me.

I suspect this will lead to extra interrupts if host is fast.
So I think for now we should always use VRING_EVENT_F_DESC
if EVENT_IDX is negotiated.

VRING_EVENT_F_DISABLE makes more sense to me.


> > 
> > > > +	END_USE(vq);
> > > > +	return last_used_idx;
> > > > +}
> > > > +
> > [...]
> > > > @@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
> > > >    	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
> > > >    		switch (i) {
> > > > -		case VIRTIO_RING_F_INDIRECT_DESC:
> > > > +#if 0
> > > > +		case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
> > > >    			break;
> > > > -		case VIRTIO_RING_F_EVENT_IDX:
> > > > +		case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
> > > >    			break;
> > > > +#endif
> > > It would be better if you can split EVENT_IDX and INDIRECT_DESC into
> > > separate patches too.
> > Sure. Will do it in the next version.
> > 
> > Thanks for the review!
> 
> Thanks.
> 
> > > Thanks
> > > 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17  2:17       ` Michael S. Tsirkin
@ 2018-04-17  2:24         ` Jason Wang
  2018-04-17  2:37           ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2018-04-17  2:24 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Tiwei Bie, wexu, virtualization, linux-kernel, netdev, jfreimann



On 2018年04月17日 10:17, Michael S. Tsirkin wrote:
> On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
>>
>> On 2018年04月13日 15:15, Tiwei Bie wrote:
>>> On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
>>>> On 2018年04月01日 22:12, Tiwei Bie wrote:
>>>>> Hello everyone,
>>>>>
>>>>> This RFC implements packed ring support for virtio driver.
>>>>>
>>>>> The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
>>>>> by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
>>>>> Minor changes are needed for the vhost code, e.g. to kick the guest.
>>>>>
>>>>> TODO:
>>>>> - Refinements and bug fixes;
>>>>> - Split into small patches;
>>>>> - Test indirect descriptor support;
>>>>> - Test/fix event suppression support;
>>>>> - Test devices other than net;
>>>>>
>>>>> RFC v1 -> RFC v2:
>>>>> - Add indirect descriptor support - compile test only;
>>>>> - Add event suppression supprt - compile test only;
>>>>> - Move vring_packed_init() out of uapi (Jason, MST);
>>>>> - Merge two loops into one in virtqueue_add_packed() (Jason);
>>>>> - Split vring_unmap_one() for packed ring and split ring (Jason);
>>>>> - Avoid using '%' operator (Jason);
>>>>> - Rename free_head -> next_avail_idx (Jason);
>>>>> - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
>>>>> - Some other refinements and bug fixes;
>>>>>
>>>>> Thanks!
>>>>>
>>>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>>>> ---
>>>>>     drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
>>>>>     include/linux/virtio_ring.h        |    8 +-
>>>>>     include/uapi/linux/virtio_config.h |   12 +-
>>>>>     include/uapi/linux/virtio_ring.h   |   61 ++
>>>>>     4 files changed, 980 insertions(+), 195 deletions(-)
>>> [...]

[...]

>>>> It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
>>>> instead of vq->event here. Spec does not forces to use evenf_off and
>>>> event_wrap if event index is enabled.
>>>>
>>>>> +		// FIXME: fix this!
>>>>> +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
>>>>> +			     vring_need_event(off_wrap & ~(1<<15), new, old);
>>>> Why need a & here?
>>> Because wrap_counter (the most significant bit in off_wrap)
>>> isn't part of the index.
>>>
>>>>> +	} else {
>>>> Need a smp_rmb() to make sure desc_event_flags was checked before flags.
>>> I don't get your point, if my understanding is correct,
>>> desc_event_flags is vq->vring_packed.device->flags. So
>>> what's the "flags"?
>> Sorry, I mean we need check device.flags before off_warp. So it needs an
>> smp_rmb() in the middle.
> It's best to just read all flags atomically as u32.

Yes it is.

>
>> It looks to me there's no guarantee that
>> VRING_EVENT_F_DESC is set if event index is supported.
>>
>>>>> +		needs_kick = (vq->vring_packed.device->flags !=
>>>>> +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
>>>>> +	}
>>>>> +	END_USE(vq);
>>>>> +	return needs_kick;
>>>>> +}
>>> [...]
>>>>> +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>>>> +			      void **ctx)
>>>>> +{
>>>>> +	struct vring_packed_desc *desc;
>>>>> +	unsigned int i, j;
>>>>> +
>>>>> +	/* Clear data ptr. */
>>>>> +	vq->desc_state[head].data = NULL;
>>>>> +
>>>>> +	i = head;
>>>>> +
>>>>> +	for (j = 0; j < vq->desc_state[head].num; j++) {
>>>>> +		desc = &vq->vring_packed.desc[i];
>>>>> +		vring_unmap_one_packed(vq, desc);
>>>>> +		desc->flags = 0x0;
>>>> Looks like this is unnecessary.
>>> It's safer to zero it. If we don't zero it, after we
>>> call virtqueue_detach_unused_buf_packed() which calls
>>> this function, the desc is still available to the
>>> device.
>> Well detach_unused_buf_packed() should be called after device is stopped,
>> otherwise even if you try to clear, there will still be a window that device
>> may use it.
>>
>>>>> +		i++;
>>>>> +		if (i >= vq->vring_packed.num)
>>>>> +			i = 0;
>>>>> +	}
>>> [...]
>>>>> +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>>>>> +{
>>>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>>>> +	u16 last_used_idx, wrap_counter, off_wrap;
>>>>> +
>>>>> +	START_USE(vq);
>>>>> +
>>>>> +	last_used_idx = vq->last_used_idx;
>>>>> +	wrap_counter = vq->wrap_counter;
>>>>> +
>>>>> +	if (last_used_idx > vq->next_avail_idx)
>>>>> +		wrap_counter ^= 1;
>>>>> +
>>>>> +	off_wrap = last_used_idx | (wrap_counter << 15);
>>>>> +
>>>>> +	/* We optimistically turn back on interrupts, then check if there was
>>>>> +	 * more to do. */
>>>>> +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
>>>>> +	 * either clear the flags bit or point the event index at the next
>>>>> +	 * entry. Always do both to keep code simple. */
>>>>> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
>>>>> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
>>>>> +						     VRING_EVENT_F_ENABLE;
>>>>> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>>>>> +							vq->event_flags_shadow);
>>>>> +	}
>>>> A smp_wmb() is missed here?
>>>>
>>>>> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
>>>> And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
>>>> sufficient here.
>>> I didn't think much when implementing the event suppression
>>> for packed ring previously. After I saw your comments, I found
>>> something new. Indeed, unlike the split ring, for the packed
>>> ring, spec doesn't say we must use VRING_EVENT_F_DESC when
>>> EVENT_IDX is negotiated. So do you think below thought is
>>> right or makes sense?
>>>
>>> - For virtqueue_enable_cb_prepare(), we just need to enable
>>>     the ring by setting flags to VRING_EVENT_F_ENABLE in any
>>>     case.
>>>
>>> - We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
>>>     negotiated) only when we want to delay the interrupts
>>>     virtqueue_enable_cb_delayed().
>> This looks good to me.
> I suspect this will lead to extra interrupts if host is fast.
> So I think for now we should always use VRING_EVENT_F_DESC
> if EVENT_IDX is negotiated.

Right, so if this is true, maybe we'd better force this in the spec?

Thanks

>
> VRING_EVENT_F_DISABLE makes more sense to me.
>

[...]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17  2:24         ` Jason Wang
@ 2018-04-17  2:37           ` Michael S. Tsirkin
  0 siblings, 0 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2018-04-17  2:37 UTC (permalink / raw)
  To: Jason Wang
  Cc: Tiwei Bie, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 17, 2018 at 10:24:32AM +0800, Jason Wang wrote:
> 
> 
> On 2018年04月17日 10:17, Michael S. Tsirkin wrote:
> > On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> > > 
> > > On 2018年04月13日 15:15, Tiwei Bie wrote:
> > > > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > > > Hello everyone,
> > > > > > 
> > > > > > This RFC implements packed ring support for virtio driver.
> > > > > > 
> > > > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > > > 
> > > > > > TODO:
> > > > > > - Refinements and bug fixes;
> > > > > > - Split into small patches;
> > > > > > - Test indirect descriptor support;
> > > > > > - Test/fix event suppression support;
> > > > > > - Test devices other than net;
> > > > > > 
> > > > > > RFC v1 -> RFC v2:
> > > > > > - Add indirect descriptor support - compile test only;
> > > > > > - Add event suppression supprt - compile test only;
> > > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > > - Avoid using '%' operator (Jason);
> > > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > > - Some other refinements and bug fixes;
> > > > > > 
> > > > > > Thanks!
> > > > > > 
> > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > ---
> > > > > >     drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > > > >     include/linux/virtio_ring.h        |    8 +-
> > > > > >     include/uapi/linux/virtio_config.h |   12 +-
> > > > > >     include/uapi/linux/virtio_ring.h   |   61 ++
> > > > > >     4 files changed, 980 insertions(+), 195 deletions(-)
> > > > [...]
> 
> [...]
> 
> > > > > It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
> > > > > instead of vq->event here. Spec does not forces to use evenf_off and
> > > > > event_wrap if event index is enabled.
> > > > > 
> > > > > > +		// FIXME: fix this!
> > > > > > +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
> > > > > > +			     vring_need_event(off_wrap & ~(1<<15), new, old);
> > > > > Why need a & here?
> > > > Because wrap_counter (the most significant bit in off_wrap)
> > > > isn't part of the index.
> > > > 
> > > > > > +	} else {
> > > > > Need a smp_rmb() to make sure desc_event_flags was checked before flags.
> > > > I don't get your point, if my understanding is correct,
> > > > desc_event_flags is vq->vring_packed.device->flags. So
> > > > what's the "flags"?
> > > Sorry, I mean we need check device.flags before off_warp. So it needs an
> > > smp_rmb() in the middle.
> > It's best to just read all flags atomically as u32.
> 
> Yes it is.
> 
> > 
> > > It looks to me there's no guarantee that
> > > VRING_EVENT_F_DESC is set if event index is supported.
> > > 
> > > > > > +		needs_kick = (vq->vring_packed.device->flags !=
> > > > > > +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
> > > > > > +	}
> > > > > > +	END_USE(vq);
> > > > > > +	return needs_kick;
> > > > > > +}
> > > > [...]
> > > > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > +			      void **ctx)
> > > > > > +{
> > > > > > +	struct vring_packed_desc *desc;
> > > > > > +	unsigned int i, j;
> > > > > > +
> > > > > > +	/* Clear data ptr. */
> > > > > > +	vq->desc_state[head].data = NULL;
> > > > > > +
> > > > > > +	i = head;
> > > > > > +
> > > > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > > > +		desc = &vq->vring_packed.desc[i];
> > > > > > +		vring_unmap_one_packed(vq, desc);
> > > > > > +		desc->flags = 0x0;
> > > > > Looks like this is unnecessary.
> > > > It's safer to zero it. If we don't zero it, after we
> > > > call virtqueue_detach_unused_buf_packed() which calls
> > > > this function, the desc is still available to the
> > > > device.
> > > Well detach_unused_buf_packed() should be called after device is stopped,
> > > otherwise even if you try to clear, there will still be a window that device
> > > may use it.
> > > 
> > > > > > +		i++;
> > > > > > +		if (i >= vq->vring_packed.num)
> > > > > > +			i = 0;
> > > > > > +	}
> > > > [...]
> > > > > > +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > > > > > +{
> > > > > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > +	u16 last_used_idx, wrap_counter, off_wrap;
> > > > > > +
> > > > > > +	START_USE(vq);
> > > > > > +
> > > > > > +	last_used_idx = vq->last_used_idx;
> > > > > > +	wrap_counter = vq->wrap_counter;
> > > > > > +
> > > > > > +	if (last_used_idx > vq->next_avail_idx)
> > > > > > +		wrap_counter ^= 1;
> > > > > > +
> > > > > > +	off_wrap = last_used_idx | (wrap_counter << 15);
> > > > > > +
> > > > > > +	/* We optimistically turn back on interrupts, then check if there was
> > > > > > +	 * more to do. */
> > > > > > +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> > > > > > +	 * either clear the flags bit or point the event index at the next
> > > > > > +	 * entry. Always do both to keep code simple. */
> > > > > > +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> > > > > > +						     VRING_EVENT_F_ENABLE;
> > > > > > +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > +							vq->event_flags_shadow);
> > > > > > +	}
> > > > > A smp_wmb() is missed here?
> > > > > 
> > > > > > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
> > > > > And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
> > > > > sufficient here.
> > > > I didn't think much when implementing the event suppression
> > > > for packed ring previously. After I saw your comments, I found
> > > > something new. Indeed, unlike the split ring, for the packed
> > > > ring, spec doesn't say we must use VRING_EVENT_F_DESC when
> > > > EVENT_IDX is negotiated. So do you think below thought is
> > > > right or makes sense?
> > > > 
> > > > - For virtqueue_enable_cb_prepare(), we just need to enable
> > > >     the ring by setting flags to VRING_EVENT_F_ENABLE in any
> > > >     case.
> > > > 
> > > > - We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
> > > >     negotiated) only when we want to delay the interrupts
> > > >     virtqueue_enable_cb_delayed().
> > > This looks good to me.
> > I suspect this will lead to extra interrupts if host is fast.
> > So I think for now we should always use VRING_EVENT_F_DESC
> > if EVENT_IDX is negotiated.
> 
> Right, so if this is true, maybe we'd better force this in the spec?
> 
> Thanks

I did consider this.

Why it is the way it is:

- if we allow DISABLE it seems nicer to allow ENABLE as well
for symmetry.

- ENABLE is handy for hardware offload devices
where kicks do not trigger an exit so not worth bother
with, but interrupts still slow the system down so
event index might be worth it.

> > 
> > VRING_EVENT_F_DISABLE makes more sense to me.
> > 
> 
> [...]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17  2:11     ` Jason Wang
  2018-04-17  2:17       ` Michael S. Tsirkin
@ 2018-04-17  2:51       ` Tiwei Bie
  2018-04-17 12:17         ` Michael S. Tsirkin
  1 sibling, 1 reply; 28+ messages in thread
From: Tiwei Bie @ 2018-04-17  2:51 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> On 2018年04月13日 15:15, Tiwei Bie wrote:
> > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > On 2018年04月01日 22:12, Tiwei Bie wrote:
[...]
> > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > +			      void **ctx)
> > > > +{
> > > > +	struct vring_packed_desc *desc;
> > > > +	unsigned int i, j;
> > > > +
> > > > +	/* Clear data ptr. */
> > > > +	vq->desc_state[head].data = NULL;
> > > > +
> > > > +	i = head;
> > > > +
> > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > +		desc = &vq->vring_packed.desc[i];
> > > > +		vring_unmap_one_packed(vq, desc);
> > > > +		desc->flags = 0x0;
> > > Looks like this is unnecessary.
> > It's safer to zero it. If we don't zero it, after we
> > call virtqueue_detach_unused_buf_packed() which calls
> > this function, the desc is still available to the
> > device.
> 
> Well detach_unused_buf_packed() should be called after device is stopped,
> otherwise even if you try to clear, there will still be a window that device
> may use it.

This is not about whether the device has been stopped or
not. We don't have other places to re-initialize the ring
descriptors and wrap_counter. So they need to be set to
the correct values when doing detach_unused_buf.

Best regards,
Tiwei Bie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17  2:51       ` Tiwei Bie
@ 2018-04-17 12:17         ` Michael S. Tsirkin
  2018-04-17 12:47           ` Tiwei Bie
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2018-04-17 12:17 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 17, 2018 at 10:51:33AM +0800, Tiwei Bie wrote:
> On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> > On 2018年04月13日 15:15, Tiwei Bie wrote:
> > > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> [...]
> > > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > +			      void **ctx)
> > > > > +{
> > > > > +	struct vring_packed_desc *desc;
> > > > > +	unsigned int i, j;
> > > > > +
> > > > > +	/* Clear data ptr. */
> > > > > +	vq->desc_state[head].data = NULL;
> > > > > +
> > > > > +	i = head;
> > > > > +
> > > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > > +		desc = &vq->vring_packed.desc[i];
> > > > > +		vring_unmap_one_packed(vq, desc);
> > > > > +		desc->flags = 0x0;
> > > > Looks like this is unnecessary.
> > > It's safer to zero it. If we don't zero it, after we
> > > call virtqueue_detach_unused_buf_packed() which calls
> > > this function, the desc is still available to the
> > > device.
> > 
> > Well detach_unused_buf_packed() should be called after device is stopped,
> > otherwise even if you try to clear, there will still be a window that device
> > may use it.
> 
> This is not about whether the device has been stopped or
> not. We don't have other places to re-initialize the ring
> descriptors and wrap_counter. So they need to be set to
> the correct values when doing detach_unused_buf.
> 
> Best regards,
> Tiwei Bie

find vqs is the time to do it.

-- 
MST

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17 12:17         ` Michael S. Tsirkin
@ 2018-04-17 12:47           ` Tiwei Bie
  2018-04-17 14:04             ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Tiwei Bie @ 2018-04-17 12:47 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 17, 2018 at 03:17:41PM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 17, 2018 at 10:51:33AM +0800, Tiwei Bie wrote:
> > On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> > > On 2018年04月13日 15:15, Tiwei Bie wrote:
> > > > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > [...]
> > > > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > +			      void **ctx)
> > > > > > +{
> > > > > > +	struct vring_packed_desc *desc;
> > > > > > +	unsigned int i, j;
> > > > > > +
> > > > > > +	/* Clear data ptr. */
> > > > > > +	vq->desc_state[head].data = NULL;
> > > > > > +
> > > > > > +	i = head;
> > > > > > +
> > > > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > > > +		desc = &vq->vring_packed.desc[i];
> > > > > > +		vring_unmap_one_packed(vq, desc);
> > > > > > +		desc->flags = 0x0;
> > > > > Looks like this is unnecessary.
> > > > It's safer to zero it. If we don't zero it, after we
> > > > call virtqueue_detach_unused_buf_packed() which calls
> > > > this function, the desc is still available to the
> > > > device.
> > > 
> > > Well detach_unused_buf_packed() should be called after device is stopped,
> > > otherwise even if you try to clear, there will still be a window that device
> > > may use it.
> > 
> > This is not about whether the device has been stopped or
> > not. We don't have other places to re-initialize the ring
> > descriptors and wrap_counter. So they need to be set to
> > the correct values when doing detach_unused_buf.
> > 
> > Best regards,
> > Tiwei Bie
> 
> find vqs is the time to do it.

The .find_vqs() will call .setup_vq() which will eventually
call vring_create_virtqueue(). It's a different case. Here
we're talking about re-initializing the descs and updating
the wrap counter when detaching the unused descs (In this
case, split ring just needs to decrease vring.avail->idx).

Best regards,
Tiwei Bie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17 12:47           ` Tiwei Bie
@ 2018-04-17 14:04             ` Michael S. Tsirkin
  2018-04-17 14:56               ` Tiwei Bie
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2018-04-17 14:04 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 17, 2018 at 08:47:16PM +0800, Tiwei Bie wrote:
> On Tue, Apr 17, 2018 at 03:17:41PM +0300, Michael S. Tsirkin wrote:
> > On Tue, Apr 17, 2018 at 10:51:33AM +0800, Tiwei Bie wrote:
> > > On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> > > > On 2018年04月13日 15:15, Tiwei Bie wrote:
> > > > > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > [...]
> > > > > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > > +			      void **ctx)
> > > > > > > +{
> > > > > > > +	struct vring_packed_desc *desc;
> > > > > > > +	unsigned int i, j;
> > > > > > > +
> > > > > > > +	/* Clear data ptr. */
> > > > > > > +	vq->desc_state[head].data = NULL;
> > > > > > > +
> > > > > > > +	i = head;
> > > > > > > +
> > > > > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > > > > +		desc = &vq->vring_packed.desc[i];
> > > > > > > +		vring_unmap_one_packed(vq, desc);
> > > > > > > +		desc->flags = 0x0;
> > > > > > Looks like this is unnecessary.
> > > > > It's safer to zero it. If we don't zero it, after we
> > > > > call virtqueue_detach_unused_buf_packed() which calls
> > > > > this function, the desc is still available to the
> > > > > device.
> > > > 
> > > > Well detach_unused_buf_packed() should be called after device is stopped,
> > > > otherwise even if you try to clear, there will still be a window that device
> > > > may use it.
> > > 
> > > This is not about whether the device has been stopped or
> > > not. We don't have other places to re-initialize the ring
> > > descriptors and wrap_counter. So they need to be set to
> > > the correct values when doing detach_unused_buf.
> > > 
> > > Best regards,
> > > Tiwei Bie
> > 
> > find vqs is the time to do it.
> 
> The .find_vqs() will call .setup_vq() which will eventually
> call vring_create_virtqueue(). It's a different case. Here
> we're talking about re-initializing the descs and updating
> the wrap counter when detaching the unused descs (In this
> case, split ring just needs to decrease vring.avail->idx).
> 
> Best regards,
> Tiwei Bie

There's no requirement that  virtqueue_detach_unused_buf re-initializes
the descs. It happens on cleanup path just before drivers delete the
vqs.

-- 
MST

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17 14:04             ` Michael S. Tsirkin
@ 2018-04-17 14:56               ` Tiwei Bie
  2018-04-17 15:54                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Tiwei Bie @ 2018-04-17 14:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 17, 2018 at 05:04:59PM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 17, 2018 at 08:47:16PM +0800, Tiwei Bie wrote:
> > On Tue, Apr 17, 2018 at 03:17:41PM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Apr 17, 2018 at 10:51:33AM +0800, Tiwei Bie wrote:
> > > > On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> > > > > On 2018年04月13日 15:15, Tiwei Bie wrote:
> > > > > > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > [...]
> > > > > > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > > > +			      void **ctx)
> > > > > > > > +{
> > > > > > > > +	struct vring_packed_desc *desc;
> > > > > > > > +	unsigned int i, j;
> > > > > > > > +
> > > > > > > > +	/* Clear data ptr. */
> > > > > > > > +	vq->desc_state[head].data = NULL;
> > > > > > > > +
> > > > > > > > +	i = head;
> > > > > > > > +
> > > > > > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > > > > > +		desc = &vq->vring_packed.desc[i];
> > > > > > > > +		vring_unmap_one_packed(vq, desc);
> > > > > > > > +		desc->flags = 0x0;
> > > > > > > Looks like this is unnecessary.
> > > > > > It's safer to zero it. If we don't zero it, after we
> > > > > > call virtqueue_detach_unused_buf_packed() which calls
> > > > > > this function, the desc is still available to the
> > > > > > device.
> > > > > 
> > > > > Well detach_unused_buf_packed() should be called after device is stopped,
> > > > > otherwise even if you try to clear, there will still be a window that device
> > > > > may use it.
> > > > 
> > > > This is not about whether the device has been stopped or
> > > > not. We don't have other places to re-initialize the ring
> > > > descriptors and wrap_counter. So they need to be set to
> > > > the correct values when doing detach_unused_buf.
> > > > 
> > > > Best regards,
> > > > Tiwei Bie
> > > 
> > > find vqs is the time to do it.
> > 
> > The .find_vqs() will call .setup_vq() which will eventually
> > call vring_create_virtqueue(). It's a different case. Here
> > we're talking about re-initializing the descs and updating
> > the wrap counter when detaching the unused descs (In this
> > case, split ring just needs to decrease vring.avail->idx).
> > 
> > Best regards,
> > Tiwei Bie
> 
> There's no requirement that  virtqueue_detach_unused_buf re-initializes
> the descs. It happens on cleanup path just before drivers delete the
> vqs.

Cool, I wasn't aware of it. I saw split ring decrease
vring.avail->idx after detaching an unused desc, so I
thought detaching unused desc also needs to make sure
that the ring state will be updated correspondingly.

If there is no such requirement, do you think it's OK
to remove below two lines:

vq->avail_idx_shadow--;
vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);

from virtqueue_detach_unused_buf(), and we could have
one generic function to handle both rings:

void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	unsigned int num, i;
	void *buf;

	START_USE(vq);

	num = vq->packed ? vq->vring_packed.num : vq->vring.num;

	for (i = 0; i < num; i++) {
		if (!vq->desc_state[i].data)
			continue;
		/* detach_buf clears data, so grab it now. */
		buf = vq->desc_state[i].data;
		detach_buf(vq, i, NULL);
		END_USE(vq);
		return buf;
	}
	/* That should have freed everything. */
	BUG_ON(vq->vq.num_free != num);

	END_USE(vq);
	return NULL;
}

Best regards,
Tiwei Bie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17 14:56               ` Tiwei Bie
@ 2018-04-17 15:54                 ` Michael S. Tsirkin
  2018-04-18  1:17                   ` Tiwei Bie
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2018-04-17 15:54 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 17, 2018 at 10:56:26PM +0800, Tiwei Bie wrote:
> On Tue, Apr 17, 2018 at 05:04:59PM +0300, Michael S. Tsirkin wrote:
> > On Tue, Apr 17, 2018 at 08:47:16PM +0800, Tiwei Bie wrote:
> > > On Tue, Apr 17, 2018 at 03:17:41PM +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Apr 17, 2018 at 10:51:33AM +0800, Tiwei Bie wrote:
> > > > > On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> > > > > > On 2018年04月13日 15:15, Tiwei Bie wrote:
> > > > > > > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > > > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > > [...]
> > > > > > > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > > > > +			      void **ctx)
> > > > > > > > > +{
> > > > > > > > > +	struct vring_packed_desc *desc;
> > > > > > > > > +	unsigned int i, j;
> > > > > > > > > +
> > > > > > > > > +	/* Clear data ptr. */
> > > > > > > > > +	vq->desc_state[head].data = NULL;
> > > > > > > > > +
> > > > > > > > > +	i = head;
> > > > > > > > > +
> > > > > > > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > > > > > > +		desc = &vq->vring_packed.desc[i];
> > > > > > > > > +		vring_unmap_one_packed(vq, desc);
> > > > > > > > > +		desc->flags = 0x0;
> > > > > > > > Looks like this is unnecessary.
> > > > > > > It's safer to zero it. If we don't zero it, after we
> > > > > > > call virtqueue_detach_unused_buf_packed() which calls
> > > > > > > this function, the desc is still available to the
> > > > > > > device.
> > > > > > 
> > > > > > Well detach_unused_buf_packed() should be called after device is stopped,
> > > > > > otherwise even if you try to clear, there will still be a window that device
> > > > > > may use it.
> > > > > 
> > > > > This is not about whether the device has been stopped or
> > > > > not. We don't have other places to re-initialize the ring
> > > > > descriptors and wrap_counter. So they need to be set to
> > > > > the correct values when doing detach_unused_buf.
> > > > > 
> > > > > Best regards,
> > > > > Tiwei Bie
> > > > 
> > > > find vqs is the time to do it.
> > > 
> > > The .find_vqs() will call .setup_vq() which will eventually
> > > call vring_create_virtqueue(). It's a different case. Here
> > > we're talking about re-initializing the descs and updating
> > > the wrap counter when detaching the unused descs (In this
> > > case, split ring just needs to decrease vring.avail->idx).
> > > 
> > > Best regards,
> > > Tiwei Bie
> > 
> > There's no requirement that  virtqueue_detach_unused_buf re-initializes
> > the descs. It happens on cleanup path just before drivers delete the
> > vqs.
> 
> Cool, I wasn't aware of it. I saw split ring decrease
> vring.avail->idx after detaching an unused desc, so I
> thought detaching unused desc also needs to make sure
> that the ring state will be updated correspondingly.


Hmm. You are right. Seems to be out console driver being out of spec.
Will have to look at how to fix that :(

It was done here:

Commit b3258ff1d6086bd2b9eeb556844a868ad7d49bc8
Author: Amit Shah <amit.shah@redhat.com>
Date:   Wed Mar 16 19:12:10 2011 +0530

    virtio: Decrement avail idx on buffer detach
    
    When detaching a buffer from a vq, the avail.idx value should be
    decremented as well.
    
    This was noticed by hot-unplugging a virtio console port and then
    plugging in a new one on the same number (re-using the vqs which were
    just 'disowned').  qemu reported
    
       'Guest moved used index from 0 to 256'
    
    when any IO was attempted on the new port.
    
    CC: stable@kernel.org
    Reported-by: juzhang <juzhang@redhat.com>
    Signed-off-by: Amit Shah <amit.shah@redhat.com>
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

The spec is quite explicit though:
	A driver MUST NOT decrement the available idx on a live virtqueue (ie. there is no way to “unexpose”
	buffers).





> If there is no such requirement, do you think it's OK
> to remove below two lines:
> 
> vq->avail_idx_shadow--;
> vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> 
> from virtqueue_detach_unused_buf(), and we could have
> one generic function to handle both rings:
> 
> void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
> {
> 	struct vring_virtqueue *vq = to_vvq(_vq);
> 	unsigned int num, i;
> 	void *buf;
> 
> 	START_USE(vq);
> 
> 	num = vq->packed ? vq->vring_packed.num : vq->vring.num;
> 
> 	for (i = 0; i < num; i++) {
> 		if (!vq->desc_state[i].data)
> 			continue;
> 		/* detach_buf clears data, so grab it now. */
> 		buf = vq->desc_state[i].data;
> 		detach_buf(vq, i, NULL);
> 		END_USE(vq);
> 		return buf;
> 	}
> 	/* That should have freed everything. */
> 	BUG_ON(vq->vq.num_free != num);
> 
> 	END_USE(vq);
> 	return NULL;
> }
> 
> Best regards,
> Tiwei Bie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-17 15:54                 ` Michael S. Tsirkin
@ 2018-04-18  1:17                   ` Tiwei Bie
  0 siblings, 0 replies; 28+ messages in thread
From: Tiwei Bie @ 2018-04-18  1:17 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 17, 2018 at 06:54:51PM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 17, 2018 at 10:56:26PM +0800, Tiwei Bie wrote:
> > On Tue, Apr 17, 2018 at 05:04:59PM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Apr 17, 2018 at 08:47:16PM +0800, Tiwei Bie wrote:
> > > > On Tue, Apr 17, 2018 at 03:17:41PM +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Apr 17, 2018 at 10:51:33AM +0800, Tiwei Bie wrote:
> > > > > > On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> > > > > > > On 2018年04月13日 15:15, Tiwei Bie wrote:
> > > > > > > > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > > > > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > > > [...]
> > > > > > > > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > > > > > +			      void **ctx)
> > > > > > > > > > +{
> > > > > > > > > > +	struct vring_packed_desc *desc;
> > > > > > > > > > +	unsigned int i, j;
> > > > > > > > > > +
> > > > > > > > > > +	/* Clear data ptr. */
> > > > > > > > > > +	vq->desc_state[head].data = NULL;
> > > > > > > > > > +
> > > > > > > > > > +	i = head;
> > > > > > > > > > +
> > > > > > > > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > > > > > > > +		desc = &vq->vring_packed.desc[i];
> > > > > > > > > > +		vring_unmap_one_packed(vq, desc);
> > > > > > > > > > +		desc->flags = 0x0;
> > > > > > > > > Looks like this is unnecessary.
> > > > > > > > It's safer to zero it. If we don't zero it, after we
> > > > > > > > call virtqueue_detach_unused_buf_packed() which calls
> > > > > > > > this function, the desc is still available to the
> > > > > > > > device.
> > > > > > > 
> > > > > > > Well detach_unused_buf_packed() should be called after device is stopped,
> > > > > > > otherwise even if you try to clear, there will still be a window that device
> > > > > > > may use it.
> > > > > > 
> > > > > > This is not about whether the device has been stopped or
> > > > > > not. We don't have other places to re-initialize the ring
> > > > > > descriptors and wrap_counter. So they need to be set to
> > > > > > the correct values when doing detach_unused_buf.
> > > > > > 
> > > > > > Best regards,
> > > > > > Tiwei Bie
> > > > > 
> > > > > find vqs is the time to do it.
> > > > 
> > > > The .find_vqs() will call .setup_vq() which will eventually
> > > > call vring_create_virtqueue(). It's a different case. Here
> > > > we're talking about re-initializing the descs and updating
> > > > the wrap counter when detaching the unused descs (In this
> > > > case, split ring just needs to decrease vring.avail->idx).
> > > > 
> > > > Best regards,
> > > > Tiwei Bie
> > > 
> > > There's no requirement that  virtqueue_detach_unused_buf re-initializes
> > > the descs. It happens on cleanup path just before drivers delete the
> > > vqs.
> > 
> > Cool, I wasn't aware of it. I saw split ring decrease
> > vring.avail->idx after detaching an unused desc, so I
> > thought detaching unused desc also needs to make sure
> > that the ring state will be updated correspondingly.
> 
> 
> Hmm. You are right. Seems to be out console driver being out of spec.
> Will have to look at how to fix that :(
> 
> It was done here:
> 
> Commit b3258ff1d6086bd2b9eeb556844a868ad7d49bc8
> Author: Amit Shah <amit.shah@redhat.com>
> Date:   Wed Mar 16 19:12:10 2011 +0530
> 
>     virtio: Decrement avail idx on buffer detach
>     
>     When detaching a buffer from a vq, the avail.idx value should be
>     decremented as well.
>     
>     This was noticed by hot-unplugging a virtio console port and then
>     plugging in a new one on the same number (re-using the vqs which were
>     just 'disowned').  qemu reported
>     
>        'Guest moved used index from 0 to 256'
>     
>     when any IO was attempted on the new port.
>     
>     CC: stable@kernel.org
>     Reported-by: juzhang <juzhang@redhat.com>
>     Signed-off-by: Amit Shah <amit.shah@redhat.com>
>     Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> 
> The spec is quite explicit though:
> 	A driver MUST NOT decrement the available idx on a live virtqueue (ie. there is no way to “unexpose”
> 	buffers).
> 

Hmm.. Got it. Thanks!

Best regards,
Tiwei Bie


> 
> 
> 
> 
> > If there is no such requirement, do you think it's OK
> > to remove below two lines:
> > 
> > vq->avail_idx_shadow--;
> > vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> > 
> > from virtqueue_detach_unused_buf(), and we could have
> > one generic function to handle both rings:
> > 
> > void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
> > {
> > 	struct vring_virtqueue *vq = to_vvq(_vq);
> > 	unsigned int num, i;
> > 	void *buf;
> > 
> > 	START_USE(vq);
> > 
> > 	num = vq->packed ? vq->vring_packed.num : vq->vring.num;
> > 
> > 	for (i = 0; i < num; i++) {
> > 		if (!vq->desc_state[i].data)
> > 			continue;
> > 		/* detach_buf clears data, so grab it now. */
> > 		buf = vq->desc_state[i].data;
> > 		detach_buf(vq, i, NULL);
> > 		END_USE(vq);
> > 		return buf;
> > 	}
> > 	/* That should have freed everything. */
> > 	BUG_ON(vq->vq.num_free != num);
> > 
> > 	END_USE(vq);
> > 	return NULL;
> > }
> > 
> > Best regards,
> > Tiwei Bie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-01 14:12 [RFC v2] virtio: support packed ring Tiwei Bie
                   ` (2 preceding siblings ...)
  2018-04-13 15:22 ` Michael S. Tsirkin
@ 2018-04-23  5:42 ` Jason Wang
  2018-04-23  9:29   ` Tiwei Bie
  3 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2018-04-23  5:42 UTC (permalink / raw)
  To: Tiwei Bie, mst, wexu, virtualization, linux-kernel, netdev; +Cc: jfreimann



On 2018年04月01日 22:12, Tiwei Bie wrote:
> Hello everyone,
>
> This RFC implements packed ring support for virtio driver.
>
> The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> Minor changes are needed for the vhost code, e.g. to kick the guest.
>
> TODO:
> - Refinements and bug fixes;
> - Split into small patches;
> - Test indirect descriptor support;
> - Test/fix event suppression support;
> - Test devices other than net;
>
> RFC v1 -> RFC v2:
> - Add indirect descriptor support - compile test only;
> - Add event suppression supprt - compile test only;
> - Move vring_packed_init() out of uapi (Jason, MST);
> - Merge two loops into one in virtqueue_add_packed() (Jason);
> - Split vring_unmap_one() for packed ring and split ring (Jason);
> - Avoid using '%' operator (Jason);
> - Rename free_head -> next_avail_idx (Jason);
> - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> - Some other refinements and bug fixes;
>
> Thanks!
>
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>   drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
>   include/linux/virtio_ring.h        |    8 +-
>   include/uapi/linux/virtio_config.h |   12 +-
>   include/uapi/linux/virtio_ring.h   |   61 ++
>   4 files changed, 980 insertions(+), 195 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 71458f493cf8..0515dca34d77 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -58,14 +58,15 @@
>   

[...]

> +
> +	if (vq->indirect) {
> +		u32 len;
> +
> +		desc = vq->desc_state[head].indir_desc;
> +		/* Free the indirect table, if any, now that it's unmapped. */
> +		if (!desc)
> +			goto out;
> +
> +		len = virtio32_to_cpu(vq->vq.vdev,
> +				      vq->vring_packed.desc[head].len);
> +
> +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));

It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. 
So we can safely remove this BUG_ON() here.

> +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));

Len could be ignored for used descriptor according to the spec, so we 
need remove this BUG_ON() too.

The reason is we don't touch descriptor ring in the case of split, so 
BUG_ON()s may help there.

> +
> +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> +			vring_unmap_one_packed(vq, &desc[j]);
> +
> +		kfree(desc);
> +		vq->desc_state[head].indir_desc = NULL;
> +	} else if (ctx) {
> +		*ctx = vq->desc_state[head].indir_desc;
> +	}
> +
> +out:
> +	return vq->desc_state[head].num;
> +}
> +
> +static inline bool more_used_split(const struct vring_virtqueue *vq)
>   {
>   	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
>   }
>   
> +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> +{
> +	u16 last_used, flags;
> +	bool avail, used;
> +
> +	if (vq->vq.num_free == vq->vring_packed.num)
> +		return false;
> +
> +	last_used = vq->last_used_idx;
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +				vq->vring_packed.desc[last_used].flags);
> +	avail = flags & VRING_DESC_F_AVAIL(1);
> +	used = flags & VRING_DESC_F_USED(1);
> +
> +	return avail == used;
> +}

This looks interesting, spec said:

"
Thus VIRTQ_DESC_F_AVAIL and VIRTQ_DESC_F_USED bits are different for an 
available descriptor and
equal for a used descriptor.
Note that this observation is mostly useful for sanity-checking as these 
are necessary but not sufficient
conditions - for example, all descriptors are zero-initialized. To 
detect used and available descriptors it is
possible for drivers and devices to keep track of the last observed 
value of VIRTQ_DESC_F_USED/VIRTQ_-
DESC_F_AVAIL. Other techniques to detect 
VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
might also be possible.
"

So it looks to me it was not sufficient, looking at the example codes in 
spec, do we need to track last seen used_wrap_counter here?

Thanks

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-23  5:42 ` Jason Wang
@ 2018-04-23  9:29   ` Tiwei Bie
  2018-04-24  0:54     ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Tiwei Bie @ 2018-04-23  9:29 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, wexu, virtualization, linux-kernel, netdev, jfreimann

On Mon, Apr 23, 2018 at 01:42:14PM +0800, Jason Wang wrote:
> On 2018年04月01日 22:12, Tiwei Bie wrote:
> > Hello everyone,
> > 
> > This RFC implements packed ring support for virtio driver.
> > 
> > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > 
> > TODO:
> > - Refinements and bug fixes;
> > - Split into small patches;
> > - Test indirect descriptor support;
> > - Test/fix event suppression support;
> > - Test devices other than net;
> > 
> > RFC v1 -> RFC v2:
> > - Add indirect descriptor support - compile test only;
> > - Add event suppression supprt - compile test only;
> > - Move vring_packed_init() out of uapi (Jason, MST);
> > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > - Avoid using '%' operator (Jason);
> > - Rename free_head -> next_avail_idx (Jason);
> > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > - Some other refinements and bug fixes;
> > 
> > Thanks!
> > 
> > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > ---
> >   drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> >   include/linux/virtio_ring.h        |    8 +-
> >   include/uapi/linux/virtio_config.h |   12 +-
> >   include/uapi/linux/virtio_ring.h   |   61 ++
> >   4 files changed, 980 insertions(+), 195 deletions(-)
> > 
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 71458f493cf8..0515dca34d77 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -58,14 +58,15 @@
> 
> [...]
> 
> > +
> > +	if (vq->indirect) {
> > +		u32 len;
> > +
> > +		desc = vq->desc_state[head].indir_desc;
> > +		/* Free the indirect table, if any, now that it's unmapped. */
> > +		if (!desc)
> > +			goto out;
> > +
> > +		len = virtio32_to_cpu(vq->vq.vdev,
> > +				      vq->vring_packed.desc[head].len);
> > +
> > +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> > +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> 
> It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. So we
> can safely remove this BUG_ON() here.
> 
> > +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> 
> Len could be ignored for used descriptor according to the spec, so we need
> remove this BUG_ON() too.

Yeah, you're right! The BUG_ON() isn't right. I'll remove it.
And I think something related to this in the spec isn't very
clear currently.

In the spec, there are below words:

https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L272
"""
In descriptors with VIRTQ_DESC_F_INDIRECT set VIRTQ_DESC_F_WRITE
is reserved and is ignored by the device.
"""

So when device writes back an used descriptor in this case,
device may not set the VIRTQ_DESC_F_WRITE flag as the flag
is reserved and should be ignored.

https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L170
"""
Element Length is reserved for used descriptors without the
VIRTQ_DESC_F_WRITE flag, and is ignored by drivers.
"""

And this is the way how driver ignores the `len` in an used
descriptor.

https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L241
"""
To increase ring capacity the driver can store a (read-only
by the device) table of indirect descriptors anywhere in memory,
and insert a descriptor in the main virtqueue (with \field{Flags}
bit VIRTQ_DESC_F_INDIRECT on) that refers to a buffer element
containing this indirect descriptor table;
"""

So the indirect descriptors in the table are read-only by
the device. And the only descriptor which is writeable by
the device is the descriptor in the main virtqueue (with
Flags bit VIRTQ_DESC_F_INDIRECT on). So if we ignore the
`len` in this descriptor, we won't be able to get the
length of the data written by the device.

So I think the `len` in this descriptor will carry the
length of the data written by the device (if the buffers
are writable to the device) even if the VIRTQ_DESC_F_WRITE
isn't set by the device. How do you think?


> 
> The reason is we don't touch descriptor ring in the case of split, so
> BUG_ON()s may help there.
> 
> > +
> > +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> > +			vring_unmap_one_packed(vq, &desc[j]);
> > +
> > +		kfree(desc);
> > +		vq->desc_state[head].indir_desc = NULL;
> > +	} else if (ctx) {
> > +		*ctx = vq->desc_state[head].indir_desc;
> > +	}
> > +
> > +out:
> > +	return vq->desc_state[head].num;
> > +}
> > +
> > +static inline bool more_used_split(const struct vring_virtqueue *vq)
> >   {
> >   	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
> >   }
> > +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> > +{
> > +	u16 last_used, flags;
> > +	bool avail, used;
> > +
> > +	if (vq->vq.num_free == vq->vring_packed.num)
> > +		return false;
> > +
> > +	last_used = vq->last_used_idx;
> > +	flags = virtio16_to_cpu(vq->vq.vdev,
> > +				vq->vring_packed.desc[last_used].flags);
> > +	avail = flags & VRING_DESC_F_AVAIL(1);
> > +	used = flags & VRING_DESC_F_USED(1);
> > +
> > +	return avail == used;
> > +}
> 
> This looks interesting, spec said:
> 
> "
> Thus VIRTQ_DESC_F_AVAIL and VIRTQ_DESC_F_USED bits are different for an
> available descriptor and
> equal for a used descriptor.
> Note that this observation is mostly useful for sanity-checking as these are
> necessary but not sufficient
> conditions - for example, all descriptors are zero-initialized. To detect
> used and available descriptors it is
> possible for drivers and devices to keep track of the last observed value of
> VIRTQ_DESC_F_USED/VIRTQ_-
> DESC_F_AVAIL. Other techniques to detect
> VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
> might also be possible.
> "
> 
> So it looks to me it was not sufficient, looking at the example codes in
> spec, do we need to track last seen used_wrap_counter here?

I don't think we have to track used_wrap_counter in
driver. There was a discussion on this:

https://lists.oasis-open.org/archives/virtio-dev/201802/msg00177.html

And after that, below sentence was added (it's also
in the above words you quoted):

"""
Other techniques to detect
VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
might also be possible.
"""

Best regards,
Tiwei Bie


> 
> Thanks

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-23  9:29   ` Tiwei Bie
@ 2018-04-24  0:54     ` Jason Wang
  2018-04-24  1:05       ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2018-04-24  0:54 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, wexu, virtualization, linux-kernel, netdev, jfreimann



On 2018年04月23日 17:29, Tiwei Bie wrote:
> On Mon, Apr 23, 2018 at 01:42:14PM +0800, Jason Wang wrote:
>> On 2018年04月01日 22:12, Tiwei Bie wrote:
>>> Hello everyone,
>>>
>>> This RFC implements packed ring support for virtio driver.
>>>
>>> The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
>>> by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
>>> Minor changes are needed for the vhost code, e.g. to kick the guest.
>>>
>>> TODO:
>>> - Refinements and bug fixes;
>>> - Split into small patches;
>>> - Test indirect descriptor support;
>>> - Test/fix event suppression support;
>>> - Test devices other than net;
>>>
>>> RFC v1 -> RFC v2:
>>> - Add indirect descriptor support - compile test only;
>>> - Add event suppression supprt - compile test only;
>>> - Move vring_packed_init() out of uapi (Jason, MST);
>>> - Merge two loops into one in virtqueue_add_packed() (Jason);
>>> - Split vring_unmap_one() for packed ring and split ring (Jason);
>>> - Avoid using '%' operator (Jason);
>>> - Rename free_head -> next_avail_idx (Jason);
>>> - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
>>> - Some other refinements and bug fixes;
>>>
>>> Thanks!
>>>
>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>> ---
>>>    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
>>>    include/linux/virtio_ring.h        |    8 +-
>>>    include/uapi/linux/virtio_config.h |   12 +-
>>>    include/uapi/linux/virtio_ring.h   |   61 ++
>>>    4 files changed, 980 insertions(+), 195 deletions(-)
>>>
>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
>>> index 71458f493cf8..0515dca34d77 100644
>>> --- a/drivers/virtio/virtio_ring.c
>>> +++ b/drivers/virtio/virtio_ring.c
>>> @@ -58,14 +58,15 @@
>> [...]
>>
>>> +
>>> +	if (vq->indirect) {
>>> +		u32 len;
>>> +
>>> +		desc = vq->desc_state[head].indir_desc;
>>> +		/* Free the indirect table, if any, now that it's unmapped. */
>>> +		if (!desc)
>>> +			goto out;
>>> +
>>> +		len = virtio32_to_cpu(vq->vq.vdev,
>>> +				      vq->vring_packed.desc[head].len);
>>> +
>>> +		BUG_ON(!(vq->vring_packed.desc[head].flags &
>>> +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
>> It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. So we
>> can safely remove this BUG_ON() here.
>>
>>> +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
>> Len could be ignored for used descriptor according to the spec, so we need
>> remove this BUG_ON() too.
> Yeah, you're right! The BUG_ON() isn't right. I'll remove it.
> And I think something related to this in the spec isn't very
> clear currently.
>
> In the spec, there are below words:
>
> https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L272
> """
> In descriptors with VIRTQ_DESC_F_INDIRECT set VIRTQ_DESC_F_WRITE
> is reserved and is ignored by the device.
> """
>
> So when device writes back an used descriptor in this case,
> device may not set the VIRTQ_DESC_F_WRITE flag as the flag
> is reserved and should be ignored.
>
> https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L170
> """
> Element Length is reserved for used descriptors without the
> VIRTQ_DESC_F_WRITE flag, and is ignored by drivers.
> """
>
> And this is the way how driver ignores the `len` in an used
> descriptor.
>
> https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L241
> """
> To increase ring capacity the driver can store a (read-only
> by the device) table of indirect descriptors anywhere in memory,
> and insert a descriptor in the main virtqueue (with \field{Flags}
> bit VIRTQ_DESC_F_INDIRECT on) that refers to a buffer element
> containing this indirect descriptor table;
> """
>
> So the indirect descriptors in the table are read-only by
> the device. And the only descriptor which is writeable by
> the device is the descriptor in the main virtqueue (with
> Flags bit VIRTQ_DESC_F_INDIRECT on). So if we ignore the
> `len` in this descriptor, we won't be able to get the
> length of the data written by the device.
>
> So I think the `len` in this descriptor will carry the
> length of the data written by the device (if the buffers
> are writable to the device) even if the VIRTQ_DESC_F_WRITE
> isn't set by the device. How do you think?

Yes I think so. But we'd better need clarification from Michael.

>
>
>> The reason is we don't touch descriptor ring in the case of split, so
>> BUG_ON()s may help there.
>>
>>> +
>>> +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
>>> +			vring_unmap_one_packed(vq, &desc[j]);
>>> +
>>> +		kfree(desc);
>>> +		vq->desc_state[head].indir_desc = NULL;
>>> +	} else if (ctx) {
>>> +		*ctx = vq->desc_state[head].indir_desc;
>>> +	}
>>> +
>>> +out:
>>> +	return vq->desc_state[head].num;
>>> +}
>>> +
>>> +static inline bool more_used_split(const struct vring_virtqueue *vq)
>>>    {
>>>    	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
>>>    }
>>> +static inline bool more_used_packed(const struct vring_virtqueue *vq)
>>> +{
>>> +	u16 last_used, flags;
>>> +	bool avail, used;
>>> +
>>> +	if (vq->vq.num_free == vq->vring_packed.num)
>>> +		return false;
>>> +
>>> +	last_used = vq->last_used_idx;
>>> +	flags = virtio16_to_cpu(vq->vq.vdev,
>>> +				vq->vring_packed.desc[last_used].flags);
>>> +	avail = flags & VRING_DESC_F_AVAIL(1);
>>> +	used = flags & VRING_DESC_F_USED(1);
>>> +
>>> +	return avail == used;
>>> +}
>> This looks interesting, spec said:
>>
>> "
>> Thus VIRTQ_DESC_F_AVAIL and VIRTQ_DESC_F_USED bits are different for an
>> available descriptor and
>> equal for a used descriptor.
>> Note that this observation is mostly useful for sanity-checking as these are
>> necessary but not sufficient
>> conditions - for example, all descriptors are zero-initialized. To detect
>> used and available descriptors it is
>> possible for drivers and devices to keep track of the last observed value of
>> VIRTQ_DESC_F_USED/VIRTQ_-
>> DESC_F_AVAIL. Other techniques to detect
>> VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
>> might also be possible.
>> "
>>
>> So it looks to me it was not sufficient, looking at the example codes in
>> spec, do we need to track last seen used_wrap_counter here?
> I don't think we have to track used_wrap_counter in
> driver. There was a discussion on this:
>
> https://lists.oasis-open.org/archives/virtio-dev/201802/msg00177.html
>
> And after that, below sentence was added (it's also
> in the above words you quoted):
>
> """
> Other techniques to detect
> VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
> might also be possible.
> """
>
> Best regards,
> Tiwei Bie

I see, the extra condition "if (vq->vq.num_free == 
vq->vring_packed.num)" help in this case.

Thanks

>
>> Thanks

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-24  0:54     ` Jason Wang
@ 2018-04-24  1:05       ` Michael S. Tsirkin
  2018-04-24  1:14         ` Jason Wang
  2018-04-24  1:16         ` Tiwei Bie
  0 siblings, 2 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2018-04-24  1:05 UTC (permalink / raw)
  To: Jason Wang
  Cc: Tiwei Bie, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 24, 2018 at 08:54:52AM +0800, Jason Wang wrote:
> 
> 
> On 2018年04月23日 17:29, Tiwei Bie wrote:
> > On Mon, Apr 23, 2018 at 01:42:14PM +0800, Jason Wang wrote:
> > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > Hello everyone,
> > > > 
> > > > This RFC implements packed ring support for virtio driver.
> > > > 
> > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > 
> > > > TODO:
> > > > - Refinements and bug fixes;
> > > > - Split into small patches;
> > > > - Test indirect descriptor support;
> > > > - Test/fix event suppression support;
> > > > - Test devices other than net;
> > > > 
> > > > RFC v1 -> RFC v2:
> > > > - Add indirect descriptor support - compile test only;
> > > > - Add event suppression supprt - compile test only;
> > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > - Avoid using '%' operator (Jason);
> > > > - Rename free_head -> next_avail_idx (Jason);
> > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > - Some other refinements and bug fixes;
> > > > 
> > > > Thanks!
> > > > 
> > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > ---
> > > >    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > >    include/linux/virtio_ring.h        |    8 +-
> > > >    include/uapi/linux/virtio_config.h |   12 +-
> > > >    include/uapi/linux/virtio_ring.h   |   61 ++
> > > >    4 files changed, 980 insertions(+), 195 deletions(-)
> > > > 
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 71458f493cf8..0515dca34d77 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -58,14 +58,15 @@
> > > [...]
> > > 
> > > > +
> > > > +	if (vq->indirect) {
> > > > +		u32 len;
> > > > +
> > > > +		desc = vq->desc_state[head].indir_desc;
> > > > +		/* Free the indirect table, if any, now that it's unmapped. */
> > > > +		if (!desc)
> > > > +			goto out;
> > > > +
> > > > +		len = virtio32_to_cpu(vq->vq.vdev,
> > > > +				      vq->vring_packed.desc[head].len);
> > > > +
> > > > +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> > > > +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> > > It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. So we
> > > can safely remove this BUG_ON() here.
> > > 
> > > > +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> > > Len could be ignored for used descriptor according to the spec, so we need
> > > remove this BUG_ON() too.
> > Yeah, you're right! The BUG_ON() isn't right. I'll remove it.
> > And I think something related to this in the spec isn't very
> > clear currently.
> > 
> > In the spec, there are below words:
> > 
> > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L272
> > """
> > In descriptors with VIRTQ_DESC_F_INDIRECT set VIRTQ_DESC_F_WRITE
> > is reserved and is ignored by the device.
> > """
> > 
> > So when device writes back an used descriptor in this case,
> > device may not set the VIRTQ_DESC_F_WRITE flag as the flag
> > is reserved and should be ignored.
> > 
> > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L170
> > """
> > Element Length is reserved for used descriptors without the
> > VIRTQ_DESC_F_WRITE flag, and is ignored by drivers.
> > """
> > 
> > And this is the way how driver ignores the `len` in an used
> > descriptor.
> > 
> > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L241
> > """
> > To increase ring capacity the driver can store a (read-only
> > by the device) table of indirect descriptors anywhere in memory,
> > and insert a descriptor in the main virtqueue (with \field{Flags}
> > bit VIRTQ_DESC_F_INDIRECT on) that refers to a buffer element
> > containing this indirect descriptor table;
> > """
> > 
> > So the indirect descriptors in the table are read-only by
> > the device. And the only descriptor which is writeable by
> > the device is the descriptor in the main virtqueue (with
> > Flags bit VIRTQ_DESC_F_INDIRECT on). So if we ignore the
> > `len` in this descriptor, we won't be able to get the
> > length of the data written by the device.
> > 
> > So I think the `len` in this descriptor will carry the
> > length of the data written by the device (if the buffers
> > are writable to the device) even if the VIRTQ_DESC_F_WRITE
> > isn't set by the device. How do you think?
> 
> Yes I think so. But we'd better need clarification from Michael.

I think if you use a descriptor, and you want to supply len
to guest, you set VIRTQ_DESC_F_WRITE in the used descriptor.
Spec also says you must not set VIRTQ_DESC_F_INDIRECT then.
If that's a problem we could look at relaxing that last requirement -
does driver want INDIRECT in used descriptor to match
the value in the avail descriptor for some reason?

> > 
> > 
> > > The reason is we don't touch descriptor ring in the case of split, so
> > > BUG_ON()s may help there.
> > > 
> > > > +
> > > > +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> > > > +			vring_unmap_one_packed(vq, &desc[j]);
> > > > +
> > > > +		kfree(desc);
> > > > +		vq->desc_state[head].indir_desc = NULL;
> > > > +	} else if (ctx) {
> > > > +		*ctx = vq->desc_state[head].indir_desc;
> > > > +	}
> > > > +
> > > > +out:
> > > > +	return vq->desc_state[head].num;
> > > > +}
> > > > +
> > > > +static inline bool more_used_split(const struct vring_virtqueue *vq)
> > > >    {
> > > >    	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
> > > >    }
> > > > +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> > > > +{
> > > > +	u16 last_used, flags;
> > > > +	bool avail, used;
> > > > +
> > > > +	if (vq->vq.num_free == vq->vring_packed.num)
> > > > +		return false;
> > > > +
> > > > +	last_used = vq->last_used_idx;
> > > > +	flags = virtio16_to_cpu(vq->vq.vdev,
> > > > +				vq->vring_packed.desc[last_used].flags);
> > > > +	avail = flags & VRING_DESC_F_AVAIL(1);
> > > > +	used = flags & VRING_DESC_F_USED(1);
> > > > +
> > > > +	return avail == used;
> > > > +}
> > > This looks interesting, spec said:
> > > 
> > > "
> > > Thus VIRTQ_DESC_F_AVAIL and VIRTQ_DESC_F_USED bits are different for an
> > > available descriptor and
> > > equal for a used descriptor.
> > > Note that this observation is mostly useful for sanity-checking as these are
> > > necessary but not sufficient
> > > conditions - for example, all descriptors are zero-initialized. To detect
> > > used and available descriptors it is
> > > possible for drivers and devices to keep track of the last observed value of
> > > VIRTQ_DESC_F_USED/VIRTQ_-
> > > DESC_F_AVAIL. Other techniques to detect
> > > VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
> > > might also be possible.
> > > "
> > > 
> > > So it looks to me it was not sufficient, looking at the example codes in
> > > spec, do we need to track last seen used_wrap_counter here?
> > I don't think we have to track used_wrap_counter in
> > driver. There was a discussion on this:
> > 
> > https://lists.oasis-open.org/archives/virtio-dev/201802/msg00177.html
> > 
> > And after that, below sentence was added (it's also
> > in the above words you quoted):
> > 
> > """
> > Other techniques to detect
> > VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
> > might also be possible.
> > """
> > 
> > Best regards,
> > Tiwei Bie
> 
> I see, the extra condition "if (vq->vq.num_free == vq->vring_packed.num)"
> help in this case.
> 
> Thanks

I still think tracking a wrap counter is better.

> > 
> > > Thanks

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-24  1:05       ` Michael S. Tsirkin
@ 2018-04-24  1:14         ` Jason Wang
  2018-04-24  1:16         ` Tiwei Bie
  1 sibling, 0 replies; 28+ messages in thread
From: Jason Wang @ 2018-04-24  1:14 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Tiwei Bie, wexu, virtualization, linux-kernel, netdev, jfreimann



On 2018年04月24日 09:05, Michael S. Tsirkin wrote:
>>>>> +	if (vq->indirect) {
>>>>> +		u32 len;
>>>>> +
>>>>> +		desc = vq->desc_state[head].indir_desc;
>>>>> +		/* Free the indirect table, if any, now that it's unmapped. */
>>>>> +		if (!desc)
>>>>> +			goto out;
>>>>> +
>>>>> +		len = virtio32_to_cpu(vq->vq.vdev,
>>>>> +				      vq->vring_packed.desc[head].len);
>>>>> +
>>>>> +		BUG_ON(!(vq->vring_packed.desc[head].flags &
>>>>> +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
>>>> It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. So we
>>>> can safely remove this BUG_ON() here.
>>>>
>>>>> +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
>>>> Len could be ignored for used descriptor according to the spec, so we need
>>>> remove this BUG_ON() too.
>>> Yeah, you're right! The BUG_ON() isn't right. I'll remove it.
>>> And I think something related to this in the spec isn't very
>>> clear currently.
>>>
>>> In the spec, there are below words:
>>>
>>> https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L272
>>> """
>>> In descriptors with VIRTQ_DESC_F_INDIRECT set VIRTQ_DESC_F_WRITE
>>> is reserved and is ignored by the device.
>>> """
>>>
>>> So when device writes back an used descriptor in this case,
>>> device may not set the VIRTQ_DESC_F_WRITE flag as the flag
>>> is reserved and should be ignored.
>>>
>>> https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L170
>>> """
>>> Element Length is reserved for used descriptors without the
>>> VIRTQ_DESC_F_WRITE flag, and is ignored by drivers.
>>> """
>>>
>>> And this is the way how driver ignores the `len` in an used
>>> descriptor.
>>>
>>> https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L241
>>> """
>>> To increase ring capacity the driver can store a (read-only
>>> by the device) table of indirect descriptors anywhere in memory,
>>> and insert a descriptor in the main virtqueue (with \field{Flags}
>>> bit VIRTQ_DESC_F_INDIRECT on) that refers to a buffer element
>>> containing this indirect descriptor table;
>>> """
>>>
>>> So the indirect descriptors in the table are read-only by
>>> the device. And the only descriptor which is writeable by
>>> the device is the descriptor in the main virtqueue (with
>>> Flags bit VIRTQ_DESC_F_INDIRECT on). So if we ignore the
>>> `len` in this descriptor, we won't be able to get the
>>> length of the data written by the device.
>>>
>>> So I think the `len` in this descriptor will carry the
>>> length of the data written by the device (if the buffers
>>> are writable to the device) even if the VIRTQ_DESC_F_WRITE
>>> isn't set by the device. How do you think?
>> Yes I think so. But we'd better need clarification from Michael.
> I think if you use a descriptor, and you want to supply len
> to guest, you set VIRTQ_DESC_F_WRITE in the used descriptor.
> Spec also says you must not set VIRTQ_DESC_F_INDIRECT then.
> If that's a problem we could look at relaxing that last requirement -
> does driver want INDIRECT in used descriptor to match
> the value in the avail descriptor for some reason?
>

Looks not, so what I get it:

- device and set VIRTQ_DESC_F_WRITE flag for used descriptor when needed
- there no need to keep INDIRECT flag in used descriptor

So for the above case, we can just have a used descriptor with _F_WRITE 
but without INDIRECT flag.

Thanks

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-24  1:05       ` Michael S. Tsirkin
  2018-04-24  1:14         ` Jason Wang
@ 2018-04-24  1:16         ` Tiwei Bie
  2018-04-24  1:29           ` Michael S. Tsirkin
  1 sibling, 1 reply; 28+ messages in thread
From: Tiwei Bie @ 2018-04-24  1:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 24, 2018 at 04:05:07AM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 24, 2018 at 08:54:52AM +0800, Jason Wang wrote:
> > 
> > 
> > On 2018年04月23日 17:29, Tiwei Bie wrote:
> > > On Mon, Apr 23, 2018 at 01:42:14PM +0800, Jason Wang wrote:
> > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > > Hello everyone,
> > > > > 
> > > > > This RFC implements packed ring support for virtio driver.
> > > > > 
> > > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > > 
> > > > > TODO:
> > > > > - Refinements and bug fixes;
> > > > > - Split into small patches;
> > > > > - Test indirect descriptor support;
> > > > > - Test/fix event suppression support;
> > > > > - Test devices other than net;
> > > > > 
> > > > > RFC v1 -> RFC v2:
> > > > > - Add indirect descriptor support - compile test only;
> > > > > - Add event suppression supprt - compile test only;
> > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > - Avoid using '%' operator (Jason);
> > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > - Some other refinements and bug fixes;
> > > > > 
> > > > > Thanks!
> > > > > 
> > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > ---
> > > > >    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > > >    include/linux/virtio_ring.h        |    8 +-
> > > > >    include/uapi/linux/virtio_config.h |   12 +-
> > > > >    include/uapi/linux/virtio_ring.h   |   61 ++
> > > > >    4 files changed, 980 insertions(+), 195 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > index 71458f493cf8..0515dca34d77 100644
> > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > @@ -58,14 +58,15 @@
> > > > [...]
> > > > 
> > > > > +
> > > > > +	if (vq->indirect) {
> > > > > +		u32 len;
> > > > > +
> > > > > +		desc = vq->desc_state[head].indir_desc;
> > > > > +		/* Free the indirect table, if any, now that it's unmapped. */
> > > > > +		if (!desc)
> > > > > +			goto out;
> > > > > +
> > > > > +		len = virtio32_to_cpu(vq->vq.vdev,
> > > > > +				      vq->vring_packed.desc[head].len);
> > > > > +
> > > > > +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> > > > > +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> > > > It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. So we
> > > > can safely remove this BUG_ON() here.
> > > > 
> > > > > +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> > > > Len could be ignored for used descriptor according to the spec, so we need
> > > > remove this BUG_ON() too.
> > > Yeah, you're right! The BUG_ON() isn't right. I'll remove it.
> > > And I think something related to this in the spec isn't very
> > > clear currently.
> > > 
> > > In the spec, there are below words:
> > > 
> > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L272
> > > """
> > > In descriptors with VIRTQ_DESC_F_INDIRECT set VIRTQ_DESC_F_WRITE
> > > is reserved and is ignored by the device.
> > > """
> > > 
> > > So when device writes back an used descriptor in this case,
> > > device may not set the VIRTQ_DESC_F_WRITE flag as the flag
> > > is reserved and should be ignored.
> > > 
> > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L170
> > > """
> > > Element Length is reserved for used descriptors without the
> > > VIRTQ_DESC_F_WRITE flag, and is ignored by drivers.
> > > """
> > > 
> > > And this is the way how driver ignores the `len` in an used
> > > descriptor.
> > > 
> > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L241
> > > """
> > > To increase ring capacity the driver can store a (read-only
> > > by the device) table of indirect descriptors anywhere in memory,
> > > and insert a descriptor in the main virtqueue (with \field{Flags}
> > > bit VIRTQ_DESC_F_INDIRECT on) that refers to a buffer element
> > > containing this indirect descriptor table;
> > > """
> > > 
> > > So the indirect descriptors in the table are read-only by
> > > the device. And the only descriptor which is writeable by
> > > the device is the descriptor in the main virtqueue (with
> > > Flags bit VIRTQ_DESC_F_INDIRECT on). So if we ignore the
> > > `len` in this descriptor, we won't be able to get the
> > > length of the data written by the device.
> > > 
> > > So I think the `len` in this descriptor will carry the
> > > length of the data written by the device (if the buffers
> > > are writable to the device) even if the VIRTQ_DESC_F_WRITE
> > > isn't set by the device. How do you think?
> > 
> > Yes I think so. But we'd better need clarification from Michael.
> 
> I think if you use a descriptor, and you want to supply len
> to guest, you set VIRTQ_DESC_F_WRITE in the used descriptor.
> Spec also says you must not set VIRTQ_DESC_F_INDIRECT then.
> If that's a problem we could look at relaxing that last requirement -
> does driver want INDIRECT in used descriptor to match
> the value in the avail descriptor for some reason?

For indirect, driver needs some way to get the length
of the data written by the driver. And the descriptors
in the indirect table is read-only, so the only place
device could put this value is the descriptor with the
VIRTQ_DESC_F_INDIRECT flag set.

> 
> > > 
> > > 
> > > > The reason is we don't touch descriptor ring in the case of split, so
> > > > BUG_ON()s may help there.
> > > > 
> > > > > +
> > > > > +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> > > > > +			vring_unmap_one_packed(vq, &desc[j]);
> > > > > +
> > > > > +		kfree(desc);
> > > > > +		vq->desc_state[head].indir_desc = NULL;
> > > > > +	} else if (ctx) {
> > > > > +		*ctx = vq->desc_state[head].indir_desc;
> > > > > +	}
> > > > > +
> > > > > +out:
> > > > > +	return vq->desc_state[head].num;
> > > > > +}
> > > > > +
> > > > > +static inline bool more_used_split(const struct vring_virtqueue *vq)
> > > > >    {
> > > > >    	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
> > > > >    }
> > > > > +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> > > > > +{
> > > > > +	u16 last_used, flags;
> > > > > +	bool avail, used;
> > > > > +
> > > > > +	if (vq->vq.num_free == vq->vring_packed.num)
> > > > > +		return false;
> > > > > +
> > > > > +	last_used = vq->last_used_idx;
> > > > > +	flags = virtio16_to_cpu(vq->vq.vdev,
> > > > > +				vq->vring_packed.desc[last_used].flags);
> > > > > +	avail = flags & VRING_DESC_F_AVAIL(1);
> > > > > +	used = flags & VRING_DESC_F_USED(1);
> > > > > +
> > > > > +	return avail == used;
> > > > > +}
> > > > This looks interesting, spec said:
> > > > 
> > > > "
> > > > Thus VIRTQ_DESC_F_AVAIL and VIRTQ_DESC_F_USED bits are different for an
> > > > available descriptor and
> > > > equal for a used descriptor.
> > > > Note that this observation is mostly useful for sanity-checking as these are
> > > > necessary but not sufficient
> > > > conditions - for example, all descriptors are zero-initialized. To detect
> > > > used and available descriptors it is
> > > > possible for drivers and devices to keep track of the last observed value of
> > > > VIRTQ_DESC_F_USED/VIRTQ_-
> > > > DESC_F_AVAIL. Other techniques to detect
> > > > VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
> > > > might also be possible.
> > > > "
> > > > 
> > > > So it looks to me it was not sufficient, looking at the example codes in
> > > > spec, do we need to track last seen used_wrap_counter here?
> > > I don't think we have to track used_wrap_counter in
> > > driver. There was a discussion on this:
> > > 
> > > https://lists.oasis-open.org/archives/virtio-dev/201802/msg00177.html
> > > 
> > > And after that, below sentence was added (it's also
> > > in the above words you quoted):
> > > 
> > > """
> > > Other techniques to detect
> > > VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
> > > might also be possible.
> > > """
> > > 
> > > Best regards,
> > > Tiwei Bie
> > 
> > I see, the extra condition "if (vq->vq.num_free == vq->vring_packed.num)"
> > help in this case.
> > 
> > Thanks
> 
> I still think tracking a wrap counter is better.

>From my understanding, wrap counter is only needed when
one side just want to update parts of the status bit(s),
it's something like the "report status" or "write back"
feature [1] in the hardware NIC. And in the driver, all
the status must be updated, and that's why I don't want
to track the usedwrap counter.

[1] https://github.com/btw616/dpdk-virtio1.1/commit/ca837865bd10

Best regards,
Tiwei Bie

> 
> > > 
> > > > Thanks

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-24  1:16         ` Tiwei Bie
@ 2018-04-24  1:29           ` Michael S. Tsirkin
  2018-04-24  1:37             ` Tiwei Bie
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2018-04-24  1:29 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 24, 2018 at 09:16:38AM +0800, Tiwei Bie wrote:
> On Tue, Apr 24, 2018 at 04:05:07AM +0300, Michael S. Tsirkin wrote:
> > On Tue, Apr 24, 2018 at 08:54:52AM +0800, Jason Wang wrote:
> > > 
> > > 
> > > On 2018年04月23日 17:29, Tiwei Bie wrote:
> > > > On Mon, Apr 23, 2018 at 01:42:14PM +0800, Jason Wang wrote:
> > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > > > Hello everyone,
> > > > > > 
> > > > > > This RFC implements packed ring support for virtio driver.
> > > > > > 
> > > > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > > > 
> > > > > > TODO:
> > > > > > - Refinements and bug fixes;
> > > > > > - Split into small patches;
> > > > > > - Test indirect descriptor support;
> > > > > > - Test/fix event suppression support;
> > > > > > - Test devices other than net;
> > > > > > 
> > > > > > RFC v1 -> RFC v2:
> > > > > > - Add indirect descriptor support - compile test only;
> > > > > > - Add event suppression supprt - compile test only;
> > > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > > - Avoid using '%' operator (Jason);
> > > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > > - Some other refinements and bug fixes;
> > > > > > 
> > > > > > Thanks!
> > > > > > 
> > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > ---
> > > > > >    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > > > >    include/linux/virtio_ring.h        |    8 +-
> > > > > >    include/uapi/linux/virtio_config.h |   12 +-
> > > > > >    include/uapi/linux/virtio_ring.h   |   61 ++
> > > > > >    4 files changed, 980 insertions(+), 195 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > index 71458f493cf8..0515dca34d77 100644
> > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > @@ -58,14 +58,15 @@
> > > > > [...]
> > > > > 
> > > > > > +
> > > > > > +	if (vq->indirect) {
> > > > > > +		u32 len;
> > > > > > +
> > > > > > +		desc = vq->desc_state[head].indir_desc;
> > > > > > +		/* Free the indirect table, if any, now that it's unmapped. */
> > > > > > +		if (!desc)
> > > > > > +			goto out;
> > > > > > +
> > > > > > +		len = virtio32_to_cpu(vq->vq.vdev,
> > > > > > +				      vq->vring_packed.desc[head].len);
> > > > > > +
> > > > > > +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> > > > > > +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> > > > > It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. So we
> > > > > can safely remove this BUG_ON() here.
> > > > > 
> > > > > > +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> > > > > Len could be ignored for used descriptor according to the spec, so we need
> > > > > remove this BUG_ON() too.
> > > > Yeah, you're right! The BUG_ON() isn't right. I'll remove it.
> > > > And I think something related to this in the spec isn't very
> > > > clear currently.
> > > > 
> > > > In the spec, there are below words:
> > > > 
> > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L272
> > > > """
> > > > In descriptors with VIRTQ_DESC_F_INDIRECT set VIRTQ_DESC_F_WRITE
> > > > is reserved and is ignored by the device.
> > > > """
> > > > 
> > > > So when device writes back an used descriptor in this case,
> > > > device may not set the VIRTQ_DESC_F_WRITE flag as the flag
> > > > is reserved and should be ignored.
> > > > 
> > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L170
> > > > """
> > > > Element Length is reserved for used descriptors without the
> > > > VIRTQ_DESC_F_WRITE flag, and is ignored by drivers.
> > > > """
> > > > 
> > > > And this is the way how driver ignores the `len` in an used
> > > > descriptor.
> > > > 
> > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L241
> > > > """
> > > > To increase ring capacity the driver can store a (read-only
> > > > by the device) table of indirect descriptors anywhere in memory,
> > > > and insert a descriptor in the main virtqueue (with \field{Flags}
> > > > bit VIRTQ_DESC_F_INDIRECT on) that refers to a buffer element
> > > > containing this indirect descriptor table;
> > > > """
> > > > 
> > > > So the indirect descriptors in the table are read-only by
> > > > the device. And the only descriptor which is writeable by
> > > > the device is the descriptor in the main virtqueue (with
> > > > Flags bit VIRTQ_DESC_F_INDIRECT on). So if we ignore the
> > > > `len` in this descriptor, we won't be able to get the
> > > > length of the data written by the device.
> > > > 
> > > > So I think the `len` in this descriptor will carry the
> > > > length of the data written by the device (if the buffers
> > > > are writable to the device) even if the VIRTQ_DESC_F_WRITE
> > > > isn't set by the device. How do you think?
> > > 
> > > Yes I think so. But we'd better need clarification from Michael.
> > 
> > I think if you use a descriptor, and you want to supply len
> > to guest, you set VIRTQ_DESC_F_WRITE in the used descriptor.
> > Spec also says you must not set VIRTQ_DESC_F_INDIRECT then.
> > If that's a problem we could look at relaxing that last requirement -
> > does driver want INDIRECT in used descriptor to match
> > the value in the avail descriptor for some reason?
> 
> For indirect, driver needs some way to get the length
> of the data written by the driver. And the descriptors
> in the indirect table is read-only, so the only place
> device could put this value is the descriptor with the
> VIRTQ_DESC_F_INDIRECT flag set.

when writing out used descriptor, device should set VIRTQ_DESC_F_WRITE
(and clear VIRTQ_DESC_F_INDIRECT).

> > 
> > > > 
> > > > 
> > > > > The reason is we don't touch descriptor ring in the case of split, so
> > > > > BUG_ON()s may help there.
> > > > > 
> > > > > > +
> > > > > > +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> > > > > > +			vring_unmap_one_packed(vq, &desc[j]);
> > > > > > +
> > > > > > +		kfree(desc);
> > > > > > +		vq->desc_state[head].indir_desc = NULL;
> > > > > > +	} else if (ctx) {
> > > > > > +		*ctx = vq->desc_state[head].indir_desc;
> > > > > > +	}
> > > > > > +
> > > > > > +out:
> > > > > > +	return vq->desc_state[head].num;
> > > > > > +}
> > > > > > +
> > > > > > +static inline bool more_used_split(const struct vring_virtqueue *vq)
> > > > > >    {
> > > > > >    	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
> > > > > >    }
> > > > > > +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> > > > > > +{
> > > > > > +	u16 last_used, flags;
> > > > > > +	bool avail, used;
> > > > > > +
> > > > > > +	if (vq->vq.num_free == vq->vring_packed.num)
> > > > > > +		return false;
> > > > > > +
> > > > > > +	last_used = vq->last_used_idx;
> > > > > > +	flags = virtio16_to_cpu(vq->vq.vdev,
> > > > > > +				vq->vring_packed.desc[last_used].flags);
> > > > > > +	avail = flags & VRING_DESC_F_AVAIL(1);
> > > > > > +	used = flags & VRING_DESC_F_USED(1);
> > > > > > +
> > > > > > +	return avail == used;
> > > > > > +}
> > > > > This looks interesting, spec said:
> > > > > 
> > > > > "
> > > > > Thus VIRTQ_DESC_F_AVAIL and VIRTQ_DESC_F_USED bits are different for an
> > > > > available descriptor and
> > > > > equal for a used descriptor.
> > > > > Note that this observation is mostly useful for sanity-checking as these are
> > > > > necessary but not sufficient
> > > > > conditions - for example, all descriptors are zero-initialized. To detect
> > > > > used and available descriptors it is
> > > > > possible for drivers and devices to keep track of the last observed value of
> > > > > VIRTQ_DESC_F_USED/VIRTQ_-
> > > > > DESC_F_AVAIL. Other techniques to detect
> > > > > VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
> > > > > might also be possible.
> > > > > "
> > > > > 
> > > > > So it looks to me it was not sufficient, looking at the example codes in
> > > > > spec, do we need to track last seen used_wrap_counter here?
> > > > I don't think we have to track used_wrap_counter in
> > > > driver. There was a discussion on this:
> > > > 
> > > > https://lists.oasis-open.org/archives/virtio-dev/201802/msg00177.html
> > > > 
> > > > And after that, below sentence was added (it's also
> > > > in the above words you quoted):
> > > > 
> > > > """
> > > > Other techniques to detect
> > > > VIRTQ_DESC_F_AVAIL/VIRTQ_DESC_F_USED bit changes
> > > > might also be possible.
> > > > """
> > > > 
> > > > Best regards,
> > > > Tiwei Bie
> > > 
> > > I see, the extra condition "if (vq->vq.num_free == vq->vring_packed.num)"
> > > help in this case.
> > > 
> > > Thanks
> > 
> > I still think tracking a wrap counter is better.
> 
> >From my understanding, wrap counter is only needed when
> one side just want to update parts of the status bit(s),
> it's something like the "report status" or "write back"
> feature [1] in the hardware NIC. And in the driver, all
> the status must be updated, and that's why I don't want
> to track the usedwrap counter.
> 
> [1] https://github.com/btw616/dpdk-virtio1.1/commit/ca837865bd10
> 
> Best regards,
> Tiwei Bie
> 
> > 
> > > > 
> > > > > Thanks

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-24  1:29           ` Michael S. Tsirkin
@ 2018-04-24  1:37             ` Tiwei Bie
  2018-04-24  1:43               ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Tiwei Bie @ 2018-04-24  1:37 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 24, 2018 at 04:29:51AM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 24, 2018 at 09:16:38AM +0800, Tiwei Bie wrote:
> > On Tue, Apr 24, 2018 at 04:05:07AM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Apr 24, 2018 at 08:54:52AM +0800, Jason Wang wrote:
> > > > 
> > > > 
> > > > On 2018年04月23日 17:29, Tiwei Bie wrote:
> > > > > On Mon, Apr 23, 2018 at 01:42:14PM +0800, Jason Wang wrote:
> > > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > > > > Hello everyone,
> > > > > > > 
> > > > > > > This RFC implements packed ring support for virtio driver.
> > > > > > > 
> > > > > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > > > > 
> > > > > > > TODO:
> > > > > > > - Refinements and bug fixes;
> > > > > > > - Split into small patches;
> > > > > > > - Test indirect descriptor support;
> > > > > > > - Test/fix event suppression support;
> > > > > > > - Test devices other than net;
> > > > > > > 
> > > > > > > RFC v1 -> RFC v2:
> > > > > > > - Add indirect descriptor support - compile test only;
> > > > > > > - Add event suppression supprt - compile test only;
> > > > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > > > - Avoid using '%' operator (Jason);
> > > > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > > > - Some other refinements and bug fixes;
> > > > > > > 
> > > > > > > Thanks!
> > > > > > > 
> > > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > > ---
> > > > > > >    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > > > > >    include/linux/virtio_ring.h        |    8 +-
> > > > > > >    include/uapi/linux/virtio_config.h |   12 +-
> > > > > > >    include/uapi/linux/virtio_ring.h   |   61 ++
> > > > > > >    4 files changed, 980 insertions(+), 195 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > index 71458f493cf8..0515dca34d77 100644
> > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > @@ -58,14 +58,15 @@
> > > > > > [...]
> > > > > > 
> > > > > > > +
> > > > > > > +	if (vq->indirect) {
> > > > > > > +		u32 len;
> > > > > > > +
> > > > > > > +		desc = vq->desc_state[head].indir_desc;
> > > > > > > +		/* Free the indirect table, if any, now that it's unmapped. */
> > > > > > > +		if (!desc)
> > > > > > > +			goto out;
> > > > > > > +
> > > > > > > +		len = virtio32_to_cpu(vq->vq.vdev,
> > > > > > > +				      vq->vring_packed.desc[head].len);
> > > > > > > +
> > > > > > > +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> > > > > > > +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> > > > > > It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. So we
> > > > > > can safely remove this BUG_ON() here.
> > > > > > 
> > > > > > > +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> > > > > > Len could be ignored for used descriptor according to the spec, so we need
> > > > > > remove this BUG_ON() too.
> > > > > Yeah, you're right! The BUG_ON() isn't right. I'll remove it.
> > > > > And I think something related to this in the spec isn't very
> > > > > clear currently.
> > > > > 
> > > > > In the spec, there are below words:
> > > > > 
> > > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L272
> > > > > """
> > > > > In descriptors with VIRTQ_DESC_F_INDIRECT set VIRTQ_DESC_F_WRITE
> > > > > is reserved and is ignored by the device.
> > > > > """
> > > > > 
> > > > > So when device writes back an used descriptor in this case,
> > > > > device may not set the VIRTQ_DESC_F_WRITE flag as the flag
> > > > > is reserved and should be ignored.
> > > > > 
> > > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L170
> > > > > """
> > > > > Element Length is reserved for used descriptors without the
> > > > > VIRTQ_DESC_F_WRITE flag, and is ignored by drivers.
> > > > > """
> > > > > 
> > > > > And this is the way how driver ignores the `len` in an used
> > > > > descriptor.
> > > > > 
> > > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L241
> > > > > """
> > > > > To increase ring capacity the driver can store a (read-only
> > > > > by the device) table of indirect descriptors anywhere in memory,
> > > > > and insert a descriptor in the main virtqueue (with \field{Flags}
> > > > > bit VIRTQ_DESC_F_INDIRECT on) that refers to a buffer element
> > > > > containing this indirect descriptor table;
> > > > > """
> > > > > 
> > > > > So the indirect descriptors in the table are read-only by
> > > > > the device. And the only descriptor which is writeable by
> > > > > the device is the descriptor in the main virtqueue (with
> > > > > Flags bit VIRTQ_DESC_F_INDIRECT on). So if we ignore the
> > > > > `len` in this descriptor, we won't be able to get the
> > > > > length of the data written by the device.
> > > > > 
> > > > > So I think the `len` in this descriptor will carry the
> > > > > length of the data written by the device (if the buffers
> > > > > are writable to the device) even if the VIRTQ_DESC_F_WRITE
> > > > > isn't set by the device. How do you think?
> > > > 
> > > > Yes I think so. But we'd better need clarification from Michael.
> > > 
> > > I think if you use a descriptor, and you want to supply len
> > > to guest, you set VIRTQ_DESC_F_WRITE in the used descriptor.
> > > Spec also says you must not set VIRTQ_DESC_F_INDIRECT then.
> > > If that's a problem we could look at relaxing that last requirement -
> > > does driver want INDIRECT in used descriptor to match
> > > the value in the avail descriptor for some reason?
> > 
> > For indirect, driver needs some way to get the length
> > of the data written by the driver. And the descriptors
> > in the indirect table is read-only, so the only place
> > device could put this value is the descriptor with the
> > VIRTQ_DESC_F_INDIRECT flag set.
> 
> when writing out used descriptor, device should set VIRTQ_DESC_F_WRITE
> (and clear VIRTQ_DESC_F_INDIRECT).

So the spec allows device to set VIRTQ_DESC_F_WRITE bit
when writing out an used descriptor even if the corresponding
descriptors it just parsed don't have the VIRTQ_DESC_F_WRITE
bit set?

Best regards,
Tiwei Bie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-24  1:37             ` Tiwei Bie
@ 2018-04-24  1:43               ` Michael S. Tsirkin
  2018-04-24  1:49                 ` Tiwei Bie
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2018-04-24  1:43 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 24, 2018 at 09:37:47AM +0800, Tiwei Bie wrote:
> On Tue, Apr 24, 2018 at 04:29:51AM +0300, Michael S. Tsirkin wrote:
> > On Tue, Apr 24, 2018 at 09:16:38AM +0800, Tiwei Bie wrote:
> > > On Tue, Apr 24, 2018 at 04:05:07AM +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Apr 24, 2018 at 08:54:52AM +0800, Jason Wang wrote:
> > > > > 
> > > > > 
> > > > > On 2018年04月23日 17:29, Tiwei Bie wrote:
> > > > > > On Mon, Apr 23, 2018 at 01:42:14PM +0800, Jason Wang wrote:
> > > > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > > > > > Hello everyone,
> > > > > > > > 
> > > > > > > > This RFC implements packed ring support for virtio driver.
> > > > > > > > 
> > > > > > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > > > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > > > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > > > > > 
> > > > > > > > TODO:
> > > > > > > > - Refinements and bug fixes;
> > > > > > > > - Split into small patches;
> > > > > > > > - Test indirect descriptor support;
> > > > > > > > - Test/fix event suppression support;
> > > > > > > > - Test devices other than net;
> > > > > > > > 
> > > > > > > > RFC v1 -> RFC v2:
> > > > > > > > - Add indirect descriptor support - compile test only;
> > > > > > > > - Add event suppression supprt - compile test only;
> > > > > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > > > > - Avoid using '%' operator (Jason);
> > > > > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > > > > - Some other refinements and bug fixes;
> > > > > > > > 
> > > > > > > > Thanks!
> > > > > > > > 
> > > > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > > > ---
> > > > > > > >    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > > > > > >    include/linux/virtio_ring.h        |    8 +-
> > > > > > > >    include/uapi/linux/virtio_config.h |   12 +-
> > > > > > > >    include/uapi/linux/virtio_ring.h   |   61 ++
> > > > > > > >    4 files changed, 980 insertions(+), 195 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > index 71458f493cf8..0515dca34d77 100644
> > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > @@ -58,14 +58,15 @@
> > > > > > > [...]
> > > > > > > 
> > > > > > > > +
> > > > > > > > +	if (vq->indirect) {
> > > > > > > > +		u32 len;
> > > > > > > > +
> > > > > > > > +		desc = vq->desc_state[head].indir_desc;
> > > > > > > > +		/* Free the indirect table, if any, now that it's unmapped. */
> > > > > > > > +		if (!desc)
> > > > > > > > +			goto out;
> > > > > > > > +
> > > > > > > > +		len = virtio32_to_cpu(vq->vq.vdev,
> > > > > > > > +				      vq->vring_packed.desc[head].len);
> > > > > > > > +
> > > > > > > > +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> > > > > > > > +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> > > > > > > It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. So we
> > > > > > > can safely remove this BUG_ON() here.
> > > > > > > 
> > > > > > > > +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> > > > > > > Len could be ignored for used descriptor according to the spec, so we need
> > > > > > > remove this BUG_ON() too.
> > > > > > Yeah, you're right! The BUG_ON() isn't right. I'll remove it.
> > > > > > And I think something related to this in the spec isn't very
> > > > > > clear currently.
> > > > > > 
> > > > > > In the spec, there are below words:
> > > > > > 
> > > > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L272
> > > > > > """
> > > > > > In descriptors with VIRTQ_DESC_F_INDIRECT set VIRTQ_DESC_F_WRITE
> > > > > > is reserved and is ignored by the device.
> > > > > > """
> > > > > > 
> > > > > > So when device writes back an used descriptor in this case,
> > > > > > device may not set the VIRTQ_DESC_F_WRITE flag as the flag
> > > > > > is reserved and should be ignored.
> > > > > > 
> > > > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L170
> > > > > > """
> > > > > > Element Length is reserved for used descriptors without the
> > > > > > VIRTQ_DESC_F_WRITE flag, and is ignored by drivers.
> > > > > > """
> > > > > > 
> > > > > > And this is the way how driver ignores the `len` in an used
> > > > > > descriptor.
> > > > > > 
> > > > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L241
> > > > > > """
> > > > > > To increase ring capacity the driver can store a (read-only
> > > > > > by the device) table of indirect descriptors anywhere in memory,
> > > > > > and insert a descriptor in the main virtqueue (with \field{Flags}
> > > > > > bit VIRTQ_DESC_F_INDIRECT on) that refers to a buffer element
> > > > > > containing this indirect descriptor table;
> > > > > > """
> > > > > > 
> > > > > > So the indirect descriptors in the table are read-only by
> > > > > > the device. And the only descriptor which is writeable by
> > > > > > the device is the descriptor in the main virtqueue (with
> > > > > > Flags bit VIRTQ_DESC_F_INDIRECT on). So if we ignore the
> > > > > > `len` in this descriptor, we won't be able to get the
> > > > > > length of the data written by the device.
> > > > > > 
> > > > > > So I think the `len` in this descriptor will carry the
> > > > > > length of the data written by the device (if the buffers
> > > > > > are writable to the device) even if the VIRTQ_DESC_F_WRITE
> > > > > > isn't set by the device. How do you think?
> > > > > 
> > > > > Yes I think so. But we'd better need clarification from Michael.
> > > > 
> > > > I think if you use a descriptor, and you want to supply len
> > > > to guest, you set VIRTQ_DESC_F_WRITE in the used descriptor.
> > > > Spec also says you must not set VIRTQ_DESC_F_INDIRECT then.
> > > > If that's a problem we could look at relaxing that last requirement -
> > > > does driver want INDIRECT in used descriptor to match
> > > > the value in the avail descriptor for some reason?
> > > 
> > > For indirect, driver needs some way to get the length
> > > of the data written by the driver. And the descriptors
> > > in the indirect table is read-only, so the only place
> > > device could put this value is the descriptor with the
> > > VIRTQ_DESC_F_INDIRECT flag set.
> > 
> > when writing out used descriptor, device should set VIRTQ_DESC_F_WRITE
> > (and clear VIRTQ_DESC_F_INDIRECT).
> 
> So the spec allows device to set VIRTQ_DESC_F_WRITE bit
> when writing out an used descriptor even if the corresponding
> descriptors it just parsed don't have the VIRTQ_DESC_F_WRITE
> bit set?
> 
> Best regards,
> Tiwei Bie

I think so. In a used descriptor, VIRTQ_DESC_F_WRITE just means length
is valid.

-- 
MST

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC v2] virtio: support packed ring
  2018-04-24  1:43               ` Michael S. Tsirkin
@ 2018-04-24  1:49                 ` Tiwei Bie
  0 siblings, 0 replies; 28+ messages in thread
From: Tiwei Bie @ 2018-04-24  1:49 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, wexu, virtualization, linux-kernel, netdev, jfreimann

On Tue, Apr 24, 2018 at 04:43:22AM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 24, 2018 at 09:37:47AM +0800, Tiwei Bie wrote:
> > On Tue, Apr 24, 2018 at 04:29:51AM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Apr 24, 2018 at 09:16:38AM +0800, Tiwei Bie wrote:
> > > > On Tue, Apr 24, 2018 at 04:05:07AM +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Apr 24, 2018 at 08:54:52AM +0800, Jason Wang wrote:
> > > > > > 
> > > > > > 
> > > > > > On 2018年04月23日 17:29, Tiwei Bie wrote:
> > > > > > > On Mon, Apr 23, 2018 at 01:42:14PM +0800, Jason Wang wrote:
> > > > > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > > > > > > Hello everyone,
> > > > > > > > > 
> > > > > > > > > This RFC implements packed ring support for virtio driver.
> > > > > > > > > 
> > > > > > > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > > > > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > > > > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > > > > > > 
> > > > > > > > > TODO:
> > > > > > > > > - Refinements and bug fixes;
> > > > > > > > > - Split into small patches;
> > > > > > > > > - Test indirect descriptor support;
> > > > > > > > > - Test/fix event suppression support;
> > > > > > > > > - Test devices other than net;
> > > > > > > > > 
> > > > > > > > > RFC v1 -> RFC v2:
> > > > > > > > > - Add indirect descriptor support - compile test only;
> > > > > > > > > - Add event suppression supprt - compile test only;
> > > > > > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > > > > > - Avoid using '%' operator (Jason);
> > > > > > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > > > > > - Some other refinements and bug fixes;
> > > > > > > > > 
> > > > > > > > > Thanks!
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > > > > ---
> > > > > > > > >    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > > > > > > >    include/linux/virtio_ring.h        |    8 +-
> > > > > > > > >    include/uapi/linux/virtio_config.h |   12 +-
> > > > > > > > >    include/uapi/linux/virtio_ring.h   |   61 ++
> > > > > > > > >    4 files changed, 980 insertions(+), 195 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > index 71458f493cf8..0515dca34d77 100644
> > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > @@ -58,14 +58,15 @@
> > > > > > > > [...]
> > > > > > > > 
> > > > > > > > > +
> > > > > > > > > +	if (vq->indirect) {
> > > > > > > > > +		u32 len;
> > > > > > > > > +
> > > > > > > > > +		desc = vq->desc_state[head].indir_desc;
> > > > > > > > > +		/* Free the indirect table, if any, now that it's unmapped. */
> > > > > > > > > +		if (!desc)
> > > > > > > > > +			goto out;
> > > > > > > > > +
> > > > > > > > > +		len = virtio32_to_cpu(vq->vq.vdev,
> > > > > > > > > +				      vq->vring_packed.desc[head].len);
> > > > > > > > > +
> > > > > > > > > +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> > > > > > > > > +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> > > > > > > > It looks to me spec does not force to keep VRING_DESC_F_INDIRECT here. So we
> > > > > > > > can safely remove this BUG_ON() here.
> > > > > > > > 
> > > > > > > > > +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> > > > > > > > Len could be ignored for used descriptor according to the spec, so we need
> > > > > > > > remove this BUG_ON() too.
> > > > > > > Yeah, you're right! The BUG_ON() isn't right. I'll remove it.
> > > > > > > And I think something related to this in the spec isn't very
> > > > > > > clear currently.
> > > > > > > 
> > > > > > > In the spec, there are below words:
> > > > > > > 
> > > > > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L272
> > > > > > > """
> > > > > > > In descriptors with VIRTQ_DESC_F_INDIRECT set VIRTQ_DESC_F_WRITE
> > > > > > > is reserved and is ignored by the device.
> > > > > > > """
> > > > > > > 
> > > > > > > So when device writes back an used descriptor in this case,
> > > > > > > device may not set the VIRTQ_DESC_F_WRITE flag as the flag
> > > > > > > is reserved and should be ignored.
> > > > > > > 
> > > > > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L170
> > > > > > > """
> > > > > > > Element Length is reserved for used descriptors without the
> > > > > > > VIRTQ_DESC_F_WRITE flag, and is ignored by drivers.
> > > > > > > """
> > > > > > > 
> > > > > > > And this is the way how driver ignores the `len` in an used
> > > > > > > descriptor.
> > > > > > > 
> > > > > > > https://github.com/oasis-tcs/virtio-spec/blob/d4fec517dfcf/packed-ring.tex#L241
> > > > > > > """
> > > > > > > To increase ring capacity the driver can store a (read-only
> > > > > > > by the device) table of indirect descriptors anywhere in memory,
> > > > > > > and insert a descriptor in the main virtqueue (with \field{Flags}
> > > > > > > bit VIRTQ_DESC_F_INDIRECT on) that refers to a buffer element
> > > > > > > containing this indirect descriptor table;
> > > > > > > """
> > > > > > > 
> > > > > > > So the indirect descriptors in the table are read-only by
> > > > > > > the device. And the only descriptor which is writeable by
> > > > > > > the device is the descriptor in the main virtqueue (with
> > > > > > > Flags bit VIRTQ_DESC_F_INDIRECT on). So if we ignore the
> > > > > > > `len` in this descriptor, we won't be able to get the
> > > > > > > length of the data written by the device.
> > > > > > > 
> > > > > > > So I think the `len` in this descriptor will carry the
> > > > > > > length of the data written by the device (if the buffers
> > > > > > > are writable to the device) even if the VIRTQ_DESC_F_WRITE
> > > > > > > isn't set by the device. How do you think?
> > > > > > 
> > > > > > Yes I think so. But we'd better need clarification from Michael.
> > > > > 
> > > > > I think if you use a descriptor, and you want to supply len
> > > > > to guest, you set VIRTQ_DESC_F_WRITE in the used descriptor.
> > > > > Spec also says you must not set VIRTQ_DESC_F_INDIRECT then.
> > > > > If that's a problem we could look at relaxing that last requirement -
> > > > > does driver want INDIRECT in used descriptor to match
> > > > > the value in the avail descriptor for some reason?
> > > > 
> > > > For indirect, driver needs some way to get the length
> > > > of the data written by the driver. And the descriptors
> > > > in the indirect table is read-only, so the only place
> > > > device could put this value is the descriptor with the
> > > > VIRTQ_DESC_F_INDIRECT flag set.
> > > 
> > > when writing out used descriptor, device should set VIRTQ_DESC_F_WRITE
> > > (and clear VIRTQ_DESC_F_INDIRECT).
> > 
> > So the spec allows device to set VIRTQ_DESC_F_WRITE bit
> > when writing out an used descriptor even if the corresponding
> > descriptors it just parsed don't have the VIRTQ_DESC_F_WRITE
> > bit set?
> > 
> > Best regards,
> > Tiwei Bie
> 
> I think so. In a used descriptor, VIRTQ_DESC_F_WRITE just means length
> is valid.

Got it. It's very neat. Thanks! :)

Best regards,
Tiwei Bie

> 
> -- 
> MST

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2018-04-24  1:49 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-01 14:12 [RFC v2] virtio: support packed ring Tiwei Bie
2018-04-10  2:55 ` Jason Wang
2018-04-10  3:21   ` Tiwei Bie
2018-04-13  4:30 ` Jason Wang
2018-04-13  7:15   ` Tiwei Bie
2018-04-17  2:11     ` Jason Wang
2018-04-17  2:17       ` Michael S. Tsirkin
2018-04-17  2:24         ` Jason Wang
2018-04-17  2:37           ` Michael S. Tsirkin
2018-04-17  2:51       ` Tiwei Bie
2018-04-17 12:17         ` Michael S. Tsirkin
2018-04-17 12:47           ` Tiwei Bie
2018-04-17 14:04             ` Michael S. Tsirkin
2018-04-17 14:56               ` Tiwei Bie
2018-04-17 15:54                 ` Michael S. Tsirkin
2018-04-18  1:17                   ` Tiwei Bie
2018-04-13 15:22 ` Michael S. Tsirkin
2018-04-14 11:22   ` Tiwei Bie
2018-04-23  5:42 ` Jason Wang
2018-04-23  9:29   ` Tiwei Bie
2018-04-24  0:54     ` Jason Wang
2018-04-24  1:05       ` Michael S. Tsirkin
2018-04-24  1:14         ` Jason Wang
2018-04-24  1:16         ` Tiwei Bie
2018-04-24  1:29           ` Michael S. Tsirkin
2018-04-24  1:37             ` Tiwei Bie
2018-04-24  1:43               ` Michael S. Tsirkin
2018-04-24  1:49                 ` Tiwei Bie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).