* [PATCH RFC v6 01/11] vhost: option to fetch descriptors through an independent struct
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
@ 2020-06-08 12:52 ` Michael S. Tsirkin
2020-06-08 12:52 ` [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version Michael S. Tsirkin
` (10 subsequent siblings)
11 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:52 UTC (permalink / raw)
To: linux-kernel; +Cc: kvm, virtualization, netdev, Jason Wang, eperezma
The idea is to support multiple ring formats by converting
to a format-independent array of descriptors.
This costs extra cycles, but we gain in ability
to fetch a batch of descriptors in one go, which
is good for code cache locality.
When used, this causes a minor performance degradation,
it's been kept as simple as possible for ease of review.
A follow-up patch gets us back the performance by adding batching.
To simplify benchmarking, I kept the old code around so one can switch
back and forth between old and new code. This will go away in the final
submission.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Link: https://lore.kernel.org/r/20200401183118.8334-2-eperezma@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/vhost.c | 305 +++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/vhost.h | 16 +++
2 files changed, 320 insertions(+), 1 deletion(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 172da092107e..180b7b58c76b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -303,6 +303,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
{
vq->num = 1;
+ vq->ndescs = 0;
vq->desc = NULL;
vq->avail = NULL;
vq->used = NULL;
@@ -373,6 +374,9 @@ static int vhost_worker(void *data)
static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
{
+ kfree(vq->descs);
+ vq->descs = NULL;
+ vq->max_descs = 0;
kfree(vq->indirect);
vq->indirect = NULL;
kfree(vq->log);
@@ -389,6 +393,10 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
+ vq->max_descs = dev->iov_limit;
+ vq->descs = kmalloc_array(vq->max_descs,
+ sizeof(*vq->descs),
+ GFP_KERNEL);
vq->indirect = kmalloc_array(UIO_MAXIOV,
sizeof(*vq->indirect),
GFP_KERNEL);
@@ -396,7 +404,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
GFP_KERNEL);
vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
GFP_KERNEL);
- if (!vq->indirect || !vq->log || !vq->heads)
+ if (!vq->indirect || !vq->log || !vq->heads || !vq->descs)
goto err_nomem;
}
return 0;
@@ -488,6 +496,8 @@ void vhost_dev_init(struct vhost_dev *dev,
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
+ vq->descs = NULL;
+ vq->max_descs = 0;
vq->log = NULL;
vq->indirect = NULL;
vq->heads = NULL;
@@ -2315,6 +2325,299 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
+static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
+{
+ BUG_ON(!vq->ndescs);
+ return &vq->descs[vq->ndescs - 1];
+}
+
+static void pop_split_desc(struct vhost_virtqueue *vq)
+{
+ BUG_ON(!vq->ndescs);
+ --vq->ndescs;
+}
+
+#define VHOST_DESC_FLAGS (VRING_DESC_F_INDIRECT | VRING_DESC_F_WRITE | \
+ VRING_DESC_F_NEXT)
+static int push_split_desc(struct vhost_virtqueue *vq, struct vring_desc *desc, u16 id)
+{
+ struct vhost_desc *h;
+
+ if (unlikely(vq->ndescs >= vq->max_descs))
+ return -EINVAL;
+ h = &vq->descs[vq->ndescs++];
+ h->addr = vhost64_to_cpu(vq, desc->addr);
+ h->len = vhost32_to_cpu(vq, desc->len);
+ h->flags = vhost16_to_cpu(vq, desc->flags) & VHOST_DESC_FLAGS;
+ h->id = id;
+
+ return 0;
+}
+
+static int fetch_indirect_descs(struct vhost_virtqueue *vq,
+ struct vhost_desc *indirect,
+ u16 head)
+{
+ struct vring_desc desc;
+ unsigned int i = 0, count, found = 0;
+ u32 len = indirect->len;
+ struct iov_iter from;
+ int ret;
+
+ /* Sanity check */
+ if (unlikely(len % sizeof desc)) {
+ vq_err(vq, "Invalid length in indirect descriptor: "
+ "len 0x%llx not multiple of 0x%zx\n",
+ (unsigned long long)len,
+ sizeof desc);
+ return -EINVAL;
+ }
+
+ ret = translate_desc(vq, indirect->addr, len, vq->indirect,
+ UIO_MAXIOV, VHOST_ACCESS_RO);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d in indirect.\n", ret);
+ return ret;
+ }
+ iov_iter_init(&from, READ, vq->indirect, ret, len);
+
+ /* We will use the result as an address to read from, so most
+ * architectures only need a compiler barrier here. */
+ read_barrier_depends();
+
+ count = len / sizeof desc;
+ /* Buffers are chained via a 16 bit next field, so
+ * we can have at most 2^16 of these. */
+ if (unlikely(count > USHRT_MAX + 1)) {
+ vq_err(vq, "Indirect buffer length too big: %d\n",
+ indirect->len);
+ return -E2BIG;
+ }
+ if (unlikely(vq->ndescs + count > vq->max_descs)) {
+ vq_err(vq, "Too many indirect + direct descs: %d + %d\n",
+ vq->ndescs, indirect->len);
+ return -E2BIG;
+ }
+
+ do {
+ if (unlikely(++found > count)) {
+ vq_err(vq, "Loop detected: last one at %u "
+ "indirect size %u\n",
+ i, count);
+ return -EINVAL;
+ }
+ if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
+ vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
+ i, (size_t)indirect->addr + i * sizeof desc);
+ return -EINVAL;
+ }
+ if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
+ vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
+ i, (size_t)indirect->addr + i * sizeof desc);
+ return -EINVAL;
+ }
+
+ /* Note: push_split_desc can't fail here:
+ * we never fetch unless there's space. */
+ ret = push_split_desc(vq, &desc, head);
+ WARN_ON(ret);
+ } while ((i = next_desc(vq, &desc)) != -1);
+ return 0;
+}
+
+/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
+ * A negative code is returned on error. */
+static int fetch_descs(struct vhost_virtqueue *vq)
+{
+ unsigned int i, head, found = 0;
+ struct vhost_desc *last;
+ struct vring_desc desc;
+ __virtio16 avail_idx;
+ __virtio16 ring_head;
+ u16 last_avail_idx;
+ int ret;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vq->last_avail_idx;
+
+ if (vq->avail_idx == vq->last_avail_idx) {
+ if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
+ vq_err(vq, "Failed to access avail idx at %p\n",
+ &vq->avail->idx);
+ return -EFAULT;
+ }
+ vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+
+ if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
+ vq_err(vq, "Guest moved used index from %u to %u",
+ last_avail_idx, vq->avail_idx);
+ return -EFAULT;
+ }
+
+ /* If there's nothing new since last we looked, return
+ * invalid.
+ */
+ if (vq->avail_idx == last_avail_idx)
+ return 0;
+
+ /* Only get avail ring entries after they have been
+ * exposed by guest.
+ */
+ smp_rmb();
+ }
+
+ /* Grab the next descriptor number they're advertising */
+ if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
+ vq_err(vq, "Failed to read head: idx %d address %p\n",
+ last_avail_idx,
+ &vq->avail->ring[last_avail_idx % vq->num]);
+ return -EFAULT;
+ }
+
+ head = vhost16_to_cpu(vq, ring_head);
+
+ /* If their number is silly, that's an error. */
+ if (unlikely(head >= vq->num)) {
+ vq_err(vq, "Guest says index %u > %u is available",
+ head, vq->num);
+ return -EINVAL;
+ }
+
+ i = head;
+ do {
+ if (unlikely(i >= vq->num)) {
+ vq_err(vq, "Desc index is %u > %u, head = %u",
+ i, vq->num, head);
+ return -EINVAL;
+ }
+ if (unlikely(++found > vq->num)) {
+ vq_err(vq, "Loop detected: last one at %u "
+ "vq size %u head %u\n",
+ i, vq->num, head);
+ return -EINVAL;
+ }
+ ret = vhost_get_desc(vq, &desc, i);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+ i, vq->desc + i);
+ return -EFAULT;
+ }
+ ret = push_split_desc(vq, &desc, head);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to save descriptor: idx %d\n", i);
+ return -EINVAL;
+ }
+ } while ((i = next_desc(vq, &desc)) != -1);
+
+ last = peek_split_desc(vq);
+ if (unlikely(last->flags & VRING_DESC_F_INDIRECT)) {
+ pop_split_desc(vq);
+ ret = fetch_indirect_descs(vq, last, head);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Failure detected "
+ "in indirect descriptor at idx %d\n", head);
+ return ret;
+ }
+ }
+
+ /* Assume notifications from guest are disabled at this point,
+ * if they aren't we would need to update avail_event index. */
+ BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
+
+ /* On success, increment avail index. */
+ vq->last_avail_idx++;
+
+ return 1;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found. A negative code is
+ * returned on error. */
+int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ int ret = fetch_descs(vq);
+ int i;
+
+ if (ret <= 0)
+ goto err_fetch;
+
+ /* Now convert to IOV */
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+ if (unlikely(log))
+ *log_num = 0;
+
+ for (i = 0; i < vq->ndescs; ++i) {
+ unsigned iov_count = *in_num + *out_num;
+ struct vhost_desc *desc = &vq->descs[i];
+ int access;
+
+ if (desc->flags & ~VHOST_DESC_FLAGS) {
+ vq_err(vq, "Unexpected flags: 0x%x at descriptor id 0x%x\n",
+ desc->flags, desc->id);
+ ret = -EINVAL;
+ goto err;
+ }
+ if (desc->flags & VRING_DESC_F_WRITE)
+ access = VHOST_ACCESS_WO;
+ else
+ access = VHOST_ACCESS_RO;
+ ret = translate_desc(vq, desc->addr,
+ desc->len, iov + iov_count,
+ iov_size - iov_count, access);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d descriptor idx %d\n",
+ ret, i);
+ goto err;
+ }
+ if (access == VHOST_ACCESS_WO) {
+ /* If this is an input descriptor,
+ * increment that count. */
+ *in_num += ret;
+ if (unlikely(log && ret)) {
+ log[*log_num].addr = desc->addr;
+ log[*log_num].len = desc->len;
+ ++*log_num;
+ }
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (unlikely(*in_num)) {
+ vq_err(vq, "Descriptor has out after in: "
+ "idx %d\n", i);
+ ret = -EINVAL;
+ goto err;
+ }
+ *out_num += ret;
+ }
+
+ ret = desc->id;
+ }
+
+ vq->ndescs = 0;
+
+ return ret;
+
+err:
+ vhost_discard_vq_desc(vq, 1);
+err_fetch:
+ vq->ndescs = 0;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
+
/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
{
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index c8e96a095d3b..87089d51490d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -60,6 +60,13 @@ enum vhost_uaddr_type {
VHOST_NUM_ADDRS = 3,
};
+struct vhost_desc {
+ u64 addr;
+ u32 len;
+ u16 flags; /* VRING_DESC_F_WRITE, VRING_DESC_F_NEXT */
+ u16 id;
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -71,6 +78,11 @@ struct vhost_virtqueue {
vring_avail_t __user *avail;
vring_used_t __user *used;
const struct vhost_iotlb_map *meta_iotlb[VHOST_NUM_ADDRS];
+
+ struct vhost_desc *descs;
+ int ndescs;
+ int max_descs;
+
struct file *kick;
struct eventfd_ctx *call_ctx;
struct eventfd_ctx *error_ctx;
@@ -177,6 +189,10 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);
+int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num);
int vhost_get_vq_desc(struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
2020-06-08 12:52 ` [PATCH RFC v6 01/11] vhost: option to fetch descriptors through an independent struct Michael S. Tsirkin
@ 2020-06-08 12:52 ` Michael S. Tsirkin
2020-06-10 3:14 ` Jason Wang
2020-06-10 11:24 ` Eugenio Perez Martin
2020-06-08 12:52 ` [PATCH RFC v6 03/11] vhost/net: pass net specific struct pointer Michael S. Tsirkin
` (9 subsequent siblings)
11 siblings, 2 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:52 UTC (permalink / raw)
To: linux-kernel; +Cc: kvm, virtualization, netdev, Jason Wang, eperezma
As testing shows no performance change, switch to that now.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Link: https://lore.kernel.org/r/20200401183118.8334-3-eperezma@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/test.c | 2 +-
drivers/vhost/vhost.c | 318 ++++++++----------------------------------
drivers/vhost/vhost.h | 7 +-
3 files changed, 65 insertions(+), 262 deletions(-)
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 0466921f4772..7d69778aaa26 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -119,7 +119,7 @@ static int vhost_test_open(struct inode *inode, struct file *f)
dev = &n->dev;
vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
- vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
+ vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV + 64,
VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
f->private_data = n;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 180b7b58c76b..41d6b132c234 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -304,6 +304,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
{
vq->num = 1;
vq->ndescs = 0;
+ vq->first_desc = 0;
vq->desc = NULL;
vq->avail = NULL;
vq->used = NULL;
@@ -372,6 +373,11 @@ static int vhost_worker(void *data)
return 0;
}
+static int vhost_vq_num_batch_descs(struct vhost_virtqueue *vq)
+{
+ return vq->max_descs - UIO_MAXIOV;
+}
+
static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
{
kfree(vq->descs);
@@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
vq->max_descs = dev->iov_limit;
+ if (vhost_vq_num_batch_descs(vq) < 0) {
+ return -EINVAL;
+ }
vq->descs = kmalloc_array(vq->max_descs,
sizeof(*vq->descs),
GFP_KERNEL);
@@ -1610,6 +1619,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
vq->last_avail_idx = s.num;
/* Forget the cached index value. */
vq->avail_idx = vq->last_avail_idx;
+ vq->ndescs = vq->first_desc = 0;
break;
case VHOST_GET_VRING_BASE:
s.index = idx;
@@ -2078,253 +2088,6 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
return next;
}
-static int get_indirect(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num,
- struct vring_desc *indirect)
-{
- struct vring_desc desc;
- unsigned int i = 0, count, found = 0;
- u32 len = vhost32_to_cpu(vq, indirect->len);
- struct iov_iter from;
- int ret, access;
-
- /* Sanity check */
- if (unlikely(len % sizeof desc)) {
- vq_err(vq, "Invalid length in indirect descriptor: "
- "len 0x%llx not multiple of 0x%zx\n",
- (unsigned long long)len,
- sizeof desc);
- return -EINVAL;
- }
-
- ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
- UIO_MAXIOV, VHOST_ACCESS_RO);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Translation failure %d in indirect.\n", ret);
- return ret;
- }
- iov_iter_init(&from, READ, vq->indirect, ret, len);
-
- /* We will use the result as an address to read from, so most
- * architectures only need a compiler barrier here. */
- read_barrier_depends();
-
- count = len / sizeof desc;
- /* Buffers are chained via a 16 bit next field, so
- * we can have at most 2^16 of these. */
- if (unlikely(count > USHRT_MAX + 1)) {
- vq_err(vq, "Indirect buffer length too big: %d\n",
- indirect->len);
- return -E2BIG;
- }
-
- do {
- unsigned iov_count = *in_num + *out_num;
- if (unlikely(++found > count)) {
- vq_err(vq, "Loop detected: last one at %u "
- "indirect size %u\n",
- i, count);
- return -EINVAL;
- }
- if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
- vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
- i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
- return -EINVAL;
- }
- if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
- vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
- i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
- return -EINVAL;
- }
-
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
- access = VHOST_ACCESS_WO;
- else
- access = VHOST_ACCESS_RO;
-
- ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
- vhost32_to_cpu(vq, desc.len), iov + iov_count,
- iov_size - iov_count, access);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Translation failure %d indirect idx %d\n",
- ret, i);
- return ret;
- }
- /* If this is an input descriptor, increment that count. */
- if (access == VHOST_ACCESS_WO) {
- *in_num += ret;
- if (unlikely(log && ret)) {
- log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
- log[*log_num].len = vhost32_to_cpu(vq, desc.len);
- ++*log_num;
- }
- } else {
- /* If it's an output descriptor, they're all supposed
- * to come before any input descriptors. */
- if (unlikely(*in_num)) {
- vq_err(vq, "Indirect descriptor "
- "has out after in: idx %d\n", i);
- return -EINVAL;
- }
- *out_num += ret;
- }
- } while ((i = next_desc(vq, &desc)) != -1);
- return 0;
-}
-
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
-{
- struct vring_desc desc;
- unsigned int i, head, found = 0;
- u16 last_avail_idx;
- __virtio16 avail_idx;
- __virtio16 ring_head;
- int ret, access;
-
- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vq->last_avail_idx;
-
- if (vq->avail_idx == vq->last_avail_idx) {
- if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
- vq_err(vq, "Failed to access avail idx at %p\n",
- &vq->avail->idx);
- return -EFAULT;
- }
- vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-
- if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
- vq_err(vq, "Guest moved used index from %u to %u",
- last_avail_idx, vq->avail_idx);
- return -EFAULT;
- }
-
- /* If there's nothing new since last we looked, return
- * invalid.
- */
- if (vq->avail_idx == last_avail_idx)
- return vq->num;
-
- /* Only get avail ring entries after they have been
- * exposed by guest.
- */
- smp_rmb();
- }
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return -EFAULT;
- }
-
- head = vhost16_to_cpu(vq, ring_head);
-
- /* If their number is silly, that's an error. */
- if (unlikely(head >= vq->num)) {
- vq_err(vq, "Guest says index %u > %u is available",
- head, vq->num);
- return -EINVAL;
- }
-
- /* When we start there are none of either input nor output. */
- *out_num = *in_num = 0;
- if (unlikely(log))
- *log_num = 0;
-
- i = head;
- do {
- unsigned iov_count = *in_num + *out_num;
- if (unlikely(i >= vq->num)) {
- vq_err(vq, "Desc index is %u > %u, head = %u",
- i, vq->num, head);
- return -EINVAL;
- }
- if (unlikely(++found > vq->num)) {
- vq_err(vq, "Loop detected: last one at %u "
- "vq size %u head %u\n",
- i, vq->num, head);
- return -EINVAL;
- }
- ret = vhost_get_desc(vq, &desc, i);
- if (unlikely(ret)) {
- vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
- i, vq->desc + i);
- return -EFAULT;
- }
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
- ret = get_indirect(vq, iov, iov_size,
- out_num, in_num,
- log, log_num, &desc);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Failure detected "
- "in indirect descriptor at idx %d\n", i);
- return ret;
- }
- continue;
- }
-
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
- access = VHOST_ACCESS_WO;
- else
- access = VHOST_ACCESS_RO;
- ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
- vhost32_to_cpu(vq, desc.len), iov + iov_count,
- iov_size - iov_count, access);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Translation failure %d descriptor idx %d\n",
- ret, i);
- return ret;
- }
- if (access == VHOST_ACCESS_WO) {
- /* If this is an input descriptor,
- * increment that count. */
- *in_num += ret;
- if (unlikely(log && ret)) {
- log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
- log[*log_num].len = vhost32_to_cpu(vq, desc.len);
- ++*log_num;
- }
- } else {
- /* If it's an output descriptor, they're all supposed
- * to come before any input descriptors. */
- if (unlikely(*in_num)) {
- vq_err(vq, "Descriptor has out after in: "
- "idx %d\n", i);
- return -EINVAL;
- }
- *out_num += ret;
- }
- } while ((i = next_desc(vq, &desc)) != -1);
-
- /* On success, increment avail index. */
- vq->last_avail_idx++;
-
- /* Assume notifications from guest are disabled at this point,
- * if they aren't we would need to update avail_event index. */
- BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
- return head;
-}
-EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
-
static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
{
BUG_ON(!vq->ndescs);
@@ -2428,7 +2191,7 @@ static int fetch_indirect_descs(struct vhost_virtqueue *vq,
/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
* A negative code is returned on error. */
-static int fetch_descs(struct vhost_virtqueue *vq)
+static int fetch_buf(struct vhost_virtqueue *vq)
{
unsigned int i, head, found = 0;
struct vhost_desc *last;
@@ -2441,7 +2204,11 @@ static int fetch_descs(struct vhost_virtqueue *vq)
/* Check it isn't doing very strange things with descriptor numbers. */
last_avail_idx = vq->last_avail_idx;
- if (vq->avail_idx == vq->last_avail_idx) {
+ if (unlikely(vq->avail_idx == vq->last_avail_idx)) {
+ /* If we already have work to do, don't bother re-checking. */
+ if (likely(vq->ndescs))
+ return 1;
+
if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
vq_err(vq, "Failed to access avail idx at %p\n",
&vq->avail->idx);
@@ -2532,6 +2299,41 @@ static int fetch_descs(struct vhost_virtqueue *vq)
return 1;
}
+/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
+ * A negative code is returned on error. */
+static int fetch_descs(struct vhost_virtqueue *vq)
+{
+ int ret;
+
+ if (unlikely(vq->first_desc >= vq->ndescs)) {
+ vq->first_desc = 0;
+ vq->ndescs = 0;
+ }
+
+ if (vq->ndescs)
+ return 1;
+
+ for (ret = 1;
+ ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
+ ret = fetch_buf(vq))
+ ;
+
+ /* On success we expect some descs */
+ BUG_ON(ret > 0 && !vq->ndescs);
+ return ret;
+}
+
+/* Reverse the effects of fetch_descs */
+static void unfetch_descs(struct vhost_virtqueue *vq)
+{
+ int i;
+
+ for (i = vq->first_desc; i < vq->ndescs; ++i)
+ if (!(vq->descs[i].flags & VRING_DESC_F_NEXT))
+ vq->last_avail_idx -= 1;
+ vq->ndescs = 0;
+}
+
/* This looks in the virtqueue and for the first available buffer, and converts
* it to an iovec for convenient access. Since descriptors consist of some
* number of output then some number of input descriptors, it's actually two
@@ -2540,7 +2342,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
* This function returns the descriptor number found, or vq->num (which is
* never a valid descriptor number) if none was found. A negative code is
* returned on error. */
-int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num)
@@ -2549,7 +2351,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
int i;
if (ret <= 0)
- goto err_fetch;
+ goto err;
/* Now convert to IOV */
/* When we start there are none of either input nor output. */
@@ -2557,7 +2359,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
if (unlikely(log))
*log_num = 0;
- for (i = 0; i < vq->ndescs; ++i) {
+ for (i = vq->first_desc; i < vq->ndescs; ++i) {
unsigned iov_count = *in_num + *out_num;
struct vhost_desc *desc = &vq->descs[i];
int access;
@@ -2603,24 +2405,26 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
}
ret = desc->id;
+
+ if (!(desc->flags & VRING_DESC_F_NEXT))
+ break;
}
- vq->ndescs = 0;
+ vq->first_desc = i + 1;
return ret;
err:
- vhost_discard_vq_desc(vq, 1);
-err_fetch:
- vq->ndescs = 0;
+ unfetch_descs(vq);
return ret;
}
-EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
+EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
{
+ unfetch_descs(vq);
vq->last_avail_idx -= n;
}
EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 87089d51490d..fed36af5c444 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -81,6 +81,7 @@ struct vhost_virtqueue {
struct vhost_desc *descs;
int ndescs;
+ int first_desc;
int max_descs;
struct file *kick;
@@ -189,10 +190,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);
-int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
- struct iovec iov[], unsigned int iov_count,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num);
int vhost_get_vq_desc(struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
@@ -261,6 +258,8 @@ static inline void vhost_vq_set_backend(struct vhost_virtqueue *vq,
void *private_data)
{
vq->private_data = private_data;
+ vq->ndescs = 0;
+ vq->first_desc = 0;
}
/**
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version
2020-06-08 12:52 ` [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version Michael S. Tsirkin
@ 2020-06-10 3:14 ` Jason Wang
2020-06-10 11:05 ` Michael S. Tsirkin
2020-06-10 11:24 ` Eugenio Perez Martin
1 sibling, 1 reply; 20+ messages in thread
From: Jason Wang @ 2020-06-10 3:14 UTC (permalink / raw)
To: Michael S. Tsirkin, linux-kernel; +Cc: kvm, virtualization, netdev, eperezma
On 2020/6/8 下午8:52, Michael S. Tsirkin wrote:
> As testing shows no performance change, switch to that now.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> Link: https://lore.kernel.org/r/20200401183118.8334-3-eperezma@redhat.com
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/vhost/test.c | 2 +-
> drivers/vhost/vhost.c | 318 ++++++++----------------------------------
> drivers/vhost/vhost.h | 7 +-
> 3 files changed, 65 insertions(+), 262 deletions(-)
>
> diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
> index 0466921f4772..7d69778aaa26 100644
> --- a/drivers/vhost/test.c
> +++ b/drivers/vhost/test.c
> @@ -119,7 +119,7 @@ static int vhost_test_open(struct inode *inode, struct file *f)
> dev = &n->dev;
> vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
> n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
> - vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
> + vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV + 64,
> VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
>
> f->private_data = n;
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 180b7b58c76b..41d6b132c234 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -304,6 +304,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> {
> vq->num = 1;
> vq->ndescs = 0;
> + vq->first_desc = 0;
> vq->desc = NULL;
> vq->avail = NULL;
> vq->used = NULL;
> @@ -372,6 +373,11 @@ static int vhost_worker(void *data)
> return 0;
> }
>
> +static int vhost_vq_num_batch_descs(struct vhost_virtqueue *vq)
> +{
> + return vq->max_descs - UIO_MAXIOV;
> +}
> +
> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> {
> kfree(vq->descs);
> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> for (i = 0; i < dev->nvqs; ++i) {
> vq = dev->vqs[i];
> vq->max_descs = dev->iov_limit;
> + if (vhost_vq_num_batch_descs(vq) < 0) {
> + return -EINVAL;
> + }
> vq->descs = kmalloc_array(vq->max_descs,
> sizeof(*vq->descs),
> GFP_KERNEL);
> @@ -1610,6 +1619,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> vq->last_avail_idx = s.num;
> /* Forget the cached index value. */
> vq->avail_idx = vq->last_avail_idx;
> + vq->ndescs = vq->first_desc = 0;
> break;
> case VHOST_GET_VRING_BASE:
> s.index = idx;
> @@ -2078,253 +2088,6 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
> return next;
> }
>
> -static int get_indirect(struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num,
> - struct vring_desc *indirect)
> -{
> - struct vring_desc desc;
> - unsigned int i = 0, count, found = 0;
> - u32 len = vhost32_to_cpu(vq, indirect->len);
> - struct iov_iter from;
> - int ret, access;
> -
> - /* Sanity check */
> - if (unlikely(len % sizeof desc)) {
> - vq_err(vq, "Invalid length in indirect descriptor: "
> - "len 0x%llx not multiple of 0x%zx\n",
> - (unsigned long long)len,
> - sizeof desc);
> - return -EINVAL;
> - }
> -
> - ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
> - UIO_MAXIOV, VHOST_ACCESS_RO);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d in indirect.\n", ret);
> - return ret;
> - }
> - iov_iter_init(&from, READ, vq->indirect, ret, len);
> -
> - /* We will use the result as an address to read from, so most
> - * architectures only need a compiler barrier here. */
> - read_barrier_depends();
> -
> - count = len / sizeof desc;
> - /* Buffers are chained via a 16 bit next field, so
> - * we can have at most 2^16 of these. */
> - if (unlikely(count > USHRT_MAX + 1)) {
> - vq_err(vq, "Indirect buffer length too big: %d\n",
> - indirect->len);
> - return -E2BIG;
> - }
> -
> - do {
> - unsigned iov_count = *in_num + *out_num;
> - if (unlikely(++found > count)) {
> - vq_err(vq, "Loop detected: last one at %u "
> - "indirect size %u\n",
> - i, count);
> - return -EINVAL;
> - }
> - if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
> - vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
> - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> - return -EINVAL;
> - }
> - if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
> - vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
> - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> - return -EINVAL;
> - }
> -
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> - access = VHOST_ACCESS_WO;
> - else
> - access = VHOST_ACCESS_RO;
> -
> - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> - iov_size - iov_count, access);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d indirect idx %d\n",
> - ret, i);
> - return ret;
> - }
> - /* If this is an input descriptor, increment that count. */
> - if (access == VHOST_ACCESS_WO) {
> - *in_num += ret;
> - if (unlikely(log && ret)) {
> - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> - ++*log_num;
> - }
> - } else {
> - /* If it's an output descriptor, they're all supposed
> - * to come before any input descriptors. */
> - if (unlikely(*in_num)) {
> - vq_err(vq, "Indirect descriptor "
> - "has out after in: idx %d\n", i);
> - return -EINVAL;
> - }
> - *out_num += ret;
> - }
> - } while ((i = next_desc(vq, &desc)) != -1);
> - return 0;
> -}
> -
> -/* This looks in the virtqueue and for the first available buffer, and converts
> - * it to an iovec for convenient access. Since descriptors consist of some
> - * number of output then some number of input descriptors, it's actually two
> - * iovecs, but we pack them into one and note how many of each there were.
> - *
> - * This function returns the descriptor number found, or vq->num (which is
> - * never a valid descriptor number) if none was found. A negative code is
> - * returned on error. */
> -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num)
> -{
> - struct vring_desc desc;
> - unsigned int i, head, found = 0;
> - u16 last_avail_idx;
> - __virtio16 avail_idx;
> - __virtio16 ring_head;
> - int ret, access;
> -
> - /* Check it isn't doing very strange things with descriptor numbers. */
> - last_avail_idx = vq->last_avail_idx;
> -
> - if (vq->avail_idx == vq->last_avail_idx) {
> - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> - vq_err(vq, "Failed to access avail idx at %p\n",
> - &vq->avail->idx);
> - return -EFAULT;
> - }
> - vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> -
> - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
> - vq_err(vq, "Guest moved used index from %u to %u",
> - last_avail_idx, vq->avail_idx);
> - return -EFAULT;
> - }
> -
> - /* If there's nothing new since last we looked, return
> - * invalid.
> - */
> - if (vq->avail_idx == last_avail_idx)
> - return vq->num;
> -
> - /* Only get avail ring entries after they have been
> - * exposed by guest.
> - */
> - smp_rmb();
> - }
> -
> - /* Grab the next descriptor number they're advertising, and increment
> - * the index we've seen. */
> - if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
> - vq_err(vq, "Failed to read head: idx %d address %p\n",
> - last_avail_idx,
> - &vq->avail->ring[last_avail_idx % vq->num]);
> - return -EFAULT;
> - }
> -
> - head = vhost16_to_cpu(vq, ring_head);
> -
> - /* If their number is silly, that's an error. */
> - if (unlikely(head >= vq->num)) {
> - vq_err(vq, "Guest says index %u > %u is available",
> - head, vq->num);
> - return -EINVAL;
> - }
> -
> - /* When we start there are none of either input nor output. */
> - *out_num = *in_num = 0;
> - if (unlikely(log))
> - *log_num = 0;
> -
> - i = head;
> - do {
> - unsigned iov_count = *in_num + *out_num;
> - if (unlikely(i >= vq->num)) {
> - vq_err(vq, "Desc index is %u > %u, head = %u",
> - i, vq->num, head);
> - return -EINVAL;
> - }
> - if (unlikely(++found > vq->num)) {
> - vq_err(vq, "Loop detected: last one at %u "
> - "vq size %u head %u\n",
> - i, vq->num, head);
> - return -EINVAL;
> - }
> - ret = vhost_get_desc(vq, &desc, i);
> - if (unlikely(ret)) {
> - vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
> - i, vq->desc + i);
> - return -EFAULT;
> - }
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
> - ret = get_indirect(vq, iov, iov_size,
> - out_num, in_num,
> - log, log_num, &desc);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Failure detected "
> - "in indirect descriptor at idx %d\n", i);
> - return ret;
> - }
> - continue;
> - }
> -
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> - access = VHOST_ACCESS_WO;
> - else
> - access = VHOST_ACCESS_RO;
> - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> - iov_size - iov_count, access);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d descriptor idx %d\n",
> - ret, i);
> - return ret;
> - }
> - if (access == VHOST_ACCESS_WO) {
> - /* If this is an input descriptor,
> - * increment that count. */
> - *in_num += ret;
> - if (unlikely(log && ret)) {
> - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> - ++*log_num;
> - }
> - } else {
> - /* If it's an output descriptor, they're all supposed
> - * to come before any input descriptors. */
> - if (unlikely(*in_num)) {
> - vq_err(vq, "Descriptor has out after in: "
> - "idx %d\n", i);
> - return -EINVAL;
> - }
> - *out_num += ret;
> - }
> - } while ((i = next_desc(vq, &desc)) != -1);
> -
> - /* On success, increment avail index. */
> - vq->last_avail_idx++;
> -
> - /* Assume notifications from guest are disabled at this point,
> - * if they aren't we would need to update avail_event index. */
> - BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> - return head;
> -}
> -EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> -
> static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
> {
> BUG_ON(!vq->ndescs);
> @@ -2428,7 +2191,7 @@ static int fetch_indirect_descs(struct vhost_virtqueue *vq,
>
> /* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> * A negative code is returned on error. */
> -static int fetch_descs(struct vhost_virtqueue *vq)
> +static int fetch_buf(struct vhost_virtqueue *vq)
> {
> unsigned int i, head, found = 0;
> struct vhost_desc *last;
> @@ -2441,7 +2204,11 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> /* Check it isn't doing very strange things with descriptor numbers. */
> last_avail_idx = vq->last_avail_idx;
>
> - if (vq->avail_idx == vq->last_avail_idx) {
> + if (unlikely(vq->avail_idx == vq->last_avail_idx)) {
> + /* If we already have work to do, don't bother re-checking. */
> + if (likely(vq->ndescs))
> + return 1;
> +
> if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> vq_err(vq, "Failed to access avail idx at %p\n",
> &vq->avail->idx);
> @@ -2532,6 +2299,41 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> return 1;
> }
>
> +/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> + * A negative code is returned on error. */
> +static int fetch_descs(struct vhost_virtqueue *vq)
> +{
> + int ret;
> +
> + if (unlikely(vq->first_desc >= vq->ndescs)) {
> + vq->first_desc = 0;
> + vq->ndescs = 0;
> + }
> +
> + if (vq->ndescs)
> + return 1;
> +
> + for (ret = 1;
> + ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
> + ret = fetch_buf(vq))
> + ;
> +
> + /* On success we expect some descs */
> + BUG_ON(ret > 0 && !vq->ndescs);
> + return ret;
> +}
> +
> +/* Reverse the effects of fetch_descs */
> +static void unfetch_descs(struct vhost_virtqueue *vq)
> +{
> + int i;
> +
> + for (i = vq->first_desc; i < vq->ndescs; ++i)
> + if (!(vq->descs[i].flags & VRING_DESC_F_NEXT))
> + vq->last_avail_idx -= 1;
> + vq->ndescs = 0;
> +}
Is it better to set first_desc to zero here?
> +
> /* This looks in the virtqueue and for the first available buffer, and converts
> * it to an iovec for convenient access. Since descriptors consist of some
> * number of output then some number of input descriptors, it's actually two
> @@ -2540,7 +2342,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> * This function returns the descriptor number found, or vq->num (which is
> * never a valid descriptor number) if none was found. A negative code is
> * returned on error. */
> -int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> struct iovec iov[], unsigned int iov_size,
> unsigned int *out_num, unsigned int *in_num,
> struct vhost_log *log, unsigned int *log_num)
> @@ -2549,7 +2351,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> int i;
>
> if (ret <= 0)
> - goto err_fetch;
> + goto err;
>
> /* Now convert to IOV */
> /* When we start there are none of either input nor output. */
> @@ -2557,7 +2359,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> if (unlikely(log))
> *log_num = 0;
>
> - for (i = 0; i < vq->ndescs; ++i) {
> + for (i = vq->first_desc; i < vq->ndescs; ++i) {
> unsigned iov_count = *in_num + *out_num;
> struct vhost_desc *desc = &vq->descs[i];
> int access;
> @@ -2603,24 +2405,26 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> }
>
> ret = desc->id;
> +
> + if (!(desc->flags & VRING_DESC_F_NEXT))
> + break;
> }
>
> - vq->ndescs = 0;
> + vq->first_desc = i + 1;
>
> return ret;
>
> err:
> - vhost_discard_vq_desc(vq, 1);
> -err_fetch:
> - vq->ndescs = 0;
> + unfetch_descs(vq);
>
> return ret;
> }
> -EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
> +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>
> /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> {
> + unfetch_descs(vq);
> vq->last_avail_idx -= n;
So unfetch_descs() has decreased last_avail_idx.
Can we fix this by letting unfetch_descs() return the number and then we
can do:
int d = unfetch_descs(vq);
vq->last_avail_idx -= (n > d) ? n - d: 0;
Thanks
> }
> EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 87089d51490d..fed36af5c444 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -81,6 +81,7 @@ struct vhost_virtqueue {
>
> struct vhost_desc *descs;
> int ndescs;
> + int first_desc;
> int max_descs;
>
> struct file *kick;
> @@ -189,10 +190,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
> bool vhost_log_access_ok(struct vhost_dev *);
>
> -int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
> - struct iovec iov[], unsigned int iov_count,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num);
> int vhost_get_vq_desc(struct vhost_virtqueue *,
> struct iovec iov[], unsigned int iov_count,
> unsigned int *out_num, unsigned int *in_num,
> @@ -261,6 +258,8 @@ static inline void vhost_vq_set_backend(struct vhost_virtqueue *vq,
> void *private_data)
> {
> vq->private_data = private_data;
> + vq->ndescs = 0;
> + vq->first_desc = 0;
> }
>
> /**
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version
2020-06-10 3:14 ` Jason Wang
@ 2020-06-10 11:05 ` Michael S. Tsirkin
2020-06-11 3:02 ` Jason Wang
0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-10 11:05 UTC (permalink / raw)
To: Jason Wang; +Cc: linux-kernel, kvm, virtualization, netdev, eperezma
On Wed, Jun 10, 2020 at 11:14:49AM +0800, Jason Wang wrote:
>
> On 2020/6/8 下午8:52, Michael S. Tsirkin wrote:
> > As testing shows no performance change, switch to that now.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > Link: https://lore.kernel.org/r/20200401183118.8334-3-eperezma@redhat.com
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> > drivers/vhost/test.c | 2 +-
> > drivers/vhost/vhost.c | 318 ++++++++----------------------------------
> > drivers/vhost/vhost.h | 7 +-
> > 3 files changed, 65 insertions(+), 262 deletions(-)
> >
> > diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
> > index 0466921f4772..7d69778aaa26 100644
> > --- a/drivers/vhost/test.c
> > +++ b/drivers/vhost/test.c
> > @@ -119,7 +119,7 @@ static int vhost_test_open(struct inode *inode, struct file *f)
> > dev = &n->dev;
> > vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
> > n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
> > - vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
> > + vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV + 64,
> > VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
> > f->private_data = n;
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index 180b7b58c76b..41d6b132c234 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -304,6 +304,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> > {
> > vq->num = 1;
> > vq->ndescs = 0;
> > + vq->first_desc = 0;
> > vq->desc = NULL;
> > vq->avail = NULL;
> > vq->used = NULL;
> > @@ -372,6 +373,11 @@ static int vhost_worker(void *data)
> > return 0;
> > }
> > +static int vhost_vq_num_batch_descs(struct vhost_virtqueue *vq)
> > +{
> > + return vq->max_descs - UIO_MAXIOV;
> > +}
> > +
> > static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> > {
> > kfree(vq->descs);
> > @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> > for (i = 0; i < dev->nvqs; ++i) {
> > vq = dev->vqs[i];
> > vq->max_descs = dev->iov_limit;
> > + if (vhost_vq_num_batch_descs(vq) < 0) {
> > + return -EINVAL;
> > + }
> > vq->descs = kmalloc_array(vq->max_descs,
> > sizeof(*vq->descs),
> > GFP_KERNEL);
> > @@ -1610,6 +1619,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> > vq->last_avail_idx = s.num;
> > /* Forget the cached index value. */
> > vq->avail_idx = vq->last_avail_idx;
> > + vq->ndescs = vq->first_desc = 0;
> > break;
> > case VHOST_GET_VRING_BASE:
> > s.index = idx;
> > @@ -2078,253 +2088,6 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
> > return next;
> > }
> > -static int get_indirect(struct vhost_virtqueue *vq,
> > - struct iovec iov[], unsigned int iov_size,
> > - unsigned int *out_num, unsigned int *in_num,
> > - struct vhost_log *log, unsigned int *log_num,
> > - struct vring_desc *indirect)
> > -{
> > - struct vring_desc desc;
> > - unsigned int i = 0, count, found = 0;
> > - u32 len = vhost32_to_cpu(vq, indirect->len);
> > - struct iov_iter from;
> > - int ret, access;
> > -
> > - /* Sanity check */
> > - if (unlikely(len % sizeof desc)) {
> > - vq_err(vq, "Invalid length in indirect descriptor: "
> > - "len 0x%llx not multiple of 0x%zx\n",
> > - (unsigned long long)len,
> > - sizeof desc);
> > - return -EINVAL;
> > - }
> > -
> > - ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
> > - UIO_MAXIOV, VHOST_ACCESS_RO);
> > - if (unlikely(ret < 0)) {
> > - if (ret != -EAGAIN)
> > - vq_err(vq, "Translation failure %d in indirect.\n", ret);
> > - return ret;
> > - }
> > - iov_iter_init(&from, READ, vq->indirect, ret, len);
> > -
> > - /* We will use the result as an address to read from, so most
> > - * architectures only need a compiler barrier here. */
> > - read_barrier_depends();
> > -
> > - count = len / sizeof desc;
> > - /* Buffers are chained via a 16 bit next field, so
> > - * we can have at most 2^16 of these. */
> > - if (unlikely(count > USHRT_MAX + 1)) {
> > - vq_err(vq, "Indirect buffer length too big: %d\n",
> > - indirect->len);
> > - return -E2BIG;
> > - }
> > -
> > - do {
> > - unsigned iov_count = *in_num + *out_num;
> > - if (unlikely(++found > count)) {
> > - vq_err(vq, "Loop detected: last one at %u "
> > - "indirect size %u\n",
> > - i, count);
> > - return -EINVAL;
> > - }
> > - if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
> > - vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
> > - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> > - return -EINVAL;
> > - }
> > - if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
> > - vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
> > - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> > - return -EINVAL;
> > - }
> > -
> > - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> > - access = VHOST_ACCESS_WO;
> > - else
> > - access = VHOST_ACCESS_RO;
> > -
> > - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> > - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> > - iov_size - iov_count, access);
> > - if (unlikely(ret < 0)) {
> > - if (ret != -EAGAIN)
> > - vq_err(vq, "Translation failure %d indirect idx %d\n",
> > - ret, i);
> > - return ret;
> > - }
> > - /* If this is an input descriptor, increment that count. */
> > - if (access == VHOST_ACCESS_WO) {
> > - *in_num += ret;
> > - if (unlikely(log && ret)) {
> > - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> > - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> > - ++*log_num;
> > - }
> > - } else {
> > - /* If it's an output descriptor, they're all supposed
> > - * to come before any input descriptors. */
> > - if (unlikely(*in_num)) {
> > - vq_err(vq, "Indirect descriptor "
> > - "has out after in: idx %d\n", i);
> > - return -EINVAL;
> > - }
> > - *out_num += ret;
> > - }
> > - } while ((i = next_desc(vq, &desc)) != -1);
> > - return 0;
> > -}
> > -
> > -/* This looks in the virtqueue and for the first available buffer, and converts
> > - * it to an iovec for convenient access. Since descriptors consist of some
> > - * number of output then some number of input descriptors, it's actually two
> > - * iovecs, but we pack them into one and note how many of each there were.
> > - *
> > - * This function returns the descriptor number found, or vq->num (which is
> > - * never a valid descriptor number) if none was found. A negative code is
> > - * returned on error. */
> > -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > - struct iovec iov[], unsigned int iov_size,
> > - unsigned int *out_num, unsigned int *in_num,
> > - struct vhost_log *log, unsigned int *log_num)
> > -{
> > - struct vring_desc desc;
> > - unsigned int i, head, found = 0;
> > - u16 last_avail_idx;
> > - __virtio16 avail_idx;
> > - __virtio16 ring_head;
> > - int ret, access;
> > -
> > - /* Check it isn't doing very strange things with descriptor numbers. */
> > - last_avail_idx = vq->last_avail_idx;
> > -
> > - if (vq->avail_idx == vq->last_avail_idx) {
> > - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> > - vq_err(vq, "Failed to access avail idx at %p\n",
> > - &vq->avail->idx);
> > - return -EFAULT;
> > - }
> > - vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> > -
> > - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
> > - vq_err(vq, "Guest moved used index from %u to %u",
> > - last_avail_idx, vq->avail_idx);
> > - return -EFAULT;
> > - }
> > -
> > - /* If there's nothing new since last we looked, return
> > - * invalid.
> > - */
> > - if (vq->avail_idx == last_avail_idx)
> > - return vq->num;
> > -
> > - /* Only get avail ring entries after they have been
> > - * exposed by guest.
> > - */
> > - smp_rmb();
> > - }
> > -
> > - /* Grab the next descriptor number they're advertising, and increment
> > - * the index we've seen. */
> > - if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
> > - vq_err(vq, "Failed to read head: idx %d address %p\n",
> > - last_avail_idx,
> > - &vq->avail->ring[last_avail_idx % vq->num]);
> > - return -EFAULT;
> > - }
> > -
> > - head = vhost16_to_cpu(vq, ring_head);
> > -
> > - /* If their number is silly, that's an error. */
> > - if (unlikely(head >= vq->num)) {
> > - vq_err(vq, "Guest says index %u > %u is available",
> > - head, vq->num);
> > - return -EINVAL;
> > - }
> > -
> > - /* When we start there are none of either input nor output. */
> > - *out_num = *in_num = 0;
> > - if (unlikely(log))
> > - *log_num = 0;
> > -
> > - i = head;
> > - do {
> > - unsigned iov_count = *in_num + *out_num;
> > - if (unlikely(i >= vq->num)) {
> > - vq_err(vq, "Desc index is %u > %u, head = %u",
> > - i, vq->num, head);
> > - return -EINVAL;
> > - }
> > - if (unlikely(++found > vq->num)) {
> > - vq_err(vq, "Loop detected: last one at %u "
> > - "vq size %u head %u\n",
> > - i, vq->num, head);
> > - return -EINVAL;
> > - }
> > - ret = vhost_get_desc(vq, &desc, i);
> > - if (unlikely(ret)) {
> > - vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
> > - i, vq->desc + i);
> > - return -EFAULT;
> > - }
> > - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
> > - ret = get_indirect(vq, iov, iov_size,
> > - out_num, in_num,
> > - log, log_num, &desc);
> > - if (unlikely(ret < 0)) {
> > - if (ret != -EAGAIN)
> > - vq_err(vq, "Failure detected "
> > - "in indirect descriptor at idx %d\n", i);
> > - return ret;
> > - }
> > - continue;
> > - }
> > -
> > - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> > - access = VHOST_ACCESS_WO;
> > - else
> > - access = VHOST_ACCESS_RO;
> > - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> > - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> > - iov_size - iov_count, access);
> > - if (unlikely(ret < 0)) {
> > - if (ret != -EAGAIN)
> > - vq_err(vq, "Translation failure %d descriptor idx %d\n",
> > - ret, i);
> > - return ret;
> > - }
> > - if (access == VHOST_ACCESS_WO) {
> > - /* If this is an input descriptor,
> > - * increment that count. */
> > - *in_num += ret;
> > - if (unlikely(log && ret)) {
> > - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> > - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> > - ++*log_num;
> > - }
> > - } else {
> > - /* If it's an output descriptor, they're all supposed
> > - * to come before any input descriptors. */
> > - if (unlikely(*in_num)) {
> > - vq_err(vq, "Descriptor has out after in: "
> > - "idx %d\n", i);
> > - return -EINVAL;
> > - }
> > - *out_num += ret;
> > - }
> > - } while ((i = next_desc(vq, &desc)) != -1);
> > -
> > - /* On success, increment avail index. */
> > - vq->last_avail_idx++;
> > -
> > - /* Assume notifications from guest are disabled at this point,
> > - * if they aren't we would need to update avail_event index. */
> > - BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> > - return head;
> > -}
> > -EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > -
> > static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
> > {
> > BUG_ON(!vq->ndescs);
> > @@ -2428,7 +2191,7 @@ static int fetch_indirect_descs(struct vhost_virtqueue *vq,
> > /* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> > * A negative code is returned on error. */
> > -static int fetch_descs(struct vhost_virtqueue *vq)
> > +static int fetch_buf(struct vhost_virtqueue *vq)
> > {
> > unsigned int i, head, found = 0;
> > struct vhost_desc *last;
> > @@ -2441,7 +2204,11 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> > /* Check it isn't doing very strange things with descriptor numbers. */
> > last_avail_idx = vq->last_avail_idx;
> > - if (vq->avail_idx == vq->last_avail_idx) {
> > + if (unlikely(vq->avail_idx == vq->last_avail_idx)) {
> > + /* If we already have work to do, don't bother re-checking. */
> > + if (likely(vq->ndescs))
> > + return 1;
> > +
> > if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> > vq_err(vq, "Failed to access avail idx at %p\n",
> > &vq->avail->idx);
> > @@ -2532,6 +2299,41 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> > return 1;
> > }
> > +/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> > + * A negative code is returned on error. */
> > +static int fetch_descs(struct vhost_virtqueue *vq)
> > +{
> > + int ret;
> > +
> > + if (unlikely(vq->first_desc >= vq->ndescs)) {
> > + vq->first_desc = 0;
> > + vq->ndescs = 0;
> > + }
> > +
> > + if (vq->ndescs)
> > + return 1;
> > +
> > + for (ret = 1;
> > + ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
> > + ret = fetch_buf(vq))
> > + ;
> > +
> > + /* On success we expect some descs */
> > + BUG_ON(ret > 0 && !vq->ndescs);
> > + return ret;
> > +}
> > +
> > +/* Reverse the effects of fetch_descs */
> > +static void unfetch_descs(struct vhost_virtqueue *vq)
> > +{
> > + int i;
> > +
> > + for (i = vq->first_desc; i < vq->ndescs; ++i)
> > + if (!(vq->descs[i].flags & VRING_DESC_F_NEXT))
> > + vq->last_avail_idx -= 1;
> > + vq->ndescs = 0;
> > +}
>
>
> Is it better to set first_desc to zero here?
>
>
> > +
> > /* This looks in the virtqueue and for the first available buffer, and converts
> > * it to an iovec for convenient access. Since descriptors consist of some
> > * number of output then some number of input descriptors, it's actually two
> > @@ -2540,7 +2342,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> > * This function returns the descriptor number found, or vq->num (which is
> > * never a valid descriptor number) if none was found. A negative code is
> > * returned on error. */
> > -int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> > +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > struct iovec iov[], unsigned int iov_size,
> > unsigned int *out_num, unsigned int *in_num,
> > struct vhost_log *log, unsigned int *log_num)
> > @@ -2549,7 +2351,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> > int i;
> > if (ret <= 0)
> > - goto err_fetch;
> > + goto err;
> > /* Now convert to IOV */
> > /* When we start there are none of either input nor output. */
> > @@ -2557,7 +2359,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> > if (unlikely(log))
> > *log_num = 0;
> > - for (i = 0; i < vq->ndescs; ++i) {
> > + for (i = vq->first_desc; i < vq->ndescs; ++i) {
> > unsigned iov_count = *in_num + *out_num;
> > struct vhost_desc *desc = &vq->descs[i];
> > int access;
> > @@ -2603,24 +2405,26 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> > }
> > ret = desc->id;
> > +
> > + if (!(desc->flags & VRING_DESC_F_NEXT))
> > + break;
> > }
> > - vq->ndescs = 0;
> > + vq->first_desc = i + 1;
> > return ret;
> > err:
> > - vhost_discard_vq_desc(vq, 1);
> > -err_fetch:
> > - vq->ndescs = 0;
> > + unfetch_descs(vq);
> > return ret;
> > }
> > -EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
> > +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> > {
> > + unfetch_descs(vq);
> > vq->last_avail_idx -= n;
>
>
> So unfetch_descs() has decreased last_avail_idx.
> Can we fix this by letting unfetch_descs() return the number and then we can
> do:
>
> int d = unfetch_descs(vq);
> vq->last_avail_idx -= (n > d) ? n - d: 0;
>
> Thanks
That's intentional I think - we need both.
Unfetch_descs drops the descriptors in the cache that were
*not returned to caller* through get_vq_desc.
vhost_discard_vq_desc drops the ones that were returned through get_vq_desc.
Did I miss anything?
>
> > }
> > EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
> > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > index 87089d51490d..fed36af5c444 100644
> > --- a/drivers/vhost/vhost.h
> > +++ b/drivers/vhost/vhost.h
> > @@ -81,6 +81,7 @@ struct vhost_virtqueue {
> > struct vhost_desc *descs;
> > int ndescs;
> > + int first_desc;
> > int max_descs;
> > struct file *kick;
> > @@ -189,10 +190,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> > bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
> > bool vhost_log_access_ok(struct vhost_dev *);
> > -int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
> > - struct iovec iov[], unsigned int iov_count,
> > - unsigned int *out_num, unsigned int *in_num,
> > - struct vhost_log *log, unsigned int *log_num);
> > int vhost_get_vq_desc(struct vhost_virtqueue *,
> > struct iovec iov[], unsigned int iov_count,
> > unsigned int *out_num, unsigned int *in_num,
> > @@ -261,6 +258,8 @@ static inline void vhost_vq_set_backend(struct vhost_virtqueue *vq,
> > void *private_data)
> > {
> > vq->private_data = private_data;
> > + vq->ndescs = 0;
> > + vq->first_desc = 0;
> > }
> > /**
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version
2020-06-10 11:05 ` Michael S. Tsirkin
@ 2020-06-11 3:02 ` Jason Wang
2020-06-11 9:06 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Jason Wang @ 2020-06-11 3:02 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: linux-kernel, kvm, virtualization, netdev, eperezma
On 2020/6/10 下午7:05, Michael S. Tsirkin wrote:
>>> +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>>> /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
>>> void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
>>> {
>>> + unfetch_descs(vq);
>>> vq->last_avail_idx -= n;
>> So unfetch_descs() has decreased last_avail_idx.
>> Can we fix this by letting unfetch_descs() return the number and then we can
>> do:
>>
>> int d = unfetch_descs(vq);
>> vq->last_avail_idx -= (n > d) ? n - d: 0;
>>
>> Thanks
> That's intentional I think - we need both.
Yes, but:
>
> Unfetch_descs drops the descriptors in the cache that were
> *not returned to caller* through get_vq_desc.
>
> vhost_discard_vq_desc drops the ones that were returned through get_vq_desc.
>
> Did I miss anything?
We could count some descriptors twice, consider the case e.g we only
cache on descriptor:
fetch_descs()
fetch_buf()
last_avail_idx++;
Then we want do discard it:
vhost_discard_avail_buf(1)
unfetch_descs()
last_avail_idx--;
last_avail_idx -= 1;
Thanks
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version
2020-06-11 3:02 ` Jason Wang
@ 2020-06-11 9:06 ` Michael S. Tsirkin
2020-06-15 2:43 ` Jason Wang
0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-11 9:06 UTC (permalink / raw)
To: Jason Wang; +Cc: linux-kernel, kvm, virtualization, netdev, eperezma
On Thu, Jun 11, 2020 at 11:02:57AM +0800, Jason Wang wrote:
>
> On 2020/6/10 下午7:05, Michael S. Tsirkin wrote:
> > > > +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > > > /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > > > void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> > > > {
> > > > + unfetch_descs(vq);
> > > > vq->last_avail_idx -= n;
> > > So unfetch_descs() has decreased last_avail_idx.
> > > Can we fix this by letting unfetch_descs() return the number and then we can
> > > do:
> > >
> > > int d = unfetch_descs(vq);
> > > vq->last_avail_idx -= (n > d) ? n - d: 0;
> > >
> > > Thanks
> > That's intentional I think - we need both.
>
>
> Yes, but:
>
>
> >
> > Unfetch_descs drops the descriptors in the cache that were
> > *not returned to caller* through get_vq_desc.
> >
> > vhost_discard_vq_desc drops the ones that were returned through get_vq_desc.
> >
> > Did I miss anything?
>
> We could count some descriptors twice, consider the case e.g we only cache
> on descriptor:
>
> fetch_descs()
> fetch_buf()
> last_avail_idx++;
>
> Then we want do discard it:
> vhost_discard_avail_buf(1)
> unfetch_descs()
> last_avail_idx--;
> last_avail_idx -= 1;
>
> Thanks
I don't think that happens. vhost_discard_avail_buf(1) is only called
after get vhost_get_avail_buf. vhost_get_avail_buf increments
first_desc. unfetch_descs only counts from first_desc to ndescs.
If I'm wrong, could you show values of first_desc and ndescs in this
scenario?
--
MST
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version
2020-06-11 9:06 ` Michael S. Tsirkin
@ 2020-06-15 2:43 ` Jason Wang
0 siblings, 0 replies; 20+ messages in thread
From: Jason Wang @ 2020-06-15 2:43 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: linux-kernel, kvm, virtualization, netdev, eperezma
On 2020/6/11 下午5:06, Michael S. Tsirkin wrote:
> On Thu, Jun 11, 2020 at 11:02:57AM +0800, Jason Wang wrote:
>> On 2020/6/10 下午7:05, Michael S. Tsirkin wrote:
>>>>> +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>>>>> /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
>>>>> void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
>>>>> {
>>>>> + unfetch_descs(vq);
>>>>> vq->last_avail_idx -= n;
>>>> So unfetch_descs() has decreased last_avail_idx.
>>>> Can we fix this by letting unfetch_descs() return the number and then we can
>>>> do:
>>>>
>>>> int d = unfetch_descs(vq);
>>>> vq->last_avail_idx -= (n > d) ? n - d: 0;
>>>>
>>>> Thanks
>>> That's intentional I think - we need both.
>>
>> Yes, but:
>>
>>
>>> Unfetch_descs drops the descriptors in the cache that were
>>> *not returned to caller* through get_vq_desc.
>>>
>>> vhost_discard_vq_desc drops the ones that were returned through get_vq_desc.
>>>
>>> Did I miss anything?
>> We could count some descriptors twice, consider the case e.g we only cache
>> on descriptor:
>>
>> fetch_descs()
>> fetch_buf()
>> last_avail_idx++;
>>
>> Then we want do discard it:
>> vhost_discard_avail_buf(1)
>> unfetch_descs()
>> last_avail_idx--;
>> last_avail_idx -= 1;
>>
>> Thanks
>
> I don't think that happens. vhost_discard_avail_buf(1) is only called
> after get vhost_get_avail_buf. vhost_get_avail_buf increments
> first_desc. unfetch_descs only counts from first_desc to ndescs.
>
> If I'm wrong, could you show values of first_desc and ndescs in this
> scenario?
You're right, fetch_descriptor could not be called directly from the
device code.
Thanks
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version
2020-06-08 12:52 ` [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version Michael S. Tsirkin
2020-06-10 3:14 ` Jason Wang
@ 2020-06-10 11:24 ` Eugenio Perez Martin
1 sibling, 0 replies; 20+ messages in thread
From: Eugenio Perez Martin @ 2020-06-10 11:24 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: linux-kernel, kvm list, virtualization, netdev, Jason Wang
On Mon, Jun 8, 2020 at 2:53 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> As testing shows no performance change, switch to that now.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> Link: https://lore.kernel.org/r/20200401183118.8334-3-eperezma@redhat.com
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/vhost/test.c | 2 +-
> drivers/vhost/vhost.c | 318 ++++++++----------------------------------
> drivers/vhost/vhost.h | 7 +-
> 3 files changed, 65 insertions(+), 262 deletions(-)
>
> diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
> index 0466921f4772..7d69778aaa26 100644
> --- a/drivers/vhost/test.c
> +++ b/drivers/vhost/test.c
> @@ -119,7 +119,7 @@ static int vhost_test_open(struct inode *inode, struct file *f)
> dev = &n->dev;
> vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
> n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
> - vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
> + vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV + 64,
> VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
>
> f->private_data = n;
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 180b7b58c76b..41d6b132c234 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -304,6 +304,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> {
> vq->num = 1;
> vq->ndescs = 0;
> + vq->first_desc = 0;
> vq->desc = NULL;
> vq->avail = NULL;
> vq->used = NULL;
> @@ -372,6 +373,11 @@ static int vhost_worker(void *data)
> return 0;
> }
>
> +static int vhost_vq_num_batch_descs(struct vhost_virtqueue *vq)
> +{
> + return vq->max_descs - UIO_MAXIOV;
> +}
> +
> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> {
> kfree(vq->descs);
> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> for (i = 0; i < dev->nvqs; ++i) {
> vq = dev->vqs[i];
> vq->max_descs = dev->iov_limit;
> + if (vhost_vq_num_batch_descs(vq) < 0) {
> + return -EINVAL;
> + }
> vq->descs = kmalloc_array(vq->max_descs,
> sizeof(*vq->descs),
> GFP_KERNEL);
> @@ -1610,6 +1619,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> vq->last_avail_idx = s.num;
> /* Forget the cached index value. */
> vq->avail_idx = vq->last_avail_idx;
> + vq->ndescs = vq->first_desc = 0;
> break;
> case VHOST_GET_VRING_BASE:
> s.index = idx;
> @@ -2078,253 +2088,6 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
> return next;
> }
>
> -static int get_indirect(struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num,
> - struct vring_desc *indirect)
> -{
> - struct vring_desc desc;
> - unsigned int i = 0, count, found = 0;
> - u32 len = vhost32_to_cpu(vq, indirect->len);
> - struct iov_iter from;
> - int ret, access;
> -
> - /* Sanity check */
> - if (unlikely(len % sizeof desc)) {
> - vq_err(vq, "Invalid length in indirect descriptor: "
> - "len 0x%llx not multiple of 0x%zx\n",
> - (unsigned long long)len,
> - sizeof desc);
> - return -EINVAL;
> - }
> -
> - ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
> - UIO_MAXIOV, VHOST_ACCESS_RO);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d in indirect.\n", ret);
> - return ret;
> - }
> - iov_iter_init(&from, READ, vq->indirect, ret, len);
> -
> - /* We will use the result as an address to read from, so most
> - * architectures only need a compiler barrier here. */
> - read_barrier_depends();
> -
> - count = len / sizeof desc;
> - /* Buffers are chained via a 16 bit next field, so
> - * we can have at most 2^16 of these. */
> - if (unlikely(count > USHRT_MAX + 1)) {
> - vq_err(vq, "Indirect buffer length too big: %d\n",
> - indirect->len);
> - return -E2BIG;
> - }
> -
> - do {
> - unsigned iov_count = *in_num + *out_num;
> - if (unlikely(++found > count)) {
> - vq_err(vq, "Loop detected: last one at %u "
> - "indirect size %u\n",
> - i, count);
> - return -EINVAL;
> - }
> - if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
> - vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
> - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> - return -EINVAL;
> - }
> - if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
> - vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
> - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> - return -EINVAL;
> - }
> -
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> - access = VHOST_ACCESS_WO;
> - else
> - access = VHOST_ACCESS_RO;
> -
> - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> - iov_size - iov_count, access);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d indirect idx %d\n",
> - ret, i);
> - return ret;
> - }
> - /* If this is an input descriptor, increment that count. */
> - if (access == VHOST_ACCESS_WO) {
> - *in_num += ret;
> - if (unlikely(log && ret)) {
> - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> - ++*log_num;
> - }
> - } else {
> - /* If it's an output descriptor, they're all supposed
> - * to come before any input descriptors. */
> - if (unlikely(*in_num)) {
> - vq_err(vq, "Indirect descriptor "
> - "has out after in: idx %d\n", i);
> - return -EINVAL;
> - }
> - *out_num += ret;
> - }
> - } while ((i = next_desc(vq, &desc)) != -1);
> - return 0;
> -}
> -
> -/* This looks in the virtqueue and for the first available buffer, and converts
> - * it to an iovec for convenient access. Since descriptors consist of some
> - * number of output then some number of input descriptors, it's actually two
> - * iovecs, but we pack them into one and note how many of each there were.
> - *
> - * This function returns the descriptor number found, or vq->num (which is
> - * never a valid descriptor number) if none was found. A negative code is
> - * returned on error. */
> -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num)
> -{
> - struct vring_desc desc;
> - unsigned int i, head, found = 0;
> - u16 last_avail_idx;
> - __virtio16 avail_idx;
> - __virtio16 ring_head;
> - int ret, access;
> -
> - /* Check it isn't doing very strange things with descriptor numbers. */
> - last_avail_idx = vq->last_avail_idx;
> -
> - if (vq->avail_idx == vq->last_avail_idx) {
> - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> - vq_err(vq, "Failed to access avail idx at %p\n",
> - &vq->avail->idx);
> - return -EFAULT;
> - }
> - vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> -
> - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
> - vq_err(vq, "Guest moved used index from %u to %u",
> - last_avail_idx, vq->avail_idx);
> - return -EFAULT;
> - }
> -
> - /* If there's nothing new since last we looked, return
> - * invalid.
> - */
> - if (vq->avail_idx == last_avail_idx)
> - return vq->num;
> -
> - /* Only get avail ring entries after they have been
> - * exposed by guest.
> - */
> - smp_rmb();
> - }
> -
> - /* Grab the next descriptor number they're advertising, and increment
> - * the index we've seen. */
> - if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
> - vq_err(vq, "Failed to read head: idx %d address %p\n",
> - last_avail_idx,
> - &vq->avail->ring[last_avail_idx % vq->num]);
> - return -EFAULT;
> - }
> -
> - head = vhost16_to_cpu(vq, ring_head);
> -
> - /* If their number is silly, that's an error. */
> - if (unlikely(head >= vq->num)) {
> - vq_err(vq, "Guest says index %u > %u is available",
> - head, vq->num);
> - return -EINVAL;
> - }
> -
> - /* When we start there are none of either input nor output. */
> - *out_num = *in_num = 0;
> - if (unlikely(log))
> - *log_num = 0;
> -
> - i = head;
> - do {
> - unsigned iov_count = *in_num + *out_num;
> - if (unlikely(i >= vq->num)) {
> - vq_err(vq, "Desc index is %u > %u, head = %u",
> - i, vq->num, head);
> - return -EINVAL;
> - }
> - if (unlikely(++found > vq->num)) {
> - vq_err(vq, "Loop detected: last one at %u "
> - "vq size %u head %u\n",
> - i, vq->num, head);
> - return -EINVAL;
> - }
> - ret = vhost_get_desc(vq, &desc, i);
> - if (unlikely(ret)) {
> - vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
> - i, vq->desc + i);
> - return -EFAULT;
> - }
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
> - ret = get_indirect(vq, iov, iov_size,
> - out_num, in_num,
> - log, log_num, &desc);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Failure detected "
> - "in indirect descriptor at idx %d\n", i);
> - return ret;
> - }
> - continue;
> - }
> -
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> - access = VHOST_ACCESS_WO;
> - else
> - access = VHOST_ACCESS_RO;
> - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> - iov_size - iov_count, access);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d descriptor idx %d\n",
> - ret, i);
> - return ret;
> - }
> - if (access == VHOST_ACCESS_WO) {
> - /* If this is an input descriptor,
> - * increment that count. */
> - *in_num += ret;
> - if (unlikely(log && ret)) {
> - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> - ++*log_num;
> - }
> - } else {
> - /* If it's an output descriptor, they're all supposed
> - * to come before any input descriptors. */
> - if (unlikely(*in_num)) {
> - vq_err(vq, "Descriptor has out after in: "
> - "idx %d\n", i);
> - return -EINVAL;
> - }
> - *out_num += ret;
> - }
> - } while ((i = next_desc(vq, &desc)) != -1);
> -
> - /* On success, increment avail index. */
> - vq->last_avail_idx++;
> -
> - /* Assume notifications from guest are disabled at this point,
> - * if they aren't we would need to update avail_event index. */
> - BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> - return head;
> -}
> -EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> -
> static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
> {
> BUG_ON(!vq->ndescs);
> @@ -2428,7 +2191,7 @@ static int fetch_indirect_descs(struct vhost_virtqueue *vq,
>
> /* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> * A negative code is returned on error. */
> -static int fetch_descs(struct vhost_virtqueue *vq)
> +static int (struct vhost_virtqueue *vq)
> {
> unsigned int i, head, found = 0;
> struct vhost_desc *last;
> @@ -2441,7 +2204,11 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> /* Check it isn't doing very strange things with descriptor numbers. */
> last_avail_idx = vq->last_avail_idx;
>
> - if (vq->avail_idx == vq->last_avail_idx) {
> + if (unlikely(vq->avail_idx == vq->last_avail_idx)) {
> + /* If we already have work to do, don't bother re-checking. */
> + if (likely(vq->ndescs))
> + return 1;
If this path is taken and vq->ndescs < vhost_vq_num_batch_descs, the
for{} in fetch_descs will never exit. Do we want to accumulate as many
buffers as possible, or to return them early? Maybe to find a proper
threshold?
(Still testing next commits)
> +
> if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> vq_err(vq, "Failed to access avail idx at %p\n",
> &vq->avail->idx);
> @@ -2532,6 +2299,41 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> return 1;
> }
>
> +/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> + * A negative code is returned on error. */
> +static int fetch_descs(struct vhost_virtqueue *vq)
> +{
> + int ret;
> +
> + if (unlikely(vq->first_desc >= vq->ndescs)) {
> + vq->first_desc = 0;
> + vq->ndescs = 0;
> + }
> +
> + if (vq->ndescs)
> + return 1;
> +
> + for (ret = 1;
> + ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
> + ret = fetch_buf(vq))
> + ;
> +
> + /* On success we expect some descs */
> + BUG_ON(ret > 0 && !vq->ndescs);
> + return ret;
> +}
> +
> +/* Reverse the effects of fetch_descs */
> +static void unfetch_descs(struct vhost_virtqueue *vq)
> +{
> + int i;
> +
> + for (i = vq->first_desc; i < vq->ndescs; ++i)
> + if (!(vq->descs[i].flags & VRING_DESC_F_NEXT))
> + vq->last_avail_idx -= 1;
> + vq->ndescs = 0;
> +}
> +
> /* This looks in the virtqueue and for the first available buffer, and converts
> * it to an iovec for convenient access. Since descriptors consist of some
> * number of output then some number of input descriptors, it's actually two
> @@ -2540,7 +2342,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> * This function returns the descriptor number found, or vq->num (which is
> * never a valid descriptor number) if none was found. A negative code is
> * returned on error. */
> -int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> struct iovec iov[], unsigned int iov_size,
> unsigned int *out_num, unsigned int *in_num,
> struct vhost_log *log, unsigned int *log_num)
> @@ -2549,7 +2351,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> int i;
>
> if (ret <= 0)
> - goto err_fetch;
> + goto err;
This return needs to differentiate between fetch_desc returning 0 (in
that case, it needs to return vq->num) and fetch_descs < 0 (goto err).
(Already noted by you in different thread, but pointing here just to
not forget it).
>
> /* Now convert to IOV */
> /* When we start there are none of either input nor output. */
> @@ -2557,7 +2359,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> if (unlikely(log))
> *log_num = 0;
>
> - for (i = 0; i < vq->ndescs; ++i) {
> + for (i = vq->first_desc; i < vq->ndescs; ++i) {
> unsigned iov_count = *in_num + *out_num;
> struct vhost_desc *desc = &vq->descs[i];
> int access;
> @@ -2603,24 +2405,26 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> }
>
> ret = desc->id;
> +
> + if (!(desc->flags & VRING_DESC_F_NEXT))
> + break;
> }
>
> - vq->ndescs = 0;
> + vq->first_desc = i + 1;
>
> return ret;
>
> err:
> - vhost_discard_vq_desc(vq, 1);
> -err_fetch:
> - vq->ndescs = 0;
> + unfetch_descs(vq);
>
> return ret;
> }
> -EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
> +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>
> /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> {
> + unfetch_descs(vq);
> vq->last_avail_idx -= n;
> }
> EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 87089d51490d..fed36af5c444 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -81,6 +81,7 @@ struct vhost_virtqueue {
>
> struct vhost_desc *descs;
> int ndescs;
> + int first_desc;
> int max_descs;
>
> struct file *kick;
> @@ -189,10 +190,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
> bool vhost_log_access_ok(struct vhost_dev *);
>
> -int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
> - struct iovec iov[], unsigned int iov_count,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num);
> int vhost_get_vq_desc(struct vhost_virtqueue *,
> struct iovec iov[], unsigned int iov_count,
> unsigned int *out_num, unsigned int *in_num,
> @@ -261,6 +258,8 @@ static inline void vhost_vq_set_backend(struct vhost_virtqueue *vq,
> void *private_data)
> {
> vq->private_data = private_data;
> + vq->ndescs = 0;
> + vq->first_desc = 0;
> }
>
> /**
> --
> MST
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH RFC v6 03/11] vhost/net: pass net specific struct pointer
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
2020-06-08 12:52 ` [PATCH RFC v6 01/11] vhost: option to fetch descriptors through an independent struct Michael S. Tsirkin
2020-06-08 12:52 ` [PATCH RFC v6 02/11] vhost: use batched get_vq_desc version Michael S. Tsirkin
@ 2020-06-08 12:52 ` Michael S. Tsirkin
2020-06-08 12:53 ` [PATCH RFC v6 04/11] vhost: reorder functions Michael S. Tsirkin
` (8 subsequent siblings)
11 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:52 UTC (permalink / raw)
To: linux-kernel; +Cc: kvm, virtualization, netdev, Jason Wang, eperezma
In preparation for further cleanup, pass net specific pointer
to ubuf callbacks so we can move net specific fields
out to net structures.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/net.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index bf5e1d81ae25..ff594eec8ae3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -94,7 +94,7 @@ struct vhost_net_ubuf_ref {
*/
atomic_t refcount;
wait_queue_head_t wait;
- struct vhost_virtqueue *vq;
+ struct vhost_net_virtqueue *nvq;
};
#define VHOST_NET_BATCH 64
@@ -231,7 +231,7 @@ static void vhost_net_enable_zcopy(int vq)
}
static struct vhost_net_ubuf_ref *
-vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
+vhost_net_ubuf_alloc(struct vhost_net_virtqueue *nvq, bool zcopy)
{
struct vhost_net_ubuf_ref *ubufs;
/* No zero copy backend? Nothing to count. */
@@ -242,7 +242,7 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
return ERR_PTR(-ENOMEM);
atomic_set(&ubufs->refcount, 1);
init_waitqueue_head(&ubufs->wait);
- ubufs->vq = vq;
+ ubufs->nvq = nvq;
return ubufs;
}
@@ -384,13 +384,13 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
{
struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
- struct vhost_virtqueue *vq = ubufs->vq;
+ struct vhost_net_virtqueue *nvq = ubufs->nvq;
int cnt;
rcu_read_lock_bh();
/* set len to mark this desc buffers done DMA */
- vq->heads[ubuf->desc].len = success ?
+ nvq->vq.heads[ubuf->desc].in_len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
cnt = vhost_net_ubuf_put(ubufs);
@@ -402,7 +402,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
* less than 10% of times).
*/
if (cnt <= 1 || !(cnt % 16))
- vhost_poll_queue(&vq->poll);
+ vhost_poll_queue(&nvq->vq.poll);
rcu_read_unlock_bh();
}
@@ -1525,7 +1525,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
/* start polling new socket */
oldsock = vhost_vq_get_backend(vq);
if (sock != oldsock) {
- ubufs = vhost_net_ubuf_alloc(vq,
+ ubufs = vhost_net_ubuf_alloc(nvq,
sock && vhost_sock_zcopy(sock));
if (IS_ERR(ubufs)) {
r = PTR_ERR(ubufs);
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC v6 04/11] vhost: reorder functions
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
` (2 preceding siblings ...)
2020-06-08 12:52 ` [PATCH RFC v6 03/11] vhost/net: pass net specific struct pointer Michael S. Tsirkin
@ 2020-06-08 12:53 ` Michael S. Tsirkin
2020-06-08 12:53 ` [PATCH RFC v6 05/11] vhost: format-independent API for used buffers Michael S. Tsirkin
` (7 subsequent siblings)
11 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:53 UTC (permalink / raw)
To: linux-kernel; +Cc: kvm, virtualization, netdev, Jason Wang, eperezma
Reorder functions in the file to not rely on forward
declarations, in preparation to making them static
down the road.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/vhost.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 41d6b132c234..334529ebecab 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2429,19 +2429,6 @@ void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
}
EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
-/* After we've used one of their buffers, we tell them about it. We'll then
- * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
-{
- struct vring_used_elem heads = {
- cpu_to_vhost32(vq, head),
- cpu_to_vhost32(vq, len)
- };
-
- return vhost_add_used_n(vq, &heads, 1);
-}
-EXPORT_SYMBOL_GPL(vhost_add_used);
-
static int __vhost_add_used_n(struct vhost_virtqueue *vq,
struct vring_used_elem *heads,
unsigned count)
@@ -2511,6 +2498,19 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
}
EXPORT_SYMBOL_GPL(vhost_add_used_n);
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+{
+ struct vring_used_elem heads = {
+ cpu_to_vhost32(vq, head),
+ cpu_to_vhost32(vq, len)
+ };
+
+ return vhost_add_used_n(vq, &heads, 1);
+}
+EXPORT_SYMBOL_GPL(vhost_add_used);
+
static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
__u16 old, new;
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC v6 05/11] vhost: format-independent API for used buffers
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
` (3 preceding siblings ...)
2020-06-08 12:53 ` [PATCH RFC v6 04/11] vhost: reorder functions Michael S. Tsirkin
@ 2020-06-08 12:53 ` Michael S. Tsirkin
2020-06-08 12:53 ` [PATCH RFC v6 06/11] vhost/net: convert to new API: heads->bufs Michael S. Tsirkin
` (6 subsequent siblings)
11 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:53 UTC (permalink / raw)
To: linux-kernel; +Cc: kvm, virtualization, netdev, Jason Wang, eperezma
Add a new API that doesn't assume used ring, heads, etc.
For now, we keep the old APIs around to make it easier
to convert drivers.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/vhost.c | 52 ++++++++++++++++++++++++++++++++++---------
drivers/vhost/vhost.h | 17 +++++++++++++-
2 files changed, 58 insertions(+), 11 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 334529ebecab..f4a6ff9ef77a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2339,13 +2339,12 @@ static void unfetch_descs(struct vhost_virtqueue *vq)
* number of output then some number of input descriptors, it's actually two
* iovecs, but we pack them into one and note how many of each there were.
*
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+ * This function returns a value > 0 if a descriptor was found, or 0 if none were found.
+ * A negative code is returned on error. */
+int vhost_get_avail_buf(struct vhost_virtqueue *vq, struct vhost_buf *buf,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
{
int ret = fetch_descs(vq);
int i;
@@ -2358,6 +2357,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
*out_num = *in_num = 0;
if (unlikely(log))
*log_num = 0;
+ buf->in_len = buf->out_len = 0;
+ buf->descs = 0;
for (i = vq->first_desc; i < vq->ndescs; ++i) {
unsigned iov_count = *in_num + *out_num;
@@ -2387,6 +2388,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
/* If this is an input descriptor,
* increment that count. */
*in_num += ret;
+ buf->in_len += desc->len;
if (unlikely(log && ret)) {
log[*log_num].addr = desc->addr;
log[*log_num].len = desc->len;
@@ -2402,9 +2404,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
goto err;
}
*out_num += ret;
+ buf->out_len += desc->len;
}
- ret = desc->id;
+ buf->id = desc->id;
+ ++buf->descs;
if (!(desc->flags & VRING_DESC_F_NEXT))
break;
@@ -2412,14 +2416,22 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
vq->first_desc = i + 1;
- return ret;
+ return 1;
err:
unfetch_descs(vq);
return ret;
}
-EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
+EXPORT_SYMBOL_GPL(vhost_get_avail_buf);
+
+/* Reverse the effect of vhost_get_avail_buf. Useful for error handling. */
+void vhost_discard_avail_bufs(struct vhost_virtqueue *vq,
+ struct vhost_buf *buf, unsigned count)
+{
+ vhost_discard_vq_desc(vq, count);
+}
+EXPORT_SYMBOL_GPL(vhost_discard_avail_bufs);
/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
@@ -2511,6 +2523,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
}
EXPORT_SYMBOL_GPL(vhost_add_used);
+int vhost_put_used_buf(struct vhost_virtqueue *vq, struct vhost_buf *buf)
+{
+ return vhost_add_used(vq, buf->id, buf->in_len);
+}
+EXPORT_SYMBOL_GPL(vhost_put_used_buf);
+
+int vhost_put_used_n_bufs(struct vhost_virtqueue *vq,
+ struct vhost_buf *bufs, unsigned count)
+{
+ unsigned i;
+
+ for (i = 0; i < count; ++i) {
+ vq->heads[i].id = cpu_to_vhost32(vq, bufs[i].id);
+ vq->heads[i].len = cpu_to_vhost32(vq, bufs[i].in_len);
+ }
+
+ return vhost_add_used_n(vq, vq->heads, count);
+}
+EXPORT_SYMBOL_GPL(vhost_put_used_n_bufs);
+
static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
__u16 old, new;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index fed36af5c444..28eea0155efb 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -67,6 +67,13 @@ struct vhost_desc {
u16 id;
};
+struct vhost_buf {
+ u32 out_len;
+ u32 in_len;
+ u16 descs;
+ u16 id;
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -195,7 +202,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
-
+int vhost_get_avail_buf(struct vhost_virtqueue *, struct vhost_buf *buf,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num);
+void vhost_discard_avail_bufs(struct vhost_virtqueue *,
+ struct vhost_buf *, unsigned count);
int vhost_vq_init_access(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
@@ -204,6 +216,9 @@ void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
unsigned int id, int len);
void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
struct vring_used_elem *heads, unsigned count);
+int vhost_put_used_buf(struct vhost_virtqueue *, struct vhost_buf *buf);
+int vhost_put_used_n_bufs(struct vhost_virtqueue *,
+ struct vhost_buf *bufs, unsigned count);
void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC v6 06/11] vhost/net: convert to new API: heads->bufs
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
` (4 preceding siblings ...)
2020-06-08 12:53 ` [PATCH RFC v6 05/11] vhost: format-independent API for used buffers Michael S. Tsirkin
@ 2020-06-08 12:53 ` Michael S. Tsirkin
2020-06-08 12:53 ` [PATCH RFC v6 07/11] vhost/net: avoid iov length math Michael S. Tsirkin
` (5 subsequent siblings)
11 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:53 UTC (permalink / raw)
To: linux-kernel; +Cc: kvm, virtualization, netdev, Jason Wang, eperezma
Convert vhost net to use the new format-agnostic API.
In particular, don't poke at vq internals such as the
heads array.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/net.c | 154 +++++++++++++++++++++++---------------------
1 file changed, 82 insertions(+), 72 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ff594eec8ae3..830fe84912a5 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -59,13 +59,13 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
* status internally; used for zerocopy tx only.
*/
/* Lower device DMA failed */
-#define VHOST_DMA_FAILED_LEN ((__force __virtio32)3)
+#define VHOST_DMA_FAILED_LEN (3)
/* Lower device DMA done */
-#define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
+#define VHOST_DMA_DONE_LEN (2)
/* Lower device DMA in progress */
-#define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1)
+#define VHOST_DMA_IN_PROGRESS (1)
/* Buffer unused */
-#define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0)
+#define VHOST_DMA_CLEAR_LEN (0)
#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
@@ -112,9 +112,12 @@ struct vhost_net_virtqueue {
/* last used idx for outstanding DMA zerocopy buffers */
int upend_idx;
/* For TX, first used idx for DMA done zerocopy buffers
- * For RX, number of batched heads
+ * For RX, number of batched bufs
*/
int done_idx;
+ /* Outstanding user bufs. UIO_MAXIOV in length. */
+ /* TODO: we can make this smaller for sure. */
+ struct vhost_buf *bufs;
/* Number of XDP frames batched */
int batched_xdp;
/* an array of userspace buffers info */
@@ -271,6 +274,8 @@ static void vhost_net_clear_ubuf_info(struct vhost_net *n)
int i;
for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
+ kfree(n->vqs[i].bufs);
+ n->vqs[i].bufs = NULL;
kfree(n->vqs[i].ubuf_info);
n->vqs[i].ubuf_info = NULL;
}
@@ -282,6 +287,12 @@ static int vhost_net_set_ubuf_info(struct vhost_net *n)
int i;
for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
+ n->vqs[i].bufs = kmalloc_array(UIO_MAXIOV,
+ sizeof(*n->vqs[i].bufs),
+ GFP_KERNEL);
+ if (!n->vqs[i].bufs)
+ goto err;
+
zcopy = vhost_net_zcopy_mask & (0x1 << i);
if (!zcopy)
continue;
@@ -364,18 +375,18 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
int j = 0;
for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
- if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+ if (nvq->bufs[i].in_len == VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net);
- if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
- vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+ if (VHOST_DMA_IS_DONE(nvq->bufs[i].in_len)) {
+ nvq->bufs[i].in_len = VHOST_DMA_CLEAR_LEN;
++j;
} else
break;
}
while (j) {
add = min(UIO_MAXIOV - nvq->done_idx, j);
- vhost_add_used_and_signal_n(vq->dev, vq,
- &vq->heads[nvq->done_idx], add);
+ vhost_put_used_n_bufs(vq, &nvq->bufs[nvq->done_idx], add);
+ vhost_signal(vq->dev, vq);
nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
j -= add;
}
@@ -390,7 +401,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
rcu_read_lock_bh();
/* set len to mark this desc buffers done DMA */
- nvq->vq.heads[ubuf->desc].in_len = success ?
+ nvq->bufs[ubuf->desc].in_len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
cnt = vhost_net_ubuf_put(ubufs);
@@ -452,7 +463,8 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
if (!nvq->done_idx)
return;
- vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
+ vhost_put_used_n_bufs(vq, nvq->bufs, nvq->done_idx);
+ vhost_signal(dev, vq);
nvq->done_idx = 0;
}
@@ -558,6 +570,7 @@ static void vhost_net_busy_poll(struct vhost_net *net,
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_net_virtqueue *tnvq,
+ struct vhost_buf *buf,
unsigned int *out_num, unsigned int *in_num,
struct msghdr *msghdr, bool *busyloop_intr)
{
@@ -565,10 +578,10 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *rvq = &rnvq->vq;
struct vhost_virtqueue *tvq = &tnvq->vq;
- int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
- out_num, in_num, NULL, NULL);
+ int r = vhost_get_avail_buf(tvq, buf, tvq->iov, ARRAY_SIZE(tvq->iov),
+ out_num, in_num, NULL, NULL);
- if (r == tvq->num && tvq->busyloop_timeout) {
+ if (!r && tvq->busyloop_timeout) {
/* Flush batched packets first */
if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq)))
vhost_tx_batch(net, tnvq,
@@ -577,8 +590,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
- r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
- out_num, in_num, NULL, NULL);
+ r = vhost_get_avail_buf(tvq, buf, tvq->iov, ARRAY_SIZE(tvq->iov),
+ out_num, in_num, NULL, NULL);
}
return r;
@@ -607,6 +620,7 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
static int get_tx_bufs(struct vhost_net *net,
struct vhost_net_virtqueue *nvq,
+ struct vhost_buf *buf,
struct msghdr *msg,
unsigned int *out, unsigned int *in,
size_t *len, bool *busyloop_intr)
@@ -614,9 +628,9 @@ static int get_tx_bufs(struct vhost_net *net,
struct vhost_virtqueue *vq = &nvq->vq;
int ret;
- ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
+ ret = vhost_net_tx_get_vq_desc(net, nvq, buf, out, in, msg, busyloop_intr);
- if (ret < 0 || ret == vq->num)
+ if (ret <= 0)
return ret;
if (*in) {
@@ -761,7 +775,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
unsigned out, in;
- int head;
+ int ret;
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
@@ -773,6 +787,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
int err;
int sent_pkts = 0;
bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
+ struct vhost_buf buf;
do {
bool busyloop_intr = false;
@@ -780,13 +795,13 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
if (nvq->done_idx == VHOST_NET_BATCH)
vhost_tx_batch(net, nvq, sock, &msg);
- head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
- &busyloop_intr);
+ ret = get_tx_bufs(net, nvq, &buf, &msg, &out, &in, &len,
+ &busyloop_intr);
/* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
+ if (unlikely(ret < 0))
break;
/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) {
+ if (!ret) {
if (unlikely(busyloop_intr)) {
vhost_poll_queue(&vq->poll);
} else if (unlikely(vhost_enable_notify(&net->dev,
@@ -808,7 +823,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
goto done;
} else if (unlikely(err != -ENOSPC)) {
vhost_tx_batch(net, nvq, sock, &msg);
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_avail_bufs(vq, &buf, 1);
vhost_net_enable_vq(net, vq);
break;
}
@@ -829,7 +844,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(sock, &msg, len);
if (unlikely(err < 0)) {
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_avail_bufs(vq, &buf, 1);
vhost_net_enable_vq(net, vq);
break;
}
@@ -837,8 +852,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
pr_debug("Truncated TX packet: len %d != %zd\n",
err, len);
done:
- vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
- vq->heads[nvq->done_idx].len = 0;
+ nvq->bufs[nvq->done_idx] = buf;
++nvq->done_idx;
} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
@@ -850,7 +864,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
unsigned out, in;
- int head;
+ int ret;
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
@@ -864,6 +878,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
bool zcopy_used;
int sent_pkts = 0;
+ struct vhost_buf buf;
do {
bool busyloop_intr;
@@ -872,13 +887,13 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
vhost_zerocopy_signal_used(net, vq);
busyloop_intr = false;
- head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
- &busyloop_intr);
+ ret = get_tx_bufs(net, nvq, &buf, &msg, &out, &in, &len,
+ &busyloop_intr);
/* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
+ if (unlikely(ret < 0))
break;
/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) {
+ if (!ret) {
if (unlikely(busyloop_intr)) {
vhost_poll_queue(&vq->poll);
} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
@@ -897,8 +912,8 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
- vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
- vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
+ nvq->bufs[nvq->upend_idx] = buf;
+ nvq->bufs[nvq->upend_idx].in_len = VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
@@ -930,17 +945,19 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
% UIO_MAXIOV;
}
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_avail_bufs(vq, &buf, 1);
vhost_net_enable_vq(net, vq);
break;
}
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- if (!zcopy_used)
- vhost_add_used_and_signal(&net->dev, vq, head, 0);
- else
+ if (!zcopy_used) {
+ vhost_put_used_buf(vq, &buf);
+ vhost_signal(&net->dev, vq);
+ } else {
vhost_zerocopy_signal_used(net, vq);
+ }
vhost_net_tx_packet(net);
} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
}
@@ -1004,7 +1021,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
int len = peek_head_len(rnvq, sk);
if (!len && rvq->busyloop_timeout) {
- /* Flush batched heads first */
+ /* Flush batched bufs first */
vhost_net_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
@@ -1022,11 +1039,11 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
* @iovcount - returned count of io vectors we fill
* @log - vhost log
* @log_num - log offset
- * @quota - headcount quota, 1 for big buffer
- * returns number of buffer heads allocated, negative on error
+ * @quota - bufcount quota, 1 for big buffer
+ * returns number of buffers allocated, negative on error
*/
static int get_rx_bufs(struct vhost_virtqueue *vq,
- struct vring_used_elem *heads,
+ struct vhost_buf *bufs,
int datalen,
unsigned *iovcount,
struct vhost_log *log,
@@ -1035,30 +1052,24 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
{
unsigned int out, in;
int seg = 0;
- int headcount = 0;
- unsigned d;
+ int bufcount = 0;
int r, nlogs = 0;
/* len is always initialized before use since we are always called with
* datalen > 0.
*/
u32 uninitialized_var(len);
- while (datalen > 0 && headcount < quota) {
+ while (datalen > 0 && bufcount < quota) {
if (unlikely(seg >= UIO_MAXIOV)) {
r = -ENOBUFS;
goto err;
}
- r = vhost_get_vq_desc(vq, vq->iov + seg,
- ARRAY_SIZE(vq->iov) - seg, &out,
- &in, log, log_num);
- if (unlikely(r < 0))
+ r = vhost_get_avail_buf(vq, bufs + bufcount, vq->iov + seg,
+ ARRAY_SIZE(vq->iov) - seg, &out,
+ &in, log, log_num);
+ if (unlikely(r <= 0))
goto err;
- d = r;
- if (d == vq->num) {
- r = 0;
- goto err;
- }
if (unlikely(out || in <= 0)) {
vq_err(vq, "unexpected descriptor format for RX: "
"out %d, in %d\n", out, in);
@@ -1069,14 +1080,12 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
nlogs += *log_num;
log += *log_num;
}
- heads[headcount].id = cpu_to_vhost32(vq, d);
len = iov_length(vq->iov + seg, in);
- heads[headcount].len = cpu_to_vhost32(vq, len);
datalen -= len;
- ++headcount;
+ ++bufcount;
seg += in;
}
- heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+ bufs[bufcount - 1].in_len = len + datalen;
*iovcount = seg;
if (unlikely(log))
*log_num = nlogs;
@@ -1086,9 +1095,9 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
r = UIO_MAXIOV + 1;
goto err;
}
- return headcount;
+ return bufcount;
err:
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_avail_bufs(vq, bufs, bufcount);
return r;
}
@@ -1113,7 +1122,7 @@ static void handle_rx(struct vhost_net *net)
};
size_t total_len = 0;
int err, mergeable;
- s16 headcount;
+ int bufcount;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
bool busyloop_intr = false;
@@ -1147,14 +1156,14 @@ static void handle_rx(struct vhost_net *net)
break;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
- headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
- vhost_len, &in, vq_log, &log,
- likely(mergeable) ? UIO_MAXIOV : 1);
+ bufcount = get_rx_bufs(vq, nvq->bufs + nvq->done_idx,
+ vhost_len, &in, vq_log, &log,
+ likely(mergeable) ? UIO_MAXIOV : 1);
/* On error, stop handling until the next kick. */
- if (unlikely(headcount < 0))
+ if (unlikely(bufcount < 0))
goto out;
/* OK, now we need to know about added descriptors. */
- if (!headcount) {
+ if (!bufcount) {
if (unlikely(busyloop_intr)) {
vhost_poll_queue(&vq->poll);
} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
@@ -1171,7 +1180,7 @@ static void handle_rx(struct vhost_net *net)
if (nvq->rx_ring)
msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
/* On overrun, truncate and discard */
- if (unlikely(headcount > UIO_MAXIOV)) {
+ if (unlikely(bufcount > UIO_MAXIOV)) {
iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
err = sock->ops->recvmsg(sock, &msg,
1, MSG_DONTWAIT | MSG_TRUNC);
@@ -1195,7 +1204,7 @@ static void handle_rx(struct vhost_net *net)
if (unlikely(err != sock_len)) {
pr_debug("Discarded rx packet: "
" len %d, expected %zd\n", err, sock_len);
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_avail_bufs(vq, nvq->bufs + nvq->done_idx, bufcount);
continue;
}
/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
@@ -1214,15 +1223,15 @@ static void handle_rx(struct vhost_net *net)
}
/* TODO: Should check and handle checksum. */
- num_buffers = cpu_to_vhost16(vq, headcount);
+ num_buffers = cpu_to_vhost16(vq, bufcount);
if (likely(mergeable) &&
copy_to_iter(&num_buffers, sizeof num_buffers,
&fixup) != sizeof num_buffers) {
vq_err(vq, "Failed num_buffers write");
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_avail_bufs(vq, nvq->bufs + nvq->done_idx, bufcount);
goto out;
}
- nvq->done_idx += headcount;
+ nvq->done_idx += bufcount;
if (nvq->done_idx > VHOST_NET_BATCH)
vhost_net_signal_used(nvq);
if (unlikely(vq_log))
@@ -1314,6 +1323,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
+ n->vqs[i].bufs = NULL;
n->vqs[i].ubufs = NULL;
n->vqs[i].ubuf_info = NULL;
n->vqs[i].upend_idx = 0;
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC v6 07/11] vhost/net: avoid iov length math
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
` (5 preceding siblings ...)
2020-06-08 12:53 ` [PATCH RFC v6 06/11] vhost/net: convert to new API: heads->bufs Michael S. Tsirkin
@ 2020-06-08 12:53 ` Michael S. Tsirkin
2020-06-08 12:53 ` [PATCH RFC v6 08/11] vhost/test: convert to the buf API Michael S. Tsirkin
` (4 subsequent siblings)
11 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:53 UTC (permalink / raw)
To: linux-kernel; +Cc: kvm, virtualization, netdev, Jason Wang, eperezma
Now that API exposes buffer length, we no longer need to
scan IOVs to figure it out.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/net.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 830fe84912a5..0b509be8d7b1 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -607,11 +607,9 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
}
static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
- size_t hdr_size, int out)
+ size_t len, size_t hdr_size, int out)
{
/* Skip header. TODO: support TSO. */
- size_t len = iov_length(vq->iov, out);
-
iov_iter_init(iter, WRITE, vq->iov, out, len);
iov_iter_advance(iter, hdr_size);
@@ -640,7 +638,7 @@ static int get_tx_bufs(struct vhost_net *net,
}
/* Sanity check */
- *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out);
+ *len = init_iov_iter(vq, &msg->msg_iter, buf->out_len, nvq->vhost_hlen, *out);
if (*len == 0) {
vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
*len, nvq->vhost_hlen);
@@ -1080,7 +1078,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
nlogs += *log_num;
log += *log_num;
}
- len = iov_length(vq->iov + seg, in);
+ len = bufs[bufcount].in_len;
datalen -= len;
++bufcount;
seg += in;
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC v6 08/11] vhost/test: convert to the buf API
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
` (6 preceding siblings ...)
2020-06-08 12:53 ` [PATCH RFC v6 07/11] vhost/net: avoid iov length math Michael S. Tsirkin
@ 2020-06-08 12:53 ` Michael S. Tsirkin
2020-06-08 12:53 ` [PATCH RFC v6 09/11] vhost/scsi: switch to buf APIs Michael S. Tsirkin
` (3 subsequent siblings)
11 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:53 UTC (permalink / raw)
To: linux-kernel; +Cc: kvm, virtualization, netdev, Jason Wang, eperezma
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/test.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 7d69778aaa26..12304eb8da15 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -44,9 +44,10 @@ static void handle_vq(struct vhost_test *n)
{
struct vhost_virtqueue *vq = &n->vqs[VHOST_TEST_VQ];
unsigned out, in;
- int head;
+ int ret;
size_t len, total_len = 0;
void *private;
+ struct vhost_buf buf;
mutex_lock(&vq->mutex);
private = vhost_vq_get_backend(vq);
@@ -58,15 +59,15 @@ static void handle_vq(struct vhost_test *n)
vhost_disable_notify(&n->dev, vq);
for (;;) {
- head = vhost_get_vq_desc(vq, vq->iov,
- ARRAY_SIZE(vq->iov),
- &out, &in,
- NULL, NULL);
+ ret = vhost_get_avail_buf(vq, &buf, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in,
+ NULL, NULL);
/* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
+ if (unlikely(ret < 0))
break;
/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) {
+ if (!ret) {
if (unlikely(vhost_enable_notify(&n->dev, vq))) {
vhost_disable_notify(&n->dev, vq);
continue;
@@ -78,13 +79,14 @@ static void handle_vq(struct vhost_test *n)
"out %d, int %d\n", out, in);
break;
}
- len = iov_length(vq->iov, out);
+ len = buf.out_len;
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected 0 len for TX\n");
break;
}
- vhost_add_used_and_signal(&n->dev, vq, head, 0);
+ vhost_put_used_buf(vq, &buf);
+ vhost_signal(&n->dev, vq);
total_len += len;
if (unlikely(vhost_exceeds_weight(vq, 0, total_len)))
break;
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC v6 09/11] vhost/scsi: switch to buf APIs
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
` (7 preceding siblings ...)
2020-06-08 12:53 ` [PATCH RFC v6 08/11] vhost/test: convert to the buf API Michael S. Tsirkin
@ 2020-06-08 12:53 ` Michael S. Tsirkin
2020-06-08 12:53 ` [PATCH RFC v6 10/11] vhost/vsock: switch to the buf API Michael S. Tsirkin
` (2 subsequent siblings)
11 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:53 UTC (permalink / raw)
To: linux-kernel
Cc: kvm, virtualization, netdev, Jason Wang, eperezma, Paolo Bonzini,
Stefan Hajnoczi
Switch to buf APIs. Doing this exposes a spec violation in vhost scsi:
all used bufs are marked with length 0.
Fix that is left for another day.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/scsi.c | 73 ++++++++++++++++++++++++++------------------
1 file changed, 44 insertions(+), 29 deletions(-)
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 0cbaa0b3893d..a5cdd4c01a3a 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -71,8 +71,8 @@ struct vhost_scsi_inflight {
};
struct vhost_scsi_cmd {
- /* Descriptor from vhost_get_vq_desc() for virt_queue segment */
- int tvc_vq_desc;
+ /* Descriptor from vhost_get_avail_buf() for virt_queue segment */
+ struct vhost_buf tvc_vq_desc;
/* virtio-scsi initiator task attribute */
int tvc_task_attr;
/* virtio-scsi response incoming iovecs */
@@ -213,7 +213,7 @@ struct vhost_scsi {
* Context for processing request and control queue operations.
*/
struct vhost_scsi_ctx {
- int head;
+ struct vhost_buf buf;
unsigned int out, in;
size_t req_size, rsp_size;
size_t out_size, in_size;
@@ -443,6 +443,20 @@ static int vhost_scsi_check_stop_free(struct se_cmd *se_cmd)
return target_put_sess_cmd(se_cmd);
}
+/* Signal to guest that request finished with no input buffer. */
+/* TODO calling this when writing into buffer and most likely a bug */
+static void vhost_scsi_signal_noinput(struct vhost_dev *vdev,
+ struct vhost_virtqueue *vq,
+ struct vhost_buf *bufp)
+{
+ struct vhost_buf buf = *bufp;
+
+ buf.in_len = 0;
+ vhost_put_used_buf(vq, &buf);
+ vhost_signal(vdev, vq);
+}
+
+
static void
vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
{
@@ -450,7 +464,8 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
struct virtio_scsi_event *event = &evt->event;
struct virtio_scsi_event __user *eventp;
unsigned out, in;
- int head, ret;
+ struct vhost_buf buf;
+ int ret;
if (!vhost_vq_get_backend(vq)) {
vs->vs_events_missed = true;
@@ -459,14 +474,14 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
again:
vhost_disable_notify(&vs->dev, vq);
- head = vhost_get_vq_desc(vq, vq->iov,
- ARRAY_SIZE(vq->iov), &out, &in,
- NULL, NULL);
- if (head < 0) {
+ ret = vhost_get_avail_buf(vq, &buf,
+ vq->iov, ARRAY_SIZE(vq->iov), &out, &in,
+ NULL, NULL);
+ if (ret < 0) {
vs->vs_events_missed = true;
return;
}
- if (head == vq->num) {
+ if (!ret) {
if (vhost_enable_notify(&vs->dev, vq))
goto again;
vs->vs_events_missed = true;
@@ -488,7 +503,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
eventp = vq->iov[out].iov_base;
ret = __copy_to_user(eventp, event, sizeof(*event));
if (!ret)
- vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+ vhost_scsi_signal_noinput(&vs->dev, vq, &buf);
else
vq_err(vq, "Faulted on vhost_scsi_send_event\n");
}
@@ -549,7 +564,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
if (likely(ret == sizeof(v_rsp))) {
struct vhost_scsi_virtqueue *q;
- vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
+ vhost_put_used_buf(cmd->tvc_vq, &cmd->tvc_vq_desc);
q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
vq = q - vs->vqs;
__set_bit(vq, signal);
@@ -793,7 +808,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
static void
vhost_scsi_send_bad_target(struct vhost_scsi *vs,
struct vhost_virtqueue *vq,
- int head, unsigned out)
+ struct vhost_buf *buf, unsigned out)
{
struct virtio_scsi_cmd_resp __user *resp;
struct virtio_scsi_cmd_resp rsp;
@@ -804,7 +819,7 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs,
resp = vq->iov[out].iov_base;
ret = __copy_to_user(resp, &rsp, sizeof(rsp));
if (!ret)
- vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+ vhost_scsi_signal_noinput(&vs->dev, vq, buf);
else
pr_err("Faulted on virtio_scsi_cmd_resp\n");
}
@@ -813,21 +828,21 @@ static int
vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
struct vhost_scsi_ctx *vc)
{
- int ret = -ENXIO;
+ int r, ret = -ENXIO;
- vc->head = vhost_get_vq_desc(vq, vq->iov,
- ARRAY_SIZE(vq->iov), &vc->out, &vc->in,
- NULL, NULL);
+ r = vhost_get_avail_buf(vq, &vc->buf,
+ vq->iov, ARRAY_SIZE(vq->iov), &vc->out, &vc->in,
+ NULL, NULL);
- pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
- vc->head, vc->out, vc->in);
+ pr_debug("vhost_get_avail_buf: buf: %d, out: %u in: %u\n",
+ vc->buf.id, vc->out, vc->in);
/* On error, stop handling until the next kick. */
- if (unlikely(vc->head < 0))
+ if (unlikely(r < 0))
goto done;
/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (vc->head == vq->num) {
+ if (!r) {
if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
vhost_disable_notify(&vs->dev, vq);
ret = -EAGAIN;
@@ -1093,11 +1108,11 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
}
}
/*
- * Save the descriptor from vhost_get_vq_desc() to be used to
+ * Save the descriptor from vhost_get_avail_buf() to be used to
* complete the virtio-scsi request in TCM callback context via
* vhost_scsi_queue_data_in() and vhost_scsi_queue_status()
*/
- cmd->tvc_vq_desc = vc.head;
+ cmd->tvc_vq_desc = vc.buf;
/*
* Dispatch cmd descriptor for cmwq execution in process
* context provided by vhost_scsi_workqueue. This also ensures
@@ -1117,7 +1132,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
if (ret == -ENXIO)
break;
else if (ret == -EIO)
- vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
+ vhost_scsi_send_bad_target(vs, vq, &vc.buf, vc.out);
} while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
out:
mutex_unlock(&vq->mutex);
@@ -1139,9 +1154,9 @@ vhost_scsi_send_tmf_reject(struct vhost_scsi *vs,
iov_iter_init(&iov_iter, READ, &vq->iov[vc->out], vc->in, sizeof(rsp));
ret = copy_to_iter(&rsp, sizeof(rsp), &iov_iter);
- if (likely(ret == sizeof(rsp)))
- vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0);
- else
+ if (likely(ret == sizeof(rsp))) {
+ vhost_scsi_signal_noinput(&vs->dev, vq, &vc->buf);
+ } else
pr_err("Faulted on virtio_scsi_ctrl_tmf_resp\n");
}
@@ -1162,7 +1177,7 @@ vhost_scsi_send_an_resp(struct vhost_scsi *vs,
ret = copy_to_iter(&rsp, sizeof(rsp), &iov_iter);
if (likely(ret == sizeof(rsp)))
- vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0);
+ vhost_scsi_signal_noinput(&vs->dev, vq, &vc->buf);
else
pr_err("Faulted on virtio_scsi_ctrl_an_resp\n");
}
@@ -1269,7 +1284,7 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
if (ret == -ENXIO)
break;
else if (ret == -EIO)
- vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
+ vhost_scsi_send_bad_target(vs, vq, &vc.buf, vc.out);
} while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
out:
mutex_unlock(&vq->mutex);
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC v6 10/11] vhost/vsock: switch to the buf API
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
` (8 preceding siblings ...)
2020-06-08 12:53 ` [PATCH RFC v6 09/11] vhost/scsi: switch to buf APIs Michael S. Tsirkin
@ 2020-06-08 12:53 ` Michael S. Tsirkin
2020-06-08 17:08 ` Stefano Garzarella
2020-06-08 12:53 ` [PATCH RFC v6 11/11] vhost: drop head based APIs Michael S. Tsirkin
2020-06-08 17:30 ` [PATCH RFC v6 00/11] vhost: ring format independence Stefano Garzarella
11 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:53 UTC (permalink / raw)
To: linux-kernel
Cc: kvm, virtualization, netdev, Jason Wang, eperezma,
Stefan Hajnoczi, Stefano Garzarella
A straight-forward conversion.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/vsock.c | 30 ++++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index a483cec31d5c..61c6d3dd2ae3 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -103,7 +103,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
unsigned out, in;
size_t nbytes;
size_t iov_len, payload_len;
- int head;
+ struct vhost_buf buf;
+ int ret;
spin_lock_bh(&vsock->send_pkt_list_lock);
if (list_empty(&vsock->send_pkt_list)) {
@@ -117,16 +118,17 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
list_del_init(&pkt->list);
spin_unlock_bh(&vsock->send_pkt_list_lock);
- head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- &out, &in, NULL, NULL);
- if (head < 0) {
+ ret = vhost_get_avail_buf(vq, &buf,
+ vq->iov, ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (ret < 0) {
spin_lock_bh(&vsock->send_pkt_list_lock);
list_add(&pkt->list, &vsock->send_pkt_list);
spin_unlock_bh(&vsock->send_pkt_list_lock);
break;
}
- if (head == vq->num) {
+ if (!ret) {
spin_lock_bh(&vsock->send_pkt_list_lock);
list_add(&pkt->list, &vsock->send_pkt_list);
spin_unlock_bh(&vsock->send_pkt_list_lock);
@@ -186,7 +188,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
*/
virtio_transport_deliver_tap_pkt(pkt);
- vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
+ buf.in_len = sizeof(pkt->hdr) + payload_len;
+ vhost_put_used_buf(vq, &buf);
added = true;
pkt->off += payload_len;
@@ -440,7 +443,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
dev);
struct virtio_vsock_pkt *pkt;
- int head, pkts = 0, total_len = 0;
+ int ret, pkts = 0, total_len = 0;
+ struct vhost_buf buf;
unsigned int out, in;
bool added = false;
@@ -461,12 +465,13 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
goto no_more_replies;
}
- head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- &out, &in, NULL, NULL);
- if (head < 0)
+ ret = vhost_get_avail_buf(vq, &buf,
+ vq->iov, ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (ret < 0)
break;
- if (head == vq->num) {
+ if (!ret) {
if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
vhost_disable_notify(&vsock->dev, vq);
continue;
@@ -494,7 +499,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
virtio_transport_free_pkt(pkt);
len += sizeof(pkt->hdr);
- vhost_add_used(vq, head, len);
+ buf.in_len = len;
+ vhost_put_used_buf(vq, &buf);
total_len += len;
added = true;
} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH RFC v6 10/11] vhost/vsock: switch to the buf API
2020-06-08 12:53 ` [PATCH RFC v6 10/11] vhost/vsock: switch to the buf API Michael S. Tsirkin
@ 2020-06-08 17:08 ` Stefano Garzarella
0 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2020-06-08 17:08 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: linux-kernel, kvm, virtualization, netdev, Jason Wang, eperezma,
Stefan Hajnoczi
On Mon, Jun 08, 2020 at 08:53:13AM -0400, Michael S. Tsirkin wrote:
> A straight-forward conversion.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/vhost/vsock.c | 30 ++++++++++++++++++------------
> 1 file changed, 18 insertions(+), 12 deletions(-)
I ran the vsock tests again with this new series and everything seems
to go well:
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Tested-by: Stefano Garzarella <sgarzare@redhat.com>
Thanks,
Stefano
>
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index a483cec31d5c..61c6d3dd2ae3 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -103,7 +103,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> unsigned out, in;
> size_t nbytes;
> size_t iov_len, payload_len;
> - int head;
> + struct vhost_buf buf;
> + int ret;
>
> spin_lock_bh(&vsock->send_pkt_list_lock);
> if (list_empty(&vsock->send_pkt_list)) {
> @@ -117,16 +118,17 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> list_del_init(&pkt->list);
> spin_unlock_bh(&vsock->send_pkt_list_lock);
>
> - head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> - &out, &in, NULL, NULL);
> - if (head < 0) {
> + ret = vhost_get_avail_buf(vq, &buf,
> + vq->iov, ARRAY_SIZE(vq->iov),
> + &out, &in, NULL, NULL);
> + if (ret < 0) {
> spin_lock_bh(&vsock->send_pkt_list_lock);
> list_add(&pkt->list, &vsock->send_pkt_list);
> spin_unlock_bh(&vsock->send_pkt_list_lock);
> break;
> }
>
> - if (head == vq->num) {
> + if (!ret) {
> spin_lock_bh(&vsock->send_pkt_list_lock);
> list_add(&pkt->list, &vsock->send_pkt_list);
> spin_unlock_bh(&vsock->send_pkt_list_lock);
> @@ -186,7 +188,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> */
> virtio_transport_deliver_tap_pkt(pkt);
>
> - vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
> + buf.in_len = sizeof(pkt->hdr) + payload_len;
> + vhost_put_used_buf(vq, &buf);
> added = true;
>
> pkt->off += payload_len;
> @@ -440,7 +443,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
> struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
> dev);
> struct virtio_vsock_pkt *pkt;
> - int head, pkts = 0, total_len = 0;
> + int ret, pkts = 0, total_len = 0;
> + struct vhost_buf buf;
> unsigned int out, in;
> bool added = false;
>
> @@ -461,12 +465,13 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
> goto no_more_replies;
> }
>
> - head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> - &out, &in, NULL, NULL);
> - if (head < 0)
> + ret = vhost_get_avail_buf(vq, &buf,
> + vq->iov, ARRAY_SIZE(vq->iov),
> + &out, &in, NULL, NULL);
> + if (ret < 0)
> break;
>
> - if (head == vq->num) {
> + if (!ret) {
> if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
> vhost_disable_notify(&vsock->dev, vq);
> continue;
> @@ -494,7 +499,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
> virtio_transport_free_pkt(pkt);
>
> len += sizeof(pkt->hdr);
> - vhost_add_used(vq, head, len);
> + buf.in_len = len;
> + vhost_put_used_buf(vq, &buf);
> total_len += len;
> added = true;
> } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
> --
> MST
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH RFC v6 11/11] vhost: drop head based APIs
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
` (9 preceding siblings ...)
2020-06-08 12:53 ` [PATCH RFC v6 10/11] vhost/vsock: switch to the buf API Michael S. Tsirkin
@ 2020-06-08 12:53 ` Michael S. Tsirkin
2020-06-08 17:30 ` [PATCH RFC v6 00/11] vhost: ring format independence Stefano Garzarella
11 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2020-06-08 12:53 UTC (permalink / raw)
To: linux-kernel; +Cc: kvm, virtualization, netdev, Jason Wang, eperezma
Everyone's using buf APIs, no need for head based ones anymore.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/vhost.c | 43 ++++++++-----------------------------------
drivers/vhost/vhost.h | 12 ------------
2 files changed, 8 insertions(+), 47 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f4a6ff9ef77a..7bd4bc581fc9 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2429,18 +2429,11 @@ EXPORT_SYMBOL_GPL(vhost_get_avail_buf);
void vhost_discard_avail_bufs(struct vhost_virtqueue *vq,
struct vhost_buf *buf, unsigned count)
{
- vhost_discard_vq_desc(vq, count);
+ unfetch_descs(vq);
+ vq->last_avail_idx -= count;
}
EXPORT_SYMBOL_GPL(vhost_discard_avail_bufs);
-/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
-{
- unfetch_descs(vq);
- vq->last_avail_idx -= n;
-}
-EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
-
static int __vhost_add_used_n(struct vhost_virtqueue *vq,
struct vring_used_elem *heads,
unsigned count)
@@ -2473,8 +2466,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
return 0;
}
-/* After we've used one of their buffers, we tell them about it. We'll then
- * want to notify the guest, using eventfd. */
+static
int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
unsigned count)
{
@@ -2508,10 +2500,8 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
}
return r;
}
-EXPORT_SYMBOL_GPL(vhost_add_used_n);
-/* After we've used one of their buffers, we tell them about it. We'll then
- * want to notify the guest, using eventfd. */
+static
int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
{
struct vring_used_elem heads = {
@@ -2521,14 +2511,17 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
return vhost_add_used_n(vq, &heads, 1);
}
-EXPORT_SYMBOL_GPL(vhost_add_used);
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using vhost_signal. */
int vhost_put_used_buf(struct vhost_virtqueue *vq, struct vhost_buf *buf)
{
return vhost_add_used(vq, buf->id, buf->in_len);
}
EXPORT_SYMBOL_GPL(vhost_put_used_buf);
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using vhost_signal. */
int vhost_put_used_n_bufs(struct vhost_virtqueue *vq,
struct vhost_buf *bufs, unsigned count)
{
@@ -2589,26 +2582,6 @@ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
}
EXPORT_SYMBOL_GPL(vhost_signal);
-/* And here's the combo meal deal. Supersize me! */
-void vhost_add_used_and_signal(struct vhost_dev *dev,
- struct vhost_virtqueue *vq,
- unsigned int head, int len)
-{
- vhost_add_used(vq, head, len);
- vhost_signal(dev, vq);
-}
-EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
-
-/* multi-buffer version of vhost_add_used_and_signal */
-void vhost_add_used_and_signal_n(struct vhost_dev *dev,
- struct vhost_virtqueue *vq,
- struct vring_used_elem *heads, unsigned count)
-{
- vhost_add_used_n(vq, heads, count);
- vhost_signal(dev, vq);
-}
-EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
-
/* return true if we're sure that avaiable ring is empty */
bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 28eea0155efb..264a2a2fae97 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -197,11 +197,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);
-int vhost_get_vq_desc(struct vhost_virtqueue *,
- struct iovec iov[], unsigned int iov_count,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
int vhost_get_avail_buf(struct vhost_virtqueue *, struct vhost_buf *buf,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
@@ -209,13 +204,6 @@ int vhost_get_avail_buf(struct vhost_virtqueue *, struct vhost_buf *buf,
void vhost_discard_avail_bufs(struct vhost_virtqueue *,
struct vhost_buf *, unsigned count);
int vhost_vq_init_access(struct vhost_virtqueue *);
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
- unsigned count);
-void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
- unsigned int id, int len);
-void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
- struct vring_used_elem *heads, unsigned count);
int vhost_put_used_buf(struct vhost_virtqueue *, struct vhost_buf *buf);
int vhost_put_used_n_bufs(struct vhost_virtqueue *,
struct vhost_buf *bufs, unsigned count);
--
MST
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH RFC v6 00/11] vhost: ring format independence
2020-06-08 12:52 [PATCH RFC v6 00/11] vhost: ring format independence Michael S. Tsirkin
` (10 preceding siblings ...)
2020-06-08 12:53 ` [PATCH RFC v6 11/11] vhost: drop head based APIs Michael S. Tsirkin
@ 2020-06-08 17:30 ` Stefano Garzarella
11 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2020-06-08 17:30 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: linux-kernel, netdev, eperezma, kvm, virtualization
Hi Michael,
On Mon, Jun 08, 2020 at 08:52:51AM -0400, Michael S. Tsirkin wrote:
>
>
> This adds infrastructure required for supporting
> multiple ring formats.
>
> The idea is as follows: we convert descriptors to an
> independent format first, and process that converting to
> iov later.
>
> Used ring is similar: we fetch into an independent struct first,
> convert that to IOV later.
>
> The point is that we have a tight loop that fetches
> descriptors, which is good for cache utilization.
> This will also allow all kind of batching tricks -
> e.g. it seems possible to keep SMAP disabled while
> we are fetching multiple descriptors.
>
> For used descriptors, this allows keeping track of the buffer length
> without need to rescan IOV.
>
> This seems to perform exactly the same as the original
> code based on a microbenchmark.
> Lightly tested.
> More testing would be very much appreciated.
while testing the vhost-vsock I found some issues in vhost-net (the VM
had also a virtio-net device).
This is the dmesg of the host (it is a QEMU VM):
[ 171.860074] CPU: 0 PID: 16613 Comm: vhost-16595 Not tainted 5.7.0-ste-12703-gaf7b4801030c-dirty #6
[ 171.862210] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[ 171.865998] Call Trace:
[ 171.866440] <IRQ>
[ 171.866817] dump_stack+0x57/0x7a
[ 171.867440] nmi_cpu_backtrace.cold+0x14/0x54
[ 171.868233] ? lapic_can_unplug_cpu.cold+0x3b/0x3b
[ 171.869153] nmi_trigger_cpumask_backtrace+0x85/0x92
[ 171.870143] arch_trigger_cpumask_backtrace+0x19/0x20
[ 171.871134] rcu_dump_cpu_stacks+0xa0/0xd2
[ 171.872203] rcu_sched_clock_irq.cold+0x23a/0x41c
[ 171.873098] update_process_times+0x2c/0x60
[ 171.874119] tick_sched_timer+0x59/0x160
[ 171.874777] ? tick_switch_to_oneshot.cold+0x79/0x79
[ 171.875602] __hrtimer_run_queues+0x10d/0x290
[ 171.876317] hrtimer_interrupt+0x109/0x220
[ 171.877025] smp_apic_timer_interrupt+0x76/0x150
[ 171.877875] apic_timer_interrupt+0xf/0x20
[ 171.878563] </IRQ>
[ 171.878897] RIP: 0010:vhost_get_avail_buf+0x5f8/0x860 [vhost]
[ 171.879951] Code: 48 8b bb 88 00 00 00 48 85 ff 0f 84 ad 00 00 00 be 01 00 00 00 44 89 45 80 e8 24 52 08 c1 8b 43 68 44 8b 45 80 e9 e9 fb ff ff <45> 85 c0 0f 85 48 fd ff ff 48 8b 43 38 48 83 bb 38 45 00 00 00 48
[ 171.889938] RSP: 0018:ffffc90000397c40 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
[ 171.896828] RAX: 0000000000000040 RBX: ffff88822c3f4688 RCX: ffff888231090000
[ 171.898903] RDX: 0000000000000440 RSI: ffff888231090000 RDI: ffffc90000397c80
[ 171.901025] RBP: ffffc90000397ce8 R08: 0000000000000001 R09: ffffc90000397dc4
[ 171.903136] R10: 000000231edc461f R11: 0000000000000003 R12: 0000000000000001
[ 171.905213] R13: 0000000000000001 R14: ffffc90000397dd4 R15: ffff88822c3f87a8
[ 171.907553] get_tx_bufs+0x49/0x180 [vhost_net]
[ 171.909142] handle_tx_copy+0xb4/0x5c0 [vhost_net]
[ 171.911495] ? update_curr+0x67/0x160
[ 171.913376] handle_tx+0xb0/0xe0 [vhost_net]
[ 171.916451] handle_tx_kick+0x15/0x20 [vhost_net]
[ 171.919912] vhost_worker+0xb3/0x110 [vhost]
[ 171.923379] kthread+0x106/0x140
[ 171.925314] ? __vhost_add_used_n+0x1c0/0x1c0 [vhost]
[ 171.933388] ? kthread_park+0x90/0x90
[ 171.936148] ret_from_fork+0x22/0x30
[ 234.859212] rcu: INFO: rcu_sched self-detected stall on CPU
[ 234.860036] rcu: 0-....: (20981 ticks this GP) idle=962/1/0x4000000000000002 softirq=15513/15513 fqs=10340
[ 234.861547] (t=21003 jiffies g=24773 q=2390)
[ 234.862158] NMI backtrace for cpu 0
[ 234.862638] CPU: 0 PID: 16613 Comm: vhost-16595 Not tainted 5.7.0-ste-12703-gaf7b4801030c-dirty #6
[ 234.864008] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[ 234.866084] Call Trace:
[ 234.866395] <IRQ>
[ 234.866648] dump_stack+0x57/0x7a
[ 234.867079] nmi_cpu_backtrace.cold+0x14/0x54
[ 234.867679] ? lapic_can_unplug_cpu.cold+0x3b/0x3b
[ 234.868322] nmi_trigger_cpumask_backtrace+0x85/0x92
[ 234.869013] arch_trigger_cpumask_backtrace+0x19/0x20
[ 234.869747] rcu_dump_cpu_stacks+0xa0/0xd2
[ 234.870267] rcu_sched_clock_irq.cold+0x23a/0x41c
[ 234.870960] update_process_times+0x2c/0x60
[ 234.871578] tick_sched_timer+0x59/0x160
[ 234.872148] ? tick_switch_to_oneshot.cold+0x79/0x79
[ 234.872949] __hrtimer_run_queues+0x10d/0x290
[ 234.873711] hrtimer_interrupt+0x109/0x220
[ 234.874271] smp_apic_timer_interrupt+0x76/0x150
[ 234.874913] apic_timer_interrupt+0xf/0x20
[ 234.876507] </IRQ>
[ 234.876799] RIP: 0010:vhost_get_avail_buf+0x8a/0x860 [vhost]
[ 234.877828] Code: 8d 72 06 00 00 85 c0 0f 85 fb 02 00 00 8b 57 70 89 d0 2d 00 04 00 00 0f 88 72 06 00 00 45 31 c0 4c 8d bb 20 41 00 00 4d 89 ee <44> 0f b7 a3 08 01 00 00 66 44 3b a3 0a 01 00 00 0f 84 58 05 00 00
[ 234.882059] RSP: 0018:ffffc90000397c40 EFLAGS: 00000283 ORIG_RAX: ffffffffffffff13
[ 234.883227] RAX: 0000000000000040 RBX: ffff88822c3f4688 RCX: ffff888231090000
[ 234.884317] RDX: 0000000000000440 RSI: ffff888231090000 RDI: ffffc90000397c80
[ 234.886531] RBP: ffffc90000397ce8 R08: 0000000000000001 R09: ffffc90000397dc4
[ 234.891840] R10: 000000231edc461f R11: 0000000000000003 R12: 0000000000000001
[ 234.896670] R13: 0000000000000001 R14: ffffc90000397dd4 R15: ffff88822c3f87a8
[ 234.900918] get_tx_bufs+0x49/0x180 [vhost_net]
[ 234.904280] handle_tx_copy+0xb4/0x5c0 [vhost_net]
[ 234.916402] ? update_curr+0x67/0x160
[ 234.917688] handle_tx+0xb0/0xe0 [vhost_net]
[ 234.918865] handle_tx_kick+0x15/0x20 [vhost_net]
[ 234.920366] vhost_worker+0xb3/0x110 [vhost]
[ 234.921500] kthread+0x106/0x140
[ 234.922219] ? __vhost_add_used_n+0x1c0/0x1c0 [vhost]
[ 234.923595] ? kthread_park+0x90/0x90
[ 234.924442] ret_from_fork+0x22/0x30
[ 297.870095] rcu: INFO: rcu_sched self-detected stall on CPU
[ 297.871352] rcu: 0-....: (36719 ticks this GP) idle=962/1/0x4000000000000002 softirq=15513/15513 fqs=18087
[ 297.873585] (t=36756 jiffies g=24773 q=2853)
[ 297.874478] NMI backtrace for cpu 0
[ 297.875229] CPU: 0 PID: 16613 Comm: vhost-16595 Not tainted 5.7.0-ste-12703-gaf7b4801030c-dirty #6
[ 297.877204] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[ 297.881644] Call Trace:
[ 297.882185] <IRQ>
[ 297.882621] dump_stack+0x57/0x7a
[ 297.883387] nmi_cpu_backtrace.cold+0x14/0x54
[ 297.884390] ? lapic_can_unplug_cpu.cold+0x3b/0x3b
[ 297.885568] nmi_trigger_cpumask_backtrace+0x85/0x92
[ 297.886746] arch_trigger_cpumask_backtrace+0x19/0x20
[ 297.888260] rcu_dump_cpu_stacks+0xa0/0xd2
[ 297.889508] rcu_sched_clock_irq.cold+0x23a/0x41c
[ 297.890803] update_process_times+0x2c/0x60
[ 297.893357] tick_sched_timer+0x59/0x160
[ 297.895143] ? tick_switch_to_oneshot.cold+0x79/0x79
[ 297.897832] __hrtimer_run_queues+0x10d/0x290
[ 297.899841] hrtimer_interrupt+0x109/0x220
[ 297.900909] smp_apic_timer_interrupt+0x76/0x150
[ 297.903543] apic_timer_interrupt+0xf/0x20
[ 297.906509] </IRQ>
[ 297.908004] RIP: 0010:vhost_get_avail_buf+0x92/0x860 [vhost]
[ 297.911536] Code: 85 fb 02 00 00 8b 57 70 89 d0 2d 00 04 00 00 0f 88 72 06 00 00 45 31 c0 4c 8d bb 20 41 00 00 4d 89 ee 44 0f b7 a3 08 01 00 00 <66> 44 3b a3 0a 01 00 00 0f 84 58 05 00 00 8b 43 28 83 e8 01 41 21
[ 297.930274] RSP: 0018:ffffc90000397c40 EFLAGS: 00000283 ORIG_RAX: ffffffffffffff13
[ 297.934056] RAX: 0000000000000040 RBX: ffff88822c3f4688 RCX: ffff888231090000
[ 297.938371] RDX: 0000000000000440 RSI: ffff888231090000 RDI: ffffc90000397c80
[ 297.944222] RBP: ffffc90000397ce8 R08: 0000000000000001 R09: ffffc90000397dc4
[ 297.953817] R10: 000000231edc461f R11: 0000000000000003 R12: 0000000000000001
[ 297.956453] R13: 0000000000000001 R14: ffffc90000397dd4 R15: ffff88822c3f87a8
[ 297.960873] get_tx_bufs+0x49/0x180 [vhost_net]
[ 297.964163] handle_tx_copy+0xb4/0x5c0 [vhost_net]
[ 297.965871] ? update_curr+0x67/0x160
[ 297.966893] handle_tx+0xb0/0xe0 [vhost_net]
[ 297.968442] handle_tx_kick+0x15/0x20 [vhost_net]
[ 297.971327] vhost_worker+0xb3/0x110 [vhost]
[ 297.974275] kthread+0x106/0x140
[ 297.976141] ? __vhost_add_used_n+0x1c0/0x1c0 [vhost]
[ 297.979518] ? kthread_park+0x90/0x90
[ 297.981665] ret_from_fork+0x22/0x30
^ permalink raw reply [flat|nested] 20+ messages in thread