Re: [PATCH v4 3/5] vhost-user block device backend server

From: Kevin Wolf <kwolf@redhat.com>
To: Coiby Xu <coiby.xu@gmail.com>
Cc: bharatlkmlkvm@gmail.com, qemu-devel@nongnu.org, stefanha@redhat.com
Subject: Re: [PATCH v4 3/5] vhost-user block device backend server
Date: Tue, 25 Feb 2020 17:09:58 +0100	[thread overview]
Message-ID: <20200225160958.GD7632@linux.fritz.box> (raw)
In-Reply-To: <20200218050711.8133-4-coiby.xu@gmail.com>

Am 18.02.2020 um 06:07 hat Coiby Xu geschrieben:
> By making use of libvhost, multiple block device drives can be exported
> and each drive can serve multiple clients simultaneously.
> Since vhost-user-server needs a block drive to be created first, delay
> the creation of this object.
> 
> Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
> ---
>  Makefile.target                  |   1 +
>  backends/Makefile.objs           |   2 +
>  backends/vhost-user-blk-server.c | 718 +++++++++++++++++++++++++++++++
>  backends/vhost-user-blk-server.h |  21 +
>  vl.c                             |   4 +
>  5 files changed, 746 insertions(+)
>  create mode 100644 backends/vhost-user-blk-server.c
>  create mode 100644 backends/vhost-user-blk-server.h
> 
> diff --git a/Makefile.target b/Makefile.target
> index 6e61f607b1..8c6c01eb3a 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -159,6 +159,7 @@ obj-y += monitor/
>  obj-y += qapi/
>  obj-y += memory.o
>  obj-y += memory_mapping.o
> +obj-$(CONFIG_LINUX) += ../contrib/libvhost-user/libvhost-user.o
>  obj-y += migration/ram.o
>  LIBS := $(libs_softmmu) $(LIBS)
>  
> diff --git a/backends/Makefile.objs b/backends/Makefile.objs
> index 28a847cd57..4e7be731e0 100644
> --- a/backends/Makefile.objs
> +++ b/backends/Makefile.objs
> @@ -14,6 +14,8 @@ common-obj-y += cryptodev-vhost.o
>  common-obj-$(CONFIG_VHOST_CRYPTO) += cryptodev-vhost-user.o
>  endif
>  
> +common-obj-$(CONFIG_LINUX) += vhost-user-blk-server.o
> +
>  common-obj-$(call land,$(CONFIG_VHOST_USER),$(CONFIG_VIRTIO)) += vhost-user.o
>  
>  common-obj-$(CONFIG_LINUX) += hostmem-memfd.o
> diff --git a/backends/vhost-user-blk-server.c b/backends/vhost-user-blk-server.c
> new file mode 100644
> index 0000000000..1bf7f7b544
> --- /dev/null
> +++ b/backends/vhost-user-blk-server.c

Please move this to block/export/vhost-user-blk.c. This will
automatically have it covered as a block layer component in MAINTAINERS,
and it will provide a place where other exports can be put, too.

> @@ -0,0 +1,718 @@
> +/*
> + * Sharing QEMU block devices via vhost-user protocal
> + *
> + * Author: Coiby Xu <coiby.xu@gmail.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +#include "qemu/osdep.h"
> +#include "block/block.h"
> +#include "vhost-user-blk-server.h"
> +#include "qapi/error.h"
> +#include "qom/object_interfaces.h"
> +#include "sysemu/block-backend.h"
> +
> +enum {
> +    VHOST_USER_BLK_MAX_QUEUES = 1,
> +};
> +struct virtio_blk_inhdr {
> +    unsigned char status;
> +};
> +
> +static QTAILQ_HEAD(, VuBlockDev) vu_block_devs =
> +                                 QTAILQ_HEAD_INITIALIZER(vu_block_devs);
> +
> +
> +typedef struct VuBlockReq {
> +    VuVirtqElement *elem;
> +    int64_t sector_num;
> +    size_t size;
> +    struct virtio_blk_inhdr *in;
> +    struct virtio_blk_outhdr out;
> +    VuClient *client;
> +    struct VuVirtq *vq;
> +} VuBlockReq;
> +
> +
> +static void vu_block_req_complete(VuBlockReq *req)
> +{
> +    VuDev *vu_dev = &req->client->parent;
> +
> +    /* IO size with 1 extra status byte */
> +    vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1);
> +    vu_queue_notify(vu_dev, req->vq);
> +
> +    if (req->elem) {
> +        free(req->elem);
> +    }
> +
> +    g_free(req);
> +}
> +
> +static VuBlockDev *get_vu_block_device_by_client(VuClient *client)
> +{
> +    return container_of(client->server->ptr_in_device, VuBlockDev, vu_server);
> +}
> +
> +static int coroutine_fn
> +vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
> +                              uint32_t iovcnt, uint32_t type)
> +{
> +    struct virtio_blk_discard_write_zeroes desc;
> +    ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
> +    if (unlikely(size != sizeof(desc))) {
> +        error_report("Invalid size %ld, expect %ld", size, sizeof(desc));
> +        return -EINVAL;
> +    }
> +
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_client(req->client);
> +    uint64_t range[2] = { le64toh(desc.sector) << 9,
> +                          le32toh(desc.num_sectors) << 9 };
> +    if (type == VIRTIO_BLK_T_DISCARD) {
> +        if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) {
> +            return 0;
> +        }
> +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
> +        if (blk_co_pwrite_zeroes(vdev_blk->backend,
> +                                 range[0], range[1], 0) == 0) {
> +            return 0;
> +        }
> +    }
> +
> +    return -EINVAL;
> +}
> +
> +
> +static void coroutine_fn vu_block_flush(VuBlockReq *req)
> +{
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_client(req->client);
> +    BlockBackend *backend = vdev_blk->backend;
> +    blk_co_flush(backend);
> +}
> +
> +
> +static int coroutine_fn vu_block_virtio_process_req(VuClient *client,
> +                                                    VuVirtq *vq)
> +{
> +    VuDev *vu_dev = &client->parent;
> +    VuVirtqElement *elem;
> +    uint32_t type;
> +    VuBlockReq *req;
> +
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_client(client);
> +    BlockBackend *backend = vdev_blk->backend;
> +    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) +
> +                                    sizeof(VuBlockReq));
> +    if (!elem) {
> +        return -1;
> +    }
> +
> +    struct iovec *in_iov = elem->in_sg;
> +    struct iovec *out_iov = elem->out_sg;
> +    unsigned in_num = elem->in_num;
> +    unsigned out_num = elem->out_num;
> +    /* refer to hw/block/virtio_blk.c */
> +    if (elem->out_num < 1 || elem->in_num < 1) {
> +        error_report("virtio-blk request missing headers");
> +        free(elem);
> +        return -EINVAL;
> +    }
> +
> +    req = g_new0(VuBlockReq, 1);
> +    req->client = client;
> +    req->vq = vq;
> +    req->elem = elem;

You can keep req on the stack, it will be freed before this function
returns.

> +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
> +                            sizeof(req->out)) != sizeof(req->out))) {
> +        error_report("virtio-blk request outhdr too short");
> +        goto err;
> +    }
> +
> +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
> +
> +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
> +        error_report("virtio-blk request inhdr too short");
> +        goto err;
> +    }
> +
> +    /* We always touch the last byte, so just see how big in_iov is.  */
> +    req->in = (void *)in_iov[in_num - 1].iov_base
> +              + in_iov[in_num - 1].iov_len
> +              - sizeof(struct virtio_blk_inhdr);
> +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
> +
> +
> +    type = le32toh(req->out.type);
> +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
> +    case VIRTIO_BLK_T_IN:
> +    case VIRTIO_BLK_T_OUT: {
> +        ssize_t ret = 0;
> +        bool is_write = type & VIRTIO_BLK_T_OUT;
> +        req->sector_num = le64toh(req->out.sector);
> +
> +        int64_t offset = req->sector_num * vdev_blk->blk_size;
> +        QEMUIOVector *qiov = g_new0(QEMUIOVector, 1);
> +        if (is_write) {
> +            qemu_iovec_init_external(qiov, out_iov, out_num);
> +            ret = blk_co_pwritev(backend, offset, qiov->size,
> +                                 qiov, 0);
> +        } else {
> +            qemu_iovec_init_external(qiov, in_iov, in_num);
> +            ret = blk_co_preadv(backend, offset, qiov->size,
> +                                 qiov, 0);
> +        }
> +        aio_wait_kick();

What purpose does this aio_wait_kick() serve? Usually you do this if you
want to signal completion fo something to a different thread, but I
don't see that we changed any control state that could be visible to
other threads.

> +        if (ret >= 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vu_block_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_FLUSH:
> +        vu_block_flush(req);
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        vu_block_req_complete(req);
> +        break;
> +    case VIRTIO_BLK_T_GET_ID: {
> +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> +                          VIRTIO_BLK_ID_BYTES);
> +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk_server");
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        req->size = elem->in_sg[0].iov_len;
> +        vu_block_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_DISCARD:
> +    case VIRTIO_BLK_T_WRITE_ZEROES: {
> +        int rc;
> +        rc = vu_block_discard_write_zeroes(req, &elem->out_sg[1],
> +                                           out_num, type);
> +        if (rc == 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vu_block_req_complete(req);
> +        break;
> +    }
> +    default:
> +        req->in->status = VIRTIO_BLK_S_UNSUPP;
> +        vu_block_req_complete(req);
> +        break;
> +    }

You have vu_block_req_complete() as the last thing in every branch. You
could have it just once after the switch block.

> +    return 0;
> +
> +err:
> +    free(elem);
> +    g_free(req);
> +    return -EINVAL;
> +}

Okay, so vu_block_virtio_process_req() takes a single request from the
virtqueue and processes it. This makes sense as a building block, but it
doesn't allow parallelism yet. I expect to see the creation of multiple
coroutines below that can run in parallel.

> +
> +static void vu_block_process_vq(VuDev *vu_dev, int idx)
> +{
> +    VuClient *client;
> +    VuVirtq *vq;
> +    int ret;
> +
> +    client = container_of(vu_dev, VuClient, parent);
> +    assert(client);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    assert(vq);
> +
> +    while (1) {
> +        ret = vu_block_virtio_process_req(client, vq);

If you call vu_block_virtio_process_req(), then this one need to be a
coroutine_fn, too.

> +        if (ret) {
> +            break;
> +        }
> +    }
> +}
> +
> +static void vu_block_queue_set_started(VuDev *vu_dev, int idx, bool started)
> +{
> +    VuVirtq *vq;
> +
> +    assert(vu_dev);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    vu_set_queue_handler(vu_dev, vq, started ? vu_block_process_vq : NULL);
> +}

If vu_block_process_vq() is used as a vq->handler, will it not be called
outside coroutine context? How can this work when
vu_block_virtio_process_req() required to be run in coroutine context?

Another problem is that vu_block_process_vq() runs a loop as long as
there are requests that aren't completed yet. During this time,
vu_kick_cb() will block. I don't think this is what was intended.
Instead, we should start the requests (without waiting for their
completion) and return.

> diff --git a/vl.c b/vl.c
> index 794f2e5733..0332ab70da 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -2538,6 +2538,10 @@ static bool object_create_initial(const char *type, QemuOpts *opts)
>      }
>  #endif
>  
> +    /* Reason: vhost-user-server property "name" */

"node-name" now.

> +    if (g_str_equal(type, "vhost-user-blk-server")) {
> +        return false;
> +    }
>      /*
>       * Reason: filter-* property "netdev" etc.
>       */

Kevin