All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kevin Wolf <kwolf@redhat.com>
To: Coiby Xu <coiby.xu@gmail.com>
Cc: bharatlkmlkvm@gmail.com, qemu-devel@nongnu.org, stefanha@redhat.com
Subject: Re: [PATCH v4 3/5] vhost-user block device backend server
Date: Tue, 25 Feb 2020 17:09:58 +0100	[thread overview]
Message-ID: <20200225160958.GD7632@linux.fritz.box> (raw)
In-Reply-To: <20200218050711.8133-4-coiby.xu@gmail.com>

Am 18.02.2020 um 06:07 hat Coiby Xu geschrieben:
> By making use of libvhost, multiple block device drives can be exported
> and each drive can serve multiple clients simultaneously.
> Since vhost-user-server needs a block drive to be created first, delay
> the creation of this object.
> 
> Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
> ---
>  Makefile.target                  |   1 +
>  backends/Makefile.objs           |   2 +
>  backends/vhost-user-blk-server.c | 718 +++++++++++++++++++++++++++++++
>  backends/vhost-user-blk-server.h |  21 +
>  vl.c                             |   4 +
>  5 files changed, 746 insertions(+)
>  create mode 100644 backends/vhost-user-blk-server.c
>  create mode 100644 backends/vhost-user-blk-server.h
> 
> diff --git a/Makefile.target b/Makefile.target
> index 6e61f607b1..8c6c01eb3a 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -159,6 +159,7 @@ obj-y += monitor/
>  obj-y += qapi/
>  obj-y += memory.o
>  obj-y += memory_mapping.o
> +obj-$(CONFIG_LINUX) += ../contrib/libvhost-user/libvhost-user.o
>  obj-y += migration/ram.o
>  LIBS := $(libs_softmmu) $(LIBS)
>  
> diff --git a/backends/Makefile.objs b/backends/Makefile.objs
> index 28a847cd57..4e7be731e0 100644
> --- a/backends/Makefile.objs
> +++ b/backends/Makefile.objs
> @@ -14,6 +14,8 @@ common-obj-y += cryptodev-vhost.o
>  common-obj-$(CONFIG_VHOST_CRYPTO) += cryptodev-vhost-user.o
>  endif
>  
> +common-obj-$(CONFIG_LINUX) += vhost-user-blk-server.o
> +
>  common-obj-$(call land,$(CONFIG_VHOST_USER),$(CONFIG_VIRTIO)) += vhost-user.o
>  
>  common-obj-$(CONFIG_LINUX) += hostmem-memfd.o
> diff --git a/backends/vhost-user-blk-server.c b/backends/vhost-user-blk-server.c
> new file mode 100644
> index 0000000000..1bf7f7b544
> --- /dev/null
> +++ b/backends/vhost-user-blk-server.c

Please move this to block/export/vhost-user-blk.c. This will
automatically have it covered as a block layer component in MAINTAINERS,
and it will provide a place where other exports can be put, too.

> @@ -0,0 +1,718 @@
> +/*
> + * Sharing QEMU block devices via vhost-user protocal
> + *
> + * Author: Coiby Xu <coiby.xu@gmail.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +#include "qemu/osdep.h"
> +#include "block/block.h"
> +#include "vhost-user-blk-server.h"
> +#include "qapi/error.h"
> +#include "qom/object_interfaces.h"
> +#include "sysemu/block-backend.h"
> +
> +enum {
> +    VHOST_USER_BLK_MAX_QUEUES = 1,
> +};
> +struct virtio_blk_inhdr {
> +    unsigned char status;
> +};
> +
> +static QTAILQ_HEAD(, VuBlockDev) vu_block_devs =
> +                                 QTAILQ_HEAD_INITIALIZER(vu_block_devs);
> +
> +
> +typedef struct VuBlockReq {
> +    VuVirtqElement *elem;
> +    int64_t sector_num;
> +    size_t size;
> +    struct virtio_blk_inhdr *in;
> +    struct virtio_blk_outhdr out;
> +    VuClient *client;
> +    struct VuVirtq *vq;
> +} VuBlockReq;
> +
> +
> +static void vu_block_req_complete(VuBlockReq *req)
> +{
> +    VuDev *vu_dev = &req->client->parent;
> +
> +    /* IO size with 1 extra status byte */
> +    vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1);
> +    vu_queue_notify(vu_dev, req->vq);
> +
> +    if (req->elem) {
> +        free(req->elem);
> +    }
> +
> +    g_free(req);
> +}
> +
> +static VuBlockDev *get_vu_block_device_by_client(VuClient *client)
> +{
> +    return container_of(client->server->ptr_in_device, VuBlockDev, vu_server);
> +}
> +
> +static int coroutine_fn
> +vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
> +                              uint32_t iovcnt, uint32_t type)
> +{
> +    struct virtio_blk_discard_write_zeroes desc;
> +    ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
> +    if (unlikely(size != sizeof(desc))) {
> +        error_report("Invalid size %ld, expect %ld", size, sizeof(desc));
> +        return -EINVAL;
> +    }
> +
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_client(req->client);
> +    uint64_t range[2] = { le64toh(desc.sector) << 9,
> +                          le32toh(desc.num_sectors) << 9 };
> +    if (type == VIRTIO_BLK_T_DISCARD) {
> +        if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) {
> +            return 0;
> +        }
> +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
> +        if (blk_co_pwrite_zeroes(vdev_blk->backend,
> +                                 range[0], range[1], 0) == 0) {
> +            return 0;
> +        }
> +    }
> +
> +    return -EINVAL;
> +}
> +
> +
> +static void coroutine_fn vu_block_flush(VuBlockReq *req)
> +{
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_client(req->client);
> +    BlockBackend *backend = vdev_blk->backend;
> +    blk_co_flush(backend);
> +}
> +
> +
> +static int coroutine_fn vu_block_virtio_process_req(VuClient *client,
> +                                                    VuVirtq *vq)
> +{
> +    VuDev *vu_dev = &client->parent;
> +    VuVirtqElement *elem;
> +    uint32_t type;
> +    VuBlockReq *req;
> +
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_client(client);
> +    BlockBackend *backend = vdev_blk->backend;
> +    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) +
> +                                    sizeof(VuBlockReq));
> +    if (!elem) {
> +        return -1;
> +    }
> +
> +    struct iovec *in_iov = elem->in_sg;
> +    struct iovec *out_iov = elem->out_sg;
> +    unsigned in_num = elem->in_num;
> +    unsigned out_num = elem->out_num;
> +    /* refer to hw/block/virtio_blk.c */
> +    if (elem->out_num < 1 || elem->in_num < 1) {
> +        error_report("virtio-blk request missing headers");
> +        free(elem);
> +        return -EINVAL;
> +    }
> +
> +    req = g_new0(VuBlockReq, 1);
> +    req->client = client;
> +    req->vq = vq;
> +    req->elem = elem;

You can keep req on the stack, it will be freed before this function
returns.

> +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
> +                            sizeof(req->out)) != sizeof(req->out))) {
> +        error_report("virtio-blk request outhdr too short");
> +        goto err;
> +    }
> +
> +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
> +
> +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
> +        error_report("virtio-blk request inhdr too short");
> +        goto err;
> +    }
> +
> +    /* We always touch the last byte, so just see how big in_iov is.  */
> +    req->in = (void *)in_iov[in_num - 1].iov_base
> +              + in_iov[in_num - 1].iov_len
> +              - sizeof(struct virtio_blk_inhdr);
> +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
> +
> +
> +    type = le32toh(req->out.type);
> +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
> +    case VIRTIO_BLK_T_IN:
> +    case VIRTIO_BLK_T_OUT: {
> +        ssize_t ret = 0;
> +        bool is_write = type & VIRTIO_BLK_T_OUT;
> +        req->sector_num = le64toh(req->out.sector);
> +
> +        int64_t offset = req->sector_num * vdev_blk->blk_size;
> +        QEMUIOVector *qiov = g_new0(QEMUIOVector, 1);
> +        if (is_write) {
> +            qemu_iovec_init_external(qiov, out_iov, out_num);
> +            ret = blk_co_pwritev(backend, offset, qiov->size,
> +                                 qiov, 0);
> +        } else {
> +            qemu_iovec_init_external(qiov, in_iov, in_num);
> +            ret = blk_co_preadv(backend, offset, qiov->size,
> +                                 qiov, 0);
> +        }
> +        aio_wait_kick();

What purpose does this aio_wait_kick() serve? Usually you do this if you
want to signal completion fo something to a different thread, but I
don't see that we changed any control state that could be visible to
other threads.

> +        if (ret >= 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vu_block_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_FLUSH:
> +        vu_block_flush(req);
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        vu_block_req_complete(req);
> +        break;
> +    case VIRTIO_BLK_T_GET_ID: {
> +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> +                          VIRTIO_BLK_ID_BYTES);
> +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk_server");
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        req->size = elem->in_sg[0].iov_len;
> +        vu_block_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_DISCARD:
> +    case VIRTIO_BLK_T_WRITE_ZEROES: {
> +        int rc;
> +        rc = vu_block_discard_write_zeroes(req, &elem->out_sg[1],
> +                                           out_num, type);
> +        if (rc == 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vu_block_req_complete(req);
> +        break;
> +    }
> +    default:
> +        req->in->status = VIRTIO_BLK_S_UNSUPP;
> +        vu_block_req_complete(req);
> +        break;
> +    }

You have vu_block_req_complete() as the last thing in every branch. You
could have it just once after the switch block.

> +    return 0;
> +
> +err:
> +    free(elem);
> +    g_free(req);
> +    return -EINVAL;
> +}

Okay, so vu_block_virtio_process_req() takes a single request from the
virtqueue and processes it. This makes sense as a building block, but it
doesn't allow parallelism yet. I expect to see the creation of multiple
coroutines below that can run in parallel.

> +
> +static void vu_block_process_vq(VuDev *vu_dev, int idx)
> +{
> +    VuClient *client;
> +    VuVirtq *vq;
> +    int ret;
> +
> +    client = container_of(vu_dev, VuClient, parent);
> +    assert(client);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    assert(vq);
> +
> +    while (1) {
> +        ret = vu_block_virtio_process_req(client, vq);

If you call vu_block_virtio_process_req(), then this one need to be a
coroutine_fn, too.

> +        if (ret) {
> +            break;
> +        }
> +    }
> +}
> +
> +static void vu_block_queue_set_started(VuDev *vu_dev, int idx, bool started)
> +{
> +    VuVirtq *vq;
> +
> +    assert(vu_dev);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    vu_set_queue_handler(vu_dev, vq, started ? vu_block_process_vq : NULL);
> +}

If vu_block_process_vq() is used as a vq->handler, will it not be called
outside coroutine context? How can this work when
vu_block_virtio_process_req() required to be run in coroutine context?

Another problem is that vu_block_process_vq() runs a loop as long as
there are requests that aren't completed yet. During this time,
vu_kick_cb() will block. I don't think this is what was intended.
Instead, we should start the requests (without waiting for their
completion) and return.

> diff --git a/vl.c b/vl.c
> index 794f2e5733..0332ab70da 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -2538,6 +2538,10 @@ static bool object_create_initial(const char *type, QemuOpts *opts)
>      }
>  #endif
>  
> +    /* Reason: vhost-user-server property "name" */

"node-name" now.

> +    if (g_str_equal(type, "vhost-user-blk-server")) {
> +        return false;
> +    }
>      /*
>       * Reason: filter-* property "netdev" etc.
>       */

Kevin



  reply	other threads:[~2020-02-25 16:11 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-18  5:07 [PATCH v4 0/5] vhost-user block device backend implementation Coiby Xu
2020-02-18  5:07 ` [PATCH v4 1/5] extend libvhost to support IOThread and coroutine Coiby Xu
2020-02-19 16:33   ` Stefan Hajnoczi
2020-02-25 14:55   ` Kevin Wolf
2020-02-18  5:07 ` [PATCH v4 2/5] generic vhost user server Coiby Xu
2020-02-25 15:44   ` Kevin Wolf
2020-02-28  4:23     ` Coiby Xu
2020-02-18  5:07 ` [PATCH v4 3/5] vhost-user block device backend server Coiby Xu
2020-02-25 16:09   ` Kevin Wolf [this message]
2020-02-18  5:07 ` [PATCH v4 4/5] a standone-alone tool to directly share disk image file via vhost-user protocol Coiby Xu
2020-02-18  5:07 ` [PATCH v4 5/5] new qTest case to test the vhost-user-blk-server Coiby Xu
2020-02-19 16:38 ` [PATCH v4 0/5] vhost-user block device backend implementation Stefan Hajnoczi
2020-02-26 15:18   ` Coiby Xu
2020-02-27  7:41     ` Stefan Hajnoczi
2020-02-27  9:53       ` Coiby Xu
2020-02-27 10:02         ` Kevin Wolf
2020-02-27 10:28           ` Coiby Xu
2020-02-27 10:55             ` Kevin Wolf
2020-02-27 11:07               ` Marc-André Lureau
2020-02-27 11:19                 ` Kevin Wolf
2020-02-27 11:38                   ` Daniel P. Berrangé
2020-02-27 13:07                     ` Marc-André Lureau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200225160958.GD7632@linux.fritz.box \
    --to=kwolf@redhat.com \
    --cc=bharatlkmlkvm@gmail.com \
    --cc=coiby.xu@gmail.com \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.