qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/5] vhost-user block device backend implementation
@ 2020-01-14 14:06 Coiby Xu
  2020-01-14 14:06 ` [PATCH v2 1/5] vhost-user block device backend Coiby Xu
                   ` (4 more replies)
  0 siblings, 5 replies; 19+ messages in thread
From: Coiby Xu @ 2020-01-14 14:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, bharatlkmlkvm, Coiby Xu, stefanha

v2:
 * Only enable this feauture for Linux for eventfd is a Linux-specific
   feature

This patch series is an implementation of vhost-user-backend server, thanks to Stefan's guidance after reviewing the draft version and the instructions on https://wiki.qemu.org/Google_Summer_of_Code_2019#vhost-user-blk_device_backend,

Vhost-user-backend server is a UserCreatable object and can be started using object_add,

 (qemu) object_add vhost-user-server,id=ID,unix_socket=/tmp/vhost-user-blk_vhost.socket,name=DRIVE_NAME,writable=off
 (qemu) object_del ID

or appending the "-object" option when starting QEMU,

  $ -object vhost-user-server,id=disk,unix_socket=/tmp/vhost-user-blk_vhost.socket,name=disk,writable=off

Then vhost-user client can connect to the server backend. For example, QEMU could act as a client,
  $ -m 256 -object memory-backend-memfd,id=mem,size=256M,share=on -numa node,memdev=mem -chardev socket,id=char1,path=/tmp/vhost-user-blk_vhost.socket -device vhost-user-blk-pci,id=blk0,chardev=char1

And guest OS could access this vhost-user block device after mouting it.

patches are against commit 035eed4c0d257c905a556fa0f4865a0c077b4e7f.

Coiby Xu (5):
  vhost-user block device backend
  extend libvhost to support IOThread
  a standone-alone tool to directly share disk image file via vhost-user
    protocol
  new qTest case for the vhost-user-blk device backend
  building configuration files changes

 Makefile                              |    1 +
 Makefile.objs                         |    2 +-
 Makefile.target                       |    1 +
 blockdev-vu.c                         | 1008 +++++++++++++++++++++++++
 configure                             |    2 +-
 contrib/libvhost-user/libvhost-user.c |   64 +-
 contrib/libvhost-user/libvhost-user.h |   36 +-
 include/block/vhost-user.h            |   46 ++
 qemu-vu.c                             |  264 +++++++
 tests/Makefile.include                |    5 +-
 tests/libqos/vhost-user-blk.c         |  125 +++
 tests/libqos/vhost-user-blk.h         |   44 ++
 tests/vhost-user-blk-test.c           |  691 +++++++++++++++++
 vl.c                                  |    4 +
 14 files changed, 2277 insertions(+), 16 deletions(-)
 create mode 100644 blockdev-vu.c
 create mode 100644 include/block/vhost-user.h
 create mode 100644 qemu-vu.c
 create mode 100644 tests/libqos/vhost-user-blk.c
 create mode 100644 tests/libqos/vhost-user-blk.h
 create mode 100644 tests/vhost-user-blk-test.c

--
2.24.1



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2 1/5] vhost-user block device backend
  2020-01-14 14:06 [PATCH v2 0/5] vhost-user block device backend implementation Coiby Xu
@ 2020-01-14 14:06 ` Coiby Xu
  2020-01-16 13:51   ` Stefan Hajnoczi
  2020-01-16 13:56   ` Kevin Wolf
  2020-01-14 14:06 ` [PATCH v2 2/5] extend libvhost to support IOThread Coiby Xu
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 19+ messages in thread
From: Coiby Xu @ 2020-01-14 14:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, bharatlkmlkvm, Coiby Xu, stefanha

By making use of libvhost, multiple block device drives can be exported and each drive can serve multiple clients simultaneously. Since vhost-user-server needs a block drive to be created first, delay the creation of this object.

Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
---
 blockdev-vu.c              | 1008 ++++++++++++++++++++++++++++++++++++
 include/block/vhost-user.h |   46 ++
 vl.c                       |    4 +
 3 files changed, 1058 insertions(+)
 create mode 100644 blockdev-vu.c
 create mode 100644 include/block/vhost-user.h

diff --git a/blockdev-vu.c b/blockdev-vu.c
new file mode 100644
index 0000000000..45f0bb43a7
--- /dev/null
+++ b/blockdev-vu.c
@@ -0,0 +1,1008 @@
+#include "qemu/osdep.h"
+#include "block/vhost-user.h"
+#include "qapi/error.h"
+#include "qapi/qapi-types-sockets.h"
+#include "qapi/qapi-commands-block.h"
+
+#include "sysemu/block-backend.h"
+#include "qemu/main-loop.h"
+
+#include "qemu/units.h"
+
+#include "block/block.h"
+
+#include "qom/object_interfaces.h"
+
+#include <sys/eventfd.h>
+
+#include "hw/qdev-properties.h"
+enum {
+    VHOST_USER_BLK_MAX_QUEUES = 8,
+};
+
+struct virtio_blk_inhdr {
+    unsigned char status;
+};
+
+
+static QTAILQ_HEAD(, VubDev) vub_devs = QTAILQ_HEAD_INITIALIZER(vub_devs);
+
+
+typedef struct VubReq {
+    VuVirtqElement *elem;
+    int64_t sector_num;
+    size_t size;
+    struct virtio_blk_inhdr *in;
+    struct virtio_blk_outhdr out;
+    VuClient *client;
+    struct VuVirtq *vq;
+} VubReq;
+
+static void
+remove_watch(VuDev *vu_dev, int fd)
+{
+    VuClient *client;
+
+    g_assert(vu_dev);
+    g_assert(fd >= 0);
+
+    client = container_of(vu_dev, VuClient, parent);
+    aio_set_fd_handler(client->blk->ctx, fd, false, NULL, NULL, NULL, NULL);
+}
+
+static void close_client(VuClient *client)
+{
+    vu_deinit(&client->parent);
+    /** g_source_destroy(vub_device->parent.src); */
+    client->sioc = NULL;
+    object_unref(OBJECT(client->ioc));
+    client->closed = true;
+
+}
+
+static void vub_panic_cb(VuDev *vu_dev, const char *buf)
+{
+    if (buf) {
+        g_warning("vu_panic: %s", buf);
+    }
+
+    VuClient *client = container_of(vu_dev, VuClient, parent);
+    if (client->blk->exit_panic) {
+        client->blk->close = true;
+    }
+    if (!client->closed) {
+        close_client(client);
+    }
+}
+
+
+static void vub_req_complete(VubReq *req)
+{
+    VuDev *vu_dev = &req->client->parent;
+
+    /* IO size with 1 extra status byte */
+    vu_queue_push(vu_dev, req->vq, req->elem,
+                  req->size + 1);
+    vu_queue_notify(vu_dev, req->vq);
+
+    if (req->elem) {
+        free(req->elem);
+    }
+
+    g_free(req);
+}
+
+
+
+static int
+vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt,
+                         uint32_t type)
+{
+    struct virtio_blk_discard_write_zeroes *desc;
+    ssize_t size;
+    void *buf;
+
+    size = iov_size(iov, iovcnt);
+    if (size != sizeof(*desc)) {
+        fprintf(stderr, "Invalid size %ld, expect %ld\n", size, sizeof(*desc));
+        return -1;
+    }
+    buf = g_new0(char, size);
+
+    iov_to_buf_full(iov, iovcnt, 0, buf, size);
+
+
+    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
+    VubDev *vdev_blk;
+    VuClient *client = container_of(dev, VuClient, parent);
+    vdev_blk = client->blk;
+    desc = (struct virtio_blk_discard_write_zeroes *)buf;
+    uint64_t range[2] = { le64toh(desc->sector) << 9,
+                          le32toh(desc->num_sectors) << 9 };
+    if (type == VIRTIO_BLK_T_DISCARD) {
+        if (blk_pdiscard(vdev_blk->blk, range[0], range[1]) == 0) {
+            g_free(buf);
+            return 0;
+        }
+    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
+        if (blk_pwrite_zeroes(vdev_blk->blk, range[0], range[1]) == 0) {
+            g_free(buf);
+            return 0;
+        }
+    }
+    #endif
+
+    g_free(buf);
+    return -1;
+}
+
+
+static void
+vub_flush(VubReq *req)
+{
+    VuClient *client = req->client;
+    blk_co_flush(client->blk->backend);
+}
+
+
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+typedef struct BlkRwCo {
+    BlockBackend *blk;
+    int64_t offset;
+    void *iobuf;
+    int ret;
+    BdrvRequestFlags flags;
+} BlkRwCo;
+
+static void blk_read_entry(void *opaque)
+{
+    BlkRwCo *rwco = opaque;
+    QEMUIOVector *qiov = rwco->iobuf;
+
+    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
+                              qiov, rwco->flags);
+    aio_wait_kick();
+}
+
+
+static void blk_write_entry(void *opaque)
+{
+    BlkRwCo *rwco = opaque;
+    QEMUIOVector *qiov = rwco->iobuf;
+
+    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
+                              qiov, rwco->flags);
+    aio_wait_kick();
+}
+
+
+static int blk_prw(BlockBackend *blk, QEMUIOVector *qiov, int64_t offset,
+                   CoroutineEntry co_entry, BdrvRequestFlags flags)
+{
+
+    BlkRwCo rwco = {
+        .blk    = blk,
+        .offset = offset,
+        .iobuf  = qiov,
+        .flags  = flags,
+        .ret    = NOT_DONE,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        co_entry(&rwco);
+    } else {
+        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
+        bdrv_coroutine_enter(blk_bs(blk), co);
+        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
+    }
+
+    return rwco.ret;
+}
+
+
+static ssize_t
+vub_rwv(VubReq *req, struct iovec *iov,
+        uint32_t iovcnt,
+        CoroutineEntry co_entry)
+{
+    VuClient *client = req->client;
+    ssize_t rc;
+
+    if (!iovcnt) {
+        fprintf(stderr, "Invalid Read/Write IOV count\n");
+        return -1;
+    }
+
+    int64_t offset = req->sector_num * 512;
+    QEMUIOVector *qiov = g_new0(QEMUIOVector, 1);
+    qemu_iovec_init_external(qiov, iov, iovcnt);
+    rc = blk_prw(client->blk->backend, qiov, offset, co_entry, 0);
+
+    req->size = iov_size(iov, iovcnt);
+    if (rc < 0) {
+        fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
+                client->blk->name, req->sector_num, req->size,
+                strerror(errno));
+        return -1;
+    }
+
+    return rc;
+}
+
+static int vub_virtio_process_req(VuClient *client,
+                                     VuVirtq *vq)
+{
+    VuDev *vu_dev = &client->parent;
+    VuVirtqElement *elem;
+    uint32_t type;
+    VubReq *req;
+
+    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
+    if (!elem) {
+        return -1;
+    }
+
+    struct iovec *in_iov = elem->in_sg;
+    struct iovec *out_iov = elem->out_sg;
+    unsigned in_num = elem->in_num;
+    unsigned out_num = elem->out_num;
+    /* refer to hw/block/virtio_blk.c */
+    if (elem->out_num < 1 || elem->in_num < 1) {
+        fprintf(stderr, "virtio-blk request missing headers\n");
+        free(elem);
+        return -1;
+    }
+
+    req = g_new0(VubReq, 1);
+    req->client = client;
+    req->vq = vq;
+    req->elem = elem;
+
+    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
+                            sizeof(req->out)) != sizeof(req->out))) {
+        fprintf(stderr, "virtio-blk request outhdr too short");
+        goto err;
+    }
+
+    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
+
+    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
+        fprintf(stderr, "virtio-blk request inhdr too short");
+        goto err;
+    }
+
+    /* We always touch the last byte, so just see how big in_iov is.  */
+    req->in = (void *)in_iov[in_num - 1].iov_base
+              + in_iov[in_num - 1].iov_len
+              - sizeof(struct virtio_blk_inhdr);
+    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
+
+
+    type = le32toh(req->out.type);
+    switch (type & ~VIRTIO_BLK_T_BARRIER) {
+    case VIRTIO_BLK_T_IN:
+    case VIRTIO_BLK_T_OUT: {
+        ssize_t ret = 0;
+        bool is_write = type & VIRTIO_BLK_T_OUT;
+        req->sector_num = le64toh(req->out.sector);
+        if (is_write) {
+            ret = vub_rwv(req, out_iov, out_num, blk_write_entry);
+        } else {
+            ret = vub_rwv(req, in_iov, in_num, blk_read_entry);
+        }
+        if (ret >= 0) {
+            req->in->status = VIRTIO_BLK_S_OK;
+        } else {
+            req->in->status = VIRTIO_BLK_S_IOERR;
+        }
+        vub_req_complete(req);
+        break;
+    }
+    case VIRTIO_BLK_T_FLUSH:
+        vub_flush(req);
+        req->in->status = VIRTIO_BLK_S_OK;
+        vub_req_complete(req);
+        break;
+    case VIRTIO_BLK_T_GET_ID: {
+        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
+                          VIRTIO_BLK_ID_BYTES);
+        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
+        req->in->status = VIRTIO_BLK_S_OK;
+        req->size = elem->in_sg[0].iov_len;
+        vub_req_complete(req);
+        break;
+    }
+    case VIRTIO_BLK_T_DISCARD:
+    case VIRTIO_BLK_T_WRITE_ZEROES: {
+        int rc;
+        rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num, type);
+        if (rc == 0) {
+            req->in->status = VIRTIO_BLK_S_OK;
+        } else {
+            req->in->status = VIRTIO_BLK_S_IOERR;
+        }
+        vub_req_complete(req);
+        break;
+    }
+    default:
+        req->in->status = VIRTIO_BLK_S_UNSUPP;
+        vub_req_complete(req);
+        break;
+    }
+
+    return 0;
+
+err:
+    free(elem);
+    g_free(req);
+    return -1;
+}
+
+
+static void vub_process_vq(VuDev *vu_dev, int idx)
+{
+    VuClient *client;
+    VuVirtq *vq;
+    int ret;
+
+    client = container_of(vu_dev, VuClient, parent);
+    assert(client);
+
+    vq = vu_get_queue(vu_dev, idx);
+    assert(vq);
+
+    while (1) {
+        ret = vub_virtio_process_req(client, vq);
+        if (ret) {
+            break;
+        }
+    }
+}
+
+
+static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
+{
+    VuVirtq *vq;
+
+    assert(vu_dev);
+
+    vq = vu_get_queue(vu_dev, idx);
+    vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
+}
+
+static uint64_t
+vub_get_features(VuDev *dev)
+{
+    uint64_t features;
+    VubDev *vdev_blk;
+
+    VuClient *client = container_of(dev, VuClient, parent);
+    vdev_blk = client->blk;
+
+    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
+               1ull << VIRTIO_BLK_F_SEG_MAX |
+               1ull << VIRTIO_BLK_F_TOPOLOGY |
+               1ull << VIRTIO_BLK_F_BLK_SIZE |
+               1ull << VIRTIO_BLK_F_FLUSH |
+               #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
+               1ull << VIRTIO_BLK_F_DISCARD |
+               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
+               #endif
+               1ull << VIRTIO_BLK_F_CONFIG_WCE |
+               1ull << VIRTIO_F_VERSION_1 |
+               1ull << VIRTIO_RING_F_INDIRECT_DESC |
+               1ull << VIRTIO_RING_F_EVENT_IDX |
+               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
+
+    if (!vdev_blk->writable) {
+        features |= 1ull << VIRTIO_BLK_F_RO;
+    }
+
+    return features;
+}
+
+static uint64_t
+vub_get_protocol_features(VuDev *dev)
+{
+    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
+           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
+}
+
+static int
+vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
+{
+    VubDev *vdev_blk;
+
+    VuClient *client = container_of(vu_dev, VuClient, parent);
+    vdev_blk = client->blk;
+    memcpy(config, &vdev_blk->blkcfg, len);
+
+    return 0;
+}
+
+static int
+vub_set_config(VuDev *vu_dev, const uint8_t *data,
+               uint32_t offset, uint32_t size, uint32_t flags)
+{
+    VubDev *vdev_blk;
+
+    VuClient *client = container_of(vu_dev, VuClient, parent);
+    vdev_blk = client->blk;
+    uint8_t wce;
+
+    /* don't support live migration */
+    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
+        return -1;
+    }
+
+
+    if (offset != offsetof(struct virtio_blk_config, wce) ||
+        size != 1) {
+        return -1;
+    }
+
+    wce = *data;
+    if (wce == vdev_blk->blkcfg.wce) {
+        /* Do nothing as same with old configuration */
+        return 0;
+    }
+
+    vdev_blk->blkcfg.wce = wce;
+    blk_set_enable_write_cache(vdev_blk->backend, true);
+    return 0;
+}
+
+
+/*
+ * When the client disconnects, it send a VHOST_USER_NONE request
+ * and vu_process_message will simple call exit which cause the VM
+ * to exit abruptly.
+ * To avoid this issue,  process VHOST_USER_NONE request ahead
+ * of vu_process_message.
+ *
+ */
+static int vub_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
+{
+    if (vmsg->request == VHOST_USER_NONE) {
+        dev->panic(dev, "disconnect");
+        return true;
+    }
+    return false;
+}
+
+static void
+vmsg_close_fds(VhostUserMsg *vmsg)
+{
+    int i;
+    for (i = 0; i < vmsg->fd_num; i++) {
+        close(vmsg->fds[i]);
+    }
+}
+
+static bool
+vu_message_read_co(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
+{
+    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
+    struct iovec iov = {
+        .iov_base = (char *)vmsg,
+        .iov_len = VHOST_USER_HDR_SIZE,
+    };
+    struct msghdr msg = {
+        .msg_iov = &iov,
+        .msg_iovlen = 1,
+        .msg_control = control,
+        .msg_controllen = sizeof(control),
+    };
+    size_t fd_size;
+    struct cmsghdr *cmsg;
+    int rc;
+    char buffer[100];
+    VuClient *client = container_of(vu_dev, VuClient, parent);
+    QIOChannel *ioc = client->ioc;
+    do {
+        rc = recvmsg(conn_fd, &msg, 0);
+        if (rc < 0) {
+            if (errno == EAGAIN) {
+                if (qemu_in_coroutine()) {
+                    qio_channel_yield(ioc, G_IO_IN);
+                } else {
+                    qio_channel_wait(ioc, G_IO_IN);
+                }
+                continue;
+            } else if (errno == EINTR) {
+                continue;
+            }
+        }
+        break;
+    } while (true);
+
+    if (rc < 0) {
+        sprintf(buffer, "Error while recvmsg: %s", strerror(errno));
+        vub_panic_cb(vu_dev, buffer);
+        return false;
+    }
+
+    assert(rc == VHOST_USER_HDR_SIZE || rc == 0);
+
+    vmsg->fd_num = 0;
+    for (cmsg = CMSG_FIRSTHDR(&msg);
+         cmsg != NULL;
+         cmsg = CMSG_NXTHDR(&msg, cmsg))
+    {
+        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
+            fd_size = cmsg->cmsg_len - CMSG_LEN(0);
+            vmsg->fd_num = fd_size / sizeof(int);
+            memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
+            break;
+        }
+    }
+
+    if (vmsg->size > sizeof(vmsg->payload)) {
+        sprintf(buffer,
+                "Error: too big message request: %d, size: vmsg->size: %u, "
+                "while sizeof(vmsg->payload) = %zu\n",
+                vmsg->request, vmsg->size, sizeof(vmsg->payload));
+        vub_panic_cb(vu_dev, buffer);
+        goto fail;
+    }
+
+    if (vmsg->size) {
+        do {
+            rc = read(conn_fd, &vmsg->payload, vmsg->size);
+            if (rc < 0) {
+                if (errno == EAGAIN) {
+                    if (qemu_in_coroutine()) {
+                        qio_channel_yield(ioc, G_IO_IN);
+                    } else {
+                        qio_channel_wait(ioc, G_IO_IN);
+                    }
+                    continue;
+                } else if (errno == EINTR) {
+                    continue;
+                }
+            }
+            break;
+        } while (true);
+
+        if (rc <= 0) {
+            sprintf(buffer, "Error while reading: %s", strerror(errno));
+            vub_panic_cb(vu_dev, buffer);
+            goto fail;
+        }
+
+        assert(rc == vmsg->size);
+    }
+
+    return true;
+
+fail:
+    vmsg_close_fds(vmsg);
+
+    return false;
+}
+
+static void vub_kick_cb(void *opaque)
+{
+    vu_watch_cb_data *data = (vu_watch_cb_data *) opaque;
+    int index = data->index;
+    VuDev *dev = data->vu_dev;
+    VuVirtq *vq = &dev->vq[index];
+    int sock = vq->kick_fd;
+    eventfd_t kick_data;
+    ssize_t rc;
+
+    rc = eventfd_read(sock, &kick_data);
+    if (rc == -1) {
+        char buffer[100];
+        sprintf(buffer, "kick eventfd_read(): %s", strerror(errno));
+        vub_panic_cb(dev, buffer);
+        g_free(data);
+        dev->remove_watch(dev, dev->vq[index].kick_fd);
+    } else {
+        if (vq->handler) {
+            vq->handler(dev, index);
+        }
+    }
+}
+
+static const VuDevIface vub_iface = {
+    .get_features = vub_get_features,
+    .queue_set_started = vub_queue_set_started,
+    .get_protocol_features = vub_get_protocol_features,
+    .get_config = vub_get_config,
+    .set_config = vub_set_config,
+    .process_msg = vub_process_msg,
+    .read_msg = vu_message_read_co,
+    .kick_callback = vub_kick_cb,
+};
+
+
+void vub_free(VubDev *vub_dev, bool called_by_QOM)
+{
+    if (!vub_dev) {
+        return;
+    }
+
+    blk_unref(vub_dev->backend);
+    g_free(vub_dev->name);
+    g_free(vub_dev->unix_socket);
+
+    if (vub_dev->next.tqe_circ.tql_prev) {
+        /*
+         * if vub_dev->next.tqe_circ.tql_prev = null,
+         * vub_dev hasn't been inserted into the queue and
+         * vub_free is called by obj->instance_finalize.
+         */
+        QTAILQ_REMOVE(&vub_devs, vub_dev, next);
+    }
+    /*
+     * Needn't to free vub_dev if called by QOM
+     * because QOM will do the clean-up work.
+     */
+    if (!called_by_QOM) {
+        g_free(vub_dev);
+    }
+}
+
+static coroutine_fn void vu_client_trip(void *opaque)
+{
+    VuClient *client = opaque;
+
+    while (!client->closed) {
+        vu_dispatch(&client->parent);
+    }
+
+    QTAILQ_REMOVE(&client->blk->clients, client, next);
+
+}
+
+static void vu_client_start(VuClient *client)
+{
+    Coroutine *co = qemu_coroutine_create(vu_client_trip, client);
+    qemu_coroutine_enter(co);
+}
+
+
+G_STATIC_ASSERT((int)G_IO_IN == (int)VU_WATCH_IN);
+G_STATIC_ASSERT((int)G_IO_OUT == (int)VU_WATCH_OUT);
+G_STATIC_ASSERT((int)G_IO_PRI == (int)VU_WATCH_PRI);
+G_STATIC_ASSERT((int)G_IO_ERR == (int)VU_WATCH_ERR);
+G_STATIC_ASSERT((int)G_IO_HUP == (int)VU_WATCH_HUP);
+
+static void
+set_watch(VuDev *vu_dev, int fd, int vu_evt,
+          vu_watch_cb_packed_data cb, void *pvt)
+{
+    /*
+     * since aio_dispatch can only pass one user data pointer to the
+     * callback function, pack VuDev, pvt into a struct
+     */
+    VuClient *client;
+
+    g_assert(vu_dev);
+    g_assert(fd >= 0);
+    g_assert(cb);
+    client = container_of(vu_dev, VuClient, parent);
+    vu_watch_cb_data *cb_data = g_new0(vu_watch_cb_data, 1);
+    cb_data->index = (intptr_t) pvt;
+    cb_data->vu_dev = vu_dev;
+    aio_set_fd_handler(client->blk->ctx, fd, false, (void *) cb,
+                       NULL, NULL, cb_data);
+}
+
+
+void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
+                gpointer opaque)
+{
+    VuClient *client;
+    VubDev *vub_device = opaque;
+    client = g_new0(VuClient, 1);
+
+    if (!vu_init_packed_data(&client->parent, VHOST_USER_BLK_MAX_QUEUES,
+                             sioc->fd, vub_panic_cb, set_watch,
+                             remove_watch, &vub_iface)) {
+        fprintf(stderr, "Failed to initialized libvhost-user\n");
+        g_free(client);
+        return;
+    }
+
+    client->blk = vub_device;
+    client->refcount = 1;
+    client->sioc = sioc;
+    /*
+     * increase the object reference, so cioc will not freed by
+     * qio_net_listener_channel_func which will call object_unref(OBJECT(sioc))
+     */
+    object_ref(OBJECT(client->sioc));
+    qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
+    client->ioc = QIO_CHANNEL(sioc);
+    object_ref(OBJECT(client->ioc));
+    object_ref(OBJECT(sioc));
+
+    qio_channel_set_blocking(QIO_CHANNEL(client->sioc), false, NULL);
+    client->closed = false;
+    QTAILQ_INSERT_TAIL(&client->blk->clients, client, next);
+    vu_client_start(client);
+}
+
+
+void
+vub_initialize_config(BlockDriverState *bs, struct virtio_blk_config *config)
+{
+    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    config->blk_size = BDRV_SECTOR_SIZE;
+    config->size_max = 65536;
+    config->seg_max = 128 - 2;
+    config->min_io_size = 1;
+    config->opt_io_size = 1;
+    config->num_queues = 1;
+    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
+    config->max_discard_sectors = 32768;
+    config->max_discard_seg = 1;
+    config->discard_sector_alignment = config->blk_size >> 9;
+    config->max_write_zeroes_sectors = 32768;
+    config->max_write_zeroes_seg = 1;
+    #endif
+}
+
+
+static VubDev *vub_new(VubDev *vub_device, const char *name,
+                       const char *unix_socket, bool writable, Error **errp)
+{
+
+    BlockBackend *blk;
+
+    /*
+     * Don't allow resize while the vhost user server is running,
+     * otherwise we don't care what happens with the node.
+     */
+    uint64_t perm = BLK_PERM_CONSISTENT_READ;
+    int ret;
+
+    AioContext *ctx;
+
+    BlockDriverState *bs = bdrv_lookup_bs(name,
+                                          name,
+                                          errp);
+
+    if (!bs) {
+        error_setg(errp,
+                   "No drive with name '%s'."
+                   " Please find the list of names with "
+                   "'info block'", name);
+        return NULL;
+    }
+
+    if (bdrv_is_read_only(bs)) {
+        writable = false;
+    }
+
+    if (writable) {
+        perm |= BLK_PERM_WRITE;
+    }
+
+    ctx = bdrv_get_aio_context(bs);
+    aio_context_acquire(ctx);
+    bdrv_invalidate_cache(bs, NULL);
+    aio_context_release(ctx);
+
+    blk = blk_new(bdrv_get_aio_context(bs), perm,
+                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
+    ret = blk_insert_bs(blk, bs, errp);
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+
+    blk_set_enable_write_cache(blk, false);
+
+    blk_set_allow_aio_context_change(blk, true);
+
+
+    vub_device->name = g_strdup(name);
+    vub_device->unix_socket = g_strdup(unix_socket);
+    vub_device->writable = writable;
+    vub_device->blkcfg.wce = 0;
+    vub_device->backend = blk;
+    vub_device->ctx = ctx;
+    vub_initialize_config(bs, &vub_device->blkcfg);
+    return vub_device;
+
+fail:
+    blk_unref(blk);
+    return NULL;
+}
+
+void vhost_user_server_free(VubDev *vub_device, bool called_by_QOM)
+{
+    if (!vub_device) {
+        return;
+    }
+
+    VuClient *client, *next;
+    QTAILQ_FOREACH_SAFE(client, &vub_device->clients, next, next) {
+        if (!client->closed) {
+            close_client(client);
+        }
+    }
+
+    if (vub_device->listener) {
+        qio_net_listener_disconnect(vub_device->listener);
+        object_unref(OBJECT(vub_device->listener));
+    }
+    vub_free(vub_device, called_by_QOM);
+
+}
+
+
+VubDev *vub_dev_find(const char *name)
+{
+    VubDev *vub_device;
+    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
+        if (strcmp(name, vub_device->name) == 0) {
+            return vub_device;
+        }
+    }
+
+    return NULL;
+}
+
+
+static VubDev *vub_dev_find_by_unix_socket(const char *unix_socket)
+{
+    VubDev *vub_device;
+    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
+        if (strcmp(unix_socket, vub_device->unix_socket) == 0) {
+            return vub_device;
+        }
+    }
+
+    return NULL;
+}
+
+static void vhost_user_server_start(VubDev *vub_device, const char *unix_socket,
+                                    const char *name, bool writable,
+                                    Error **errp)
+{
+
+    if (vub_dev_find(name) || vub_dev_find_by_unix_socket(unix_socket)) {
+        error_setg(errp, "Vhost user server with name '%s' or "
+                "with socket_path '%s' has already been started",
+                name, unix_socket);
+        return;
+    }
+
+
+    if (!vub_new(vub_device, name, unix_socket, writable, errp)) {
+        return;
+    }
+
+
+    vub_device->listener = qio_net_listener_new();
+
+    qio_net_listener_set_name(vub_device->listener,
+                              "vhost-user-backend-listener");
+
+    SocketAddress *addr = g_new0(SocketAddress, 1);
+    addr->u.q_unix.path = (char *) unix_socket;
+    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+    if (qio_net_listener_open_sync(vub_device->listener, addr, 1, errp) < 0) {
+        goto error;
+    }
+
+
+    QTAILQ_INSERT_TAIL(&vub_devs, vub_device, next);
+    QTAILQ_INIT(&vub_device->clients);
+
+    qio_net_listener_set_client_func(vub_device->listener,
+                                     vub_accept,
+                                     vub_device,
+                                     NULL);
+
+    return;
+
+ error:
+    vub_free(vub_device, false);
+}
+
+static void vu_set_block_name(Object *obj, const char *value,
+                                           Error **errp)
+{
+    VubDev *vus = VHOST_USER_SERVER(obj);;
+
+    if (vus->name) {
+        error_setg(errp, "evdev property already set");
+        return;
+    }
+
+    vus->name = g_strdup(value);
+}
+
+static char *vu_get_block_name(Object *obj, Error **errp)
+{
+    VubDev *vus = VHOST_USER_SERVER(obj);
+    return g_strdup(vus->name);
+}
+
+
+static void vu_set_unix_socket(Object *obj, const char *value,
+                                            Error **errp)
+{
+    VubDev *vus = VHOST_USER_SERVER(obj);;
+
+    if (vus->unix_socket) {
+        error_setg(errp, "unix_socket property already set");
+        return;
+    }
+
+    vus->unix_socket = g_strdup(value);
+    vhost_user_server_start(vus, value, vus->name,
+                            vus->writable, errp);
+}
+
+static char *vu_get_unix_socket(Object *obj, Error **errp)
+{
+    VubDev *vus = VHOST_USER_SERVER(obj);;
+    return g_strdup(vus->unix_socket);
+}
+
+static bool vu_get_block_writable(Object *obj, Error **errp)
+{
+    VubDev *vus = VHOST_USER_SERVER(obj);;
+    return vus->writable;
+}
+
+static void vu_set_block_writable(Object *obj, bool value, Error **errp)
+{
+    VubDev *vus = VHOST_USER_SERVER(obj);
+
+    vus->writable = value;
+}
+
+static void vhost_user_server_instance_init(Object *obj)
+{
+
+    object_property_add_bool(obj, "writable",
+                            vu_get_block_writable,
+                            vu_set_block_writable, NULL);
+
+    object_property_add_str(obj, "name",
+                            vu_get_block_name,
+                            vu_set_block_name, NULL);
+
+    object_property_add_str(obj, "unix_socket",
+                            vu_get_unix_socket,
+                            vu_set_unix_socket, NULL);
+
+}
+
+static void vhost_user_server_instance_finalize(Object *obj)
+{
+    VubDev *vus = VHOST_USER_SERVER(obj);
+    vhost_user_server_free(vus, true);
+    /* object_del shouldn't free this object struct */
+    obj->free = NULL;
+}
+
+static const TypeInfo vhost_user_server_info = {
+    .name = TYPE_VHOST_USER_SERVER,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(VuDev),
+    .instance_init = vhost_user_server_instance_init,
+    .instance_finalize = vhost_user_server_instance_finalize,
+    .interfaces = (InterfaceInfo[]) {
+        {TYPE_USER_CREATABLE},
+        {}
+    },
+};
+
+static void vhost_user_server_register_types(void)
+{
+    type_register_static(&vhost_user_server_info);
+}
+
+type_init(vhost_user_server_register_types)
+
diff --git a/include/block/vhost-user.h b/include/block/vhost-user.h
new file mode 100644
index 0000000000..ef6d695244
--- /dev/null
+++ b/include/block/vhost-user.h
@@ -0,0 +1,46 @@
+#include "io/channel-socket.h"
+#include "io/net-listener.h"
+#include "contrib/libvhost-user/libvhost-user.h"
+#include "standard-headers/linux/virtio_blk.h"
+typedef struct VubDev VubDev;
+typedef struct VuClient VuClient;
+#define TYPE_VHOST_USER_SERVER "vhost-user-server"
+
+#define VHOST_USER_SERVER(obj) \
+   OBJECT_CHECK(VubDev, obj, TYPE_VHOST_USER_SERVER)
+/* vhost user block device */
+struct VubDev {
+    Object parent_obj;
+    char *name;
+    char *unix_socket;
+    bool exit_panic;
+    bool close;
+    BlockBackend *backend;
+    AioContext *ctx;
+    QIONetListener *listener;
+    QIOChannelSocket *sioc;
+    QTAILQ_HEAD(, VuClient) clients;
+    QTAILQ_ENTRY(VubDev) next;
+    struct virtio_blk_config blkcfg;
+    bool writable;
+};
+
+struct VuClient {
+    VuDev parent;
+    int refcount;
+    VubDev *blk;
+    QIOChannelSocket *sioc; /* The underlying data channel */
+    QIOChannel *ioc; /* The current I/O channel */
+    QTAILQ_ENTRY(VuClient) next;
+    bool closed;
+};
+VubDev *vub_dev_find(const char *name);
+
+void vhost_user_server_free(VubDev *vub_device, bool called_by_QOM);
+void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
+                gpointer opaque);
+
+void vub_free(VubDev *vub_dev, bool called_by_QOM);
+
+void vub_initialize_config(BlockDriverState *bs,
+                           struct virtio_blk_config *config);
diff --git a/vl.c b/vl.c
index 86474a55c9..72ac506342 100644
--- a/vl.c
+++ b/vl.c
@@ -2553,6 +2553,10 @@ static bool object_create_initial(const char *type, QemuOpts *opts)
     }
 #endif

+    /* Reason: vhost-user-server property "name" */
+    if (g_str_equal(type, "vhost-user-server")) {
+        return false;
+    }
     /*
      * Reason: filter-* property "netdev" etc.
      */
--
2.24.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v2 2/5] extend libvhost to support IOThread
  2020-01-14 14:06 [PATCH v2 0/5] vhost-user block device backend implementation Coiby Xu
  2020-01-14 14:06 ` [PATCH v2 1/5] vhost-user block device backend Coiby Xu
@ 2020-01-14 14:06 ` Coiby Xu
  2020-01-14 14:06 ` [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol Coiby Xu
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 19+ messages in thread
From: Coiby Xu @ 2020-01-14 14:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, bharatlkmlkvm, Coiby Xu, stefanha

Previously libvhost dispatch events in its own GMainContext. Now vhost-user client's kick event can be dispatched in block device drive's AioContext thus IOThread is supported.

Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
---
 contrib/libvhost-user/libvhost-user.c | 64 ++++++++++++++++++++++-----
 contrib/libvhost-user/libvhost-user.h | 36 ++++++++++++++-
 2 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index ec27b78ff1..cd328c1509 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -67,7 +67,6 @@
 /* The version of inflight buffer */
 #define INFLIGHT_VERSION 1

-#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)

 /* The version of the protocol we support */
 #define VHOST_USER_VERSION 1
@@ -260,7 +259,7 @@ have_userfault(void)
 }

 static bool
-vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
+vu_message_read_(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 {
     char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
     struct iovec iov = {
@@ -286,6 +285,8 @@ vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
         return false;
     }

+    assert(rc == VHOST_USER_HDR_SIZE || rc == 0);
+
     vmsg->fd_num = 0;
     for (cmsg = CMSG_FIRSTHDR(&msg);
          cmsg != NULL;
@@ -328,6 +329,17 @@ fail:
     return false;
 }

+static bool vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
+{
+    vu_read_msg_cb read_msg;
+    if (dev->iface->read_msg) {
+        read_msg = dev->iface->read_msg;
+    } else {
+        read_msg = vu_message_read_;
+    }
+    return read_msg(dev, conn_fd, vmsg);
+}
+
 static bool
 vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 {
@@ -400,7 +412,6 @@ vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg)
     if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
         return true;
     }
-
     if (!vu_message_read(dev, dev->slave_fd, &msg_reply)) {
         return false;
     }
@@ -644,7 +655,8 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
                     "%s: Failed to madvise(DONTNEED) region %d: %s\n",
                     __func__, i, strerror(errno));
         }
-        /* Turn off transparent hugepages so we dont get lose wakeups
+        /*
+         * Turn off transparent hugepages so we don't get lose wakeups
          * in neighbouring pages.
          * TODO: Turn this backon later.
          */
@@ -1047,9 +1059,13 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
     }

     if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
-        dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
-                       vu_kick_cb, (void *)(long)index);
-
+        if (dev->set_watch_packed_data) {
+            dev->set_watch_packed_data(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
+                           dev->iface->kick_callback, (void *)(long)index);
+        } else {
+            dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
+                           vu_kick_cb, (void *)(long)index);
+        }
         DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
                dev->vq[index].kick_fd, index);
     }
@@ -1069,8 +1085,13 @@ void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
     vq->handler = handler;
     if (vq->kick_fd >= 0) {
         if (handler) {
-            dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
-                           vu_kick_cb, (void *)(long)qidx);
+            if (dev->set_watch_packed_data) {
+                dev->set_watch_packed_data(dev, vq->kick_fd, VU_WATCH_IN,
+                        dev->iface->kick_callback, (void *)(long)qidx);
+            } else {
+                dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
+                        vu_kick_cb, (void *)(long)qidx);
+            }
         } else {
             dev->remove_watch(dev, vq->kick_fd);
         }
@@ -1596,6 +1617,12 @@ vu_deinit(VuDev *dev)
         }

         if (vq->kick_fd != -1) {
+            /* remove watch for kick_fd
+             * When client process is running in gdb and
+             * quit command is run in gdb, QEMU will still dispatch the event
+             * which will cause segment fault in the callback function
+             */
+            dev->remove_watch(dev, vq->kick_fd);
             close(vq->kick_fd);
             vq->kick_fd = -1;
         }
@@ -1647,10 +1674,9 @@ vu_init(VuDev *dev,
         const VuDevIface *iface)
 {
     uint16_t i;
-
     assert(max_queues > 0);
     assert(socket >= 0);
-    assert(set_watch);
+    /* assert(set_watch); */
     assert(remove_watch);
     assert(iface);
     assert(panic);
@@ -1682,6 +1708,22 @@ vu_init(VuDev *dev,
     return true;
 }

+bool
+vu_init_packed_data(VuDev *dev,
+        uint16_t max_queues,
+        int socket,
+        vu_panic_cb panic,
+        vu_set_watch_cb_packed_data set_watch_packed_data,
+        vu_remove_watch_cb remove_watch,
+        const VuDevIface *iface)
+{
+    if (vu_init(dev, max_queues, socket, panic, NULL, remove_watch, iface)) {
+        dev->set_watch_packed_data = set_watch_packed_data;
+        return true;
+    }
+    return false;
+}
+
 VuVirtq *
 vu_get_queue(VuDev *dev, int qidx)
 {
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 46b600799b..5230d55092 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -34,6 +34,9 @@ typedef enum VhostSetConfigType {
     VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
 } VhostSetConfigType;

+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
 /*
  * Maximum size of virtio device config space
  */
@@ -200,6 +203,7 @@ typedef uint64_t (*vu_get_features_cb) (VuDev *dev);
 typedef void (*vu_set_features_cb) (VuDev *dev, uint64_t features);
 typedef int (*vu_process_msg_cb) (VuDev *dev, VhostUserMsg *vmsg,
                                   int *do_reply);
+typedef bool (*vu_read_msg_cb) (VuDev *dev, int sock, VhostUserMsg *vmsg);
 typedef void (*vu_queue_set_started_cb) (VuDev *dev, int qidx, bool started);
 typedef bool (*vu_queue_is_processed_in_order_cb) (VuDev *dev, int qidx);
 typedef int (*vu_get_config_cb) (VuDev *dev, uint8_t *config, uint32_t len);
@@ -207,6 +211,15 @@ typedef int (*vu_set_config_cb) (VuDev *dev, const uint8_t *data,
                                  uint32_t offset, uint32_t size,
                                  uint32_t flags);

+typedef struct vu_watch_cb_data {
+   long index;
+   VuDev *vu_dev;
+} vu_watch_cb_data;
+typedef void (*vu_watch_cb_packed_data) (void *packed_data);
+
+typedef void (*vu_set_watch_cb_packed_data) (VuDev *dev, int fd, int condition,
+                                 vu_watch_cb_packed_data cb, void *data);
+
 typedef struct VuDevIface {
     /* called by VHOST_USER_GET_FEATURES to get the features bitmask */
     vu_get_features_cb get_features;
@@ -220,8 +233,11 @@ typedef struct VuDevIface {
     /* process_msg is called for each vhost-user message received */
     /* skip libvhost-user processing if return value != 0 */
     vu_process_msg_cb process_msg;
+    vu_read_msg_cb read_msg;
+    vu_watch_cb_packed_data kick_callback;
     /* tells when queues can be processed */
     vu_queue_set_started_cb queue_set_started;
+
     /*
      * If the queue is processed in order, in which case it will be
      * resumed to vring.used->idx. This can help to support resuming
@@ -366,7 +382,8 @@ struct VuDev {
     /* @set_watch: add or update the given fd to the watch set,
      * call cb when condition is met */
     vu_set_watch_cb set_watch;
-
+    /* AIO dispatch will only one data pointer to callback function */
+    vu_set_watch_cb_packed_data set_watch_packed_data;
     /* @remove_watch: remove the given fd from the watch set */
     vu_remove_watch_cb remove_watch;

@@ -398,7 +415,7 @@ typedef struct VuVirtqElement {
  * @remove_watch: a remove_watch callback
  * @iface: a VuDevIface structure with vhost-user device callbacks
  *
- * Intializes a VuDev vhost-user context.
+ * Initializes a VuDev vhost-user context.
  *
  * Returns: true on success, false on failure.
  **/
@@ -411,6 +428,21 @@ bool vu_init(VuDev *dev,
              const VuDevIface *iface);


+/**
+ * vu_init_packed_data:
+ * Same as vu_init except for set_watch_packed_data which will pack
+ * two parameters into a struct thus QEMU aio_dispatch can pass the
+ * required data to callback function.
+ *
+ * Returns: true on success, false on failure.
+ **/
+bool vu_init_packed_data(VuDev *dev,
+             uint16_t max_queues,
+             int socket,
+             vu_panic_cb panic,
+             vu_set_watch_cb_packed_data set_watch_packed_data,
+             vu_remove_watch_cb remove_watch,
+             const VuDevIface *iface);
 /**
  * vu_deinit:
  * @dev: a VuDev context
--
2.24.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol
  2020-01-14 14:06 [PATCH v2 0/5] vhost-user block device backend implementation Coiby Xu
  2020-01-14 14:06 ` [PATCH v2 1/5] vhost-user block device backend Coiby Xu
  2020-01-14 14:06 ` [PATCH v2 2/5] extend libvhost to support IOThread Coiby Xu
@ 2020-01-14 14:06 ` Coiby Xu
  2020-01-16 14:04   ` Stefan Hajnoczi
  2020-01-14 14:06 ` [PATCH v2 4/5] new qTest case for the vhost-user-blk device backend Coiby Xu
  2020-01-14 14:06 ` [PATCH v2 5/5] building configuration files changes Coiby Xu
  4 siblings, 1 reply; 19+ messages in thread
From: Coiby Xu @ 2020-01-14 14:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, bharatlkmlkvm, Coiby Xu, stefanha

vhost-user-blk can have played as vhost-user backend but it only supports raw file and don't support VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES operations on raw file (ioctl(fd, BLKDISCARD) is only valid for real block device).

Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
---
 qemu-vu.c | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 qemu-vu.c

diff --git a/qemu-vu.c b/qemu-vu.c
new file mode 100644
index 0000000000..25c32c2c6d
--- /dev/null
+++ b/qemu-vu.c
@@ -0,0 +1,264 @@
+/*
+ *  Copyright (C) 2020  Coiby Xu <coiby.xu@gmail.com>
+ *
+ *  Vhost-user-blk device backend
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; under version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include <getopt.h>
+#include <libgen.h>
+#include "block/vhost-user.h"
+#include "qemu-common.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "sysemu/block-backend.h"
+#include "block/block_int.h"
+#include "qemu/main-loop.h"
+#include "qemu/module.h"
+#include "qemu/option.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+#include "qom/object_interfaces.h"
+#include "io/net-listener.h"
+#include "qemu-version.h"
+
+#define QEMU_VU_OPT_CACHE         256
+
+#define QEMU_VU_OPT_AIO           257
+
+static char *srcpath;
+
+static void usage(const char *name)
+{
+    (printf) (
+"Usage: %s [OPTIONS] FILE\n"
+"  or:  %s -L [OPTIONS]\n"
+"QEMU Vhost-user Server Utility\n"
+"\n"
+"  -h, --help                display this help and exit\n"
+"  -V, --version             output version information and exit\n"
+"\n"
+"Connection properties:\n"
+"  -k, --socket=PATH         path to the unix socket\n"
+"\n"
+"General purpose options:\n"
+"  -e, -- exit-panic         When the panic callback is called, the program\n"
+"                            will exit. Useful for make check-qtest.\n"
+"\n"
+"Block device options:\n"
+"  -f, --format=FORMAT       set image format (raw, qcow2, ...)\n"
+"  -r, --read-only           export read-only\n"
+"  -n, --nocache             disable host cache\n"
+"      --cache=MODE          set cache mode (none, writeback, ...)\n"
+"      --aio=MODE            set AIO mode (native or threads)\n"
+"\n"
+QEMU_HELP_BOTTOM "\n"
+    , name, name);
+}
+
+static void version(const char *name)
+{
+    printf(
+"%s " QEMU_FULL_VERSION "\n"
+"Written by Coiby Xu, based on qemu-nbd by Anthony Liguori\n"
+"\n"
+QEMU_COPYRIGHT "\n"
+"This is free software; see the source for copying conditions.  There is NO\n"
+"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"
+    , name);
+}
+
+static VubDev *vub_device;
+
+static void vus_shutdown(void)
+{
+    job_cancel_sync_all();
+    bdrv_close_all();
+    vub_free(vub_device, false);
+}
+
+int main(int argc, char **argv)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    bool readonly = false;
+    char *sockpath = NULL;
+    int64_t fd_size;
+    const char *sopt = "hVrnvek:f:";
+    struct option lopt[] = {
+        { "help", no_argument, NULL, 'h' },
+        { "version", no_argument, NULL, 'V' },
+        { "exit-panic", no_argument, NULL, 'e' },
+        { "socket", required_argument, NULL, 'k' },
+        { "read-only", no_argument, NULL, 'r' },
+        { "nocache", no_argument, NULL, 'n' },
+        { "cache", required_argument, NULL, QEMU_VU_OPT_CACHE },
+        { "aio", required_argument, NULL, QEMU_VU_OPT_AIO },
+        { "format", required_argument, NULL, 'f' },
+        { NULL, 0, NULL, 0 }
+    };
+    int ch;
+    int opt_ind = 0;
+    int flags = BDRV_O_RDWR;
+    bool seen_cache = false;
+    bool seen_aio = false;
+    const char *fmt = NULL;
+    Error *local_err = NULL;
+    QDict *options = NULL;
+    bool writethrough = true;
+    bool exit_panic = false;
+
+    error_init(argv[0]);
+
+    module_call_init(MODULE_INIT_QOM);
+    qemu_init_exec_dir(argv[0]);
+
+    while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
+        switch (ch) {
+        case 'e':
+            exit_panic = true;
+            break;
+        case 'n':
+            optarg = (char *) "none";
+            /* fallthrough */
+        case QEMU_VU_OPT_CACHE:
+            if (seen_cache) {
+                error_report("-n and --cache can only be specified once");
+                exit(EXIT_FAILURE);
+            }
+            seen_cache = true;
+            if (bdrv_parse_cache_mode(optarg, &flags, &writethrough) == -1) {
+                error_report("Invalid cache mode `%s'", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case QEMU_VU_OPT_AIO:
+            if (seen_aio) {
+                error_report("--aio can only be specified once");
+                exit(EXIT_FAILURE);
+            }
+            seen_aio = true;
+            if (!strcmp(optarg, "native")) {
+                flags |= BDRV_O_NATIVE_AIO;
+            } else if (!strcmp(optarg, "threads")) {
+                /* this is the default */
+            } else {
+               error_report("invalid aio mode `%s'", optarg);
+               exit(EXIT_FAILURE);
+            }
+            break;
+        case 'r':
+            readonly = true;
+            flags &= ~BDRV_O_RDWR;
+            break;
+        case 'k':
+            sockpath = optarg;
+            if (sockpath[0] != '/') {
+                error_report("socket path must be absolute");
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'f':
+            fmt = optarg;
+            break;
+        case 'V':
+            version(argv[0]);
+            exit(0);
+            break;
+        case 'h':
+            usage(argv[0]);
+            exit(0);
+            break;
+        case '?':
+            error_report("Try `%s --help' for more information.", argv[0]);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if ((argc - optind) != 1) {
+        error_report("Invalid number of arguments");
+        error_printf("Try `%s --help' for more information.\n", argv[0]);
+        exit(EXIT_FAILURE);
+    }
+
+    if (qemu_init_main_loop(&local_err)) {
+        error_report_err(local_err);
+        exit(EXIT_FAILURE);
+    }
+    bdrv_init();
+
+    srcpath = argv[optind];
+    if (fmt) {
+        options = qdict_new();
+        qdict_put_str(options, "driver", fmt);
+    }
+    blk = blk_new_open(srcpath, NULL, options, flags, &local_err);
+
+    if (!blk) {
+        error_reportf_err(local_err, "Failed to blk_new_open '%s': ",
+                          argv[optind]);
+        exit(EXIT_FAILURE);
+    }
+    bs = blk_bs(blk);
+
+    blk_set_enable_write_cache(blk, !writethrough);
+
+    fd_size = blk_getlength(blk);
+    if (fd_size < 0) {
+        error_report("Failed to determine the image length: %s",
+                     strerror(-fd_size));
+        exit(EXIT_FAILURE);
+    }
+
+    AioContext *ctx = bdrv_get_aio_context(bs);
+    bdrv_invalidate_cache(bs, NULL);
+
+    vub_device = g_new0(VubDev, 1);
+    vub_device->unix_socket = g_strdup(sockpath);
+    vub_device->writable = !readonly;
+    vub_device->blkcfg.wce = !writethrough;
+    vub_device->backend = blk;
+    vub_device->ctx = ctx;
+    vub_initialize_config(bs, &vub_device->blkcfg);
+    vub_device->listener = qio_net_listener_new();
+    vub_device->exit_panic = exit_panic;
+
+    qio_net_listener_set_name(vub_device->listener,
+                              "vhost-user-backend-listener");
+
+    SocketAddress *addr = g_new0(SocketAddress, 1);
+    addr->u.q_unix.path = (char *) sockpath;
+    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+    Error **errp = NULL;
+    if (qio_net_listener_open_sync(vub_device->listener, addr, 1, errp) < 0) {
+        goto error;
+    }
+
+    qio_net_listener_set_client_func(vub_device->listener,
+                                     vub_accept,
+                                     vub_device,
+                                     NULL);
+
+    QTAILQ_INIT(&vub_device->clients);
+
+    do {
+        main_loop_wait(false);
+    } while (!vub_device->exit_panic || !vub_device->close);
+
+ error:
+    vus_shutdown();
+    exit(EXIT_SUCCESS);
+}
--
2.24.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v2 4/5] new qTest case for the vhost-user-blk device backend
  2020-01-14 14:06 [PATCH v2 0/5] vhost-user block device backend implementation Coiby Xu
                   ` (2 preceding siblings ...)
  2020-01-14 14:06 ` [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol Coiby Xu
@ 2020-01-14 14:06 ` Coiby Xu
  2020-01-14 14:06 ` [PATCH v2 5/5] building configuration files changes Coiby Xu
  4 siblings, 0 replies; 19+ messages in thread
From: Coiby Xu @ 2020-01-14 14:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, bharatlkmlkvm, Coiby Xu, stefanha

This test case has the same tests as tests/virtio-blk-test.c except for
tests have block_resize.

Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
---
 tests/libqos/vhost-user-blk.c | 125 ++++++
 tests/libqos/vhost-user-blk.h |  44 +++
 tests/vhost-user-blk-test.c   | 691 ++++++++++++++++++++++++++++++++++
 3 files changed, 860 insertions(+)
 create mode 100644 tests/libqos/vhost-user-blk.c
 create mode 100644 tests/libqos/vhost-user-blk.h
 create mode 100644 tests/vhost-user-blk-test.c

diff --git a/tests/libqos/vhost-user-blk.c b/tests/libqos/vhost-user-blk.c
new file mode 100644
index 0000000000..1f8e6eec7e
--- /dev/null
+++ b/tests/libqos/vhost-user-blk.c
@@ -0,0 +1,125 @@
+/*
+ * libqos driver framework
+ *
+ * Copyright (c) 2018 Emanuele Giuseppe Esposito <e.emanuelegiuseppe@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest.h"
+#include "qemu/module.h"
+#include "standard-headers/linux/virtio_blk.h"
+#include "libqos/qgraph.h"
+#include "libqos/vhost-user-blk.h"
+
+#define PCI_SLOT                0x04
+#define PCI_FN                  0x00
+
+/* virtio-blk-device */
+static void *qvhost_user_blk_get_driver(QVhostUserBlk *v_blk,
+                                    const char *interface)
+{
+    if (!g_strcmp0(interface, "vhost-user-blk")) {
+        return v_blk;
+    }
+    if (!g_strcmp0(interface, "virtio")) {
+        return v_blk->vdev;
+    }
+
+    fprintf(stderr, "%s not present in vhost-user-blk-device\n", interface);
+    g_assert_not_reached();
+}
+
+static void *qvhost_user_blk_device_get_driver(void *object,
+                                           const char *interface)
+{
+    QVhostUserBlkDevice *v_blk = object;
+    return qvhost_user_blk_get_driver(&v_blk->blk, interface);
+}
+
+static void *vhost_user_blk_device_create(void *virtio_dev,
+                                      QGuestAllocator *t_alloc,
+                                      void *addr)
+{
+    QVhostUserBlkDevice *vhost_user_blk = g_new0(QVhostUserBlkDevice, 1);
+    QVhostUserBlk *interface = &vhost_user_blk->blk;
+
+    interface->vdev = virtio_dev;
+
+    vhost_user_blk->obj.get_driver = qvhost_user_blk_device_get_driver;
+
+    return &vhost_user_blk->obj;
+}
+
+/* virtio-blk-pci */
+static void *qvhost_user_blk_pci_get_driver(void *object, const char *interface)
+{
+    QVhostUserBlkPCI *v_blk = object;
+    if (!g_strcmp0(interface, "pci-device")) {
+        return v_blk->pci_vdev.pdev;
+    }
+    return qvhost_user_blk_get_driver(&v_blk->blk, interface);
+}
+
+static void *vhost_user_blk_pci_create(void *pci_bus, QGuestAllocator *t_alloc,
+                                      void *addr)
+{
+    QVhostUserBlkPCI *vhost_user_blk = g_new0(QVhostUserBlkPCI, 1);
+    QVhostUserBlk *interface = &vhost_user_blk->blk;
+    QOSGraphObject *obj = &vhost_user_blk->pci_vdev.obj;
+
+    virtio_pci_init(&vhost_user_blk->pci_vdev, pci_bus, addr);
+    interface->vdev = &vhost_user_blk->pci_vdev.vdev;
+
+    g_assert_cmphex(interface->vdev->device_type, ==, VIRTIO_ID_BLOCK);
+
+    obj->get_driver = qvhost_user_blk_pci_get_driver;
+
+    return obj;
+}
+
+static void vhost_user_blk_register_nodes(void)
+{
+    /* FIXME: every test using these two nodes needs to setup a
+     * -drive,id=drive0 otherwise QEMU is not going to start.
+     * Therefore, we do not include "produces" edge for virtio
+     * and pci-device yet.
+    */
+
+    char *arg = g_strdup_printf("id=drv0,chardev=char1,addr=%x.%x",
+                                PCI_SLOT, PCI_FN);
+
+    QPCIAddress addr = {
+        .devfn = QPCI_DEVFN(PCI_SLOT, PCI_FN),
+    };
+
+    QOSGraphEdgeOptions opts = { };
+
+    /* virtio-blk-device */
+    /** opts.extra_device_opts = "drive=drive0"; */
+    qos_node_create_driver("vhost-user-blk-device", vhost_user_blk_device_create);
+    qos_node_consumes("vhost-user-blk-device", "virtio-bus", &opts);
+    qos_node_produces("vhost-user-blk-device", "vhost-user-blk");
+
+    /* virtio-blk-pci */
+    opts.extra_device_opts = arg;
+    add_qpci_address(&opts, &addr);
+    qos_node_create_driver("vhost-user-blk-pci", vhost_user_blk_pci_create);
+    qos_node_consumes("vhost-user-blk-pci", "pci-bus", &opts);
+    qos_node_produces("vhost-user-blk-pci", "vhost-user-blk");
+
+    g_free(arg);
+}
+
+libqos_init(vhost_user_blk_register_nodes);
diff --git a/tests/libqos/vhost-user-blk.h b/tests/libqos/vhost-user-blk.h
new file mode 100644
index 0000000000..ef4ef09cca
--- /dev/null
+++ b/tests/libqos/vhost-user-blk.h
@@ -0,0 +1,44 @@
+/*
+ * libqos driver framework
+ *
+ * Copyright (c) 2018 Emanuele Giuseppe Esposito <e.emanuelegiuseppe@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#ifndef TESTS_LIBQOS_VHOST_USER_BLK_H
+#define TESTS_LIBQOS_VHOST_USER_BLK_H
+
+#include "libqos/qgraph.h"
+#include "libqos/virtio.h"
+#include "libqos/virtio-pci.h"
+
+typedef struct QVhostUserBlk QVhostUserBlk;
+typedef struct QVhostUserBlkPCI QVhostUserBlkPCI;
+typedef struct QVhostUserBlkDevice QVhostUserBlkDevice;
+
+struct QVhostUserBlk {
+    QVirtioDevice *vdev;
+};
+
+struct QVhostUserBlkPCI {
+    QVirtioPCIDevice pci_vdev;
+    QVhostUserBlk blk;
+};
+
+struct QVhostUserBlkDevice {
+    QOSGraphObject obj;
+    QVhostUserBlk blk;
+};
+
+#endif
diff --git a/tests/vhost-user-blk-test.c b/tests/vhost-user-blk-test.c
new file mode 100644
index 0000000000..d54769dd7f
--- /dev/null
+++ b/tests/vhost-user-blk-test.c
@@ -0,0 +1,691 @@
+/*
+ * QTest testcase for VirtIO Block Device
+ *
+ * Copyright (c) 2014 SUSE LINUX Products GmbH
+ * Copyright (c) 2014 Marc Marí
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest-single.h"
+#include "qemu/bswap.h"
+#include "qemu/module.h"
+#include "standard-headers/linux/virtio_blk.h"
+#include "standard-headers/linux/virtio_pci.h"
+#include "libqos/qgraph.h"
+#include "libqos/vhost-user-blk.h"
+#include "libqos/libqos-pc.h"
+
+/* TODO actually test the results and get rid of this */
+#define qmp_discard_response(...) qobject_unref(qmp(__VA_ARGS__))
+
+#define TEST_IMAGE_SIZE         (64 * 1024 * 1024)
+#define QVIRTIO_BLK_TIMEOUT_US  (30 * 1000 * 1000)
+#define PCI_SLOT_HP             0x06
+
+typedef struct QVirtioBlkReq {
+    uint32_t type;
+    uint32_t ioprio;
+    uint64_t sector;
+    char *data;
+    uint8_t status;
+} QVirtioBlkReq;
+
+
+#ifdef HOST_WORDS_BIGENDIAN
+static const bool host_is_big_endian = true;
+#else
+static const bool host_is_big_endian; /* false */
+#endif
+
+static inline void virtio_blk_fix_request(QVirtioDevice *d, QVirtioBlkReq *req)
+{
+    if (qvirtio_is_big_endian(d) != host_is_big_endian) {
+        req->type = bswap32(req->type);
+        req->ioprio = bswap32(req->ioprio);
+        req->sector = bswap64(req->sector);
+    }
+}
+
+
+static inline void virtio_blk_fix_dwz_hdr(QVirtioDevice *d,
+    struct virtio_blk_discard_write_zeroes *dwz_hdr)
+{
+    if (qvirtio_is_big_endian(d) != host_is_big_endian) {
+        dwz_hdr->sector = bswap64(dwz_hdr->sector);
+        dwz_hdr->num_sectors = bswap32(dwz_hdr->num_sectors);
+        dwz_hdr->flags = bswap32(dwz_hdr->flags);
+    }
+}
+
+static uint64_t virtio_blk_request(QGuestAllocator *alloc, QVirtioDevice *d,
+                                   QVirtioBlkReq *req, uint64_t data_size)
+{
+    uint64_t addr;
+    uint8_t status = 0xFF;
+
+    switch (req->type) {
+    case VIRTIO_BLK_T_IN:
+    case VIRTIO_BLK_T_OUT:
+        g_assert_cmpuint(data_size % 512, ==, 0);
+        break;
+    case VIRTIO_BLK_T_DISCARD:
+    case VIRTIO_BLK_T_WRITE_ZEROES:
+        g_assert_cmpuint(data_size %
+                         sizeof(struct virtio_blk_discard_write_zeroes), ==, 0);
+        break;
+    default:
+        g_assert_cmpuint(data_size, ==, 0);
+    }
+
+    addr = guest_alloc(alloc, sizeof(*req) + data_size);
+
+    virtio_blk_fix_request(d, req);
+
+    memwrite(addr, req, 16);
+    memwrite(addr + 16, req->data, data_size);
+    memwrite(addr + 16 + data_size, &status, sizeof(status));
+
+    return addr;
+}
+
+/* Returns the request virtqueue so the caller can perform further tests */
+static QVirtQueue *test_basic(QVirtioDevice *dev, QGuestAllocator *alloc)
+{
+    QVirtioBlkReq req;
+    uint64_t req_addr;
+    uint64_t capacity;
+    uint64_t features;
+    uint32_t free_head;
+    uint8_t status;
+    char *data;
+    QTestState *qts = global_qtest;
+    QVirtQueue *vq;
+
+    features = qvirtio_get_features(dev);
+    features = features & ~(QVIRTIO_F_BAD_FEATURE |
+                    (1u << VIRTIO_RING_F_INDIRECT_DESC) |
+                    (1u << VIRTIO_RING_F_EVENT_IDX) |
+                    (1u << VIRTIO_BLK_F_SCSI));
+    qvirtio_set_features(dev, features);
+
+    capacity = qvirtio_config_readq(dev, 0);
+    g_assert_cmpint(capacity, ==, TEST_IMAGE_SIZE / 512);
+
+    vq = qvirtqueue_setup(dev, alloc, 0);
+
+    qvirtio_set_driver_ok(dev);
+
+    /* Write and read with 3 descriptor layout */
+    /* Write request */
+    req.type = VIRTIO_BLK_T_OUT;
+    req.ioprio = 1;
+    req.sector = 0;
+    req.data = g_malloc0(512);
+    strcpy(req.data, "TEST");
+
+    req_addr = virtio_blk_request(alloc, dev, &req, 512);
+
+    g_free(req.data);
+
+    free_head = qvirtqueue_add(qts, vq, req_addr, 16, false, true);
+    qvirtqueue_add(qts, vq, req_addr + 16, 512, false, true);
+    qvirtqueue_add(qts, vq, req_addr + 528, 1, true, false);
+
+    qvirtqueue_kick(qts, dev, vq, free_head);
+
+    qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                           QVIRTIO_BLK_TIMEOUT_US);
+    status = readb(req_addr + 528);
+    g_assert_cmpint(status, ==, 0);
+
+    guest_free(alloc, req_addr);
+
+    /* Read request */
+    req.type = VIRTIO_BLK_T_IN;
+    req.ioprio = 1;
+    req.sector = 0;
+    req.data = g_malloc0(512);
+
+    req_addr = virtio_blk_request(alloc, dev, &req, 512);
+
+    g_free(req.data);
+
+    free_head = qvirtqueue_add(qts, vq, req_addr, 16, false, true);
+    qvirtqueue_add(qts, vq, req_addr + 16, 512, true, true);
+    qvirtqueue_add(qts, vq, req_addr + 528, 1, true, false);
+
+    qvirtqueue_kick(qts, dev, vq, free_head);
+
+    qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                           QVIRTIO_BLK_TIMEOUT_US);
+    status = readb(req_addr + 528);
+    g_assert_cmpint(status, ==, 0);
+
+    data = g_malloc0(512);
+    memread(req_addr + 16, data, 512);
+    g_assert_cmpstr(data, ==, "TEST");
+    g_free(data);
+
+    guest_free(alloc, req_addr);
+
+    if (features & (1u << VIRTIO_BLK_F_WRITE_ZEROES)) {
+        struct virtio_blk_discard_write_zeroes dwz_hdr;
+        void *expected;
+
+        /*
+         * WRITE_ZEROES request on the same sector of previous test where
+         * we wrote "TEST".
+         */
+        req.type = VIRTIO_BLK_T_WRITE_ZEROES;
+        req.data = (char *) &dwz_hdr;
+        dwz_hdr.sector = 0;
+        dwz_hdr.num_sectors = 1;
+        dwz_hdr.flags = 0;
+
+        virtio_blk_fix_dwz_hdr(dev, &dwz_hdr);
+
+        req_addr = virtio_blk_request(alloc, dev, &req, sizeof(dwz_hdr));
+
+        free_head = qvirtqueue_add(qts, vq, req_addr, 16, false, true);
+        qvirtqueue_add(qts, vq, req_addr + 16, sizeof(dwz_hdr), false, true);
+        qvirtqueue_add(qts, vq, req_addr + 16 + sizeof(dwz_hdr), 1, true,
+                       false);
+
+        qvirtqueue_kick(qts, dev, vq, free_head);
+
+        qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                               QVIRTIO_BLK_TIMEOUT_US);
+        status = readb(req_addr + 16 + sizeof(dwz_hdr));
+        g_assert_cmpint(status, ==, 0);
+
+        guest_free(alloc, req_addr);
+
+        /* Read request to check if the sector contains all zeroes */
+        req.type = VIRTIO_BLK_T_IN;
+        req.ioprio = 1;
+        req.sector = 0;
+        req.data = g_malloc0(512);
+
+        req_addr = virtio_blk_request(alloc, dev, &req, 512);
+
+        g_free(req.data);
+
+        free_head = qvirtqueue_add(qts, vq, req_addr, 16, false, true);
+        qvirtqueue_add(qts, vq, req_addr + 16, 512, true, true);
+        qvirtqueue_add(qts, vq, req_addr + 528, 1, true, false);
+
+        qvirtqueue_kick(qts, dev, vq, free_head);
+
+        qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                               QVIRTIO_BLK_TIMEOUT_US);
+        status = readb(req_addr + 528);
+        g_assert_cmpint(status, ==, 0);
+
+        data = g_malloc(512);
+        expected = g_malloc0(512);
+        memread(req_addr + 16, data, 512);
+        g_assert_cmpmem(data, 512, expected, 512);
+        g_free(expected);
+        g_free(data);
+
+        guest_free(alloc, req_addr);
+    }
+
+    if (features & (1u << VIRTIO_BLK_F_DISCARD)) {
+        struct virtio_blk_discard_write_zeroes dwz_hdr;
+
+        req.type = VIRTIO_BLK_T_DISCARD;
+        req.data = (char *) &dwz_hdr;
+        dwz_hdr.sector = 0;
+        dwz_hdr.num_sectors = 1;
+        dwz_hdr.flags = 0;
+
+        virtio_blk_fix_dwz_hdr(dev, &dwz_hdr);
+
+        req_addr = virtio_blk_request(alloc, dev, &req, sizeof(dwz_hdr));
+
+        free_head = qvirtqueue_add(qts, vq, req_addr, 16, false, true);
+        qvirtqueue_add(qts, vq, req_addr + 16, sizeof(dwz_hdr), false, true);
+        qvirtqueue_add(qts, vq, req_addr + 16 + sizeof(dwz_hdr), 1, true, false);
+
+        qvirtqueue_kick(qts, dev, vq, free_head);
+
+        qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                               QVIRTIO_BLK_TIMEOUT_US);
+        status = readb(req_addr + 16 + sizeof(dwz_hdr));
+        g_assert_cmpint(status, ==, 0);
+
+        guest_free(alloc, req_addr);
+    }
+
+    if (features & (1u << VIRTIO_F_ANY_LAYOUT)) {
+        /* Write and read with 2 descriptor layout */
+        /* Write request */
+        req.type = VIRTIO_BLK_T_OUT;
+        req.ioprio = 1;
+        req.sector = 1;
+        req.data = g_malloc0(512);
+        strcpy(req.data, "TEST");
+
+        req_addr = virtio_blk_request(alloc, dev, &req, 512);
+
+        g_free(req.data);
+
+        free_head = qvirtqueue_add(qts, vq, req_addr, 528, false, true);
+        qvirtqueue_add(qts, vq, req_addr + 528, 1, true, false);
+        qvirtqueue_kick(qts, dev, vq, free_head);
+
+        qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                               QVIRTIO_BLK_TIMEOUT_US);
+        status = readb(req_addr + 528);
+        g_assert_cmpint(status, ==, 0);
+
+        guest_free(alloc, req_addr);
+
+        /* Read request */
+        req.type = VIRTIO_BLK_T_IN;
+        req.ioprio = 1;
+        req.sector = 1;
+        req.data = g_malloc0(512);
+
+        req_addr = virtio_blk_request(alloc, dev, &req, 512);
+
+        g_free(req.data);
+
+        free_head = qvirtqueue_add(qts, vq, req_addr, 16, false, true);
+        qvirtqueue_add(qts, vq, req_addr + 16, 513, true, false);
+
+        qvirtqueue_kick(qts, dev, vq, free_head);
+
+        qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                               QVIRTIO_BLK_TIMEOUT_US);
+        status = readb(req_addr + 528);
+        g_assert_cmpint(status, ==, 0);
+
+        data = g_malloc0(512);
+        memread(req_addr + 16, data, 512);
+        g_assert_cmpstr(data, ==, "TEST");
+        g_free(data);
+
+        guest_free(alloc, req_addr);
+    }
+
+    return vq;
+}
+
+static void basic(void *obj, void *data, QGuestAllocator *t_alloc)
+{
+    QVhostUserBlk *blk_if = obj;
+    QVirtQueue *vq;
+
+    vq = test_basic(blk_if->vdev, t_alloc);
+    qvirtqueue_cleanup(blk_if->vdev->bus, vq, t_alloc);
+
+}
+
+static void indirect(void *obj, void *u_data, QGuestAllocator *t_alloc)
+{
+    QVirtQueue *vq;
+    QVhostUserBlk *blk_if = obj;
+    QVirtioDevice *dev = blk_if->vdev;
+    QVirtioBlkReq req;
+    QVRingIndirectDesc *indirect;
+    uint64_t req_addr;
+    uint64_t capacity;
+    uint64_t features;
+    uint32_t free_head;
+    uint8_t status;
+    char *data;
+    QTestState *qts = global_qtest;
+
+    features = qvirtio_get_features(dev);
+    g_assert_cmphex(features & (1u << VIRTIO_RING_F_INDIRECT_DESC), !=, 0);
+    features = features & ~(QVIRTIO_F_BAD_FEATURE |
+                            (1u << VIRTIO_RING_F_EVENT_IDX) |
+                            (1u << VIRTIO_BLK_F_SCSI));
+    qvirtio_set_features(dev, features);
+
+    capacity = qvirtio_config_readq(dev, 0);
+    g_assert_cmpint(capacity, ==, TEST_IMAGE_SIZE / 512);
+
+    vq = qvirtqueue_setup(dev, t_alloc, 0);
+    qvirtio_set_driver_ok(dev);
+
+    /* Write request */
+    req.type = VIRTIO_BLK_T_OUT;
+    req.ioprio = 1;
+    req.sector = 0;
+    req.data = g_malloc0(512);
+    strcpy(req.data, "TEST");
+
+    req_addr = virtio_blk_request(t_alloc, dev, &req, 512);
+
+    g_free(req.data);
+
+    indirect = qvring_indirect_desc_setup(qts, dev, t_alloc, 2);
+    qvring_indirect_desc_add(dev, qts, indirect, req_addr, 528, false);
+    qvring_indirect_desc_add(dev, qts, indirect, req_addr + 528, 1, true);
+    free_head = qvirtqueue_add_indirect(qts, vq, indirect);
+    qvirtqueue_kick(qts, dev, vq, free_head);
+
+    qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                           QVIRTIO_BLK_TIMEOUT_US);
+    status = readb(req_addr + 528);
+    g_assert_cmpint(status, ==, 0);
+
+    g_free(indirect);
+    guest_free(t_alloc, req_addr);
+
+    /* Read request */
+    req.type = VIRTIO_BLK_T_IN;
+    req.ioprio = 1;
+    req.sector = 0;
+    req.data = g_malloc0(512);
+    strcpy(req.data, "TEST");
+
+    req_addr = virtio_blk_request(t_alloc, dev, &req, 512);
+
+    g_free(req.data);
+
+    indirect = qvring_indirect_desc_setup(qts, dev, t_alloc, 2);
+    qvring_indirect_desc_add(dev, qts, indirect, req_addr, 16, false);
+    qvring_indirect_desc_add(dev, qts, indirect, req_addr + 16, 513, true);
+    free_head = qvirtqueue_add_indirect(qts, vq, indirect);
+    qvirtqueue_kick(qts, dev, vq, free_head);
+
+    qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                           QVIRTIO_BLK_TIMEOUT_US);
+    status = readb(req_addr + 528);
+    g_assert_cmpint(status, ==, 0);
+
+    data = g_malloc0(512);
+    memread(req_addr + 16, data, 512);
+    g_assert_cmpstr(data, ==, "TEST");
+    g_free(data);
+
+    g_free(indirect);
+    guest_free(t_alloc, req_addr);
+    qvirtqueue_cleanup(dev->bus, vq, t_alloc);
+}
+
+
+static void idx(void *obj, void *u_data, QGuestAllocator *t_alloc)
+{
+    QVirtQueue *vq;
+    QVhostUserBlkPCI *blk = obj;
+    QVirtioPCIDevice *pdev = &blk->pci_vdev;
+    QVirtioDevice *dev = &pdev->vdev;
+    QVirtioBlkReq req;
+    uint64_t req_addr;
+    uint64_t capacity;
+    uint64_t features;
+    uint32_t free_head;
+    uint32_t write_head;
+    uint32_t desc_idx;
+    uint8_t status;
+    char *data;
+    QOSGraphObject *blk_object = obj;
+    QPCIDevice *pci_dev = blk_object->get_driver(blk_object, "pci-device");
+    QTestState *qts = global_qtest;
+
+    if (qpci_check_buggy_msi(pci_dev)) {
+        return;
+    }
+
+    qpci_msix_enable(pdev->pdev);
+    qvirtio_pci_set_msix_configuration_vector(pdev, t_alloc, 0);
+
+    features = qvirtio_get_features(dev);
+    features = features & ~(QVIRTIO_F_BAD_FEATURE |
+                            (1u << VIRTIO_RING_F_INDIRECT_DESC) |
+                            (1u << VIRTIO_F_NOTIFY_ON_EMPTY) |
+                            (1u << VIRTIO_BLK_F_SCSI));
+    qvirtio_set_features(dev, features);
+
+    capacity = qvirtio_config_readq(dev, 0);
+    g_assert_cmpint(capacity, ==, TEST_IMAGE_SIZE / 512);
+
+    vq = qvirtqueue_setup(dev, t_alloc, 0);
+    qvirtqueue_pci_msix_setup(pdev, (QVirtQueuePCI *)vq, t_alloc, 1);
+
+    qvirtio_set_driver_ok(dev);
+
+    /* Write request */
+    req.type = VIRTIO_BLK_T_OUT;
+    req.ioprio = 1;
+    req.sector = 0;
+    req.data = g_malloc0(512);
+    strcpy(req.data, "TEST");
+
+    req_addr = virtio_blk_request(t_alloc, dev, &req, 512);
+
+    g_free(req.data);
+
+    free_head = qvirtqueue_add(qts, vq, req_addr, 16, false, true);
+    qvirtqueue_add(qts, vq, req_addr + 16, 512, false, true);
+    qvirtqueue_add(qts, vq, req_addr + 528, 1, true, false);
+    qvirtqueue_kick(qts, dev, vq, free_head);
+
+    qvirtio_wait_used_elem(qts, dev, vq, free_head, NULL,
+                           QVIRTIO_BLK_TIMEOUT_US);
+
+    /* Write request */
+    req.type = VIRTIO_BLK_T_OUT;
+    req.ioprio = 1;
+    req.sector = 1;
+    req.data = g_malloc0(512);
+    strcpy(req.data, "TEST");
+
+    req_addr = virtio_blk_request(t_alloc, dev, &req, 512);
+
+    g_free(req.data);
+
+    /* Notify after processing the third request */
+    qvirtqueue_set_used_event(qts, vq, 2);
+    free_head = qvirtqueue_add(qts, vq, req_addr, 16, false, true);
+    qvirtqueue_add(qts, vq, req_addr + 16, 512, false, true);
+    qvirtqueue_add(qts, vq, req_addr + 528, 1, true, false);
+    qvirtqueue_kick(qts, dev, vq, free_head);
+    write_head = free_head;
+
+    /* No notification expected */
+    status = qvirtio_wait_status_byte_no_isr(qts, dev,
+                                             vq, req_addr + 528,
+                                             QVIRTIO_BLK_TIMEOUT_US);
+    g_assert_cmpint(status, ==, 0);
+
+    guest_free(t_alloc, req_addr);
+
+    /* Read request */
+    req.type = VIRTIO_BLK_T_IN;
+    req.ioprio = 1;
+    req.sector = 1;
+    req.data = g_malloc0(512);
+
+    req_addr = virtio_blk_request(t_alloc, dev, &req, 512);
+
+    g_free(req.data);
+
+    free_head = qvirtqueue_add(qts, vq, req_addr, 16, false, true);
+    qvirtqueue_add(qts, vq, req_addr + 16, 512, true, true);
+    qvirtqueue_add(qts, vq, req_addr + 528, 1, true, false);
+
+    qvirtqueue_kick(qts, dev, vq, free_head);
+
+    /* We get just one notification for both requests */
+    qvirtio_wait_used_elem(qts, dev, vq, write_head, NULL,
+                           QVIRTIO_BLK_TIMEOUT_US);
+    g_assert(qvirtqueue_get_buf(qts, vq, &desc_idx, NULL));
+    g_assert_cmpint(desc_idx, ==, free_head);
+
+    status = readb(req_addr + 528);
+    g_assert_cmpint(status, ==, 0);
+
+    data = g_malloc0(512);
+    memread(req_addr + 16, data, 512);
+    g_assert_cmpstr(data, ==, "TEST");
+    g_free(data);
+
+    guest_free(t_alloc, req_addr);
+
+    /* End test */
+    qpci_msix_disable(pdev->pdev);
+
+    qvirtqueue_cleanup(dev->bus, vq, t_alloc);
+}
+
+static void pci_hotplug(void *obj, void *data, QGuestAllocator *t_alloc)
+{
+    QVirtioPCIDevice *dev1 = obj;
+    QVirtioPCIDevice *dev;
+    QTestState *qts = dev1->pdev->bus->qts;
+
+    /* plug secondary disk */
+    qtest_qmp_device_add(qts, "vhost-user-blk-pci", "drv1",
+                         "{'addr': %s, 'chardev': 'char2'}",
+                         stringify(PCI_SLOT_HP) ".0");
+
+    dev = virtio_pci_new(dev1->pdev->bus,
+                         &(QPCIAddress) { .devfn = QPCI_DEVFN(PCI_SLOT_HP, 0) });
+    g_assert_nonnull(dev);
+    g_assert_cmpint(dev->vdev.device_type, ==, VIRTIO_ID_BLOCK);
+    qvirtio_pci_device_disable(dev);
+    qos_object_destroy((QOSGraphObject *)dev);
+
+    /* unplug secondary disk */
+    qpci_unplug_acpi_device_test(qts, "drv1", PCI_SLOT_HP);
+}
+
+/*
+ * Check that setting the vring addr on a non-existent virtqueue does
+ * not crash.
+ */
+static void test_nonexistent_virtqueue(void *obj, void *data,
+                                       QGuestAllocator *t_alloc)
+{
+    QVhostUserBlkPCI *blk = obj;
+    QVirtioPCIDevice *pdev = &blk->pci_vdev;
+    QPCIBar bar0;
+    QPCIDevice *dev;
+
+    dev = qpci_device_find(pdev->pdev->bus, QPCI_DEVFN(4, 0));
+    g_assert(dev != NULL);
+    qpci_device_enable(dev);
+
+    bar0 = qpci_iomap(dev, 0, NULL);
+
+    qpci_io_writeb(dev, bar0, VIRTIO_PCI_QUEUE_SEL, 2);
+    qpci_io_writel(dev, bar0, VIRTIO_PCI_QUEUE_PFN, 1);
+
+    g_free(dev);
+}
+
+static const char *qtest_qemu_vu_binary(void)
+{
+    const char *qemu_vu_bin;
+
+    qemu_vu_bin = getenv("QTEST_QEMU_VU_BINARY");
+    if (!qemu_vu_bin) {
+        fprintf(stderr, "Environment variable QTEST_QEMU_VU_BINARY required\n");
+        exit(0);
+    }
+
+    return qemu_vu_bin;
+}
+
+static void drive_destroy(void *path)
+{
+    unlink(path);
+    g_free(path);
+    qos_invalidate_command_line();
+}
+
+
+static char *drive_create(void)
+{
+ int fd, ret;
+ /** vhost-user-blk won't recognize drive located in /tmp */
+ char *t_path = g_strdup("qtest.XXXXXX");
+
+ /** Create a temporary raw image */
+ fd = mkstemp(t_path);
+ g_assert_cmpint(fd, >=, 0);
+ ret = ftruncate(fd, TEST_IMAGE_SIZE);
+ g_assert_cmpint(ret, ==, 0);
+ close(fd);
+
+ g_test_queue_destroy(drive_destroy, t_path);
+ return t_path;
+}
+
+
+
+static void start_vhost_user_blk(const char *img_path, const char *sock_path)
+{
+    const char *vhost_user_blk_bin = qtest_qemu_vu_binary();
+    /* "qemu-vu -e" will exit when the client disconnects thus the launched
+     *  qemu-vu process will not block scripts/tap-driver.pl
+     */
+    gchar *command = g_strdup_printf("exec %s "
+                                     "-e "
+                                     "-k %s "
+                                     "-f raw "
+                                     "%s",
+                                     vhost_user_blk_bin,
+                                     sock_path, img_path);
+    g_test_message("starting vhost-user backend: %s", command);
+    pid_t pid = fork();
+    if (pid == 0) {
+        execlp("/bin/sh", "sh", "-c", command, NULL);
+        exit(1);
+    }
+    /*
+     * make sure qemu-vu i.e. socket server is started before tests
+     * otherwise qemu will complain,
+     * "Failed to connect socket ... Connection refused"
+     */
+    g_usleep(G_USEC_PER_SEC);
+}
+
+static void *vhost_user_blk_test_setup(GString *cmd_line, void *arg)
+{
+    /* create image file */
+    const char *img_path = drive_create();
+    const char *sock_path = "/tmp/vhost-user-blk_vhost.socket";
+    start_vhost_user_blk(img_path, sock_path);
+    /* "-chardev socket,id=char2" is used for pci_hotplug*/
+    g_string_append_printf(cmd_line,
+                           " -object memory-backend-memfd,id=mem,size=128M,share=on -numa node,memdev=mem "
+                           "-chardev socket,id=char1,path=%s "
+                           "-chardev socket,id=char2,path=%s",
+                           sock_path, sock_path);
+    return arg;
+}
+
+static void register_vhost_user_blk_test(void)
+{
+    QOSGraphTestOptions opts = {
+        .before = vhost_user_blk_test_setup,
+    };
+
+    /*
+     * tests for vhost-user-blk and vhost-user-blk-pci
+     * The tests are borrowed from tests/virtio-blk-test.c. But some tests
+     * regarding block_resize don't work for vhost-user-blk.
+     * vhost-user-blk device doesn't have -drive, so tests containing
+     * block_resize are also abandoned,
+     *  - config
+     *  - resize
+     */
+    qos_add_test("basic", "vhost-user-blk", basic, &opts);
+    qos_add_test("indirect", "vhost-user-blk", indirect, &opts);
+    qos_add_test("idx", "vhost-user-blk-pci", idx, &opts);
+    qos_add_test("nxvirtq", "vhost-user-blk-pci",
+                      test_nonexistent_virtqueue, &opts);
+    qos_add_test("hotplug", "vhost-user-blk-pci", pci_hotplug, &opts);
+}
+
+libqos_init(register_vhost_user_blk_test);
--
2.24.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v2 5/5] building configuration files changes
  2020-01-14 14:06 [PATCH v2 0/5] vhost-user block device backend implementation Coiby Xu
                   ` (3 preceding siblings ...)
  2020-01-14 14:06 ` [PATCH v2 4/5] new qTest case for the vhost-user-blk device backend Coiby Xu
@ 2020-01-14 14:06 ` Coiby Xu
  2020-01-16 11:07   ` Kevin Wolf
  4 siblings, 1 reply; 19+ messages in thread
From: Coiby Xu @ 2020-01-14 14:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, bharatlkmlkvm, Coiby Xu, stefanha

libvhost-user depends on sys/poll.h, sys/socket.h and eventfd. Although
Windows has the equivalent Winsock, it doesn't have eventfd. So only
enable this feature on Linux.

Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
---
 Makefile               | 1 +
 Makefile.objs          | 2 +-
 Makefile.target        | 1 +
 configure              | 2 +-
 tests/Makefile.include | 5 ++++-
 5 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 6b5ad1121b..80c514f96b 100644
--- a/Makefile
+++ b/Makefile
@@ -558,6 +558,10 @@ qemu-img.o: qemu-img-cmds.h

 qemu-img$(EXESUF): qemu-img.o $(authz-obj-y) $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS)
 qemu-nbd$(EXESUF): qemu-nbd.o $(authz-obj-y) $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS)
+
+ifdef CONFIG_LINUX
+qemu-vu$(EXESUF): qemu-vu.o blockdev-vu.o $(authz-obj-y) $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS) libvhost-user.a
+endif
 qemu-io$(EXESUF): qemu-io.o $(authz-obj-y) $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS)

 qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o $(COMMON_LDADDS)
diff --git a/Makefile.objs b/Makefile.objs
index 7c1e50f9d6..5404daca00 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -53,6 +53,7 @@ common-obj-$(CONFIG_WIN32) += os-win32.o
 common-obj-$(CONFIG_POSIX) += os-posix.o

 common-obj-$(CONFIG_LINUX) += fsdev/
+common-obj-$(CONFIG_LINUX) += blockdev-vu.o

 common-obj-y += accel/
 common-obj-y += migration/
diff --git a/Makefile.target b/Makefile.target
index 6e61f607b1..d5405976ce 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -159,6 +159,9 @@ obj-y += monitor/
 obj-y += qapi/
 obj-y += memory.o
 obj-y += memory_mapping.o
+ifdef CONFIG_LINUX
+obj-y += ../contrib/libvhost-user/libvhost-user.o
+endif
 obj-y += migration/ram.o
 LIBS := $(libs_softmmu) $(LIBS)

diff --git a/configure b/configure
index 0ce2c0354a..8ad4da57be 100755
--- a/configure
+++ b/configure
@@ -6169,6 +6169,9 @@ if test "$want_tools" = "yes" ; then
   if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" ] ; then
     tools="qemu-nbd\$(EXESUF) $tools"
   fi
+  if [ "$linux" = "yes" ] ; then
+    tools="qemu-vu\$(EXESUF) $tools"
+  fi
   if [ "$ivshmem" = "yes" ]; then
     tools="ivshmem-client\$(EXESUF) ivshmem-server\$(EXESUF) $tools"
   fi
diff --git a/tests/Makefile.include b/tests/Makefile.include
index 49e3b0d319..33bd48247b 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -746,6 +746,7 @@ qos-test-obj-y += tests/libqos/virtio.o
 qos-test-obj-$(CONFIG_VIRTFS) += tests/libqos/virtio-9p.o
 qos-test-obj-y += tests/libqos/virtio-balloon.o
 qos-test-obj-y += tests/libqos/virtio-blk.o
+qos-test-obj-y += tests/libqos/vhost-user-blk.o
 qos-test-obj-y += tests/libqos/virtio-mmio.o
 qos-test-obj-y += tests/libqos/virtio-net.o
 qos-test-obj-y += tests/libqos/virtio-pci.o
@@ -788,6 +789,7 @@ qos-test-obj-$(CONFIG_VHOST_NET_USER) += tests/vhost-user-test.o $(chardev-obj-y
 qos-test-obj-y += tests/virtio-test.o
 qos-test-obj-$(CONFIG_VIRTFS) += tests/virtio-9p-test.o
 qos-test-obj-y += tests/virtio-blk-test.o
+qos-test-obj-$(CONFIG_LINUX) += tests/vhost-user-blk-test.o
 qos-test-obj-y += tests/virtio-net-test.o
 qos-test-obj-y += tests/virtio-rng-test.o
 qos-test-obj-y += tests/virtio-scsi-test.o
@@ -935,7 +937,8 @@ endef
 $(patsubst %, check-qtest-%, $(QTEST_TARGETS)): check-qtest-%: %-softmmu/all $(check-qtest-y)
 	$(call do_test_human,$(check-qtest-$*-y) $(check-qtest-generic-y), \
 	  QTEST_QEMU_BINARY=$*-softmmu/qemu-system-$* \
-	  QTEST_QEMU_IMG=qemu-img$(EXESUF))
+	  QTEST_QEMU_IMG=./qemu-img$(EXESUF) \
+	  QTEST_QEMU_VU_BINARY=./qemu-vu$(EXESUF))

 check-unit: $(check-unit-y)
 	$(call do_test_human, $^)
--
2.24.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 5/5] building configuration files changes
  2020-01-14 14:06 ` [PATCH v2 5/5] building configuration files changes Coiby Xu
@ 2020-01-16 11:07   ` Kevin Wolf
  0 siblings, 0 replies; 19+ messages in thread
From: Kevin Wolf @ 2020-01-16 11:07 UTC (permalink / raw)
  To: Coiby Xu; +Cc: bharatlkmlkvm, qemu-devel, stefanha

Am 14.01.2020 um 15:06 hat Coiby Xu geschrieben:
> libvhost-user depends on sys/poll.h, sys/socket.h and eventfd. Although
> Windows has the equivalent Winsock, it doesn't have eventfd. So only
> enable this feature on Linux.
> 
> Signed-off-by: Coiby Xu <coiby.xu@gmail.com>

This shouldn't be a separate patch at the end of the series, but each
patch should make sure that the files it adds are actually built. This
way you can apply individual patches and they still make sense. It also
helps for bisecting.

While applying the patches locally for review, I noticed that there is a
merge conflict with recent changes that moved qos test cases to a
different subdirectory. I resolved the conflict manually, but for v3 you
should rebase.

Kevin



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 1/5] vhost-user block device backend
  2020-01-14 14:06 ` [PATCH v2 1/5] vhost-user block device backend Coiby Xu
@ 2020-01-16 13:51   ` Stefan Hajnoczi
  2020-01-16 14:20     ` Kevin Wolf
  2020-02-14  3:07     ` Coiby Xu
  2020-01-16 13:56   ` Kevin Wolf
  1 sibling, 2 replies; 19+ messages in thread
From: Stefan Hajnoczi @ 2020-01-16 13:51 UTC (permalink / raw)
  To: Coiby Xu; +Cc: kwolf, bharatlkmlkvm, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 36346 bytes --]

On Tue, Jan 14, 2020 at 10:06:16PM +0800, Coiby Xu wrote:
> By making use of libvhost, multiple block device drives can be exported and each drive can serve multiple clients simultaneously. Since vhost-user-server needs a block drive to be created first, delay the creation of this object.
> 
> Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
> ---
>  blockdev-vu.c              | 1008 ++++++++++++++++++++++++++++++++++++

This file contains both vhost-user-blk code and generic vhost-user
protocol code.  Please split them into two files:
1. backends/vhost-user-blk-server.c
2. util/vhost-user-server.c

(As I read and understand the code better I may have better suggestions
about where to put these source files and how to name them.)

>  include/block/vhost-user.h |   46 ++
>  vl.c                       |    4 +
>  3 files changed, 1058 insertions(+)
>  create mode 100644 blockdev-vu.c
>  create mode 100644 include/block/vhost-user.h
> 
> diff --git a/blockdev-vu.c b/blockdev-vu.c
> new file mode 100644
> index 0000000000..45f0bb43a7
> --- /dev/null
> +++ b/blockdev-vu.c
> @@ -0,0 +1,1008 @@
> +#include "qemu/osdep.h"
> +#include "block/vhost-user.h"
> +#include "qapi/error.h"
> +#include "qapi/qapi-types-sockets.h"
> +#include "qapi/qapi-commands-block.h"
> +
> +#include "sysemu/block-backend.h"
> +#include "qemu/main-loop.h"
> +
> +#include "qemu/units.h"
> +
> +#include "block/block.h"
> +
> +#include "qom/object_interfaces.h"
> +
> +#include <sys/eventfd.h>
> +
> +#include "hw/qdev-properties.h"
> +enum {
> +    VHOST_USER_BLK_MAX_QUEUES = 8,
> +};

The number of queues is hardcoded to 1 so this constant can be removed
for now.

> +
> +struct virtio_blk_inhdr {
> +    unsigned char status;
> +};
> +
> +
> +static QTAILQ_HEAD(, VubDev) vub_devs = QTAILQ_HEAD_INITIALIZER(vub_devs);

I'm not sure if this list will be necessary.  See my comments about
replacing the "name" property with Object's built-in "id" property.

> +
> +
> +typedef struct VubReq {
> +    VuVirtqElement *elem;
> +    int64_t sector_num;
> +    size_t size;
> +    struct virtio_blk_inhdr *in;
> +    struct virtio_blk_outhdr out;
> +    VuClient *client;
> +    struct VuVirtq *vq;
> +} VubReq;
> +
> +static void
> +remove_watch(VuDev *vu_dev, int fd)
> +{
> +    VuClient *client;
> +
> +    g_assert(vu_dev);
> +    g_assert(fd >= 0);
> +
> +    client = container_of(vu_dev, VuClient, parent);
> +    aio_set_fd_handler(client->blk->ctx, fd, false, NULL, NULL, NULL, NULL);
> +}
> +
> +static void close_client(VuClient *client)
> +{
> +    vu_deinit(&client->parent);
> +    /** g_source_destroy(vub_device->parent.src); */
> +    client->sioc = NULL;
> +    object_unref(OBJECT(client->ioc));
> +    client->closed = true;
> +
> +}
> +
> +static void vub_panic_cb(VuDev *vu_dev, const char *buf)
> +{
> +    if (buf) {
> +        g_warning("vu_panic: %s", buf);
> +    }
> +
> +    VuClient *client = container_of(vu_dev, VuClient, parent);
> +    if (client->blk->exit_panic) {
> +        client->blk->close = true;
> +    }
> +    if (!client->closed) {
> +        close_client(client);
> +    }
> +}
> +
> +
> +static void vub_req_complete(VubReq *req)
> +{
> +    VuDev *vu_dev = &req->client->parent;
> +
> +    /* IO size with 1 extra status byte */
> +    vu_queue_push(vu_dev, req->vq, req->elem,
> +                  req->size + 1);
> +    vu_queue_notify(vu_dev, req->vq);
> +
> +    if (req->elem) {
> +        free(req->elem);
> +    }
> +
> +    g_free(req);
> +}
> +
> +
> +
> +static int
> +vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt,
> +                         uint32_t type)
> +{
> +    struct virtio_blk_discard_write_zeroes *desc;
> +    ssize_t size;
> +    void *buf;
> +
> +    size = iov_size(iov, iovcnt);
> +    if (size != sizeof(*desc)) {
> +        fprintf(stderr, "Invalid size %ld, expect %ld\n", size, sizeof(*desc));
> +        return -1;
> +    }
> +    buf = g_new0(char, size);
> +
> +    iov_to_buf_full(iov, iovcnt, 0, buf, size);

Simpler:

  struct virtio_blk_discard_write_zeroes desc;

  if (unlikely(iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc)) !=
               sizeof(desc)) {
      /* TODO handle error */
  }

No heap allocation.  One variable (desc) instead of two (desc and buf).

> +
> +
> +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> +    VubDev *vdev_blk;
> +    VuClient *client = container_of(dev, VuClient, parent);
> +    vdev_blk = client->blk;
> +    desc = (struct virtio_blk_discard_write_zeroes *)buf;
> +    uint64_t range[2] = { le64toh(desc->sector) << 9,
> +                          le32toh(desc->num_sectors) << 9 };
> +    if (type == VIRTIO_BLK_T_DISCARD) {
> +        if (blk_pdiscard(vdev_blk->blk, range[0], range[1]) == 0) {
> +            g_free(buf);
> +            return 0;
> +        }
> +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
> +        if (blk_pwrite_zeroes(vdev_blk->blk, range[0], range[1]) == 0) {
> +            g_free(buf);
> +            return 0;
> +        }
> +    }
> +    #endif
> +
> +    g_free(buf);
> +    return -1;
> +}
> +
> +
> +static void
> +vub_flush(VubReq *req)
> +{
> +    VuClient *client = req->client;
> +    blk_co_flush(client->blk->backend);
> +}

Any function that calls a *_co_*() function must be marked coroutine_fn:

  static void coroutine_fn
  vub_flush(VubReq *req)

> +
> +
> +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
> +typedef struct BlkRwCo {
> +    BlockBackend *blk;
> +    int64_t offset;
> +    void *iobuf;
> +    int ret;
> +    BdrvRequestFlags flags;
> +} BlkRwCo;
> +
> +static void blk_read_entry(void *opaque)
> +{
> +    BlkRwCo *rwco = opaque;
> +    QEMUIOVector *qiov = rwco->iobuf;
> +
> +    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
> +                              qiov, rwco->flags);
> +    aio_wait_kick();
> +}
> +
> +
> +static void blk_write_entry(void *opaque)
> +{
> +    BlkRwCo *rwco = opaque;
> +    QEMUIOVector *qiov = rwco->iobuf;
> +
> +    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
> +                              qiov, rwco->flags);
> +    aio_wait_kick();
> +}
> +
> +
> +static int blk_prw(BlockBackend *blk, QEMUIOVector *qiov, int64_t offset,
> +                   CoroutineEntry co_entry, BdrvRequestFlags flags)

Why is block/block-backend.c:blk_prw() duplicated here?

> +{
> +
> +    BlkRwCo rwco = {
> +        .blk    = blk,
> +        .offset = offset,
> +        .iobuf  = qiov,
> +        .flags  = flags,
> +        .ret    = NOT_DONE,
> +    };
> +
> +    if (qemu_in_coroutine()) {
> +        /* Fast-path if already in coroutine context */
> +        co_entry(&rwco);
> +    } else {
> +        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
> +        bdrv_coroutine_enter(blk_bs(blk), co);
> +        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> +    }
> +
> +    return rwco.ret;
> +}
> +
> +
> +static ssize_t
> +vub_rwv(VubReq *req, struct iovec *iov,
> +        uint32_t iovcnt,
> +        CoroutineEntry co_entry)
> +{
> +    VuClient *client = req->client;
> +    ssize_t rc;
> +
> +    if (!iovcnt) {
> +        fprintf(stderr, "Invalid Read/Write IOV count\n");
> +        return -1;
> +    }
> +
> +    int64_t offset = req->sector_num * 512;
> +    QEMUIOVector *qiov = g_new0(QEMUIOVector, 1);
> +    qemu_iovec_init_external(qiov, iov, iovcnt);
> +    rc = blk_prw(client->blk->backend, qiov, offset, co_entry, 0);
> +
> +    req->size = iov_size(iov, iovcnt);
> +    if (rc < 0) {
> +        fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
> +                client->blk->name, req->sector_num, req->size,
> +                strerror(errno));
> +        return -1;
> +    }
> +
> +    return rc;
> +}
> +
> +static int vub_virtio_process_req(VuClient *client,
> +                                     VuVirtq *vq)
> +{
> +    VuDev *vu_dev = &client->parent;
> +    VuVirtqElement *elem;
> +    uint32_t type;
> +    VubReq *req;
> +
> +    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
> +    if (!elem) {
> +        return -1;
> +    }
> +
> +    struct iovec *in_iov = elem->in_sg;
> +    struct iovec *out_iov = elem->out_sg;
> +    unsigned in_num = elem->in_num;
> +    unsigned out_num = elem->out_num;
> +    /* refer to hw/block/virtio_blk.c */
> +    if (elem->out_num < 1 || elem->in_num < 1) {
> +        fprintf(stderr, "virtio-blk request missing headers\n");
> +        free(elem);
> +        return -1;
> +    }
> +
> +    req = g_new0(VubReq, 1);
> +    req->client = client;
> +    req->vq = vq;
> +    req->elem = elem;
> +
> +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
> +                            sizeof(req->out)) != sizeof(req->out))) {
> +        fprintf(stderr, "virtio-blk request outhdr too short");
> +        goto err;
> +    }
> +
> +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
> +
> +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
> +        fprintf(stderr, "virtio-blk request inhdr too short");
> +        goto err;
> +    }
> +
> +    /* We always touch the last byte, so just see how big in_iov is.  */
> +    req->in = (void *)in_iov[in_num - 1].iov_base
> +              + in_iov[in_num - 1].iov_len
> +              - sizeof(struct virtio_blk_inhdr);
> +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
> +
> +
> +    type = le32toh(req->out.type);
> +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
> +    case VIRTIO_BLK_T_IN:
> +    case VIRTIO_BLK_T_OUT: {
> +        ssize_t ret = 0;
> +        bool is_write = type & VIRTIO_BLK_T_OUT;
> +        req->sector_num = le64toh(req->out.sector);
> +        if (is_write) {
> +            ret = vub_rwv(req, out_iov, out_num, blk_write_entry);
> +        } else {
> +            ret = vub_rwv(req, in_iov, in_num, blk_read_entry);
> +        }
> +        if (ret >= 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vub_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_FLUSH:
> +        vub_flush(req);
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        vub_req_complete(req);
> +        break;
> +    case VIRTIO_BLK_T_GET_ID: {
> +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> +                          VIRTIO_BLK_ID_BYTES);
> +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        req->size = elem->in_sg[0].iov_len;
> +        vub_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_DISCARD:
> +    case VIRTIO_BLK_T_WRITE_ZEROES: {
> +        int rc;
> +        rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num, type);
> +        if (rc == 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vub_req_complete(req);
> +        break;
> +    }
> +    default:
> +        req->in->status = VIRTIO_BLK_S_UNSUPP;
> +        vub_req_complete(req);
> +        break;
> +    }
> +
> +    return 0;
> +
> +err:
> +    free(elem);
> +    g_free(req);
> +    return -1;
> +}
> +
> +
> +static void vub_process_vq(VuDev *vu_dev, int idx)
> +{
> +    VuClient *client;
> +    VuVirtq *vq;
> +    int ret;
> +
> +    client = container_of(vu_dev, VuClient, parent);
> +    assert(client);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    assert(vq);
> +
> +    while (1) {
> +        ret = vub_virtio_process_req(client, vq);
> +        if (ret) {
> +            break;
> +        }
> +    }
> +}
> +
> +
> +static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
> +{
> +    VuVirtq *vq;
> +
> +    assert(vu_dev);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
> +}
> +
> +static uint64_t
> +vub_get_features(VuDev *dev)
> +{
> +    uint64_t features;
> +    VubDev *vdev_blk;
> +
> +    VuClient *client = container_of(dev, VuClient, parent);
> +    vdev_blk = client->blk;
> +
> +    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
> +               1ull << VIRTIO_BLK_F_SEG_MAX |
> +               1ull << VIRTIO_BLK_F_TOPOLOGY |
> +               1ull << VIRTIO_BLK_F_BLK_SIZE |
> +               1ull << VIRTIO_BLK_F_FLUSH |
> +               #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> +               1ull << VIRTIO_BLK_F_DISCARD |
> +               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
> +               #endif
> +               1ull << VIRTIO_BLK_F_CONFIG_WCE |
> +               1ull << VIRTIO_F_VERSION_1 |
> +               1ull << VIRTIO_RING_F_INDIRECT_DESC |
> +               1ull << VIRTIO_RING_F_EVENT_IDX |
> +               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
> +
> +    if (!vdev_blk->writable) {
> +        features |= 1ull << VIRTIO_BLK_F_RO;
> +    }
> +
> +    return features;
> +}
> +
> +static uint64_t
> +vub_get_protocol_features(VuDev *dev)
> +{
> +    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
> +           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
> +}
> +
> +static int
> +vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
> +{
> +    VubDev *vdev_blk;
> +
> +    VuClient *client = container_of(vu_dev, VuClient, parent);
> +    vdev_blk = client->blk;
> +    memcpy(config, &vdev_blk->blkcfg, len);
> +
> +    return 0;
> +}
> +
> +static int
> +vub_set_config(VuDev *vu_dev, const uint8_t *data,
> +               uint32_t offset, uint32_t size, uint32_t flags)
> +{
> +    VubDev *vdev_blk;
> +
> +    VuClient *client = container_of(vu_dev, VuClient, parent);
> +    vdev_blk = client->blk;
> +    uint8_t wce;
> +
> +    /* don't support live migration */
> +    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
> +        return -1;
> +    }
> +
> +
> +    if (offset != offsetof(struct virtio_blk_config, wce) ||
> +        size != 1) {
> +        return -1;
> +    }
> +
> +    wce = *data;
> +    if (wce == vdev_blk->blkcfg.wce) {
> +        /* Do nothing as same with old configuration */
> +        return 0;
> +    }
> +
> +    vdev_blk->blkcfg.wce = wce;
> +    blk_set_enable_write_cache(vdev_blk->backend, true);
> +    return 0;
> +}
> +
> +
> +/*
> + * When the client disconnects, it send a VHOST_USER_NONE request
> + * and vu_process_message will simple call exit which cause the VM
> + * to exit abruptly.
> + * To avoid this issue,  process VHOST_USER_NONE request ahead
> + * of vu_process_message.
> + *
> + */
> +static int vub_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
> +{
> +    if (vmsg->request == VHOST_USER_NONE) {
> +        dev->panic(dev, "disconnect");
> +        return true;
> +    }
> +    return false;
> +}
> +
> +static void
> +vmsg_close_fds(VhostUserMsg *vmsg)
> +{
> +    int i;
> +    for (i = 0; i < vmsg->fd_num; i++) {
> +        close(vmsg->fds[i]);
> +    }
> +}
> +
> +static bool
> +vu_message_read_co(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)

Why is this function called *_co()?  It appears to be callable from
outside coroutine context too.

> +{
> +    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
> +    struct iovec iov = {
> +        .iov_base = (char *)vmsg,
> +        .iov_len = VHOST_USER_HDR_SIZE,
> +    };
> +    struct msghdr msg = {
> +        .msg_iov = &iov,
> +        .msg_iovlen = 1,
> +        .msg_control = control,
> +        .msg_controllen = sizeof(control),
> +    };
> +    size_t fd_size;
> +    struct cmsghdr *cmsg;
> +    int rc;
> +    char buffer[100];
> +    VuClient *client = container_of(vu_dev, VuClient, parent);
> +    QIOChannel *ioc = client->ioc;
> +    do {
> +        rc = recvmsg(conn_fd, &msg, 0);
> +        if (rc < 0) {
> +            if (errno == EAGAIN) {
> +                if (qemu_in_coroutine()) {
> +                    qio_channel_yield(ioc, G_IO_IN);
> +                } else {
> +                    qio_channel_wait(ioc, G_IO_IN);
> +                }
> +                continue;
> +            } else if (errno == EINTR) {
> +                continue;
> +            }
> +        }
> +        break;
> +    } while (true);
> +
> +    if (rc < 0) {
> +        sprintf(buffer, "Error while recvmsg: %s", strerror(errno));
> +        vub_panic_cb(vu_dev, buffer);
> +        return false;
> +    }
> +
> +    assert(rc == VHOST_USER_HDR_SIZE || rc == 0);
> +
> +    vmsg->fd_num = 0;
> +    for (cmsg = CMSG_FIRSTHDR(&msg);
> +         cmsg != NULL;
> +         cmsg = CMSG_NXTHDR(&msg, cmsg))
> +    {
> +        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
> +            fd_size = cmsg->cmsg_len - CMSG_LEN(0);
> +            vmsg->fd_num = fd_size / sizeof(int);
> +            memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
> +            break;
> +        }
> +    }
> +
> +    if (vmsg->size > sizeof(vmsg->payload)) {
> +        sprintf(buffer,
> +                "Error: too big message request: %d, size: vmsg->size: %u, "
> +                "while sizeof(vmsg->payload) = %zu\n",
> +                vmsg->request, vmsg->size, sizeof(vmsg->payload));
> +        vub_panic_cb(vu_dev, buffer);
> +        goto fail;
> +    }
> +
> +    if (vmsg->size) {
> +        do {
> +            rc = read(conn_fd, &vmsg->payload, vmsg->size);
> +            if (rc < 0) {
> +                if (errno == EAGAIN) {
> +                    if (qemu_in_coroutine()) {
> +                        qio_channel_yield(ioc, G_IO_IN);
> +                    } else {
> +                        qio_channel_wait(ioc, G_IO_IN);
> +                    }
> +                    continue;
> +                } else if (errno == EINTR) {
> +                    continue;
> +                }
> +            }
> +            break;
> +        } while (true);
> +
> +        if (rc <= 0) {
> +            sprintf(buffer, "Error while reading: %s", strerror(errno));
> +            vub_panic_cb(vu_dev, buffer);
> +            goto fail;
> +        }
> +
> +        assert(rc == vmsg->size);
> +    }
> +
> +    return true;
> +
> +fail:
> +    vmsg_close_fds(vmsg);
> +
> +    return false;
> +}
> +
> +static void vub_kick_cb(void *opaque)
> +{
> +    vu_watch_cb_data *data = (vu_watch_cb_data *) opaque;
> +    int index = data->index;
> +    VuDev *dev = data->vu_dev;
> +    VuVirtq *vq = &dev->vq[index];
> +    int sock = vq->kick_fd;
> +    eventfd_t kick_data;
> +    ssize_t rc;
> +
> +    rc = eventfd_read(sock, &kick_data);
> +    if (rc == -1) {
> +        char buffer[100];
> +        sprintf(buffer, "kick eventfd_read(): %s", strerror(errno));
> +        vub_panic_cb(dev, buffer);
> +        g_free(data);
> +        dev->remove_watch(dev, dev->vq[index].kick_fd);
> +    } else {
> +        if (vq->handler) {
> +            vq->handler(dev, index);
> +        }
> +    }
> +}

This is a generic vhost-user function that is not specific to
vhost-user-blk.  It should be in a different source file.

> +
> +static const VuDevIface vub_iface = {
> +    .get_features = vub_get_features,
> +    .queue_set_started = vub_queue_set_started,
> +    .get_protocol_features = vub_get_protocol_features,
> +    .get_config = vub_get_config,
> +    .set_config = vub_set_config,
> +    .process_msg = vub_process_msg,
> +    .read_msg = vu_message_read_co,
> +    .kick_callback = vub_kick_cb,
> +};
> +
> +
> +void vub_free(VubDev *vub_dev, bool called_by_QOM)
> +{
> +    if (!vub_dev) {
> +        return;
> +    }
> +
> +    blk_unref(vub_dev->backend);
> +    g_free(vub_dev->name);
> +    g_free(vub_dev->unix_socket);
> +
> +    if (vub_dev->next.tqe_circ.tql_prev) {
> +        /*
> +         * if vub_dev->next.tqe_circ.tql_prev = null,
> +         * vub_dev hasn't been inserted into the queue and
> +         * vub_free is called by obj->instance_finalize.
> +         */
> +        QTAILQ_REMOVE(&vub_devs, vub_dev, next);
> +    }
> +    /*
> +     * Needn't to free vub_dev if called by QOM
> +     * because QOM will do the clean-up work.
> +     */
> +    if (!called_by_QOM) {
> +        g_free(vub_dev);
> +    }
> +}
> +
> +static coroutine_fn void vu_client_trip(void *opaque)
> +{
> +    VuClient *client = opaque;
> +
> +    while (!client->closed) {
> +        vu_dispatch(&client->parent);
> +    }
> +
> +    QTAILQ_REMOVE(&client->blk->clients, client, next);
> +
> +}
> +
> +static void vu_client_start(VuClient *client)
> +{
> +    Coroutine *co = qemu_coroutine_create(vu_client_trip, client);
> +    qemu_coroutine_enter(co);
> +}

What is the coroutine and threading model here?  This launches
vu_client_trip() in the current thread's event loop but vu_dispatch()
performs blocking I/O and will prevent the event loop from continuing.

> +
> +
> +G_STATIC_ASSERT((int)G_IO_IN == (int)VU_WATCH_IN);
> +G_STATIC_ASSERT((int)G_IO_OUT == (int)VU_WATCH_OUT);
> +G_STATIC_ASSERT((int)G_IO_PRI == (int)VU_WATCH_PRI);
> +G_STATIC_ASSERT((int)G_IO_ERR == (int)VU_WATCH_ERR);
> +G_STATIC_ASSERT((int)G_IO_HUP == (int)VU_WATCH_HUP);
> +
> +static void
> +set_watch(VuDev *vu_dev, int fd, int vu_evt,
> +          vu_watch_cb_packed_data cb, void *pvt)
> +{
> +    /*
> +     * since aio_dispatch can only pass one user data pointer to the
> +     * callback function, pack VuDev, pvt into a struct
> +     */
> +    VuClient *client;
> +
> +    g_assert(vu_dev);
> +    g_assert(fd >= 0);
> +    g_assert(cb);
> +    client = container_of(vu_dev, VuClient, parent);
> +    vu_watch_cb_data *cb_data = g_new0(vu_watch_cb_data, 1);
> +    cb_data->index = (intptr_t) pvt;
> +    cb_data->vu_dev = vu_dev;
> +    aio_set_fd_handler(client->blk->ctx, fd, false, (void *) cb,
> +                       NULL, NULL, cb_data);
> +}
> +
> +
> +void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
> +                gpointer opaque)
> +{
> +    VuClient *client;
> +    VubDev *vub_device = opaque;
> +    client = g_new0(VuClient, 1);

Is it helpful to have a separate VuClient struct even though only a
single vhost-user client can be connected at a time?  It may be simpler
to keep connection state directly in VubDev.

> +
> +    if (!vu_init_packed_data(&client->parent, VHOST_USER_BLK_MAX_QUEUES,
> +                             sioc->fd, vub_panic_cb, set_watch,
> +                             remove_watch, &vub_iface)) {
> +        fprintf(stderr, "Failed to initialized libvhost-user\n");
> +        g_free(client);
> +        return;
> +    }
> +
> +    client->blk = vub_device;
> +    client->refcount = 1;
> +    client->sioc = sioc;
> +    /*
> +     * increase the object reference, so cioc will not freed by
> +     * qio_net_listener_channel_func which will call object_unref(OBJECT(sioc))
> +     */
> +    object_ref(OBJECT(client->sioc));
> +    qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
> +    client->ioc = QIO_CHANNEL(sioc);
> +    object_ref(OBJECT(client->ioc));
> +    object_ref(OBJECT(sioc));
> +
> +    qio_channel_set_blocking(QIO_CHANNEL(client->sioc), false, NULL);
> +    client->closed = false;
> +    QTAILQ_INSERT_TAIL(&client->blk->clients, client, next);
> +    vu_client_start(client);
> +}
> +
> +
> +void
> +vub_initialize_config(BlockDriverState *bs, struct virtio_blk_config *config)
> +{
> +    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
> +    config->blk_size = BDRV_SECTOR_SIZE;

At some point this should become a configuration parameter because using
512B sectors on a native 4K sector image file results in poor
performance.

> +    config->size_max = 65536;

Why 65536?  hw/block/virtio-blk.c sets it to 0.

> +    config->seg_max = 128 - 2;

This is okay for now but should be changed in the future.

hw/block/virtio-blk.c was recently modified to calculate seg_max based
on the virtqueue size (which is where the hardcoded 128 originally came
from).  Applications that use large buffer sizes (128+ KB) benefit from
larger virtqueue sizes and seg_max.

> +    config->min_io_size = 1;
> +    config->opt_io_size = 1;
> +    config->num_queues = 1;
> +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> +    config->max_discard_sectors = 32768;
> +    config->max_discard_seg = 1;
> +    config->discard_sector_alignment = config->blk_size >> 9;
> +    config->max_write_zeroes_sectors = 32768;
> +    config->max_write_zeroes_seg = 1;
> +    #endif
> +}
> +
> +
> +static VubDev *vub_new(VubDev *vub_device, const char *name,
> +                       const char *unix_socket, bool writable, Error **errp)

This function does not allocate and return a new VubDev, so vub_new() is
not an accurate name for this function.  The name vub_init() is clearer.

Why are name, unix_socket, and writable passed in?  They are already
vub_device fields.

> +{
> +
> +    BlockBackend *blk;
> +
> +    /*
> +     * Don't allow resize while the vhost user server is running,
> +     * otherwise we don't care what happens with the node.
> +     */
> +    uint64_t perm = BLK_PERM_CONSISTENT_READ;
> +    int ret;
> +
> +    AioContext *ctx;
> +
> +    BlockDriverState *bs = bdrv_lookup_bs(name,
> +                                          name,
> +                                          errp);
> +
> +    if (!bs) {
> +        error_setg(errp,
> +                   "No drive with name '%s'."
> +                   " Please find the list of names with "
> +                   "'info block'", name);
> +        return NULL;
> +    }
> +
> +    if (bdrv_is_read_only(bs)) {
> +        writable = false;
> +    }
> +
> +    if (writable) {
> +        perm |= BLK_PERM_WRITE;
> +    }
> +
> +    ctx = bdrv_get_aio_context(bs);
> +    aio_context_acquire(ctx);
> +    bdrv_invalidate_cache(bs, NULL);
> +    aio_context_release(ctx);
> +
> +    blk = blk_new(bdrv_get_aio_context(bs), perm,
> +                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> +                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
> +    ret = blk_insert_bs(blk, bs, errp);
> +
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +
> +    blk_set_enable_write_cache(blk, false);
> +
> +    blk_set_allow_aio_context_change(blk, true);
> +
> +
> +    vub_device->name = g_strdup(name);
> +    vub_device->unix_socket = g_strdup(unix_socket);
> +    vub_device->writable = writable;
> +    vub_device->blkcfg.wce = 0;
> +    vub_device->backend = blk;
> +    vub_device->ctx = ctx;
> +    vub_initialize_config(bs, &vub_device->blkcfg);
> +    return vub_device;
> +
> +fail:
> +    blk_unref(blk);
> +    return NULL;
> +}
> +
> +void vhost_user_server_free(VubDev *vub_device, bool called_by_QOM)
> +{
> +    if (!vub_device) {
> +        return;
> +    }
> +
> +    VuClient *client, *next;
> +    QTAILQ_FOREACH_SAFE(client, &vub_device->clients, next, next) {
> +        if (!client->closed) {
> +            close_client(client);
> +        }
> +    }
> +
> +    if (vub_device->listener) {
> +        qio_net_listener_disconnect(vub_device->listener);
> +        object_unref(OBJECT(vub_device->listener));
> +    }
> +    vub_free(vub_device, called_by_QOM);
> +
> +}
> +
> +
> +VubDev *vub_dev_find(const char *name)
> +{
> +    VubDev *vub_device;
> +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> +        if (strcmp(name, vub_device->name) == 0) {
> +            return vub_device;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +
> +static VubDev *vub_dev_find_by_unix_socket(const char *unix_socket)
> +{
> +    VubDev *vub_device;
> +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> +        if (strcmp(unix_socket, vub_device->unix_socket) == 0) {
> +            return vub_device;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +static void vhost_user_server_start(VubDev *vub_device, const char *unix_socket,
> +                                    const char *name, bool writable,
> +                                    Error **errp)
> +{
> +
> +    if (vub_dev_find(name) || vub_dev_find_by_unix_socket(unix_socket)) {
> +        error_setg(errp, "Vhost user server with name '%s' or "
> +                "with socket_path '%s' has already been started",
> +                name, unix_socket);
> +        return;
> +    }
> +
> +
> +    if (!vub_new(vub_device, name, unix_socket, writable, errp)) {
> +        return;
> +    }
> +
> +
> +    vub_device->listener = qio_net_listener_new();
> +
> +    qio_net_listener_set_name(vub_device->listener,
> +                              "vhost-user-backend-listener");
> +
> +    SocketAddress *addr = g_new0(SocketAddress, 1);

addr is never freed.  This variable could live on the stack instead:

  SocketAddress addr = {};

> +    addr->u.q_unix.path = (char *) unix_socket;
> +    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
> +    if (qio_net_listener_open_sync(vub_device->listener, addr, 1, errp) < 0) {
> +        goto error;
> +    }
> +
> +
> +    QTAILQ_INSERT_TAIL(&vub_devs, vub_device, next);
> +    QTAILQ_INIT(&vub_device->clients);
> +
> +    qio_net_listener_set_client_func(vub_device->listener,
> +                                     vub_accept,
> +                                     vub_device,
> +                                     NULL);
> +
> +    return;
> +
> + error:
> +    vub_free(vub_device, false);
> +}
> +
> +static void vu_set_block_name(Object *obj, const char *value,
> +                                           Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);;
> +
> +    if (vus->name) {
> +        error_setg(errp, "evdev property already set");
> +        return;
> +    }
> +
> +    vus->name = g_strdup(value);
> +}
> +
> +static char *vu_get_block_name(Object *obj, Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);
> +    return g_strdup(vus->name);
> +}
> +
> +
> +static void vu_set_unix_socket(Object *obj, const char *value,
> +                                            Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);;
> +
> +    if (vus->unix_socket) {
> +        error_setg(errp, "unix_socket property already set");
> +        return;
> +    }
> +
> +    vus->unix_socket = g_strdup(value);
> +    vhost_user_server_start(vus, value, vus->name,
> +                            vus->writable, errp);

Property setters should only perform input validation and store the
data.  Actions like creating network connections, opening files, etc
should happen later in a UserCreatableClass->complete() callback.

This is necessary because vus->writable is also a property and may be
set after unix_socket.  The ->complete() callback is called after all
setters so it can access the final values of all properties.

See iothread_class_init() and iothread_complete() for an example.

> +}
> +
> +static char *vu_get_unix_socket(Object *obj, Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);;
> +    return g_strdup(vus->unix_socket);
> +}
> +
> +static bool vu_get_block_writable(Object *obj, Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);;
> +    return vus->writable;
> +}
> +
> +static void vu_set_block_writable(Object *obj, bool value, Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);
> +
> +    vus->writable = value;
> +}
> +
> +static void vhost_user_server_instance_init(Object *obj)
> +{
> +
> +    object_property_add_bool(obj, "writable",
> +                            vu_get_block_writable,
> +                            vu_set_block_writable, NULL);
> +
> +    object_property_add_str(obj, "name",
> +                            vu_get_block_name,
> +                            vu_set_block_name, NULL);
> +
> +    object_property_add_str(obj, "unix_socket",
> +                            vu_get_unix_socket,
> +                            vu_set_unix_socket, NULL);
> +
> +}
> +
> +static void vhost_user_server_instance_finalize(Object *obj)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);
> +    vhost_user_server_free(vus, true);
> +    /* object_del shouldn't free this object struct */
> +    obj->free = NULL;
> +}
> +
> +static const TypeInfo vhost_user_server_info = {
> +    .name = TYPE_VHOST_USER_SERVER,
> +    .parent = TYPE_OBJECT,
> +    .instance_size = sizeof(VuDev),
> +    .instance_init = vhost_user_server_instance_init,
> +    .instance_finalize = vhost_user_server_instance_finalize,
> +    .interfaces = (InterfaceInfo[]) {
> +        {TYPE_USER_CREATABLE},
> +        {}
> +    },
> +};
> +
> +static void vhost_user_server_register_types(void)
> +{
> +    type_register_static(&vhost_user_server_info);
> +}
> +
> +type_init(vhost_user_server_register_types)
> +
> diff --git a/include/block/vhost-user.h b/include/block/vhost-user.h
> new file mode 100644
> index 0000000000..ef6d695244
> --- /dev/null
> +++ b/include/block/vhost-user.h
> @@ -0,0 +1,46 @@
> +#include "io/channel-socket.h"
> +#include "io/net-listener.h"
> +#include "contrib/libvhost-user/libvhost-user.h"
> +#include "standard-headers/linux/virtio_blk.h"
> +typedef struct VubDev VubDev;
> +typedef struct VuClient VuClient;
> +#define TYPE_VHOST_USER_SERVER "vhost-user-server"
> +
> +#define VHOST_USER_SERVER(obj) \
> +   OBJECT_CHECK(VubDev, obj, TYPE_VHOST_USER_SERVER)
> +/* vhost user block device */
> +struct VubDev {
> +    Object parent_obj;
> +    char *name;
> +    char *unix_socket;
> +    bool exit_panic;
> +    bool close;
> +    BlockBackend *backend;
> +    AioContext *ctx;
> +    QIONetListener *listener;
> +    QIOChannelSocket *sioc;
> +    QTAILQ_HEAD(, VuClient) clients;
> +    QTAILQ_ENTRY(VubDev) next;
> +    struct virtio_blk_config blkcfg;
> +    bool writable;
> +};
> +
> +struct VuClient {
> +    VuDev parent;
> +    int refcount;
> +    VubDev *blk;
> +    QIOChannelSocket *sioc; /* The underlying data channel */
> +    QIOChannel *ioc; /* The current I/O channel */
> +    QTAILQ_ENTRY(VuClient) next;
> +    bool closed;
> +};
> +VubDev *vub_dev_find(const char *name);

All -object instances already have an id=ID property.  There is no need
to declare a separate "name" property.  Please look at iothread_by_id()
and iothread_get_id() for examples.

> +
> +void vhost_user_server_free(VubDev *vub_device, bool called_by_QOM);
> +void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
> +                gpointer opaque);
> +
> +void vub_free(VubDev *vub_dev, bool called_by_QOM);
> +
> +void vub_initialize_config(BlockDriverState *bs,
> +                           struct virtio_blk_config *config);
> diff --git a/vl.c b/vl.c
> index 86474a55c9..72ac506342 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -2553,6 +2553,10 @@ static bool object_create_initial(const char *type, QemuOpts *opts)
>      }
>  #endif
> 
> +    /* Reason: vhost-user-server property "name" */
> +    if (g_str_equal(type, "vhost-user-server")) {
> +        return false;
> +    }

I don't understand why the "name" property introduces a creation order
dependency.  It's just a string and has no dependency on other
command-line objects.  Can you explain why this change is necessary?

>      /*
>       * Reason: filter-* property "netdev" etc.
>       */
> --
> 2.24.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 1/5] vhost-user block device backend
  2020-01-14 14:06 ` [PATCH v2 1/5] vhost-user block device backend Coiby Xu
  2020-01-16 13:51   ` Stefan Hajnoczi
@ 2020-01-16 13:56   ` Kevin Wolf
  2020-02-20  7:04     ` Coiby Xu
  2020-02-20  9:30     ` Coiby Xu
  1 sibling, 2 replies; 19+ messages in thread
From: Kevin Wolf @ 2020-01-16 13:56 UTC (permalink / raw)
  To: Coiby Xu; +Cc: bharatlkmlkvm, qemu-devel, stefanha

Hi,

I'm only doing a quick first review pointing out the more obvious
things while I familiarise myself with your code. I intend to review it
in more detail later (either in a second pass for this series, or when
you post v3).

Am 14.01.2020 um 15:06 hat Coiby Xu geschrieben:
> By making use of libvhost, multiple block device drives can be exported and each drive can serve multiple clients simultaneously. Since vhost-user-server needs a block drive to be created first, delay the creation of this object.
> 
> Signed-off-by: Coiby Xu <coiby.xu@gmail.com>

Please wrap the commit message at 72 characters.

>  blockdev-vu.c              | 1008 ++++++++++++++++++++++++++++++++++++
>  include/block/vhost-user.h |   46 ++
>  vl.c                       |    4 +
>  3 files changed, 1058 insertions(+)
>  create mode 100644 blockdev-vu.c
>  create mode 100644 include/block/vhost-user.h

This adds a single, relatively big source file. I see at least two
parts: The generic vhost-user infrastructure with connection handling
etc. and the implementation of the specific vhost-user-blk device.
Separating these into two files is probably a good idea.

I would also suggest to put the files in a new subdirectory
block/export/ and call them vhost-user.c/vhost-user-blk.c. The new
header file can be in the same directory as it shouldn't be used by
anyone else.

> diff --git a/blockdev-vu.c b/blockdev-vu.c
> new file mode 100644
> index 0000000000..45f0bb43a7
> --- /dev/null
> +++ b/blockdev-vu.c
> @@ -0,0 +1,1008 @@

The LICENSE file clarifies that files without a license header are
GPLv2+, so it's not strictly a problem, but I think it is good style to
include a license header that explicitly tells so.

> +#include "qemu/osdep.h"
> +#include "block/vhost-user.h"
> +#include "qapi/error.h"
> +#include "qapi/qapi-types-sockets.h"
> +#include "qapi/qapi-commands-block.h"
> +
> +#include "sysemu/block-backend.h"
> +#include "qemu/main-loop.h"
> +
> +#include "qemu/units.h"
> +
> +#include "block/block.h"
> +
> +#include "qom/object_interfaces.h"
> +
> +#include <sys/eventfd.h>
> +
> +#include "hw/qdev-properties.h"

Does the order of includes and the empty lines between them signify
anything? If not, I suggest just sorting them alphabetically (and maybe
using empty lines between different subdirectories if you like this
better than a single large block).

According to CODING_STYLE.rst, system headers like <sys/eventfd.h> come
before all QEMU headers (except qemu/osdep.h, which always must come
first).

> +enum {
> +    VHOST_USER_BLK_MAX_QUEUES = 8,
> +};
> +
> +struct virtio_blk_inhdr {
> +    unsigned char status;
> +};
> +
> +
> +static QTAILQ_HEAD(, VubDev) vub_devs = QTAILQ_HEAD_INITIALIZER(vub_devs);
> +
> +
> +typedef struct VubReq {
> +    VuVirtqElement *elem;

Maybe worth a comment that this was allocated with plain malloc(), so
you must use free() rather than g_free() (which would be the default in
QEMU)?

> +    int64_t sector_num;
> +    size_t size;
> +    struct virtio_blk_inhdr *in;
> +    struct virtio_blk_outhdr out;
> +    VuClient *client;
> +    struct VuVirtq *vq;
> +} VubReq;

I'm not completely sure yet, but I think I would prefer VuBlock to Vub
in the type names. Some may even prefer VhostUserBlock, but I can see
that this would be quite lengthy.

> +static void
> +remove_watch(VuDev *vu_dev, int fd)
> +{
> +    VuClient *client;
> +
> +    g_assert(vu_dev);
> +    g_assert(fd >= 0);
> +
> +    client = container_of(vu_dev, VuClient, parent);
> +    aio_set_fd_handler(client->blk->ctx, fd, false, NULL, NULL, NULL, NULL);
> +}
> +
> +static void close_client(VuClient *client)
> +{
> +    vu_deinit(&client->parent);
> +    /** g_source_destroy(vub_device->parent.src); */

Leftover from conversion?

> +    client->sioc = NULL;
> +    object_unref(OBJECT(client->ioc));
> +    client->closed = true;
> +
> +}
> +
> +static void vub_panic_cb(VuDev *vu_dev, const char *buf)

You use a lot of sprintf() before calling this function. Would it be
worth taking a printf-like format parameter instead of buf and using a
variable argument list?

> +{
> +    if (buf) {
> +        g_warning("vu_panic: %s", buf);

I think QEMU proper doesn't use g_warning() anywhere. This could be
error_report() or warn_report(). (Or if you use a format string
error_vreport() and warn_vreport().)

> +    }
> +
> +    VuClient *client = container_of(vu_dev, VuClient, parent);
> +    if (client->blk->exit_panic) {
> +        client->blk->close = true;
> +    }
> +    if (!client->closed) {
> +        close_client(client);
> +    }
> +}
> +
> +
> +static void vub_req_complete(VubReq *req)
> +{
> +    VuDev *vu_dev = &req->client->parent;
> +
> +    /* IO size with 1 extra status byte */
> +    vu_queue_push(vu_dev, req->vq, req->elem,
> +                  req->size + 1);

I think this fits in a single line.

> +    vu_queue_notify(vu_dev, req->vq);
> +
> +    if (req->elem) {
> +        free(req->elem);
> +    }
> +
> +    g_free(req);
> +}
> +
> +
> +
> +static int
> +vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt,
> +                         uint32_t type)
> +{
> +    struct virtio_blk_discard_write_zeroes *desc;
> +    ssize_t size;
> +    void *buf;
> +
> +    size = iov_size(iov, iovcnt);
> +    if (size != sizeof(*desc)) {
> +        fprintf(stderr, "Invalid size %ld, expect %ld\n", size, sizeof(*desc));
> +        return -1;

This would be error_report(), too. (More cases below, I'll ignore them
now.)

I would prefer consistent use of -errno instead of -1 for error cases if
you don't mind. I guess this would be -EINVAL here. I won't mention it
for all the other cases; if you want to make the change, you need to
make it everywhere, obviously.

> +    }
> +    buf = g_new0(char, size);
> +
> +    iov_to_buf_full(iov, iovcnt, 0, buf, size);

I think uint8_t describes better than char what we want here: A buffer
of bytes.

The empty line would make more sense to me above the g_new0() line than
after it because it starts a new section that deals with the buffer. In
general, the use of empty lines feels a bit inconsistent in this patch.
You may want to go over them again.

> +
> +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)

Preprocessor directives should be unindented.

However, I don't think any of this code actually depends on Linux,
BLKDISCARD or BLKZEROOUT. You can just call blk_pdiscard() and
blk_pwrite_zeroes() and they will do whatever is necessary to perform
the operation on the backend (which might not be a Linux block device,
but could be a regular file or even using a network protocol like NBD).

> +    VubDev *vdev_blk;
> +    VuClient *client = container_of(dev, VuClient, parent);
> +    vdev_blk = client->blk;
> +    desc = (struct virtio_blk_discard_write_zeroes *)buf;
> +    uint64_t range[2] = { le64toh(desc->sector) << 9,
> +                          le32toh(desc->num_sectors) << 9 };
> +    if (type == VIRTIO_BLK_T_DISCARD) {
> +        if (blk_pdiscard(vdev_blk->blk, range[0], range[1]) == 0) {
> +            g_free(buf);
> +            return 0;
> +        }
> +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
> +        if (blk_pwrite_zeroes(vdev_blk->blk, range[0], range[1]) == 0) {
> +            g_free(buf);
> +            return 0;
> +        }

blk_pdiscard() and blk_pwrite_zeroes() are synchronous functions. In
other words, the guest will be blocked until the I/O is complete. We
cannot do this.

I think you should let vub_virtio_process_req() run in a coroutine so
that you can call blk_co_pdiscard() and blk_co_pwrite_zeroes() here.

> +    }
> +    #endif
> +
> +    g_free(buf);
> +    return -1;
> +}
> +
> +
> +static void
> +vub_flush(VubReq *req)
> +{
> +    VuClient *client = req->client;
> +    blk_co_flush(client->blk->backend);

You can't call blk_co_flush() from outside coroutine context. This code
will be right after you move vub_virtio_process_req() to a coroutine,
though (which will make this function a coroutine_fn).

> +}
> +
> +
> +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
> +typedef struct BlkRwCo {
> +    BlockBackend *blk;
> +    int64_t offset;
> +    void *iobuf;
> +    int ret;
> +    BdrvRequestFlags flags;
> +} BlkRwCo;
> +
> +static void blk_read_entry(void *opaque)
> +{
> +    BlkRwCo *rwco = opaque;
> +    QEMUIOVector *qiov = rwco->iobuf;
> +
> +    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
> +                              qiov, rwco->flags);
> +    aio_wait_kick();
> +}
> +
> +
> +static void blk_write_entry(void *opaque)
> +{
> +    BlkRwCo *rwco = opaque;
> +    QEMUIOVector *qiov = rwco->iobuf;
> +
> +    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
> +                              qiov, rwco->flags);
> +    aio_wait_kick();
> +}
> +
> +
> +static int blk_prw(BlockBackend *blk, QEMUIOVector *qiov, int64_t offset,
> +                   CoroutineEntry co_entry, BdrvRequestFlags flags)
> +{
> +
> +    BlkRwCo rwco = {
> +        .blk    = blk,
> +        .offset = offset,
> +        .iobuf  = qiov,
> +        .flags  = flags,
> +        .ret    = NOT_DONE,
> +    };
> +
> +    if (qemu_in_coroutine()) {
> +        /* Fast-path if already in coroutine context */
> +        co_entry(&rwco);
> +    } else {
> +        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
> +        bdrv_coroutine_enter(blk_bs(blk), co);
> +        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> +    }
> +
> +    return rwco.ret;
> +}

This is copy&paste from block-backend.c. We should certainly not do
this. I think it will automatically go away when you can use
blk_co_preadv() and blk_co_pwritev() directly.

Note that the BDRV_POLL_WHILE() means that like above, we would be
waiting for the request to complete. This would block the guest and
would also not allow parallel requests, killing the I/O performance of
our vhost-user export.

> +
> +static ssize_t
> +vub_rwv(VubReq *req, struct iovec *iov,
> +        uint32_t iovcnt,
> +        CoroutineEntry co_entry)

I don't understand the line wrapping here. :-)

> +{
> +    VuClient *client = req->client;
> +    ssize_t rc;
> +
> +    if (!iovcnt) {
> +        fprintf(stderr, "Invalid Read/Write IOV count\n");
> +        return -1;
> +    }
> +
> +    int64_t offset = req->sector_num * 512;
> +    QEMUIOVector *qiov = g_new0(QEMUIOVector, 1);
> +    qemu_iovec_init_external(qiov, iov, iovcnt);
> +    rc = blk_prw(client->blk->backend, qiov, offset, co_entry, 0);
> +
> +    req->size = iov_size(iov, iovcnt);

You can use qiov->size instead of duplicating this information into a
separate VubReq field.

> +    if (rc < 0) {
> +        fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
> +                client->blk->name, req->sector_num, req->size,
> +                strerror(errno));
> +        return -1;
> +    }
> +
> +    return rc;
> +}
> +
> +static int vub_virtio_process_req(VuClient *client,
> +                                     VuVirtq *vq)

Indentation is off. This could be a single line anyway.

> +{
> +    VuDev *vu_dev = &client->parent;
> +    VuVirtqElement *elem;
> +    uint32_t type;
> +    VubReq *req;
> +
> +    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
> +    if (!elem) {
> +        return -1;
> +    }
> +
> +    struct iovec *in_iov = elem->in_sg;
> +    struct iovec *out_iov = elem->out_sg;
> +    unsigned in_num = elem->in_num;
> +    unsigned out_num = elem->out_num;
> +    /* refer to hw/block/virtio_blk.c */
> +    if (elem->out_num < 1 || elem->in_num < 1) {
> +        fprintf(stderr, "virtio-blk request missing headers\n");
> +        free(elem);
> +        return -1;
> +    }
> +
> +    req = g_new0(VubReq, 1);
> +    req->client = client;
> +    req->vq = vq;
> +    req->elem = elem;
> +
> +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
> +                            sizeof(req->out)) != sizeof(req->out))) {
> +        fprintf(stderr, "virtio-blk request outhdr too short");
> +        goto err;
> +    }
> +
> +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
> +
> +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
> +        fprintf(stderr, "virtio-blk request inhdr too short");
> +        goto err;
> +    }
> +
> +    /* We always touch the last byte, so just see how big in_iov is.  */
> +    req->in = (void *)in_iov[in_num - 1].iov_base
> +              + in_iov[in_num - 1].iov_len
> +              - sizeof(struct virtio_blk_inhdr);
> +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
> +
> +
> +    type = le32toh(req->out.type);
> +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
> +    case VIRTIO_BLK_T_IN:
> +    case VIRTIO_BLK_T_OUT: {
> +        ssize_t ret = 0;
> +        bool is_write = type & VIRTIO_BLK_T_OUT;
> +        req->sector_num = le64toh(req->out.sector);
> +        if (is_write) {
> +            ret = vub_rwv(req, out_iov, out_num, blk_write_entry);
> +        } else {
> +            ret = vub_rwv(req, in_iov, in_num, blk_read_entry);
> +        }
> +        if (ret >= 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vub_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_FLUSH:
> +        vub_flush(req);
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        vub_req_complete(req);
> +        break;
> +    case VIRTIO_BLK_T_GET_ID: {
> +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> +                          VIRTIO_BLK_ID_BYTES);
> +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        req->size = elem->in_sg[0].iov_len;
> +        vub_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_DISCARD:
> +    case VIRTIO_BLK_T_WRITE_ZEROES: {
> +        int rc;
> +        rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num, type);
> +        if (rc == 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vub_req_complete(req);
> +        break;
> +    }
> +    default:
> +        req->in->status = VIRTIO_BLK_S_UNSUPP;
> +        vub_req_complete(req);
> +        break;
> +    }
> +
> +    return 0;
> +
> +err:
> +    free(elem);
> +    g_free(req);
> +    return -1;
> +}
> +
> +
> +static void vub_process_vq(VuDev *vu_dev, int idx)
> +{
> +    VuClient *client;
> +    VuVirtq *vq;
> +    int ret;
> +
> +    client = container_of(vu_dev, VuClient, parent);
> +    assert(client);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    assert(vq);
> +
> +    while (1) {
> +        ret = vub_virtio_process_req(client, vq);
> +        if (ret) {
> +            break;
> +        }
> +    }
> +}

I mentioned above moving vub_virtio_process_req() into a coroutine. Just
creating a coroutine here and entering it wouldn't work, though.

The best design is probably to get requests from the virtqueue in this
function (what is currently the first half of vub_virtio_process_req())
and then spawn a coroutine per request to actually execute them (roughly
the switch in vub_virtio_process_req()).

This way you'll get parallel requests and won't have to think about
synchronising accesses to the virtqueue from multiple coroutines.

> +
> +static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
> +{
> +    VuVirtq *vq;
> +
> +    assert(vu_dev);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
> +}
> +
> +static uint64_t
> +vub_get_features(VuDev *dev)
> +{
> +    uint64_t features;
> +    VubDev *vdev_blk;
> +
> +    VuClient *client = container_of(dev, VuClient, parent);
> +    vdev_blk = client->blk;
> +
> +    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
> +               1ull << VIRTIO_BLK_F_SEG_MAX |
> +               1ull << VIRTIO_BLK_F_TOPOLOGY |
> +               1ull << VIRTIO_BLK_F_BLK_SIZE |
> +               1ull << VIRTIO_BLK_F_FLUSH |
> +               #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> +               1ull << VIRTIO_BLK_F_DISCARD |
> +               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
> +               #endif
> +               1ull << VIRTIO_BLK_F_CONFIG_WCE |
> +               1ull << VIRTIO_F_VERSION_1 |
> +               1ull << VIRTIO_RING_F_INDIRECT_DESC |
> +               1ull << VIRTIO_RING_F_EVENT_IDX |
> +               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
> +
> +    if (!vdev_blk->writable) {
> +        features |= 1ull << VIRTIO_BLK_F_RO;
> +    }
> +
> +    return features;
> +}
> +
> +static uint64_t
> +vub_get_protocol_features(VuDev *dev)
> +{
> +    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
> +           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
> +}
> +
> +static int
> +vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
> +{
> +    VubDev *vdev_blk;
> +
> +    VuClient *client = container_of(vu_dev, VuClient, parent);
> +    vdev_blk = client->blk;
> +    memcpy(config, &vdev_blk->blkcfg, len);
> +
> +    return 0;
> +}
> +
> +static int
> +vub_set_config(VuDev *vu_dev, const uint8_t *data,
> +               uint32_t offset, uint32_t size, uint32_t flags)
> +{
> +    VubDev *vdev_blk;
> +
> +    VuClient *client = container_of(vu_dev, VuClient, parent);
> +    vdev_blk = client->blk;
> +    uint8_t wce;
> +
> +    /* don't support live migration */
> +    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
> +        return -1;
> +    }
> +
> +
> +    if (offset != offsetof(struct virtio_blk_config, wce) ||
> +        size != 1) {
> +        return -1;
> +    }
> +
> +    wce = *data;
> +    if (wce == vdev_blk->blkcfg.wce) {
> +        /* Do nothing as same with old configuration */
> +        return 0;
> +    }
> +
> +    vdev_blk->blkcfg.wce = wce;
> +    blk_set_enable_write_cache(vdev_blk->backend, true);
> +    return 0;
> +}
> +
> +
> +/*
> + * When the client disconnects, it send a VHOST_USER_NONE request

s/send/sends/

> + * and vu_process_message will simple call exit which cause the VM
> + * to exit abruptly.
> + * To avoid this issue,  process VHOST_USER_NONE request ahead
> + * of vu_process_message.
> + *
> + */
> +static int vub_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
> +{
> +    if (vmsg->request == VHOST_USER_NONE) {
> +        dev->panic(dev, "disconnect");
> +        return true;
> +    }
> +    return false;
> +}
> +
> +static void
> +vmsg_close_fds(VhostUserMsg *vmsg)
> +{
> +    int i;
> +    for (i = 0; i < vmsg->fd_num; i++) {
> +        close(vmsg->fds[i]);
> +    }
> +}
> +
> +static bool
> +vu_message_read_co(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
> +{
> +    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
> +    struct iovec iov = {
> +        .iov_base = (char *)vmsg,
> +        .iov_len = VHOST_USER_HDR_SIZE,
> +    };
> +    struct msghdr msg = {
> +        .msg_iov = &iov,
> +        .msg_iovlen = 1,
> +        .msg_control = control,
> +        .msg_controllen = sizeof(control),
> +    };
> +    size_t fd_size;
> +    struct cmsghdr *cmsg;
> +    int rc;
> +    char buffer[100];
> +    VuClient *client = container_of(vu_dev, VuClient, parent);
> +    QIOChannel *ioc = client->ioc;
> +    do {
> +        rc = recvmsg(conn_fd, &msg, 0);

This should certainly use qio_channel_readv_full() rather than working
directly with a socket fd?

> +        if (rc < 0) {
> +            if (errno == EAGAIN) {
> +                if (qemu_in_coroutine()) {
> +                    qio_channel_yield(ioc, G_IO_IN);
> +                } else {
> +                    qio_channel_wait(ioc, G_IO_IN);
> +                }
> +                continue;
> +            } else if (errno == EINTR) {
> +                continue;
> +            }
> +        }
> +        break;
> +    } while (true);
> +
> +    if (rc < 0) {
> +        sprintf(buffer, "Error while recvmsg: %s", strerror(errno));
> +        vub_panic_cb(vu_dev, buffer);
> +        return false;
> +    }
> +
> +    assert(rc == VHOST_USER_HDR_SIZE || rc == 0);

Why do you think that there can't be short reads?

> +    vmsg->fd_num = 0;
> +    for (cmsg = CMSG_FIRSTHDR(&msg);
> +         cmsg != NULL;
> +         cmsg = CMSG_NXTHDR(&msg, cmsg))
> +    {
> +        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
> +            fd_size = cmsg->cmsg_len - CMSG_LEN(0);
> +            vmsg->fd_num = fd_size / sizeof(int);
> +            memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
> +            break;
> +        }
> +    }

I think the fd passing part becomes easier when you use the proper
qio_channel_readv_full() function. Its implementation is also a bit more
careful than yours. For example, you forgot checking fd_size against
VHOST_MEMORY_MAX_NREGIONS, allowing a buffer overflow in the memcpy(),
and you don't adjust fd flags for the new file descriptors.

> +    if (vmsg->size > sizeof(vmsg->payload)) {
> +        sprintf(buffer,
> +                "Error: too big message request: %d, size: vmsg->size: %u, "
> +                "while sizeof(vmsg->payload) = %zu\n",
> +                vmsg->request, vmsg->size, sizeof(vmsg->payload));
> +        vub_panic_cb(vu_dev, buffer);
> +        goto fail;
> +    }
> +
> +    if (vmsg->size) {
> +        do {
> +            rc = read(conn_fd, &vmsg->payload, vmsg->size);

qio_channel_readv_all_eof() already implements this whole loop and
correctly handles short reads, too.

> +            if (rc < 0) {
> +                if (errno == EAGAIN) {
> +                    if (qemu_in_coroutine()) {
> +                        qio_channel_yield(ioc, G_IO_IN);
> +                    } else {
> +                        qio_channel_wait(ioc, G_IO_IN);
> +                    }
> +                    continue;
> +                } else if (errno == EINTR) {
> +                    continue;
> +                }
> +            }
> +            break;
> +        } while (true);
> +
> +        if (rc <= 0) {
> +            sprintf(buffer, "Error while reading: %s", strerror(errno));
> +            vub_panic_cb(vu_dev, buffer);
> +            goto fail;
> +        }
> +
> +        assert(rc == vmsg->size);
> +    }
> +
> +    return true;
> +
> +fail:
> +    vmsg_close_fds(vmsg);
> +
> +    return false;
> +}
> +
> +static void vub_kick_cb(void *opaque)
> +{
> +    vu_watch_cb_data *data = (vu_watch_cb_data *) opaque;
> +    int index = data->index;
> +    VuDev *dev = data->vu_dev;
> +    VuVirtq *vq = &dev->vq[index];
> +    int sock = vq->kick_fd;
> +    eventfd_t kick_data;
> +    ssize_t rc;
> +
> +    rc = eventfd_read(sock, &kick_data);
> +    if (rc == -1) {
> +        char buffer[100];
> +        sprintf(buffer, "kick eventfd_read(): %s", strerror(errno));
> +        vub_panic_cb(dev, buffer);
> +        g_free(data);
> +        dev->remove_watch(dev, dev->vq[index].kick_fd);
> +    } else {
> +        if (vq->handler) {
> +            vq->handler(dev, index);
> +        }
> +    }
> +}
> +
> +static const VuDevIface vub_iface = {
> +    .get_features = vub_get_features,
> +    .queue_set_started = vub_queue_set_started,
> +    .get_protocol_features = vub_get_protocol_features,
> +    .get_config = vub_get_config,
> +    .set_config = vub_set_config,
> +    .process_msg = vub_process_msg,
> +    .read_msg = vu_message_read_co,
> +    .kick_callback = vub_kick_cb,
> +};

I would prefer the = signs to be aligned to the same column.

> +
> +void vub_free(VubDev *vub_dev, bool called_by_QOM)
> +{
> +    if (!vub_dev) {
> +        return;
> +    }
> +
> +    blk_unref(vub_dev->backend);
> +    g_free(vub_dev->name);
> +    g_free(vub_dev->unix_socket);
> +
> +    if (vub_dev->next.tqe_circ.tql_prev) {
> +        /*
> +         * if vub_dev->next.tqe_circ.tql_prev = null,
> +         * vub_dev hasn't been inserted into the queue and
> +         * vub_free is called by obj->instance_finalize.
> +         */
> +        QTAILQ_REMOVE(&vub_devs, vub_dev, next);
> +    }
> +    /*
> +     * Needn't to free vub_dev if called by QOM
> +     * because QOM will do the clean-up work.
> +     */
> +    if (!called_by_QOM) {
> +        g_free(vub_dev);
> +    }
> +}
> +
> +static coroutine_fn void vu_client_trip(void *opaque)
> +{
> +    VuClient *client = opaque;
> +
> +    while (!client->closed) {
> +        vu_dispatch(&client->parent);
> +    }
> +
> +    QTAILQ_REMOVE(&client->blk->clients, client, next);
> +

Extra empty line.

> +}
> +
> +static void vu_client_start(VuClient *client)
> +{
> +    Coroutine *co = qemu_coroutine_create(vu_client_trip, client);
> +    qemu_coroutine_enter(co);
> +}
> +
> +
> +G_STATIC_ASSERT((int)G_IO_IN == (int)VU_WATCH_IN);
> +G_STATIC_ASSERT((int)G_IO_OUT == (int)VU_WATCH_OUT);
> +G_STATIC_ASSERT((int)G_IO_PRI == (int)VU_WATCH_PRI);
> +G_STATIC_ASSERT((int)G_IO_ERR == (int)VU_WATCH_ERR);
> +G_STATIC_ASSERT((int)G_IO_HUP == (int)VU_WATCH_HUP);
> +
> +static void
> +set_watch(VuDev *vu_dev, int fd, int vu_evt,
> +          vu_watch_cb_packed_data cb, void *pvt)
> +{
> +    /*
> +     * since aio_dispatch can only pass one user data pointer to the
> +     * callback function, pack VuDev, pvt into a struct
> +     */
> +    VuClient *client;
> +
> +    g_assert(vu_dev);
> +    g_assert(fd >= 0);
> +    g_assert(cb);
> +    client = container_of(vu_dev, VuClient, parent);
> +    vu_watch_cb_data *cb_data = g_new0(vu_watch_cb_data, 1);
> +    cb_data->index = (intptr_t) pvt;
> +    cb_data->vu_dev = vu_dev;
> +    aio_set_fd_handler(client->blk->ctx, fd, false, (void *) cb,
> +                       NULL, NULL, cb_data);
> +}
> +
> +
> +void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
> +                gpointer opaque)
> +{
> +    VuClient *client;
> +    VubDev *vub_device = opaque;
> +    client = g_new0(VuClient, 1);
> +
> +    if (!vu_init_packed_data(&client->parent, VHOST_USER_BLK_MAX_QUEUES,
> +                             sioc->fd, vub_panic_cb, set_watch,
> +                             remove_watch, &vub_iface)) {
> +        fprintf(stderr, "Failed to initialized libvhost-user\n");
> +        g_free(client);
> +        return;
> +    }
> +
> +    client->blk = vub_device;
> +    client->refcount = 1;
> +    client->sioc = sioc;
> +    /*
> +     * increase the object reference, so cioc will not freed by
> +     * qio_net_listener_channel_func which will call object_unref(OBJECT(sioc))
> +     */
> +    object_ref(OBJECT(client->sioc));
> +    qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
> +    client->ioc = QIO_CHANNEL(sioc);
> +    object_ref(OBJECT(client->ioc));
> +    object_ref(OBJECT(sioc));
> +
> +    qio_channel_set_blocking(QIO_CHANNEL(client->sioc), false, NULL);
> +    client->closed = false;
> +    QTAILQ_INSERT_TAIL(&client->blk->clients, client, next);
> +    vu_client_start(client);
> +}
> +
> +
> +void
> +vub_initialize_config(BlockDriverState *bs, struct virtio_blk_config *config)
> +{
> +    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
> +    config->blk_size = BDRV_SECTOR_SIZE;
> +    config->size_max = 65536;
> +    config->seg_max = 128 - 2;
> +    config->min_io_size = 1;
> +    config->opt_io_size = 1;
> +    config->num_queues = 1;
> +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> +    config->max_discard_sectors = 32768;
> +    config->max_discard_seg = 1;
> +    config->discard_sector_alignment = config->blk_size >> 9;
> +    config->max_write_zeroes_sectors = 32768;
> +    config->max_write_zeroes_seg = 1;
> +    #endif
> +}
> +
> +
> +static VubDev *vub_new(VubDev *vub_device, const char *name,
> +                       const char *unix_socket, bool writable, Error **errp)
> +{
> +
> +    BlockBackend *blk;
> +
> +    /*
> +     * Don't allow resize while the vhost user server is running,
> +     * otherwise we don't care what happens with the node.
> +     */
> +    uint64_t perm = BLK_PERM_CONSISTENT_READ;
> +    int ret;
> +
> +    AioContext *ctx;
> +
> +    BlockDriverState *bs = bdrv_lookup_bs(name,
> +                                          name,
> +                                          errp);

This fits in a single line.

> +
> +    if (!bs) {
> +        error_setg(errp,
> +                   "No drive with name '%s'."
> +                   " Please find the list of names with "
> +                   "'info block'", name);

This can probably be two lines instead of four.

> +        return NULL;
> +    }
> +
> +    if (bdrv_is_read_only(bs)) {
> +        writable = false;
> +    }
> +
> +    if (writable) {
> +        perm |= BLK_PERM_WRITE;
> +    }
> +
> +    ctx = bdrv_get_aio_context(bs);
> +    aio_context_acquire(ctx);
> +    bdrv_invalidate_cache(bs, NULL);
> +    aio_context_release(ctx);
> +
> +    blk = blk_new(bdrv_get_aio_context(bs), perm,
> +                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> +                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
> +    ret = blk_insert_bs(blk, bs, errp);
> +
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +
> +    blk_set_enable_write_cache(blk, false);
> +
> +    blk_set_allow_aio_context_change(blk, true);
> +
> +
> +    vub_device->name = g_strdup(name);
> +    vub_device->unix_socket = g_strdup(unix_socket);
> +    vub_device->writable = writable;
> +    vub_device->blkcfg.wce = 0;
> +    vub_device->backend = blk;
> +    vub_device->ctx = ctx;
> +    vub_initialize_config(bs, &vub_device->blkcfg);
> +    return vub_device;
> +
> +fail:
> +    blk_unref(blk);
> +    return NULL;
> +}
> +
> +void vhost_user_server_free(VubDev *vub_device, bool called_by_QOM)
> +{
> +    if (!vub_device) {
> +        return;
> +    }
> +
> +    VuClient *client, *next;
> +    QTAILQ_FOREACH_SAFE(client, &vub_device->clients, next, next) {
> +        if (!client->closed) {
> +            close_client(client);
> +        }
> +    }
> +
> +    if (vub_device->listener) {
> +        qio_net_listener_disconnect(vub_device->listener);
> +        object_unref(OBJECT(vub_device->listener));
> +    }
> +    vub_free(vub_device, called_by_QOM);
> +
> +}
> +
> +
> +VubDev *vub_dev_find(const char *name)
> +{
> +    VubDev *vub_device;
> +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> +        if (strcmp(name, vub_device->name) == 0) {
> +            return vub_device;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +
> +static VubDev *vub_dev_find_by_unix_socket(const char *unix_socket)
> +{
> +    VubDev *vub_device;
> +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> +        if (strcmp(unix_socket, vub_device->unix_socket) == 0) {
> +            return vub_device;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +static void vhost_user_server_start(VubDev *vub_device, const char *unix_socket,
> +                                    const char *name, bool writable,
> +                                    Error **errp)
> +{
> +
> +    if (vub_dev_find(name) || vub_dev_find_by_unix_socket(unix_socket)) {
> +        error_setg(errp, "Vhost user server with name '%s' or "
> +                "with socket_path '%s' has already been started",
> +                name, unix_socket);
> +        return;
> +    }
> +
> +
> +    if (!vub_new(vub_device, name, unix_socket, writable, errp)) {
> +        return;
> +    }
> +
> +
> +    vub_device->listener = qio_net_listener_new();
> +
> +    qio_net_listener_set_name(vub_device->listener,
> +                              "vhost-user-backend-listener");
> +
> +    SocketAddress *addr = g_new0(SocketAddress, 1);
> +    addr->u.q_unix.path = (char *) unix_socket;
> +    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
> +    if (qio_net_listener_open_sync(vub_device->listener, addr, 1, errp) < 0) {
> +        goto error;
> +    }
> +
> +
> +    QTAILQ_INSERT_TAIL(&vub_devs, vub_device, next);
> +    QTAILQ_INIT(&vub_device->clients);
> +
> +    qio_net_listener_set_client_func(vub_device->listener,
> +                                     vub_accept,
> +                                     vub_device,
> +                                     NULL);
> +
> +    return;
> +
> + error:
> +    vub_free(vub_device, false);
> +}
> +
> +static void vu_set_block_name(Object *obj, const char *value,
> +                                           Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);;
> +
> +    if (vus->name) {
> +        error_setg(errp, "evdev property already set");
> +        return;
> +    }
> +
> +    vus->name = g_strdup(value);
> +}
> +
> +static char *vu_get_block_name(Object *obj, Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);
> +    return g_strdup(vus->name);
> +}
> +
> +
> +static void vu_set_unix_socket(Object *obj, const char *value,
> +                                            Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);;
> +
> +    if (vus->unix_socket) {
> +        error_setg(errp, "unix_socket property already set");
> +        return;
> +    }
> +
> +    vus->unix_socket = g_strdup(value);
> +    vhost_user_server_start(vus, value, vus->name,
> +                            vus->writable, errp);
> +}

This makes the unix-socket property magic in that it starts the server
with the properties specified at this point. This means that this
property must always be specified last.

Maybe it would be better to use a boolean property (similar to qdev's
"realized") that explicitly start and possibly stops the export.

Writing to other properties should probably result in an error while the
server is already running because these property changes won't take
effect any more then.

> +static char *vu_get_unix_socket(Object *obj, Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);;
> +    return g_strdup(vus->unix_socket);
> +}
> +
> +static bool vu_get_block_writable(Object *obj, Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);;
> +    return vus->writable;
> +}
> +
> +static void vu_set_block_writable(Object *obj, bool value, Error **errp)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);
> +
> +    vus->writable = value;
> +}
> +
> +static void vhost_user_server_instance_init(Object *obj)
> +{
> +
> +    object_property_add_bool(obj, "writable",
> +                            vu_get_block_writable,
> +                            vu_set_block_writable, NULL);
> +
> +    object_property_add_str(obj, "name",
> +                            vu_get_block_name,
> +                            vu_set_block_name, NULL);
> +
> +    object_property_add_str(obj, "unix_socket",
> +                            vu_get_unix_socket,
> +                            vu_set_unix_socket, NULL);

These should probably be object_class_property_add_*() and be called in
.class_init rather than .instance_init.

"name" suggests that it's the name of the export rather than the block
device to be exported. I would suggest "node-name" (and then actually
only pass it as node-name to bdrv_lookup_bs()).

I expect that in the long run, we'll want to accept a full SocketAddress
rather than just a filename like in "unix_socket".

> +}
> +
> +static void vhost_user_server_instance_finalize(Object *obj)
> +{
> +    VubDev *vus = VHOST_USER_SERVER(obj);
> +    vhost_user_server_free(vus, true);
> +    /* object_del shouldn't free this object struct */
> +    obj->free = NULL;
> +}
> +
> +static const TypeInfo vhost_user_server_info = {
> +    .name = TYPE_VHOST_USER_SERVER,
> +    .parent = TYPE_OBJECT,
> +    .instance_size = sizeof(VuDev),
> +    .instance_init = vhost_user_server_instance_init,
> +    .instance_finalize = vhost_user_server_instance_finalize,
> +    .interfaces = (InterfaceInfo[]) {
> +        {TYPE_USER_CREATABLE},
> +        {}
> +    },
> +};
> +
> +static void vhost_user_server_register_types(void)
> +{
> +    type_register_static(&vhost_user_server_info);
> +}
> +
> +type_init(vhost_user_server_register_types)
> +

Extra empty line at the file end.

In summary, I can see this going in the right direction, though in
detail some more work will be needed.

Kevin



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol
  2020-01-14 14:06 ` [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol Coiby Xu
@ 2020-01-16 14:04   ` Stefan Hajnoczi
  2020-01-17  8:12     ` Coiby Xu
  0 siblings, 1 reply; 19+ messages in thread
From: Stefan Hajnoczi @ 2020-01-16 14:04 UTC (permalink / raw)
  To: Coiby Xu; +Cc: kwolf, bharatlkmlkvm, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 800 bytes --]

On Tue, Jan 14, 2020 at 10:06:18PM +0800, Coiby Xu wrote:
> vhost-user-blk can have played as vhost-user backend but it only supports raw file and don't support VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES operations on raw file (ioctl(fd, BLKDISCARD) is only valid for real block device).
> 
> Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
> ---
>  qemu-vu.c | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 264 insertions(+)
>  create mode 100644 qemu-vu.c

Kevin has been working on qemu-storage-daemon, a tool for running NBD
exports, block jobs, and other storage features that are not part of a
guest.  I think qemu-storage-daemon would be the appropriate tool for
running vhost-user-blk servers.  A dedicated binary is not necessary.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 1/5] vhost-user block device backend
  2020-01-16 13:51   ` Stefan Hajnoczi
@ 2020-01-16 14:20     ` Kevin Wolf
  2020-02-14  3:07     ` Coiby Xu
  1 sibling, 0 replies; 19+ messages in thread
From: Kevin Wolf @ 2020-01-16 14:20 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: bharatlkmlkvm, qemu-devel, Coiby Xu

[-- Attachment #1: Type: text/plain, Size: 2688 bytes --]

Am 16.01.2020 um 14:51 hat Stefan Hajnoczi geschrieben:
> > +static void vu_set_unix_socket(Object *obj, const char *value,
> > +                                            Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +
> > +    if (vus->unix_socket) {
> > +        error_setg(errp, "unix_socket property already set");
> > +        return;
> > +    }
> > +
> > +    vus->unix_socket = g_strdup(value);
> > +    vhost_user_server_start(vus, value, vus->name,
> > +                            vus->writable, errp);
> 
> Property setters should only perform input validation and store the
> data.  Actions like creating network connections, opening files, etc
> should happen later in a UserCreatableClass->complete() callback.
> 
> This is necessary because vus->writable is also a property and may be
> set after unix_socket.  The ->complete() callback is called after all
> setters so it can access the final values of all properties.
> 
> See iothread_class_init() and iothread_complete() for an example.

Ah, right, this is the correct way. I forgot about the existence of
.complete(), so please ignore what I wrote.

> > diff --git a/vl.c b/vl.c
> > index 86474a55c9..72ac506342 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -2553,6 +2553,10 @@ static bool object_create_initial(const char *type, QemuOpts *opts)
> >      }
> >  #endif
> > 
> > +    /* Reason: vhost-user-server property "name" */
> > +    if (g_str_equal(type, "vhost-user-server")) {
> > +        return false;
> > +    }
> 
> I don't understand why the "name" property introduces a creation order
> dependency.  It's just a string and has no dependency on other
> command-line objects.  Can you explain why this change is necessary?

I was confused at first, too, but it's just a naming problem: "name"
is what points to the block device to be exported. It should really be
"node-name".

> > +struct VuClient {
> > +    VuDev parent;
> > +    int refcount;
> > +    VubDev *blk;
> > +    QIOChannelSocket *sioc; /* The underlying data channel */
> > +    QIOChannel *ioc; /* The current I/O channel */
> > +    QTAILQ_ENTRY(VuClient) next;
> > +    bool closed;
> > +};
> > +VubDev *vub_dev_find(const char *name);
> 
> All -object instances already have an id=ID property.  There is no need
> to declare a separate "name" property.  Please look at iothread_by_id()
> and iothread_get_id() for examples.

It's questionable to me if this function is needed at all. It is only
used in vhost_user_server_start() to make sure that you don't export a
node twice - but what's even the problem with exporting it twice?

Kevin

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol
  2020-01-16 14:04   ` Stefan Hajnoczi
@ 2020-01-17  8:12     ` Coiby Xu
  2020-01-17 10:11       ` Kevin Wolf
  0 siblings, 1 reply; 19+ messages in thread
From: Coiby Xu @ 2020-01-17  8:12 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: kwolf, bharatlkmlkvm, qemu-devel

Excellent! I will add an option (or object property) for
vhost-user-blk server oject which will tell the daemon process to exit
when the client disconnects, thus "make check-qtest" will not get held
by this daemon process. After that since Kevin's qemu-storage-daemon
support "-object" option
(https://patchew.org/QEMU/20191017130204.16131-1-kwolf@redhat.com/20191017130204.16131-3-kwolf@redhat.com/)
and vhost-user-server is a user-creatable QOM object, it will work out
of the box.

I'm curious when will be formal version of qemu-storage-daemon
finished so I can take advantage of it? Or should I apply the RFC
PATCHes to my working branch directly and submit them together with
the patches on vhost-user-blk server feature when posting v3?



On Thu, Jan 16, 2020 at 10:04 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Tue, Jan 14, 2020 at 10:06:18PM +0800, Coiby Xu wrote:
> > vhost-user-blk can have played as vhost-user backend but it only supports raw file and don't support VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES operations on raw file (ioctl(fd, BLKDISCARD) is only valid for real block device).
> >
> > Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
> > ---
> >  qemu-vu.c | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 264 insertions(+)
> >  create mode 100644 qemu-vu.c
>
> Kevin has been working on qemu-storage-daemon, a tool for running NBD
> exports, block jobs, and other storage features that are not part of a
> guest.  I think qemu-storage-daemon would be the appropriate tool for
> running vhost-user-blk servers.  A dedicated binary is not necessary.
>
> Stefan



--
Best regards,
Coiby


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol
  2020-01-17  8:12     ` Coiby Xu
@ 2020-01-17 10:11       ` Kevin Wolf
  2020-01-31 16:42         ` Coiby Xu
  0 siblings, 1 reply; 19+ messages in thread
From: Kevin Wolf @ 2020-01-17 10:11 UTC (permalink / raw)
  To: Coiby Xu; +Cc: bharatlkmlkvm, qemu-devel, Stefan Hajnoczi

Am 17.01.2020 um 09:12 hat Coiby Xu geschrieben:
> Excellent! I will add an option (or object property) for
> vhost-user-blk server oject which will tell the daemon process to exit
> when the client disconnects, thus "make check-qtest" will not get held
> by this daemon process. After that since Kevin's qemu-storage-daemon
> support "-object" option
> (https://patchew.org/QEMU/20191017130204.16131-1-kwolf@redhat.com/20191017130204.16131-3-kwolf@redhat.com/)
> and vhost-user-server is a user-creatable QOM object, it will work out
> of the box.

Yes, I think at least for the moment it should work fine this way.
Eventually, I'd like to integrate it with --export (and associated QMP
commands, which are still to be created), too. Maybe at that point we
want to make the QOM object not user creatable any more.

Would it make sense to prefix the object type name with "x-" so we can
later retire it from the external user interface without a deprecation
period?

As for test cases, do you think it would be hard to just modify the
tests to send an explicit 'quit' command to the daemon?

> I'm curious when will be formal version of qemu-storage-daemon
> finished so I can take advantage of it? Or should I apply the RFC
> PATCHes to my working branch directly and submit them together with
> the patches on vhost-user-blk server feature when posting v3?

It's the next thing I'm planning to work on after completing the
coroutine-base QMP handlers (which I hope to get finished very soon).

For the time being I would suggest that you put any patches that depend
on qemu-storage-daemon (if you do need it) at the end of your series so
that we could apply the first part even if the storage daemon isn't in
yet.

The latest version of my patches is at:

    git://repo.or.cz/qemu/kevin.git storage-daemon

But if you just need something for testing your code, I think it would
even make sense if you kept your standalone tool around (even though
we'll never merge it) and we'll deal with integration in the storage
daemon once both parts are ready.

Kevin



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol
  2020-01-17 10:11       ` Kevin Wolf
@ 2020-01-31 16:42         ` Coiby Xu
  2020-02-02  9:33           ` Kevin Wolf
  0 siblings, 1 reply; 19+ messages in thread
From: Coiby Xu @ 2020-01-31 16:42 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: bharatlkmlkvm, qemu-devel, Stefan Hajnoczi

Hi Kevin,

> Yes, I think at least for the moment it should work fine this way.
> Eventually, I'd like to integrate it with --export (and associated QMP
> commands, which are still to be created), too. Maybe at that point we
> want to make the QOM object not user creatable any more.

Does it mean TYPE_USER_CREATABLE interface in QOM will become
deprecated in the future? I'm curious what are the reasons for making
QOM object no user creatable? Because we may still need to start
vhost-user block device backend through HMP or QMP instead of stating
it as a standalone-alone daemon.

> As for test cases, do you think it would be hard to just modify the
> tests to send an explicit 'quit' command to the daemon?

Accroding to https://patchew.org/QEMU/20191017130204.16131-1-kwolf@redhat.com/20191017130204.16131-10-kwolf@redhat.com/,

> +static bool exit_requested = false;
> +
> +void qemu_system_killed(int signal, pid_t pid)
> +{
> +    exit_requested = true;
> +}

if exit_requested = true, qemu-storage-daemon will exit the main loop
and then quit. So is calling qemu_system_killed by what you means "to
send an explicit 'quit' command to the daemon"?

On Fri, Jan 17, 2020 at 6:12 PM Kevin Wolf <kwolf@redhat.com> wrote:
>
> Am 17.01.2020 um 09:12 hat Coiby Xu geschrieben:
> > Excellent! I will add an option (or object property) for
> > vhost-user-blk server oject which will tell the daemon process to exit
> > when the client disconnects, thus "make check-qtest" will not get held
> > by this daemon process. After that since Kevin's qemu-storage-daemon
> > support "-object" option
> > (https://patchew.org/QEMU/20191017130204.16131-1-kwolf@redhat.com/20191017130204.16131-3-kwolf@redhat.com/)
> > and vhost-user-server is a user-creatable QOM object, it will work out
> > of the box.
>
> Yes, I think at least for the moment it should work fine this way.
> Eventually, I'd like to integrate it with --export (and associated QMP
> commands, which are still to be created), too. Maybe at that point we
> want to make the QOM object not user creatable any more.
>
> Would it make sense to prefix the object type name with "x-" so we can
> later retire it from the external user interface without a deprecation
> period?
>
> As for test cases, do you think it would be hard to just modify the
> tests to send an explicit 'quit' command to the daemon?
>
> > I'm curious when will be formal version of qemu-storage-daemon
> > finished so I can take advantage of it? Or should I apply the RFC
> > PATCHes to my working branch directly and submit them together with
> > the patches on vhost-user-blk server feature when posting v3?
>
> It's the next thing I'm planning to work on after completing the
> coroutine-base QMP handlers (which I hope to get finished very soon).
>
> For the time being I would suggest that you put any patches that depend
> on qemu-storage-daemon (if you do need it) at the end of your series so
> that we could apply the first part even if the storage daemon isn't in
> yet.
>
> The latest version of my patches is at:
>
>     git://repo.or.cz/qemu/kevin.git storage-daemon
>
> But if you just need something for testing your code, I think it would
> even make sense if you kept your standalone tool around (even though
> we'll never merge it) and we'll deal with integration in the storage
> daemon once both parts are ready.
>
> Kevin
>


-- 
Best regards,
Coiby


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol
  2020-01-31 16:42         ` Coiby Xu
@ 2020-02-02  9:33           ` Kevin Wolf
  2020-02-13  1:00             ` Coiby Xu
  0 siblings, 1 reply; 19+ messages in thread
From: Kevin Wolf @ 2020-02-02  9:33 UTC (permalink / raw)
  To: Coiby Xu; +Cc: bharatlkmlkvm, qemu-devel, Stefan Hajnoczi

Am 31.01.2020 um 17:42 hat Coiby Xu geschrieben:
> > Yes, I think at least for the moment it should work fine this way.
> > Eventually, I'd like to integrate it with --export (and associated QMP
> > commands, which are still to be created), too. Maybe at that point we
> > want to make the QOM object not user creatable any more.
> 
> Does it mean TYPE_USER_CREATABLE interface in QOM will become
> deprecated in the future? I'm curious what are the reasons for making
> QOM object no user creatable? Because we may still need to start
> vhost-user block device backend through HMP or QMP instead of stating
> it as a standalone-alone daemon.

Not in general, but if we have something like a block-export-add QMP
command, the QOM interface would be redundant. We could still leave it
there and have both a low-level and a high-level interface, but whether
we would want to is something we still have to decide.

> > As for test cases, do you think it would be hard to just modify the
> > tests to send an explicit 'quit' command to the daemon?
> 
> Accroding to https://patchew.org/QEMU/20191017130204.16131-1-kwolf@redhat.com/20191017130204.16131-10-kwolf@redhat.com/,
> 
> > +static bool exit_requested = false;
> > +
> > +void qemu_system_killed(int signal, pid_t pid)
> > +{
> > +    exit_requested = true;
> > +}
> 
> if exit_requested = true, qemu-storage-daemon will exit the main loop
> and then quit. So is calling qemu_system_killed by what you means "to
> send an explicit 'quit' command to the daemon"?

qemu_system_killed() is call in the signal handlers for, amongst others,
SIGTERM and SIGINT. This is one way to stop the storage daemon (for
manual use, sending SIGINT with Ctrl-C is probably the easiest way).

What I actually meant is the 'quit' QMP command which will cause
qmp_quit() to be run, which contains the same code. But if sending a
signal is more convenient, that's just as good.

Kevin

> On Fri, Jan 17, 2020 at 6:12 PM Kevin Wolf <kwolf@redhat.com> wrote:
> >
> > Am 17.01.2020 um 09:12 hat Coiby Xu geschrieben:
> > > Excellent! I will add an option (or object property) for
> > > vhost-user-blk server oject which will tell the daemon process to exit
> > > when the client disconnects, thus "make check-qtest" will not get held
> > > by this daemon process. After that since Kevin's qemu-storage-daemon
> > > support "-object" option
> > > (https://patchew.org/QEMU/20191017130204.16131-1-kwolf@redhat.com/20191017130204.16131-3-kwolf@redhat.com/)
> > > and vhost-user-server is a user-creatable QOM object, it will work out
> > > of the box.
> >
> > Yes, I think at least for the moment it should work fine this way.
> > Eventually, I'd like to integrate it with --export (and associated QMP
> > commands, which are still to be created), too. Maybe at that point we
> > want to make the QOM object not user creatable any more.
> >
> > Would it make sense to prefix the object type name with "x-" so we can
> > later retire it from the external user interface without a deprecation
> > period?
> >
> > As for test cases, do you think it would be hard to just modify the
> > tests to send an explicit 'quit' command to the daemon?
> >
> > > I'm curious when will be formal version of qemu-storage-daemon
> > > finished so I can take advantage of it? Or should I apply the RFC
> > > PATCHes to my working branch directly and submit them together with
> > > the patches on vhost-user-blk server feature when posting v3?
> >
> > It's the next thing I'm planning to work on after completing the
> > coroutine-base QMP handlers (which I hope to get finished very soon).
> >
> > For the time being I would suggest that you put any patches that depend
> > on qemu-storage-daemon (if you do need it) at the end of your series so
> > that we could apply the first part even if the storage daemon isn't in
> > yet.
> >
> > The latest version of my patches is at:
> >
> >     git://repo.or.cz/qemu/kevin.git storage-daemon
> >
> > But if you just need something for testing your code, I think it would
> > even make sense if you kept your standalone tool around (even though
> > we'll never merge it) and we'll deal with integration in the storage
> > daemon once both parts are ready.
> >
> > Kevin
> >
> 
> 
> -- 
> Best regards,
> Coiby
> 



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol
  2020-02-02  9:33           ` Kevin Wolf
@ 2020-02-13  1:00             ` Coiby Xu
  0 siblings, 0 replies; 19+ messages in thread
From: Coiby Xu @ 2020-02-13  1:00 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: bharatlkmlkvm, qemu-devel, Stefan Hajnoczi

> > > Yes, I think at least for the moment it should work fine this way.
> > > Eventually, I'd like to integrate it with --export (and associated QMP
> > > commands, which are still to be created), too. Maybe at that point we
> > > want to make the QOM object not user creatable any more.
> >
> > Does it mean TYPE_USER_CREATABLE interface in QOM will become
> > deprecated in the future? I'm curious what are the reasons for making
> > QOM object no user creatable? Because we may still need to start
> > vhost-user block device backend through HMP or QMP instead of stating
> > it as a standalone-alone daemon.

> Not in general, but if we have something like a block-export-add QMP
> command, the QOM interface would be redundant. We could still leave it
> there and have both a low-level and a high-level interface, but whether
> we would want to is something we still have to decide.

I see. So QOM interface will still be used as a low-level interface.
In the draft version, vhost-user-blk-server is started using specific
command vhost_user_server_start/vhost_user_server_stop which proivide
interfaces easier for implementing block-export-add QMP command. But
in later versions, only object_add/object_del syntax is supported to
start/stop vhost-user-blk-server. I'll keep an eye on how the storage
daemon develops and adapt my code accordingly.


On Sun, Feb 2, 2020 at 5:33 PM Kevin Wolf <kwolf@redhat.com> wrote:
>
> Am 31.01.2020 um 17:42 hat Coiby Xu geschrieben:
> > > Yes, I think at least for the moment it should work fine this way.
> > > Eventually, I'd like to integrate it with --export (and associated QMP
> > > commands, which are still to be created), too. Maybe at that point we
> > > want to make the QOM object not user creatable any more.
> >
> > Does it mean TYPE_USER_CREATABLE interface in QOM will become
> > deprecated in the future? I'm curious what are the reasons for making
> > QOM object no user creatable? Because we may still need to start
> > vhost-user block device backend through HMP or QMP instead of stating
> > it as a standalone-alone daemon.
>
> Not in general, but if we have something like a block-export-add QMP
> command, the QOM interface would be redundant. We could still leave it
> there and have both a low-level and a high-level interface, but whether
> we would want to is something we still have to decide.
>
> > > As for test cases, do you think it would be hard to just modify the
> > > tests to send an explicit 'quit' command to the daemon?
> >
> > Accroding to https://patchew.org/QEMU/20191017130204.16131-1-kwolf@redhat.com/20191017130204.16131-10-kwolf@redhat.com/,
> >
> > > +static bool exit_requested = false;
> > > +
> > > +void qemu_system_killed(int signal, pid_t pid)
> > > +{
> > > +    exit_requested = true;
> > > +}
> >
> > if exit_requested = true, qemu-storage-daemon will exit the main loop
> > and then quit. So is calling qemu_system_killed by what you means "to
> > send an explicit 'quit' command to the daemon"?
>
> qemu_system_killed() is call in the signal handlers for, amongst others,
> SIGTERM and SIGINT. This is one way to stop the storage daemon (for
> manual use, sending SIGINT with Ctrl-C is probably the easiest way).
>
> What I actually meant is the 'quit' QMP command which will cause
> qmp_quit() to be run, which contains the same code. But if sending a
> signal is more convenient, that's just as good.
>
> Kevin
>
> > On Fri, Jan 17, 2020 at 6:12 PM Kevin Wolf <kwolf@redhat.com> wrote:
> > >
> > > Am 17.01.2020 um 09:12 hat Coiby Xu geschrieben:
> > > > Excellent! I will add an option (or object property) for
> > > > vhost-user-blk server oject which will tell the daemon process to exit
> > > > when the client disconnects, thus "make check-qtest" will not get held
> > > > by this daemon process. After that since Kevin's qemu-storage-daemon
> > > > support "-object" option
> > > > (https://patchew.org/QEMU/20191017130204.16131-1-kwolf@redhat.com/20191017130204.16131-3-kwolf@redhat.com/)
> > > > and vhost-user-server is a user-creatable QOM object, it will work out
> > > > of the box.
> > >
> > > Yes, I think at least for the moment it should work fine this way.
> > > Eventually, I'd like to integrate it with --export (and associated QMP
> > > commands, which are still to be created), too. Maybe at that point we
> > > want to make the QOM object not user creatable any more.
> > >
> > > Would it make sense to prefix the object type name with "x-" so we can
> > > later retire it from the external user interface without a deprecation
> > > period?
> > >
> > > As for test cases, do you think it would be hard to just modify the
> > > tests to send an explicit 'quit' command to the daemon?
> > >
> > > > I'm curious when will be formal version of qemu-storage-daemon
> > > > finished so I can take advantage of it? Or should I apply the RFC
> > > > PATCHes to my working branch directly and submit them together with
> > > > the patches on vhost-user-blk server feature when posting v3?
> > >
> > > It's the next thing I'm planning to work on after completing the
> > > coroutine-base QMP handlers (which I hope to get finished very soon).
> > >
> > > For the time being I would suggest that you put any patches that depend
> > > on qemu-storage-daemon (if you do need it) at the end of your series so
> > > that we could apply the first part even if the storage daemon isn't in
> > > yet.
> > >
> > > The latest version of my patches is at:
> > >
> > >     git://repo.or.cz/qemu/kevin.git storage-daemon
> > >
> > > But if you just need something for testing your code, I think it would
> > > even make sense if you kept your standalone tool around (even though
> > > we'll never merge it) and we'll deal with integration in the storage
> > > daemon once both parts are ready.
> > >
> > > Kevin
> > >
> >
> >
> > --
> > Best regards,
> > Coiby
> >
>


--
Best regards,
Coiby


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 1/5] vhost-user block device backend
  2020-01-16 13:51   ` Stefan Hajnoczi
  2020-01-16 14:20     ` Kevin Wolf
@ 2020-02-14  3:07     ` Coiby Xu
  1 sibling, 0 replies; 19+ messages in thread
From: Coiby Xu @ 2020-02-14  3:07 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: kwolf, bharatlkmlkvm, qemu-devel

Thank you for reviewing this patch! I'm already posted v3 based on
your feedback.

> > +#include "hw/qdev-properties.h"
> > +enum {
> > +    VHOST_USER_BLK_MAX_QUEUES = 8,
> > +};

> The number of queues is hardcoded to 1 so this constant can be removed
for now.

> > +
> > +static QTAILQ_HEAD(, VubDev) vub_devs = QTAILQ_HEAD_INITIALIZER(vub_devs);

> I'm not sure if this list will be necessary.  See my comments about
> replacing the "name" property with Object's built-in "id" property.

There is no need to start vhost-user-blk-server with the same exported
drive since vhost-user-blk-server can serve multiple clients
simutanously. And we shoudn't started two vhost-user-blk-servers with
the same unix socket path. So we need this list to avoid dupliate
servers. And as pointed out by Kevin, "name" property actually means
node-name which is used in v3.


> > +#include "hw/qdev-properties.h"
> > +enum {
> > +    VHOST_USER_BLK_MAX_QUEUES = 8,
> > +};

> The number of queues is hardcoded to 1 so this constant can be removed
for now.

I've set VHOST_USER_BLK_MAX_QUEUES = 1 in v3 to avoid magic number.

> > +    config->seg_max = 128 - 2;

> This is okay for now but should be changed in the future.

> hw/block/virtio-blk.c was recently modified to calculate seg_max based
> on the virtqueue size (which is where the hardcoded 128 originally came
> from).  Applications that use large buffer sizes (128+ KB) benefit from
> larger virtqueue sizes and seg_max.

I've looked at the implementation of "-device
vhost-user-blk-pci,queue-size=512" regarding seg_max and found out
vhost-user-blk-server doesn' have the chance to caculate seg_max based
on the virtqueue size and report it back to QEMU as vhost-user client
in time.

QEMU as vhost-user client will create virtqueues with the size of
queue-size parameter. Later it will get the configureation including
seg_max from vhost-user-blk-server by sending VHOST_USER_SET_CONFIG
message and this seg_max vallue will be sent to guest OS.  Guest OS
will set the real size of virtqueue which will be sent to
vhost-user-blk-server through VHOST_USER_SET_VRING_NUM message. After
that QEMU as vhost-user client will never send VHOST_USER_SET_CONFIG
again.

There could be three ways to address this issue,
1. Add seg_max_adjust and queue-size object property for
vhost-user-blk device in a way similar to virtio-blk device. And QEMU
as vhost-user client will ignore seg_max parameter from
vhost-user-blk-server. It will caculate seg_max based on queue-size
and report it to guest OS.
2. Add seg_max_adjust and queue-size object property for
vhost-user-blk server and let  vhost-user-blk server calculate seg_max
based on queue-size
3. Let QEMU as vhost-user client tell vhost-user-blk-server its queue
size by sending VHOST_USER_SET_VRING_NUM message first. When
vhost-user-blk-server receive the VHOST_USER_SET_CONFIG message, it
will calculate seg_max and report it back to QEMU as vhost-user
client.

Which way do you is the best?


> > +void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
> > +                gpointer opaque)
> > +{
> > +    VuClient *client;
> > +    VubDev *vub_device = opaque;
> > +    client = g_new0(VuClient, 1);

> Is it helpful to have a separate VuClient struct even though only a
> single vhost-user client can be connected at a time?  It may be simpler
> to keep connection state directly in VubDev.

Currently, I don't use chardev as an object property of
vhost-user-blk-server. So actually multiple clients can be connected
simutaneously.

All the other suggestions have been adopted in v3. Thank you for your advice!


On Thu, Jan 16, 2020 at 9:51 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Tue, Jan 14, 2020 at 10:06:16PM +0800, Coiby Xu wrote:
> > By making use of libvhost, multiple block device drives can be exported and each drive can serve multiple clients simultaneously. Since vhost-user-server needs a block drive to be created first, delay the creation of this object.
> >
> > Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
> > ---
> >  blockdev-vu.c              | 1008 ++++++++++++++++++++++++++++++++++++
>
> This file contains both vhost-user-blk code and generic vhost-user
> protocol code.  Please split them into two files:
> 1. backends/vhost-user-blk-server.c
> 2. util/vhost-user-server.c
>
> (As I read and understand the code better I may have better suggestions
> about where to put these source files and how to name them.)
>
> >  include/block/vhost-user.h |   46 ++
> >  vl.c                       |    4 +
> >  3 files changed, 1058 insertions(+)
> >  create mode 100644 blockdev-vu.c
> >  create mode 100644 include/block/vhost-user.h
> >
> > diff --git a/blockdev-vu.c b/blockdev-vu.c
> > new file mode 100644
> > index 0000000000..45f0bb43a7
> > --- /dev/null
> > +++ b/blockdev-vu.c
> > @@ -0,0 +1,1008 @@
> > +#include "qemu/osdep.h"
> > +#include "block/vhost-user.h"
> > +#include "qapi/error.h"
> > +#include "qapi/qapi-types-sockets.h"
> > +#include "qapi/qapi-commands-block.h"
> > +
> > +#include "sysemu/block-backend.h"
> > +#include "qemu/main-loop.h"
> > +
> > +#include "qemu/units.h"
> > +
> > +#include "block/block.h"
> > +
> > +#include "qom/object_interfaces.h"
> > +
> > +#include <sys/eventfd.h>
> > +
> > +#include "hw/qdev-properties.h"
> > +enum {
> > +    VHOST_USER_BLK_MAX_QUEUES = 8,
> > +};
>
> The number of queues is hardcoded to 1 so this constant can be removed
> for now.
>
> > +
> > +struct virtio_blk_inhdr {
> > +    unsigned char status;
> > +};
> > +
> > +
> > +static QTAILQ_HEAD(, VubDev) vub_devs = QTAILQ_HEAD_INITIALIZER(vub_devs);
>
> I'm not sure if this list will be necessary.  See my comments about
> replacing the "name" property with Object's built-in "id" property.
>
> > +
> > +
> > +typedef struct VubReq {
> > +    VuVirtqElement *elem;
> > +    int64_t sector_num;
> > +    size_t size;
> > +    struct virtio_blk_inhdr *in;
> > +    struct virtio_blk_outhdr out;
> > +    VuClient *client;
> > +    struct VuVirtq *vq;
> > +} VubReq;
> > +
> > +static void
> > +remove_watch(VuDev *vu_dev, int fd)
> > +{
> > +    VuClient *client;
> > +
> > +    g_assert(vu_dev);
> > +    g_assert(fd >= 0);
> > +
> > +    client = container_of(vu_dev, VuClient, parent);
> > +    aio_set_fd_handler(client->blk->ctx, fd, false, NULL, NULL, NULL, NULL);
> > +}
> > +
> > +static void close_client(VuClient *client)
> > +{
> > +    vu_deinit(&client->parent);
> > +    /** g_source_destroy(vub_device->parent.src); */
> > +    client->sioc = NULL;
> > +    object_unref(OBJECT(client->ioc));
> > +    client->closed = true;
> > +
> > +}
> > +
> > +static void vub_panic_cb(VuDev *vu_dev, const char *buf)
> > +{
> > +    if (buf) {
> > +        g_warning("vu_panic: %s", buf);
> > +    }
> > +
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    if (client->blk->exit_panic) {
> > +        client->blk->close = true;
> > +    }
> > +    if (!client->closed) {
> > +        close_client(client);
> > +    }
> > +}
> > +
> > +
> > +static void vub_req_complete(VubReq *req)
> > +{
> > +    VuDev *vu_dev = &req->client->parent;
> > +
> > +    /* IO size with 1 extra status byte */
> > +    vu_queue_push(vu_dev, req->vq, req->elem,
> > +                  req->size + 1);
> > +    vu_queue_notify(vu_dev, req->vq);
> > +
> > +    if (req->elem) {
> > +        free(req->elem);
> > +    }
> > +
> > +    g_free(req);
> > +}
> > +
> > +
> > +
> > +static int
> > +vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt,
> > +                         uint32_t type)
> > +{
> > +    struct virtio_blk_discard_write_zeroes *desc;
> > +    ssize_t size;
> > +    void *buf;
> > +
> > +    size = iov_size(iov, iovcnt);
> > +    if (size != sizeof(*desc)) {
> > +        fprintf(stderr, "Invalid size %ld, expect %ld\n", size, sizeof(*desc));
> > +        return -1;
> > +    }
> > +    buf = g_new0(char, size);
> > +
> > +    iov_to_buf_full(iov, iovcnt, 0, buf, size);
>
> Simpler:
>
>   struct virtio_blk_discard_write_zeroes desc;
>
>   if (unlikely(iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc)) !=
>                sizeof(desc)) {
>       /* TODO handle error */
>   }
>
> No heap allocation.  One variable (desc) instead of two (desc and buf).
>
> > +
> > +
> > +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> > +    VubDev *vdev_blk;
> > +    VuClient *client = container_of(dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +    desc = (struct virtio_blk_discard_write_zeroes *)buf;
> > +    uint64_t range[2] = { le64toh(desc->sector) << 9,
> > +                          le32toh(desc->num_sectors) << 9 };
> > +    if (type == VIRTIO_BLK_T_DISCARD) {
> > +        if (blk_pdiscard(vdev_blk->blk, range[0], range[1]) == 0) {
> > +            g_free(buf);
> > +            return 0;
> > +        }
> > +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
> > +        if (blk_pwrite_zeroes(vdev_blk->blk, range[0], range[1]) == 0) {
> > +            g_free(buf);
> > +            return 0;
> > +        }
> > +    }
> > +    #endif
> > +
> > +    g_free(buf);
> > +    return -1;
> > +}
> > +
> > +
> > +static void
> > +vub_flush(VubReq *req)
> > +{
> > +    VuClient *client = req->client;
> > +    blk_co_flush(client->blk->backend);
> > +}
>
> Any function that calls a *_co_*() function must be marked coroutine_fn:
>
>   static void coroutine_fn
>   vub_flush(VubReq *req)
>
> > +
> > +
> > +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
> > +typedef struct BlkRwCo {
> > +    BlockBackend *blk;
> > +    int64_t offset;
> > +    void *iobuf;
> > +    int ret;
> > +    BdrvRequestFlags flags;
> > +} BlkRwCo;
> > +
> > +static void blk_read_entry(void *opaque)
> > +{
> > +    BlkRwCo *rwco = opaque;
> > +    QEMUIOVector *qiov = rwco->iobuf;
> > +
> > +    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
> > +                              qiov, rwco->flags);
> > +    aio_wait_kick();
> > +}
> > +
> > +
> > +static void blk_write_entry(void *opaque)
> > +{
> > +    BlkRwCo *rwco = opaque;
> > +    QEMUIOVector *qiov = rwco->iobuf;
> > +
> > +    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
> > +                              qiov, rwco->flags);
> > +    aio_wait_kick();
> > +}
> > +
> > +
> > +static int blk_prw(BlockBackend *blk, QEMUIOVector *qiov, int64_t offset,
> > +                   CoroutineEntry co_entry, BdrvRequestFlags flags)
>
> Why is block/block-backend.c:blk_prw() duplicated here?
>
> > +{
> > +
> > +    BlkRwCo rwco = {
> > +        .blk    = blk,
> > +        .offset = offset,
> > +        .iobuf  = qiov,
> > +        .flags  = flags,
> > +        .ret    = NOT_DONE,
> > +    };
> > +
> > +    if (qemu_in_coroutine()) {
> > +        /* Fast-path if already in coroutine context */
> > +        co_entry(&rwco);
> > +    } else {
> > +        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
> > +        bdrv_coroutine_enter(blk_bs(blk), co);
> > +        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> > +    }
> > +
> > +    return rwco.ret;
> > +}
> > +
> > +
> > +static ssize_t
> > +vub_rwv(VubReq *req, struct iovec *iov,
> > +        uint32_t iovcnt,
> > +        CoroutineEntry co_entry)
> > +{
> > +    VuClient *client = req->client;
> > +    ssize_t rc;
> > +
> > +    if (!iovcnt) {
> > +        fprintf(stderr, "Invalid Read/Write IOV count\n");
> > +        return -1;
> > +    }
> > +
> > +    int64_t offset = req->sector_num * 512;
> > +    QEMUIOVector *qiov = g_new0(QEMUIOVector, 1);
> > +    qemu_iovec_init_external(qiov, iov, iovcnt);
> > +    rc = blk_prw(client->blk->backend, qiov, offset, co_entry, 0);
> > +
> > +    req->size = iov_size(iov, iovcnt);
> > +    if (rc < 0) {
> > +        fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
> > +                client->blk->name, req->sector_num, req->size,
> > +                strerror(errno));
> > +        return -1;
> > +    }
> > +
> > +    return rc;
> > +}
> > +
> > +static int vub_virtio_process_req(VuClient *client,
> > +                                     VuVirtq *vq)
> > +{
> > +    VuDev *vu_dev = &client->parent;
> > +    VuVirtqElement *elem;
> > +    uint32_t type;
> > +    VubReq *req;
> > +
> > +    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
> > +    if (!elem) {
> > +        return -1;
> > +    }
> > +
> > +    struct iovec *in_iov = elem->in_sg;
> > +    struct iovec *out_iov = elem->out_sg;
> > +    unsigned in_num = elem->in_num;
> > +    unsigned out_num = elem->out_num;
> > +    /* refer to hw/block/virtio_blk.c */
> > +    if (elem->out_num < 1 || elem->in_num < 1) {
> > +        fprintf(stderr, "virtio-blk request missing headers\n");
> > +        free(elem);
> > +        return -1;
> > +    }
> > +
> > +    req = g_new0(VubReq, 1);
> > +    req->client = client;
> > +    req->vq = vq;
> > +    req->elem = elem;
> > +
> > +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
> > +                            sizeof(req->out)) != sizeof(req->out))) {
> > +        fprintf(stderr, "virtio-blk request outhdr too short");
> > +        goto err;
> > +    }
> > +
> > +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
> > +
> > +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
> > +        fprintf(stderr, "virtio-blk request inhdr too short");
> > +        goto err;
> > +    }
> > +
> > +    /* We always touch the last byte, so just see how big in_iov is.  */
> > +    req->in = (void *)in_iov[in_num - 1].iov_base
> > +              + in_iov[in_num - 1].iov_len
> > +              - sizeof(struct virtio_blk_inhdr);
> > +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
> > +
> > +
> > +    type = le32toh(req->out.type);
> > +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
> > +    case VIRTIO_BLK_T_IN:
> > +    case VIRTIO_BLK_T_OUT: {
> > +        ssize_t ret = 0;
> > +        bool is_write = type & VIRTIO_BLK_T_OUT;
> > +        req->sector_num = le64toh(req->out.sector);
> > +        if (is_write) {
> > +            ret = vub_rwv(req, out_iov, out_num, blk_write_entry);
> > +        } else {
> > +            ret = vub_rwv(req, in_iov, in_num, blk_read_entry);
> > +        }
> > +        if (ret >= 0) {
> > +            req->in->status = VIRTIO_BLK_S_OK;
> > +        } else {
> > +            req->in->status = VIRTIO_BLK_S_IOERR;
> > +        }
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +    case VIRTIO_BLK_T_FLUSH:
> > +        vub_flush(req);
> > +        req->in->status = VIRTIO_BLK_S_OK;
> > +        vub_req_complete(req);
> > +        break;
> > +    case VIRTIO_BLK_T_GET_ID: {
> > +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> > +                          VIRTIO_BLK_ID_BYTES);
> > +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
> > +        req->in->status = VIRTIO_BLK_S_OK;
> > +        req->size = elem->in_sg[0].iov_len;
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +    case VIRTIO_BLK_T_DISCARD:
> > +    case VIRTIO_BLK_T_WRITE_ZEROES: {
> > +        int rc;
> > +        rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num, type);
> > +        if (rc == 0) {
> > +            req->in->status = VIRTIO_BLK_S_OK;
> > +        } else {
> > +            req->in->status = VIRTIO_BLK_S_IOERR;
> > +        }
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +    default:
> > +        req->in->status = VIRTIO_BLK_S_UNSUPP;
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +
> > +    return 0;
> > +
> > +err:
> > +    free(elem);
> > +    g_free(req);
> > +    return -1;
> > +}
> > +
> > +
> > +static void vub_process_vq(VuDev *vu_dev, int idx)
> > +{
> > +    VuClient *client;
> > +    VuVirtq *vq;
> > +    int ret;
> > +
> > +    client = container_of(vu_dev, VuClient, parent);
> > +    assert(client);
> > +
> > +    vq = vu_get_queue(vu_dev, idx);
> > +    assert(vq);
> > +
> > +    while (1) {
> > +        ret = vub_virtio_process_req(client, vq);
> > +        if (ret) {
> > +            break;
> > +        }
> > +    }
> > +}
> > +
> > +
> > +static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
> > +{
> > +    VuVirtq *vq;
> > +
> > +    assert(vu_dev);
> > +
> > +    vq = vu_get_queue(vu_dev, idx);
> > +    vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
> > +}
> > +
> > +static uint64_t
> > +vub_get_features(VuDev *dev)
> > +{
> > +    uint64_t features;
> > +    VubDev *vdev_blk;
> > +
> > +    VuClient *client = container_of(dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +
> > +    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
> > +               1ull << VIRTIO_BLK_F_SEG_MAX |
> > +               1ull << VIRTIO_BLK_F_TOPOLOGY |
> > +               1ull << VIRTIO_BLK_F_BLK_SIZE |
> > +               1ull << VIRTIO_BLK_F_FLUSH |
> > +               #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> > +               1ull << VIRTIO_BLK_F_DISCARD |
> > +               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
> > +               #endif
> > +               1ull << VIRTIO_BLK_F_CONFIG_WCE |
> > +               1ull << VIRTIO_F_VERSION_1 |
> > +               1ull << VIRTIO_RING_F_INDIRECT_DESC |
> > +               1ull << VIRTIO_RING_F_EVENT_IDX |
> > +               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
> > +
> > +    if (!vdev_blk->writable) {
> > +        features |= 1ull << VIRTIO_BLK_F_RO;
> > +    }
> > +
> > +    return features;
> > +}
> > +
> > +static uint64_t
> > +vub_get_protocol_features(VuDev *dev)
> > +{
> > +    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
> > +           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
> > +}
> > +
> > +static int
> > +vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
> > +{
> > +    VubDev *vdev_blk;
> > +
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +    memcpy(config, &vdev_blk->blkcfg, len);
> > +
> > +    return 0;
> > +}
> > +
> > +static int
> > +vub_set_config(VuDev *vu_dev, const uint8_t *data,
> > +               uint32_t offset, uint32_t size, uint32_t flags)
> > +{
> > +    VubDev *vdev_blk;
> > +
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +    uint8_t wce;
> > +
> > +    /* don't support live migration */
> > +    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
> > +        return -1;
> > +    }
> > +
> > +
> > +    if (offset != offsetof(struct virtio_blk_config, wce) ||
> > +        size != 1) {
> > +        return -1;
> > +    }
> > +
> > +    wce = *data;
> > +    if (wce == vdev_blk->blkcfg.wce) {
> > +        /* Do nothing as same with old configuration */
> > +        return 0;
> > +    }
> > +
> > +    vdev_blk->blkcfg.wce = wce;
> > +    blk_set_enable_write_cache(vdev_blk->backend, true);
> > +    return 0;
> > +}
> > +
> > +
> > +/*
> > + * When the client disconnects, it send a VHOST_USER_NONE request
> > + * and vu_process_message will simple call exit which cause the VM
> > + * to exit abruptly.
> > + * To avoid this issue,  process VHOST_USER_NONE request ahead
> > + * of vu_process_message.
> > + *
> > + */
> > +static int vub_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
> > +{
> > +    if (vmsg->request == VHOST_USER_NONE) {
> > +        dev->panic(dev, "disconnect");
> > +        return true;
> > +    }
> > +    return false;
> > +}
> > +
> > +static void
> > +vmsg_close_fds(VhostUserMsg *vmsg)
> > +{
> > +    int i;
> > +    for (i = 0; i < vmsg->fd_num; i++) {
> > +        close(vmsg->fds[i]);
> > +    }
> > +}
> > +
> > +static bool
> > +vu_message_read_co(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
>
> Why is this function called *_co()?  It appears to be callable from
> outside coroutine context too.
>
> > +{
> > +    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
> > +    struct iovec iov = {
> > +        .iov_base = (char *)vmsg,
> > +        .iov_len = VHOST_USER_HDR_SIZE,
> > +    };
> > +    struct msghdr msg = {
> > +        .msg_iov = &iov,
> > +        .msg_iovlen = 1,
> > +        .msg_control = control,
> > +        .msg_controllen = sizeof(control),
> > +    };
> > +    size_t fd_size;
> > +    struct cmsghdr *cmsg;
> > +    int rc;
> > +    char buffer[100];
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    QIOChannel *ioc = client->ioc;
> > +    do {
> > +        rc = recvmsg(conn_fd, &msg, 0);
> > +        if (rc < 0) {
> > +            if (errno == EAGAIN) {
> > +                if (qemu_in_coroutine()) {
> > +                    qio_channel_yield(ioc, G_IO_IN);
> > +                } else {
> > +                    qio_channel_wait(ioc, G_IO_IN);
> > +                }
> > +                continue;
> > +            } else if (errno == EINTR) {
> > +                continue;
> > +            }
> > +        }
> > +        break;
> > +    } while (true);
> > +
> > +    if (rc < 0) {
> > +        sprintf(buffer, "Error while recvmsg: %s", strerror(errno));
> > +        vub_panic_cb(vu_dev, buffer);
> > +        return false;
> > +    }
> > +
> > +    assert(rc == VHOST_USER_HDR_SIZE || rc == 0);
> > +
> > +    vmsg->fd_num = 0;
> > +    for (cmsg = CMSG_FIRSTHDR(&msg);
> > +         cmsg != NULL;
> > +         cmsg = CMSG_NXTHDR(&msg, cmsg))
> > +    {
> > +        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
> > +            fd_size = cmsg->cmsg_len - CMSG_LEN(0);
> > +            vmsg->fd_num = fd_size / sizeof(int);
> > +            memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
> > +            break;
> > +        }
> > +    }
> > +
> > +    if (vmsg->size > sizeof(vmsg->payload)) {
> > +        sprintf(buffer,
> > +                "Error: too big message request: %d, size: vmsg->size: %u, "
> > +                "while sizeof(vmsg->payload) = %zu\n",
> > +                vmsg->request, vmsg->size, sizeof(vmsg->payload));
> > +        vub_panic_cb(vu_dev, buffer);
> > +        goto fail;
> > +    }
> > +
> > +    if (vmsg->size) {
> > +        do {
> > +            rc = read(conn_fd, &vmsg->payload, vmsg->size);
> > +            if (rc < 0) {
> > +                if (errno == EAGAIN) {
> > +                    if (qemu_in_coroutine()) {
> > +                        qio_channel_yield(ioc, G_IO_IN);
> > +                    } else {
> > +                        qio_channel_wait(ioc, G_IO_IN);
> > +                    }
> > +                    continue;
> > +                } else if (errno == EINTR) {
> > +                    continue;
> > +                }
> > +            }
> > +            break;
> > +        } while (true);
> > +
> > +        if (rc <= 0) {
> > +            sprintf(buffer, "Error while reading: %s", strerror(errno));
> > +            vub_panic_cb(vu_dev, buffer);
> > +            goto fail;
> > +        }
> > +
> > +        assert(rc == vmsg->size);
> > +    }
> > +
> > +    return true;
> > +
> > +fail:
> > +    vmsg_close_fds(vmsg);
> > +
> > +    return false;
> > +}
> > +
> > +static void vub_kick_cb(void *opaque)
> > +{
> > +    vu_watch_cb_data *data = (vu_watch_cb_data *) opaque;
> > +    int index = data->index;
> > +    VuDev *dev = data->vu_dev;
> > +    VuVirtq *vq = &dev->vq[index];
> > +    int sock = vq->kick_fd;
> > +    eventfd_t kick_data;
> > +    ssize_t rc;
> > +
> > +    rc = eventfd_read(sock, &kick_data);
> > +    if (rc == -1) {
> > +        char buffer[100];
> > +        sprintf(buffer, "kick eventfd_read(): %s", strerror(errno));
> > +        vub_panic_cb(dev, buffer);
> > +        g_free(data);
> > +        dev->remove_watch(dev, dev->vq[index].kick_fd);
> > +    } else {
> > +        if (vq->handler) {
> > +            vq->handler(dev, index);
> > +        }
> > +    }
> > +}
>
> This is a generic vhost-user function that is not specific to
> vhost-user-blk.  It should be in a different source file.
>
> > +
> > +static const VuDevIface vub_iface = {
> > +    .get_features = vub_get_features,
> > +    .queue_set_started = vub_queue_set_started,
> > +    .get_protocol_features = vub_get_protocol_features,
> > +    .get_config = vub_get_config,
> > +    .set_config = vub_set_config,
> > +    .process_msg = vub_process_msg,
> > +    .read_msg = vu_message_read_co,
> > +    .kick_callback = vub_kick_cb,
> > +};
> > +
> > +
> > +void vub_free(VubDev *vub_dev, bool called_by_QOM)
> > +{
> > +    if (!vub_dev) {
> > +        return;
> > +    }
> > +
> > +    blk_unref(vub_dev->backend);
> > +    g_free(vub_dev->name);
> > +    g_free(vub_dev->unix_socket);
> > +
> > +    if (vub_dev->next.tqe_circ.tql_prev) {
> > +        /*
> > +         * if vub_dev->next.tqe_circ.tql_prev = null,
> > +         * vub_dev hasn't been inserted into the queue and
> > +         * vub_free is called by obj->instance_finalize.
> > +         */
> > +        QTAILQ_REMOVE(&vub_devs, vub_dev, next);
> > +    }
> > +    /*
> > +     * Needn't to free vub_dev if called by QOM
> > +     * because QOM will do the clean-up work.
> > +     */
> > +    if (!called_by_QOM) {
> > +        g_free(vub_dev);
> > +    }
> > +}
> > +
> > +static coroutine_fn void vu_client_trip(void *opaque)
> > +{
> > +    VuClient *client = opaque;
> > +
> > +    while (!client->closed) {
> > +        vu_dispatch(&client->parent);
> > +    }
> > +
> > +    QTAILQ_REMOVE(&client->blk->clients, client, next);
> > +
> > +}
> > +
> > +static void vu_client_start(VuClient *client)
> > +{
> > +    Coroutine *co = qemu_coroutine_create(vu_client_trip, client);
> > +    qemu_coroutine_enter(co);
> > +}
>
> What is the coroutine and threading model here?  This launches
> vu_client_trip() in the current thread's event loop but vu_dispatch()
> performs blocking I/O and will prevent the event loop from continuing.
>
> > +
> > +
> > +G_STATIC_ASSERT((int)G_IO_IN == (int)VU_WATCH_IN);
> > +G_STATIC_ASSERT((int)G_IO_OUT == (int)VU_WATCH_OUT);
> > +G_STATIC_ASSERT((int)G_IO_PRI == (int)VU_WATCH_PRI);
> > +G_STATIC_ASSERT((int)G_IO_ERR == (int)VU_WATCH_ERR);
> > +G_STATIC_ASSERT((int)G_IO_HUP == (int)VU_WATCH_HUP);
> > +
> > +static void
> > +set_watch(VuDev *vu_dev, int fd, int vu_evt,
> > +          vu_watch_cb_packed_data cb, void *pvt)
> > +{
> > +    /*
> > +     * since aio_dispatch can only pass one user data pointer to the
> > +     * callback function, pack VuDev, pvt into a struct
> > +     */
> > +    VuClient *client;
> > +
> > +    g_assert(vu_dev);
> > +    g_assert(fd >= 0);
> > +    g_assert(cb);
> > +    client = container_of(vu_dev, VuClient, parent);
> > +    vu_watch_cb_data *cb_data = g_new0(vu_watch_cb_data, 1);
> > +    cb_data->index = (intptr_t) pvt;
> > +    cb_data->vu_dev = vu_dev;
> > +    aio_set_fd_handler(client->blk->ctx, fd, false, (void *) cb,
> > +                       NULL, NULL, cb_data);
> > +}
> > +
> > +
> > +void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
> > +                gpointer opaque)
> > +{
> > +    VuClient *client;
> > +    VubDev *vub_device = opaque;
> > +    client = g_new0(VuClient, 1);
>
> Is it helpful to have a separate VuClient struct even though only a
> single vhost-user client can be connected at a time?  It may be simpler
> to keep connection state directly in VubDev.
>
> > +
> > +    if (!vu_init_packed_data(&client->parent, VHOST_USER_BLK_MAX_QUEUES,
> > +                             sioc->fd, vub_panic_cb, set_watch,
> > +                             remove_watch, &vub_iface)) {
> > +        fprintf(stderr, "Failed to initialized libvhost-user\n");
> > +        g_free(client);
> > +        return;
> > +    }
> > +
> > +    client->blk = vub_device;
> > +    client->refcount = 1;
> > +    client->sioc = sioc;
> > +    /*
> > +     * increase the object reference, so cioc will not freed by
> > +     * qio_net_listener_channel_func which will call object_unref(OBJECT(sioc))
> > +     */
> > +    object_ref(OBJECT(client->sioc));
> > +    qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
> > +    client->ioc = QIO_CHANNEL(sioc);
> > +    object_ref(OBJECT(client->ioc));
> > +    object_ref(OBJECT(sioc));
> > +
> > +    qio_channel_set_blocking(QIO_CHANNEL(client->sioc), false, NULL);
> > +    client->closed = false;
> > +    QTAILQ_INSERT_TAIL(&client->blk->clients, client, next);
> > +    vu_client_start(client);
> > +}
> > +
> > +
> > +void
> > +vub_initialize_config(BlockDriverState *bs, struct virtio_blk_config *config)
> > +{
> > +    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
> > +    config->blk_size = BDRV_SECTOR_SIZE;
>
> At some point this should become a configuration parameter because using
> 512B sectors on a native 4K sector image file results in poor
> performance.
>
> > +    config->size_max = 65536;
>
> Why 65536?  hw/block/virtio-blk.c sets it to 0.
>
> > +    config->seg_max = 128 - 2;
>
> This is okay for now but should be changed in the future.
>
> hw/block/virtio-blk.c was recently modified to calculate seg_max based
> on the virtqueue size (which is where the hardcoded 128 originally came
> from).  Applications that use large buffer sizes (128+ KB) benefit from
> larger virtqueue sizes and seg_max.
>
> > +    config->min_io_size = 1;
> > +    config->opt_io_size = 1;
> > +    config->num_queues = 1;
> > +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> > +    config->max_discard_sectors = 32768;
> > +    config->max_discard_seg = 1;
> > +    config->discard_sector_alignment = config->blk_size >> 9;
> > +    config->max_write_zeroes_sectors = 32768;
> > +    config->max_write_zeroes_seg = 1;
> > +    #endif
> > +}
> > +
> > +
> > +static VubDev *vub_new(VubDev *vub_device, const char *name,
> > +                       const char *unix_socket, bool writable, Error **errp)
>
> This function does not allocate and return a new VubDev, so vub_new() is
> not an accurate name for this function.  The name vub_init() is clearer.
>
> Why are name, unix_socket, and writable passed in?  They are already
> vub_device fields.
>
> > +{
> > +
> > +    BlockBackend *blk;
> > +
> > +    /*
> > +     * Don't allow resize while the vhost user server is running,
> > +     * otherwise we don't care what happens with the node.
> > +     */
> > +    uint64_t perm = BLK_PERM_CONSISTENT_READ;
> > +    int ret;
> > +
> > +    AioContext *ctx;
> > +
> > +    BlockDriverState *bs = bdrv_lookup_bs(name,
> > +                                          name,
> > +                                          errp);
> > +
> > +    if (!bs) {
> > +        error_setg(errp,
> > +                   "No drive with name '%s'."
> > +                   " Please find the list of names with "
> > +                   "'info block'", name);
> > +        return NULL;
> > +    }
> > +
> > +    if (bdrv_is_read_only(bs)) {
> > +        writable = false;
> > +    }
> > +
> > +    if (writable) {
> > +        perm |= BLK_PERM_WRITE;
> > +    }
> > +
> > +    ctx = bdrv_get_aio_context(bs);
> > +    aio_context_acquire(ctx);
> > +    bdrv_invalidate_cache(bs, NULL);
> > +    aio_context_release(ctx);
> > +
> > +    blk = blk_new(bdrv_get_aio_context(bs), perm,
> > +                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> > +                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
> > +    ret = blk_insert_bs(blk, bs, errp);
> > +
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +
> > +
> > +    blk_set_enable_write_cache(blk, false);
> > +
> > +    blk_set_allow_aio_context_change(blk, true);
> > +
> > +
> > +    vub_device->name = g_strdup(name);
> > +    vub_device->unix_socket = g_strdup(unix_socket);
> > +    vub_device->writable = writable;
> > +    vub_device->blkcfg.wce = 0;
> > +    vub_device->backend = blk;
> > +    vub_device->ctx = ctx;
> > +    vub_initialize_config(bs, &vub_device->blkcfg);
> > +    return vub_device;
> > +
> > +fail:
> > +    blk_unref(blk);
> > +    return NULL;
> > +}
> > +
> > +void vhost_user_server_free(VubDev *vub_device, bool called_by_QOM)
> > +{
> > +    if (!vub_device) {
> > +        return;
> > +    }
> > +
> > +    VuClient *client, *next;
> > +    QTAILQ_FOREACH_SAFE(client, &vub_device->clients, next, next) {
> > +        if (!client->closed) {
> > +            close_client(client);
> > +        }
> > +    }
> > +
> > +    if (vub_device->listener) {
> > +        qio_net_listener_disconnect(vub_device->listener);
> > +        object_unref(OBJECT(vub_device->listener));
> > +    }
> > +    vub_free(vub_device, called_by_QOM);
> > +
> > +}
> > +
> > +
> > +VubDev *vub_dev_find(const char *name)
> > +{
> > +    VubDev *vub_device;
> > +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> > +        if (strcmp(name, vub_device->name) == 0) {
> > +            return vub_device;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +
> > +static VubDev *vub_dev_find_by_unix_socket(const char *unix_socket)
> > +{
> > +    VubDev *vub_device;
> > +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> > +        if (strcmp(unix_socket, vub_device->unix_socket) == 0) {
> > +            return vub_device;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +static void vhost_user_server_start(VubDev *vub_device, const char *unix_socket,
> > +                                    const char *name, bool writable,
> > +                                    Error **errp)
> > +{
> > +
> > +    if (vub_dev_find(name) || vub_dev_find_by_unix_socket(unix_socket)) {
> > +        error_setg(errp, "Vhost user server with name '%s' or "
> > +                "with socket_path '%s' has already been started",
> > +                name, unix_socket);
> > +        return;
> > +    }
> > +
> > +
> > +    if (!vub_new(vub_device, name, unix_socket, writable, errp)) {
> > +        return;
> > +    }
> > +
> > +
> > +    vub_device->listener = qio_net_listener_new();
> > +
> > +    qio_net_listener_set_name(vub_device->listener,
> > +                              "vhost-user-backend-listener");
> > +
> > +    SocketAddress *addr = g_new0(SocketAddress, 1);
>
> addr is never freed.  This variable could live on the stack instead:
>
>   SocketAddress addr = {};
>
> > +    addr->u.q_unix.path = (char *) unix_socket;
> > +    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
> > +    if (qio_net_listener_open_sync(vub_device->listener, addr, 1, errp) < 0) {
> > +        goto error;
> > +    }
> > +
> > +
> > +    QTAILQ_INSERT_TAIL(&vub_devs, vub_device, next);
> > +    QTAILQ_INIT(&vub_device->clients);
> > +
> > +    qio_net_listener_set_client_func(vub_device->listener,
> > +                                     vub_accept,
> > +                                     vub_device,
> > +                                     NULL);
> > +
> > +    return;
> > +
> > + error:
> > +    vub_free(vub_device, false);
> > +}
> > +
> > +static void vu_set_block_name(Object *obj, const char *value,
> > +                                           Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +
> > +    if (vus->name) {
> > +        error_setg(errp, "evdev property already set");
> > +        return;
> > +    }
> > +
> > +    vus->name = g_strdup(value);
> > +}
> > +
> > +static char *vu_get_block_name(Object *obj, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);
> > +    return g_strdup(vus->name);
> > +}
> > +
> > +
> > +static void vu_set_unix_socket(Object *obj, const char *value,
> > +                                            Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +
> > +    if (vus->unix_socket) {
> > +        error_setg(errp, "unix_socket property already set");
> > +        return;
> > +    }
> > +
> > +    vus->unix_socket = g_strdup(value);
> > +    vhost_user_server_start(vus, value, vus->name,
> > +                            vus->writable, errp);
>
> Property setters should only perform input validation and store the
> data.  Actions like creating network connections, opening files, etc
> should happen later in a UserCreatableClass->complete() callback.
>
> This is necessary because vus->writable is also a property and may be
> set after unix_socket.  The ->complete() callback is called after all
> setters so it can access the final values of all properties.
>
> See iothread_class_init() and iothread_complete() for an example.
>
> > +}
> > +
> > +static char *vu_get_unix_socket(Object *obj, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +    return g_strdup(vus->unix_socket);
> > +}
> > +
> > +static bool vu_get_block_writable(Object *obj, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +    return vus->writable;
> > +}
> > +
> > +static void vu_set_block_writable(Object *obj, bool value, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);
> > +
> > +    vus->writable = value;
> > +}
> > +
> > +static void vhost_user_server_instance_init(Object *obj)
> > +{
> > +
> > +    object_property_add_bool(obj, "writable",
> > +                            vu_get_block_writable,
> > +                            vu_set_block_writable, NULL);
> > +
> > +    object_property_add_str(obj, "name",
> > +                            vu_get_block_name,
> > +                            vu_set_block_name, NULL);
> > +
> > +    object_property_add_str(obj, "unix_socket",
> > +                            vu_get_unix_socket,
> > +                            vu_set_unix_socket, NULL);
> > +
> > +}
> > +
> > +static void vhost_user_server_instance_finalize(Object *obj)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);
> > +    vhost_user_server_free(vus, true);
> > +    /* object_del shouldn't free this object struct */
> > +    obj->free = NULL;
> > +}
> > +
> > +static const TypeInfo vhost_user_server_info = {
> > +    .name = TYPE_VHOST_USER_SERVER,
> > +    .parent = TYPE_OBJECT,
> > +    .instance_size = sizeof(VuDev),
> > +    .instance_init = vhost_user_server_instance_init,
> > +    .instance_finalize = vhost_user_server_instance_finalize,
> > +    .interfaces = (InterfaceInfo[]) {
> > +        {TYPE_USER_CREATABLE},
> > +        {}
> > +    },
> > +};
> > +
> > +static void vhost_user_server_register_types(void)
> > +{
> > +    type_register_static(&vhost_user_server_info);
> > +}
> > +
> > +type_init(vhost_user_server_register_types)
> > +
> > diff --git a/include/block/vhost-user.h b/include/block/vhost-user.h
> > new file mode 100644
> > index 0000000000..ef6d695244
> > --- /dev/null
> > +++ b/include/block/vhost-user.h
> > @@ -0,0 +1,46 @@
> > +#include "io/channel-socket.h"
> > +#include "io/net-listener.h"
> > +#include "contrib/libvhost-user/libvhost-user.h"
> > +#include "standard-headers/linux/virtio_blk.h"
> > +typedef struct VubDev VubDev;
> > +typedef struct VuClient VuClient;
> > +#define TYPE_VHOST_USER_SERVER "vhost-user-server"
> > +
> > +#define VHOST_USER_SERVER(obj) \
> > +   OBJECT_CHECK(VubDev, obj, TYPE_VHOST_USER_SERVER)
> > +/* vhost user block device */
> > +struct VubDev {
> > +    Object parent_obj;
> > +    char *name;
> > +    char *unix_socket;
> > +    bool exit_panic;
> > +    bool close;
> > +    BlockBackend *backend;
> > +    AioContext *ctx;
> > +    QIONetListener *listener;
> > +    QIOChannelSocket *sioc;
> > +    QTAILQ_HEAD(, VuClient) clients;
> > +    QTAILQ_ENTRY(VubDev) next;
> > +    struct virtio_blk_config blkcfg;
> > +    bool writable;
> > +};
> > +
> > +struct VuClient {
> > +    VuDev parent;
> > +    int refcount;
> > +    VubDev *blk;
> > +    QIOChannelSocket *sioc; /* The underlying data channel */
> > +    QIOChannel *ioc; /* The current I/O channel */
> > +    QTAILQ_ENTRY(VuClient) next;
> > +    bool closed;
> > +};
> > +VubDev *vub_dev_find(const char *name);
>
> All -object instances already have an id=ID property.  There is no need
> to declare a separate "name" property.  Please look at iothread_by_id()
> and iothread_get_id() for examples.
>
> > +
> > +void vhost_user_server_free(VubDev *vub_device, bool called_by_QOM);
> > +void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
> > +                gpointer opaque);
> > +
> > +void vub_free(VubDev *vub_dev, bool called_by_QOM);
> > +
> > +void vub_initialize_config(BlockDriverState *bs,
> > +                           struct virtio_blk_config *config);
> > diff --git a/vl.c b/vl.c
> > index 86474a55c9..72ac506342 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -2553,6 +2553,10 @@ static bool object_create_initial(const char *type, QemuOpts *opts)
> >      }
> >  #endif
> >
> > +    /* Reason: vhost-user-server property "name" */
> > +    if (g_str_equal(type, "vhost-user-server")) {
> > +        return false;
> > +    }
>
> I don't understand why the "name" property introduces a creation order
> dependency.  It's just a string and has no dependency on other
> command-line objects.  Can you explain why this change is necessary?
>
> >      /*
> >       * Reason: filter-* property "netdev" etc.
> >       */
> > --
> > 2.24.1
> >



--
Best regards,
Coiby


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 1/5] vhost-user block device backend
  2020-01-16 13:56   ` Kevin Wolf
@ 2020-02-20  7:04     ` Coiby Xu
  2020-02-20  9:30     ` Coiby Xu
  1 sibling, 0 replies; 19+ messages in thread
From: Coiby Xu @ 2020-02-20  7:04 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: bharatlkmlkvm, qemu-devel, stefanha

Hi Kevin,

Thank you for reviewing my work in a rather detailed way.

> >  blockdev-vu.c              | 1008 ++++++++++++++++++++++++++++++++++++
> >  include/block/vhost-user.h |   46 ++
> >  vl.c                       |    4 +
> >  3 files changed, 1058 insertions(+)
> >  create mode 100644 blockdev-vu.c
> >  create mode 100644 include/block/vhost-user.h

> This adds a single, relatively big source file. I see at least two
parts: The generic vhost-user infrastructure with connection handling
etc. and the implementation of the specific vhost-user-blk device.
Separating these into two files is probably a good idea.

> I would also suggest to put the files in a new subdirectory
block/export/ and call them vhost-user.c/vhost-user-blk.c. The new
header file can be in the same directory as it shouldn't be used by
anyone else.

I've split blockdev-vu.c in two separate files but in a different subdirectory
 - backends/vhost-user-blk-server.c
 - util/vhost-user-server.c

> > +static QTAILQ_HEAD(, VubDev) vub_devs = QTAILQ_HEAD_INITIALIZER(vub_devs);
> > +
> > +
> > +typedef struct VubReq {
> > +    VuVirtqElement *elem;

> Maybe worth a comment that this was allocated with plain malloc(), so
> you must use free() rather than g_free() (which would be the default in
> QEMU)?

Although VuVirtqElement is created using malloc, VubReq is created using g_new0.


I missed several suggestions in v3 but in v4 all suggestions have been
applied. Thank you!
On Thu, Jan 16, 2020 at 9:56 PM Kevin Wolf <kwolf@redhat.com> wrote:
>
> Hi,
>
> I'm only doing a quick first review pointing out the more obvious
> things while I familiarise myself with your code. I intend to review it
> in more detail later (either in a second pass for this series, or when
> you post v3).
>
> Am 14.01.2020 um 15:06 hat Coiby Xu geschrieben:
> > By making use of libvhost, multiple block device drives can be exported and each drive can serve multiple clients simultaneously. Since vhost-user-server needs a block drive to be created first, delay the creation of this object.
> >
> > Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
>
> Please wrap the commit message at 72 characters.
>
> >  blockdev-vu.c              | 1008 ++++++++++++++++++++++++++++++++++++
> >  include/block/vhost-user.h |   46 ++
> >  vl.c                       |    4 +
> >  3 files changed, 1058 insertions(+)
> >  create mode 100644 blockdev-vu.c
> >  create mode 100644 include/block/vhost-user.h
>
> This adds a single, relatively big source file. I see at least two
> parts: The generic vhost-user infrastructure with connection handling
> etc. and the implementation of the specific vhost-user-blk device.
> Separating these into two files is probably a good idea.
>
> I would also suggest to put the files in a new subdirectory
> block/export/ and call them vhost-user.c/vhost-user-blk.c. The new
> header file can be in the same directory as it shouldn't be used by
> anyone else.
>
> > diff --git a/blockdev-vu.c b/blockdev-vu.c
> > new file mode 100644
> > index 0000000000..45f0bb43a7
> > --- /dev/null
> > +++ b/blockdev-vu.c
> > @@ -0,0 +1,1008 @@
>
> The LICENSE file clarifies that files without a license header are
> GPLv2+, so it's not strictly a problem, but I think it is good style to
> include a license header that explicitly tells so.
>
> > +#include "qemu/osdep.h"
> > +#include "block/vhost-user.h"
> > +#include "qapi/error.h"
> > +#include "qapi/qapi-types-sockets.h"
> > +#include "qapi/qapi-commands-block.h"
> > +
> > +#include "sysemu/block-backend.h"
> > +#include "qemu/main-loop.h"
> > +
> > +#include "qemu/units.h"
> > +
> > +#include "block/block.h"
> > +
> > +#include "qom/object_interfaces.h"
> > +
> > +#include <sys/eventfd.h>
> > +
> > +#include "hw/qdev-properties.h"
>
> Does the order of includes and the empty lines between them signify
> anything? If not, I suggest just sorting them alphabetically (and maybe
> using empty lines between different subdirectories if you like this
> better than a single large block).
>
> According to CODING_STYLE.rst, system headers like <sys/eventfd.h> come
> before all QEMU headers (except qemu/osdep.h, which always must come
> first).
>
> > +enum {
> > +    VHOST_USER_BLK_MAX_QUEUES = 8,
> > +};
> > +
> > +struct virtio_blk_inhdr {
> > +    unsigned char status;
> > +};
> > +
> > +
> > +static QTAILQ_HEAD(, VubDev) vub_devs = QTAILQ_HEAD_INITIALIZER(vub_devs);
> > +
> > +
> > +typedef struct VubReq {
> > +    VuVirtqElement *elem;
>
> Maybe worth a comment that this was allocated with plain malloc(), so
> you must use free() rather than g_free() (which would be the default in
> QEMU)?
>
> > +    int64_t sector_num;
> > +    size_t size;
> > +    struct virtio_blk_inhdr *in;
> > +    struct virtio_blk_outhdr out;
> > +    VuClient *client;
> > +    struct VuVirtq *vq;
> > +} VubReq;
>
> I'm not completely sure yet, but I think I would prefer VuBlock to Vub
> in the type names. Some may even prefer VhostUserBlock, but I can see
> that this would be quite lengthy.
>
> > +static void
> > +remove_watch(VuDev *vu_dev, int fd)
> > +{
> > +    VuClient *client;
> > +
> > +    g_assert(vu_dev);
> > +    g_assert(fd >= 0);
> > +
> > +    client = container_of(vu_dev, VuClient, parent);
> > +    aio_set_fd_handler(client->blk->ctx, fd, false, NULL, NULL, NULL, NULL);
> > +}
> > +
> > +static void close_client(VuClient *client)
> > +{
> > +    vu_deinit(&client->parent);
> > +    /** g_source_destroy(vub_device->parent.src); */
>
> Leftover from conversion?
>
> > +    client->sioc = NULL;
> > +    object_unref(OBJECT(client->ioc));
> > +    client->closed = true;
> > +
> > +}
> > +
> > +static void vub_panic_cb(VuDev *vu_dev, const char *buf)
>
> You use a lot of sprintf() before calling this function. Would it be
> worth taking a printf-like format parameter instead of buf and using a
> variable argument list?
>
> > +{
> > +    if (buf) {
> > +        g_warning("vu_panic: %s", buf);
>
> I think QEMU proper doesn't use g_warning() anywhere. This could be
> error_report() or warn_report(). (Or if you use a format string
> error_vreport() and warn_vreport().)
>
> > +    }
> > +
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    if (client->blk->exit_panic) {
> > +        client->blk->close = true;
> > +    }
> > +    if (!client->closed) {
> > +        close_client(client);
> > +    }
> > +}
> > +
> > +
> > +static void vub_req_complete(VubReq *req)
> > +{
> > +    VuDev *vu_dev = &req->client->parent;
> > +
> > +    /* IO size with 1 extra status byte */
> > +    vu_queue_push(vu_dev, req->vq, req->elem,
> > +                  req->size + 1);
>
> I think this fits in a single line.
>
> > +    vu_queue_notify(vu_dev, req->vq);
> > +
> > +    if (req->elem) {
> > +        free(req->elem);
> > +    }
> > +
> > +    g_free(req);
> > +}
> > +
> > +
> > +
> > +static int
> > +vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt,
> > +                         uint32_t type)
> > +{
> > +    struct virtio_blk_discard_write_zeroes *desc;
> > +    ssize_t size;
> > +    void *buf;
> > +
> > +    size = iov_size(iov, iovcnt);
> > +    if (size != sizeof(*desc)) {
> > +        fprintf(stderr, "Invalid size %ld, expect %ld\n", size, sizeof(*desc));
> > +        return -1;
>
> This would be error_report(), too. (More cases below, I'll ignore them
> now.)
>
> I would prefer consistent use of -errno instead of -1 for error cases if
> you don't mind. I guess this would be -EINVAL here. I won't mention it
> for all the other cases; if you want to make the change, you need to
> make it everywhere, obviously.
>
> > +    }
> > +    buf = g_new0(char, size);
> > +
> > +    iov_to_buf_full(iov, iovcnt, 0, buf, size);
>
> I think uint8_t describes better than char what we want here: A buffer
> of bytes.
>
> The empty line would make more sense to me above the g_new0() line than
> after it because it starts a new section that deals with the buffer. In
> general, the use of empty lines feels a bit inconsistent in this patch.
> You may want to go over them again.
>
> > +
> > +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
>
> Preprocessor directives should be unindented.
>
> However, I don't think any of this code actually depends on Linux,
> BLKDISCARD or BLKZEROOUT. You can just call blk_pdiscard() and
> blk_pwrite_zeroes() and they will do whatever is necessary to perform
> the operation on the backend (which might not be a Linux block device,
> but could be a regular file or even using a network protocol like NBD).
>
> > +    VubDev *vdev_blk;
> > +    VuClient *client = container_of(dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +    desc = (struct virtio_blk_discard_write_zeroes *)buf;
> > +    uint64_t range[2] = { le64toh(desc->sector) << 9,
> > +                          le32toh(desc->num_sectors) << 9 };
> > +    if (type == VIRTIO_BLK_T_DISCARD) {
> > +        if (blk_pdiscard(vdev_blk->blk, range[0], range[1]) == 0) {
> > +            g_free(buf);
> > +            return 0;
> > +        }
> > +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
> > +        if (blk_pwrite_zeroes(vdev_blk->blk, range[0], range[1]) == 0) {
> > +            g_free(buf);
> > +            return 0;
> > +        }
>
> blk_pdiscard() and blk_pwrite_zeroes() are synchronous functions. In
> other words, the guest will be blocked until the I/O is complete. We
> cannot do this.
>
> I think you should let vub_virtio_process_req() run in a coroutine so
> that you can call blk_co_pdiscard() and blk_co_pwrite_zeroes() here.
>
> > +    }
> > +    #endif
> > +
> > +    g_free(buf);
> > +    return -1;
> > +}
> > +
> > +
> > +static void
> > +vub_flush(VubReq *req)
> > +{
> > +    VuClient *client = req->client;
> > +    blk_co_flush(client->blk->backend);
>
> You can't call blk_co_flush() from outside coroutine context. This code
> will be right after you move vub_virtio_process_req() to a coroutine,
> though (which will make this function a coroutine_fn).
>
> > +}
> > +
> > +
> > +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
> > +typedef struct BlkRwCo {
> > +    BlockBackend *blk;
> > +    int64_t offset;
> > +    void *iobuf;
> > +    int ret;
> > +    BdrvRequestFlags flags;
> > +} BlkRwCo;
> > +
> > +static void blk_read_entry(void *opaque)
> > +{
> > +    BlkRwCo *rwco = opaque;
> > +    QEMUIOVector *qiov = rwco->iobuf;
> > +
> > +    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
> > +                              qiov, rwco->flags);
> > +    aio_wait_kick();
> > +}
> > +
> > +
> > +static void blk_write_entry(void *opaque)
> > +{
> > +    BlkRwCo *rwco = opaque;
> > +    QEMUIOVector *qiov = rwco->iobuf;
> > +
> > +    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
> > +                              qiov, rwco->flags);
> > +    aio_wait_kick();
> > +}
> > +
> > +
> > +static int blk_prw(BlockBackend *blk, QEMUIOVector *qiov, int64_t offset,
> > +                   CoroutineEntry co_entry, BdrvRequestFlags flags)
> > +{
> > +
> > +    BlkRwCo rwco = {
> > +        .blk    = blk,
> > +        .offset = offset,
> > +        .iobuf  = qiov,
> > +        .flags  = flags,
> > +        .ret    = NOT_DONE,
> > +    };
> > +
> > +    if (qemu_in_coroutine()) {
> > +        /* Fast-path if already in coroutine context */
> > +        co_entry(&rwco);
> > +    } else {
> > +        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
> > +        bdrv_coroutine_enter(blk_bs(blk), co);
> > +        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> > +    }
> > +
> > +    return rwco.ret;
> > +}
>
> This is copy&paste from block-backend.c. We should certainly not do
> this. I think it will automatically go away when you can use
> blk_co_preadv() and blk_co_pwritev() directly.
>
> Note that the BDRV_POLL_WHILE() means that like above, we would be
> waiting for the request to complete. This would block the guest and
> would also not allow parallel requests, killing the I/O performance of
> our vhost-user export.
>
> > +
> > +static ssize_t
> > +vub_rwv(VubReq *req, struct iovec *iov,
> > +        uint32_t iovcnt,
> > +        CoroutineEntry co_entry)
>
> I don't understand the line wrapping here. :-)
>
> > +{
> > +    VuClient *client = req->client;
> > +    ssize_t rc;
> > +
> > +    if (!iovcnt) {
> > +        fprintf(stderr, "Invalid Read/Write IOV count\n");
> > +        return -1;
> > +    }
> > +
> > +    int64_t offset = req->sector_num * 512;
> > +    QEMUIOVector *qiov = g_new0(QEMUIOVector, 1);
> > +    qemu_iovec_init_external(qiov, iov, iovcnt);
> > +    rc = blk_prw(client->blk->backend, qiov, offset, co_entry, 0);
> > +
> > +    req->size = iov_size(iov, iovcnt);
>
> You can use qiov->size instead of duplicating this information into a
> separate VubReq field.
>
> > +    if (rc < 0) {
> > +        fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
> > +                client->blk->name, req->sector_num, req->size,
> > +                strerror(errno));
> > +        return -1;
> > +    }
> > +
> > +    return rc;
> > +}
> > +
> > +static int vub_virtio_process_req(VuClient *client,
> > +                                     VuVirtq *vq)
>
> Indentation is off. This could be a single line anyway.
>
> > +{
> > +    VuDev *vu_dev = &client->parent;
> > +    VuVirtqElement *elem;
> > +    uint32_t type;
> > +    VubReq *req;
> > +
> > +    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
> > +    if (!elem) {
> > +        return -1;
> > +    }
> > +
> > +    struct iovec *in_iov = elem->in_sg;
> > +    struct iovec *out_iov = elem->out_sg;
> > +    unsigned in_num = elem->in_num;
> > +    unsigned out_num = elem->out_num;
> > +    /* refer to hw/block/virtio_blk.c */
> > +    if (elem->out_num < 1 || elem->in_num < 1) {
> > +        fprintf(stderr, "virtio-blk request missing headers\n");
> > +        free(elem);
> > +        return -1;
> > +    }
> > +
> > +    req = g_new0(VubReq, 1);
> > +    req->client = client;
> > +    req->vq = vq;
> > +    req->elem = elem;
> > +
> > +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
> > +                            sizeof(req->out)) != sizeof(req->out))) {
> > +        fprintf(stderr, "virtio-blk request outhdr too short");
> > +        goto err;
> > +    }
> > +
> > +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
> > +
> > +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
> > +        fprintf(stderr, "virtio-blk request inhdr too short");
> > +        goto err;
> > +    }
> > +
> > +    /* We always touch the last byte, so just see how big in_iov is.  */
> > +    req->in = (void *)in_iov[in_num - 1].iov_base
> > +              + in_iov[in_num - 1].iov_len
> > +              - sizeof(struct virtio_blk_inhdr);
> > +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
> > +
> > +
> > +    type = le32toh(req->out.type);
> > +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
> > +    case VIRTIO_BLK_T_IN:
> > +    case VIRTIO_BLK_T_OUT: {
> > +        ssize_t ret = 0;
> > +        bool is_write = type & VIRTIO_BLK_T_OUT;
> > +        req->sector_num = le64toh(req->out.sector);
> > +        if (is_write) {
> > +            ret = vub_rwv(req, out_iov, out_num, blk_write_entry);
> > +        } else {
> > +            ret = vub_rwv(req, in_iov, in_num, blk_read_entry);
> > +        }
> > +        if (ret >= 0) {
> > +            req->in->status = VIRTIO_BLK_S_OK;
> > +        } else {
> > +            req->in->status = VIRTIO_BLK_S_IOERR;
> > +        }
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +    case VIRTIO_BLK_T_FLUSH:
> > +        vub_flush(req);
> > +        req->in->status = VIRTIO_BLK_S_OK;
> > +        vub_req_complete(req);
> > +        break;
> > +    case VIRTIO_BLK_T_GET_ID: {
> > +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> > +                          VIRTIO_BLK_ID_BYTES);
> > +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
> > +        req->in->status = VIRTIO_BLK_S_OK;
> > +        req->size = elem->in_sg[0].iov_len;
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +    case VIRTIO_BLK_T_DISCARD:
> > +    case VIRTIO_BLK_T_WRITE_ZEROES: {
> > +        int rc;
> > +        rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num, type);
> > +        if (rc == 0) {
> > +            req->in->status = VIRTIO_BLK_S_OK;
> > +        } else {
> > +            req->in->status = VIRTIO_BLK_S_IOERR;
> > +        }
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +    default:
> > +        req->in->status = VIRTIO_BLK_S_UNSUPP;
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +
> > +    return 0;
> > +
> > +err:
> > +    free(elem);
> > +    g_free(req);
> > +    return -1;
> > +}
> > +
> > +
> > +static void vub_process_vq(VuDev *vu_dev, int idx)
> > +{
> > +    VuClient *client;
> > +    VuVirtq *vq;
> > +    int ret;
> > +
> > +    client = container_of(vu_dev, VuClient, parent);
> > +    assert(client);
> > +
> > +    vq = vu_get_queue(vu_dev, idx);
> > +    assert(vq);
> > +
> > +    while (1) {
> > +        ret = vub_virtio_process_req(client, vq);
> > +        if (ret) {
> > +            break;
> > +        }
> > +    }
> > +}
>
> I mentioned above moving vub_virtio_process_req() into a coroutine. Just
> creating a coroutine here and entering it wouldn't work, though.
>
> The best design is probably to get requests from the virtqueue in this
> function (what is currently the first half of vub_virtio_process_req())
> and then spawn a coroutine per request to actually execute them (roughly
> the switch in vub_virtio_process_req()).
>
> This way you'll get parallel requests and won't have to think about
> synchronising accesses to the virtqueue from multiple coroutines.
>
> > +
> > +static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
> > +{
> > +    VuVirtq *vq;
> > +
> > +    assert(vu_dev);
> > +
> > +    vq = vu_get_queue(vu_dev, idx);
> > +    vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
> > +}
> > +
> > +static uint64_t
> > +vub_get_features(VuDev *dev)
> > +{
> > +    uint64_t features;
> > +    VubDev *vdev_blk;
> > +
> > +    VuClient *client = container_of(dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +
> > +    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
> > +               1ull << VIRTIO_BLK_F_SEG_MAX |
> > +               1ull << VIRTIO_BLK_F_TOPOLOGY |
> > +               1ull << VIRTIO_BLK_F_BLK_SIZE |
> > +               1ull << VIRTIO_BLK_F_FLUSH |
> > +               #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> > +               1ull << VIRTIO_BLK_F_DISCARD |
> > +               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
> > +               #endif
> > +               1ull << VIRTIO_BLK_F_CONFIG_WCE |
> > +               1ull << VIRTIO_F_VERSION_1 |
> > +               1ull << VIRTIO_RING_F_INDIRECT_DESC |
> > +               1ull << VIRTIO_RING_F_EVENT_IDX |
> > +               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
> > +
> > +    if (!vdev_blk->writable) {
> > +        features |= 1ull << VIRTIO_BLK_F_RO;
> > +    }
> > +
> > +    return features;
> > +}
> > +
> > +static uint64_t
> > +vub_get_protocol_features(VuDev *dev)
> > +{
> > +    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
> > +           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
> > +}
> > +
> > +static int
> > +vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
> > +{
> > +    VubDev *vdev_blk;
> > +
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +    memcpy(config, &vdev_blk->blkcfg, len);
> > +
> > +    return 0;
> > +}
> > +
> > +static int
> > +vub_set_config(VuDev *vu_dev, const uint8_t *data,
> > +               uint32_t offset, uint32_t size, uint32_t flags)
> > +{
> > +    VubDev *vdev_blk;
> > +
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +    uint8_t wce;
> > +
> > +    /* don't support live migration */
> > +    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
> > +        return -1;
> > +    }
> > +
> > +
> > +    if (offset != offsetof(struct virtio_blk_config, wce) ||
> > +        size != 1) {
> > +        return -1;
> > +    }
> > +
> > +    wce = *data;
> > +    if (wce == vdev_blk->blkcfg.wce) {
> > +        /* Do nothing as same with old configuration */
> > +        return 0;
> > +    }
> > +
> > +    vdev_blk->blkcfg.wce = wce;
> > +    blk_set_enable_write_cache(vdev_blk->backend, true);
> > +    return 0;
> > +}
> > +
> > +
> > +/*
> > + * When the client disconnects, it send a VHOST_USER_NONE request
>
> s/send/sends/
>
> > + * and vu_process_message will simple call exit which cause the VM
> > + * to exit abruptly.
> > + * To avoid this issue,  process VHOST_USER_NONE request ahead
> > + * of vu_process_message.
> > + *
> > + */
> > +static int vub_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
> > +{
> > +    if (vmsg->request == VHOST_USER_NONE) {
> > +        dev->panic(dev, "disconnect");
> > +        return true;
> > +    }
> > +    return false;
> > +}
> > +
> > +static void
> > +vmsg_close_fds(VhostUserMsg *vmsg)
> > +{
> > +    int i;
> > +    for (i = 0; i < vmsg->fd_num; i++) {
> > +        close(vmsg->fds[i]);
> > +    }
> > +}
> > +
> > +static bool
> > +vu_message_read_co(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
> > +{
> > +    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
> > +    struct iovec iov = {
> > +        .iov_base = (char *)vmsg,
> > +        .iov_len = VHOST_USER_HDR_SIZE,
> > +    };
> > +    struct msghdr msg = {
> > +        .msg_iov = &iov,
> > +        .msg_iovlen = 1,
> > +        .msg_control = control,
> > +        .msg_controllen = sizeof(control),
> > +    };
> > +    size_t fd_size;
> > +    struct cmsghdr *cmsg;
> > +    int rc;
> > +    char buffer[100];
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    QIOChannel *ioc = client->ioc;
> > +    do {
> > +        rc = recvmsg(conn_fd, &msg, 0);
>
> This should certainly use qio_channel_readv_full() rather than working
> directly with a socket fd?
>
> > +        if (rc < 0) {
> > +            if (errno == EAGAIN) {
> > +                if (qemu_in_coroutine()) {
> > +                    qio_channel_yield(ioc, G_IO_IN);
> > +                } else {
> > +                    qio_channel_wait(ioc, G_IO_IN);
> > +                }
> > +                continue;
> > +            } else if (errno == EINTR) {
> > +                continue;
> > +            }
> > +        }
> > +        break;
> > +    } while (true);
> > +
> > +    if (rc < 0) {
> > +        sprintf(buffer, "Error while recvmsg: %s", strerror(errno));
> > +        vub_panic_cb(vu_dev, buffer);
> > +        return false;
> > +    }
> > +
> > +    assert(rc == VHOST_USER_HDR_SIZE || rc == 0);
>
> Why do you think that there can't be short reads?
>
> > +    vmsg->fd_num = 0;
> > +    for (cmsg = CMSG_FIRSTHDR(&msg);
> > +         cmsg != NULL;
> > +         cmsg = CMSG_NXTHDR(&msg, cmsg))
> > +    {
> > +        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
> > +            fd_size = cmsg->cmsg_len - CMSG_LEN(0);
> > +            vmsg->fd_num = fd_size / sizeof(int);
> > +            memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
> > +            break;
> > +        }
> > +    }
>
> I think the fd passing part becomes easier when you use the proper
> qio_channel_readv_full() function. Its implementation is also a bit more
> careful than yours. For example, you forgot checking fd_size against
> VHOST_MEMORY_MAX_NREGIONS, allowing a buffer overflow in the memcpy(),
> and you don't adjust fd flags for the new file descriptors.
>
> > +    if (vmsg->size > sizeof(vmsg->payload)) {
> > +        sprintf(buffer,
> > +                "Error: too big message request: %d, size: vmsg->size: %u, "
> > +                "while sizeof(vmsg->payload) = %zu\n",
> > +                vmsg->request, vmsg->size, sizeof(vmsg->payload));
> > +        vub_panic_cb(vu_dev, buffer);
> > +        goto fail;
> > +    }
> > +
> > +    if (vmsg->size) {
> > +        do {
> > +            rc = read(conn_fd, &vmsg->payload, vmsg->size);
>
> qio_channel_readv_all_eof() already implements this whole loop and
> correctly handles short reads, too.
>
> > +            if (rc < 0) {
> > +                if (errno == EAGAIN) {
> > +                    if (qemu_in_coroutine()) {
> > +                        qio_channel_yield(ioc, G_IO_IN);
> > +                    } else {
> > +                        qio_channel_wait(ioc, G_IO_IN);
> > +                    }
> > +                    continue;
> > +                } else if (errno == EINTR) {
> > +                    continue;
> > +                }
> > +            }
> > +            break;
> > +        } while (true);
> > +
> > +        if (rc <= 0) {
> > +            sprintf(buffer, "Error while reading: %s", strerror(errno));
> > +            vub_panic_cb(vu_dev, buffer);
> > +            goto fail;
> > +        }
> > +
> > +        assert(rc == vmsg->size);
> > +    }
> > +
> > +    return true;
> > +
> > +fail:
> > +    vmsg_close_fds(vmsg);
> > +
> > +    return false;
> > +}
> > +
> > +static void vub_kick_cb(void *opaque)
> > +{
> > +    vu_watch_cb_data *data = (vu_watch_cb_data *) opaque;
> > +    int index = data->index;
> > +    VuDev *dev = data->vu_dev;
> > +    VuVirtq *vq = &dev->vq[index];
> > +    int sock = vq->kick_fd;
> > +    eventfd_t kick_data;
> > +    ssize_t rc;
> > +
> > +    rc = eventfd_read(sock, &kick_data);
> > +    if (rc == -1) {
> > +        char buffer[100];
> > +        sprintf(buffer, "kick eventfd_read(): %s", strerror(errno));
> > +        vub_panic_cb(dev, buffer);
> > +        g_free(data);
> > +        dev->remove_watch(dev, dev->vq[index].kick_fd);
> > +    } else {
> > +        if (vq->handler) {
> > +            vq->handler(dev, index);
> > +        }
> > +    }
> > +}
> > +
> > +static const VuDevIface vub_iface = {
> > +    .get_features = vub_get_features,
> > +    .queue_set_started = vub_queue_set_started,
> > +    .get_protocol_features = vub_get_protocol_features,
> > +    .get_config = vub_get_config,
> > +    .set_config = vub_set_config,
> > +    .process_msg = vub_process_msg,
> > +    .read_msg = vu_message_read_co,
> > +    .kick_callback = vub_kick_cb,
> > +};
>
> I would prefer the = signs to be aligned to the same column.
>
> > +
> > +void vub_free(VubDev *vub_dev, bool called_by_QOM)
> > +{
> > +    if (!vub_dev) {
> > +        return;
> > +    }
> > +
> > +    blk_unref(vub_dev->backend);
> > +    g_free(vub_dev->name);
> > +    g_free(vub_dev->unix_socket);
> > +
> > +    if (vub_dev->next.tqe_circ.tql_prev) {
> > +        /*
> > +         * if vub_dev->next.tqe_circ.tql_prev = null,
> > +         * vub_dev hasn't been inserted into the queue and
> > +         * vub_free is called by obj->instance_finalize.
> > +         */
> > +        QTAILQ_REMOVE(&vub_devs, vub_dev, next);
> > +    }
> > +    /*
> > +     * Needn't to free vub_dev if called by QOM
> > +     * because QOM will do the clean-up work.
> > +     */
> > +    if (!called_by_QOM) {
> > +        g_free(vub_dev);
> > +    }
> > +}
> > +
> > +static coroutine_fn void vu_client_trip(void *opaque)
> > +{
> > +    VuClient *client = opaque;
> > +
> > +    while (!client->closed) {
> > +        vu_dispatch(&client->parent);
> > +    }
> > +
> > +    QTAILQ_REMOVE(&client->blk->clients, client, next);
> > +
>
> Extra empty line.
>
> > +}
> > +
> > +static void vu_client_start(VuClient *client)
> > +{
> > +    Coroutine *co = qemu_coroutine_create(vu_client_trip, client);
> > +    qemu_coroutine_enter(co);
> > +}
> > +
> > +
> > +G_STATIC_ASSERT((int)G_IO_IN == (int)VU_WATCH_IN);
> > +G_STATIC_ASSERT((int)G_IO_OUT == (int)VU_WATCH_OUT);
> > +G_STATIC_ASSERT((int)G_IO_PRI == (int)VU_WATCH_PRI);
> > +G_STATIC_ASSERT((int)G_IO_ERR == (int)VU_WATCH_ERR);
> > +G_STATIC_ASSERT((int)G_IO_HUP == (int)VU_WATCH_HUP);
> > +
> > +static void
> > +set_watch(VuDev *vu_dev, int fd, int vu_evt,
> > +          vu_watch_cb_packed_data cb, void *pvt)
> > +{
> > +    /*
> > +     * since aio_dispatch can only pass one user data pointer to the
> > +     * callback function, pack VuDev, pvt into a struct
> > +     */
> > +    VuClient *client;
> > +
> > +    g_assert(vu_dev);
> > +    g_assert(fd >= 0);
> > +    g_assert(cb);
> > +    client = container_of(vu_dev, VuClient, parent);
> > +    vu_watch_cb_data *cb_data = g_new0(vu_watch_cb_data, 1);
> > +    cb_data->index = (intptr_t) pvt;
> > +    cb_data->vu_dev = vu_dev;
> > +    aio_set_fd_handler(client->blk->ctx, fd, false, (void *) cb,
> > +                       NULL, NULL, cb_data);
> > +}
> > +
> > +
> > +void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
> > +                gpointer opaque)
> > +{
> > +    VuClient *client;
> > +    VubDev *vub_device = opaque;
> > +    client = g_new0(VuClient, 1);
> > +
> > +    if (!vu_init_packed_data(&client->parent, VHOST_USER_BLK_MAX_QUEUES,
> > +                             sioc->fd, vub_panic_cb, set_watch,
> > +                             remove_watch, &vub_iface)) {
> > +        fprintf(stderr, "Failed to initialized libvhost-user\n");
> > +        g_free(client);
> > +        return;
> > +    }
> > +
> > +    client->blk = vub_device;
> > +    client->refcount = 1;
> > +    client->sioc = sioc;
> > +    /*
> > +     * increase the object reference, so cioc will not freed by
> > +     * qio_net_listener_channel_func which will call object_unref(OBJECT(sioc))
> > +     */
> > +    object_ref(OBJECT(client->sioc));
> > +    qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
> > +    client->ioc = QIO_CHANNEL(sioc);
> > +    object_ref(OBJECT(client->ioc));
> > +    object_ref(OBJECT(sioc));
> > +
> > +    qio_channel_set_blocking(QIO_CHANNEL(client->sioc), false, NULL);
> > +    client->closed = false;
> > +    QTAILQ_INSERT_TAIL(&client->blk->clients, client, next);
> > +    vu_client_start(client);
> > +}
> > +
> > +
> > +void
> > +vub_initialize_config(BlockDriverState *bs, struct virtio_blk_config *config)
> > +{
> > +    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
> > +    config->blk_size = BDRV_SECTOR_SIZE;
> > +    config->size_max = 65536;
> > +    config->seg_max = 128 - 2;
> > +    config->min_io_size = 1;
> > +    config->opt_io_size = 1;
> > +    config->num_queues = 1;
> > +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> > +    config->max_discard_sectors = 32768;
> > +    config->max_discard_seg = 1;
> > +    config->discard_sector_alignment = config->blk_size >> 9;
> > +    config->max_write_zeroes_sectors = 32768;
> > +    config->max_write_zeroes_seg = 1;
> > +    #endif
> > +}
> > +
> > +
> > +static VubDev *vub_new(VubDev *vub_device, const char *name,
> > +                       const char *unix_socket, bool writable, Error **errp)
> > +{
> > +
> > +    BlockBackend *blk;
> > +
> > +    /*
> > +     * Don't allow resize while the vhost user server is running,
> > +     * otherwise we don't care what happens with the node.
> > +     */
> > +    uint64_t perm = BLK_PERM_CONSISTENT_READ;
> > +    int ret;
> > +
> > +    AioContext *ctx;
> > +
> > +    BlockDriverState *bs = bdrv_lookup_bs(name,
> > +                                          name,
> > +                                          errp);
>
> This fits in a single line.
>
> > +
> > +    if (!bs) {
> > +        error_setg(errp,
> > +                   "No drive with name '%s'."
> > +                   " Please find the list of names with "
> > +                   "'info block'", name);
>
> This can probably be two lines instead of four.
>
> > +        return NULL;
> > +    }
> > +
> > +    if (bdrv_is_read_only(bs)) {
> > +        writable = false;
> > +    }
> > +
> > +    if (writable) {
> > +        perm |= BLK_PERM_WRITE;
> > +    }
> > +
> > +    ctx = bdrv_get_aio_context(bs);
> > +    aio_context_acquire(ctx);
> > +    bdrv_invalidate_cache(bs, NULL);
> > +    aio_context_release(ctx);
> > +
> > +    blk = blk_new(bdrv_get_aio_context(bs), perm,
> > +                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> > +                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
> > +    ret = blk_insert_bs(blk, bs, errp);
> > +
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +
> > +
> > +    blk_set_enable_write_cache(blk, false);
> > +
> > +    blk_set_allow_aio_context_change(blk, true);
> > +
> > +
> > +    vub_device->name = g_strdup(name);
> > +    vub_device->unix_socket = g_strdup(unix_socket);
> > +    vub_device->writable = writable;
> > +    vub_device->blkcfg.wce = 0;
> > +    vub_device->backend = blk;
> > +    vub_device->ctx = ctx;
> > +    vub_initialize_config(bs, &vub_device->blkcfg);
> > +    return vub_device;
> > +
> > +fail:
> > +    blk_unref(blk);
> > +    return NULL;
> > +}
> > +
> > +void vhost_user_server_free(VubDev *vub_device, bool called_by_QOM)
> > +{
> > +    if (!vub_device) {
> > +        return;
> > +    }
> > +
> > +    VuClient *client, *next;
> > +    QTAILQ_FOREACH_SAFE(client, &vub_device->clients, next, next) {
> > +        if (!client->closed) {
> > +            close_client(client);
> > +        }
> > +    }
> > +
> > +    if (vub_device->listener) {
> > +        qio_net_listener_disconnect(vub_device->listener);
> > +        object_unref(OBJECT(vub_device->listener));
> > +    }
> > +    vub_free(vub_device, called_by_QOM);
> > +
> > +}
> > +
> > +
> > +VubDev *vub_dev_find(const char *name)
> > +{
> > +    VubDev *vub_device;
> > +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> > +        if (strcmp(name, vub_device->name) == 0) {
> > +            return vub_device;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +
> > +static VubDev *vub_dev_find_by_unix_socket(const char *unix_socket)
> > +{
> > +    VubDev *vub_device;
> > +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> > +        if (strcmp(unix_socket, vub_device->unix_socket) == 0) {
> > +            return vub_device;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +static void vhost_user_server_start(VubDev *vub_device, const char *unix_socket,
> > +                                    const char *name, bool writable,
> > +                                    Error **errp)
> > +{
> > +
> > +    if (vub_dev_find(name) || vub_dev_find_by_unix_socket(unix_socket)) {
> > +        error_setg(errp, "Vhost user server with name '%s' or "
> > +                "with socket_path '%s' has already been started",
> > +                name, unix_socket);
> > +        return;
> > +    }
> > +
> > +
> > +    if (!vub_new(vub_device, name, unix_socket, writable, errp)) {
> > +        return;
> > +    }
> > +
> > +
> > +    vub_device->listener = qio_net_listener_new();
> > +
> > +    qio_net_listener_set_name(vub_device->listener,
> > +                              "vhost-user-backend-listener");
> > +
> > +    SocketAddress *addr = g_new0(SocketAddress, 1);
> > +    addr->u.q_unix.path = (char *) unix_socket;
> > +    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
> > +    if (qio_net_listener_open_sync(vub_device->listener, addr, 1, errp) < 0) {
> > +        goto error;
> > +    }
> > +
> > +
> > +    QTAILQ_INSERT_TAIL(&vub_devs, vub_device, next);
> > +    QTAILQ_INIT(&vub_device->clients);
> > +
> > +    qio_net_listener_set_client_func(vub_device->listener,
> > +                                     vub_accept,
> > +                                     vub_device,
> > +                                     NULL);
> > +
> > +    return;
> > +
> > + error:
> > +    vub_free(vub_device, false);
> > +}
> > +
> > +static void vu_set_block_name(Object *obj, const char *value,
> > +                                           Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +
> > +    if (vus->name) {
> > +        error_setg(errp, "evdev property already set");
> > +        return;
> > +    }
> > +
> > +    vus->name = g_strdup(value);
> > +}
> > +
> > +static char *vu_get_block_name(Object *obj, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);
> > +    return g_strdup(vus->name);
> > +}
> > +
> > +
> > +static void vu_set_unix_socket(Object *obj, const char *value,
> > +                                            Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +
> > +    if (vus->unix_socket) {
> > +        error_setg(errp, "unix_socket property already set");
> > +        return;
> > +    }
> > +
> > +    vus->unix_socket = g_strdup(value);
> > +    vhost_user_server_start(vus, value, vus->name,
> > +                            vus->writable, errp);
> > +}
>
> This makes the unix-socket property magic in that it starts the server
> with the properties specified at this point. This means that this
> property must always be specified last.
>
> Maybe it would be better to use a boolean property (similar to qdev's
> "realized") that explicitly start and possibly stops the export.
>
> Writing to other properties should probably result in an error while the
> server is already running because these property changes won't take
> effect any more then.
>
> > +static char *vu_get_unix_socket(Object *obj, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +    return g_strdup(vus->unix_socket);
> > +}
> > +
> > +static bool vu_get_block_writable(Object *obj, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +    return vus->writable;
> > +}
> > +
> > +static void vu_set_block_writable(Object *obj, bool value, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);
> > +
> > +    vus->writable = value;
> > +}
> > +
> > +static void vhost_user_server_instance_init(Object *obj)
> > +{
> > +
> > +    object_property_add_bool(obj, "writable",
> > +                            vu_get_block_writable,
> > +                            vu_set_block_writable, NULL);
> > +
> > +    object_property_add_str(obj, "name",
> > +                            vu_get_block_name,
> > +                            vu_set_block_name, NULL);
> > +
> > +    object_property_add_str(obj, "unix_socket",
> > +                            vu_get_unix_socket,
> > +                            vu_set_unix_socket, NULL);
>
> These should probably be object_class_property_add_*() and be called in
> .class_init rather than .instance_init.
>
> "name" suggests that it's the name of the export rather than the block
> device to be exported. I would suggest "node-name" (and then actually
> only pass it as node-name to bdrv_lookup_bs()).
>
> I expect that in the long run, we'll want to accept a full SocketAddress
> rather than just a filename like in "unix_socket".
>
> > +}
> > +
> > +static void vhost_user_server_instance_finalize(Object *obj)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);
> > +    vhost_user_server_free(vus, true);
> > +    /* object_del shouldn't free this object struct */
> > +    obj->free = NULL;
> > +}
> > +
> > +static const TypeInfo vhost_user_server_info = {
> > +    .name = TYPE_VHOST_USER_SERVER,
> > +    .parent = TYPE_OBJECT,
> > +    .instance_size = sizeof(VuDev),
> > +    .instance_init = vhost_user_server_instance_init,
> > +    .instance_finalize = vhost_user_server_instance_finalize,
> > +    .interfaces = (InterfaceInfo[]) {
> > +        {TYPE_USER_CREATABLE},
> > +        {}
> > +    },
> > +};
> > +
> > +static void vhost_user_server_register_types(void)
> > +{
> > +    type_register_static(&vhost_user_server_info);
> > +}
> > +
> > +type_init(vhost_user_server_register_types)
> > +
>
> Extra empty line at the file end.
>
> In summary, I can see this going in the right direction, though in
> detail some more work will be needed.
>
> Kevin
>


--
Best regards,
Coiby


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 1/5] vhost-user block device backend
  2020-01-16 13:56   ` Kevin Wolf
  2020-02-20  7:04     ` Coiby Xu
@ 2020-02-20  9:30     ` Coiby Xu
  1 sibling, 0 replies; 19+ messages in thread
From: Coiby Xu @ 2020-02-20  9:30 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: bharatlkmlkvm, qemu-devel, stefanha

[-- Attachment #1: Type: text/plain, Size: 39721 bytes --]

> > +    vmsg->fd_num = 0;
> > +    for (cmsg = CMSG_FIRSTHDR(&msg);
> > +         cmsg != NULL;
> > +         cmsg = CMSG_NXTHDR(&msg, cmsg))
> > +    {
> > +        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type ==
SCM_RIGHTS) {
> > +            fd_size = cmsg->cmsg_len - CMSG_LEN(0);
> > +            vmsg->fd_num = fd_size / sizeof(int);
> > +            memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
> > +            break;
> > +        }
> > +    }

> I think the fd passing part becomes easier when you use the proper
> qio_channel_readv_full() function. Its implementation is also a bit more
> careful than yours. For example, you forgot checking fd_size against
> VHOST_MEMORY_MAX_NREGIONS, allowing a buffer overflow in the memcpy(),
> and you don't adjust fd flags for the new file descriptors.

Oh, I used qio_channel_readv_full in v3&v4. But I still forgot checking
fd_size against  VHOST_MEMORY_MAX_NREGIONS. I'll fix this buffer overflow
issue in v5.

On Thu, Jan 16, 2020 at 9:56 PM Kevin Wolf <kwolf@redhat.com> wrote:

> Hi,
>
> I'm only doing a quick first review pointing out the more obvious
> things while I familiarise myself with your code. I intend to review it
> in more detail later (either in a second pass for this series, or when
> you post v3).
>
> Am 14.01.2020 um 15:06 hat Coiby Xu geschrieben:
> > By making use of libvhost, multiple block device drives can be exported
> and each drive can serve multiple clients simultaneously. Since
> vhost-user-server needs a block drive to be created first, delay the
> creation of this object.
> >
> > Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
>
> Please wrap the commit message at 72 characters.
>
> >  blockdev-vu.c              | 1008 ++++++++++++++++++++++++++++++++++++
> >  include/block/vhost-user.h |   46 ++
> >  vl.c                       |    4 +
> >  3 files changed, 1058 insertions(+)
> >  create mode 100644 blockdev-vu.c
> >  create mode 100644 include/block/vhost-user.h
>
> This adds a single, relatively big source file. I see at least two
> parts: The generic vhost-user infrastructure with connection handling
> etc. and the implementation of the specific vhost-user-blk device.
> Separating these into two files is probably a good idea.
>
> I would also suggest to put the files in a new subdirectory
> block/export/ and call them vhost-user.c/vhost-user-blk.c. The new
> header file can be in the same directory as it shouldn't be used by
> anyone else.
>
> > diff --git a/blockdev-vu.c b/blockdev-vu.c
> > new file mode 100644
> > index 0000000000..45f0bb43a7
> > --- /dev/null
> > +++ b/blockdev-vu.c
> > @@ -0,0 +1,1008 @@
>
> The LICENSE file clarifies that files without a license header are
> GPLv2+, so it's not strictly a problem, but I think it is good style to
> include a license header that explicitly tells so.
>
> > +#include "qemu/osdep.h"
> > +#include "block/vhost-user.h"
> > +#include "qapi/error.h"
> > +#include "qapi/qapi-types-sockets.h"
> > +#include "qapi/qapi-commands-block.h"
> > +
> > +#include "sysemu/block-backend.h"
> > +#include "qemu/main-loop.h"
> > +
> > +#include "qemu/units.h"
> > +
> > +#include "block/block.h"
> > +
> > +#include "qom/object_interfaces.h"
> > +
> > +#include <sys/eventfd.h>
> > +
> > +#include "hw/qdev-properties.h"
>
> Does the order of includes and the empty lines between them signify
> anything? If not, I suggest just sorting them alphabetically (and maybe
> using empty lines between different subdirectories if you like this
> better than a single large block).
>
> According to CODING_STYLE.rst, system headers like <sys/eventfd.h> come
> before all QEMU headers (except qemu/osdep.h, which always must come
> first).
>
> > +enum {
> > +    VHOST_USER_BLK_MAX_QUEUES = 8,
> > +};
> > +
> > +struct virtio_blk_inhdr {
> > +    unsigned char status;
> > +};
> > +
> > +
> > +static QTAILQ_HEAD(, VubDev) vub_devs =
> QTAILQ_HEAD_INITIALIZER(vub_devs);
> > +
> > +
> > +typedef struct VubReq {
> > +    VuVirtqElement *elem;
>
> Maybe worth a comment that this was allocated with plain malloc(), so
> you must use free() rather than g_free() (which would be the default in
> QEMU)?
>
> > +    int64_t sector_num;
> > +    size_t size;
> > +    struct virtio_blk_inhdr *in;
> > +    struct virtio_blk_outhdr out;
> > +    VuClient *client;
> > +    struct VuVirtq *vq;
> > +} VubReq;
>
> I'm not completely sure yet, but I think I would prefer VuBlock to Vub
> in the type names. Some may even prefer VhostUserBlock, but I can see
> that this would be quite lengthy.
>
> > +static void
> > +remove_watch(VuDev *vu_dev, int fd)
> > +{
> > +    VuClient *client;
> > +
> > +    g_assert(vu_dev);
> > +    g_assert(fd >= 0);
> > +
> > +    client = container_of(vu_dev, VuClient, parent);
> > +    aio_set_fd_handler(client->blk->ctx, fd, false, NULL, NULL, NULL,
> NULL);
> > +}
> > +
> > +static void close_client(VuClient *client)
> > +{
> > +    vu_deinit(&client->parent);
> > +    /** g_source_destroy(vub_device->parent.src); */
>
> Leftover from conversion?
>
> > +    client->sioc = NULL;
> > +    object_unref(OBJECT(client->ioc));
> > +    client->closed = true;
> > +
> > +}
> > +
> > +static void vub_panic_cb(VuDev *vu_dev, const char *buf)
>
> You use a lot of sprintf() before calling this function. Would it be
> worth taking a printf-like format parameter instead of buf and using a
> variable argument list?
>
> > +{
> > +    if (buf) {
> > +        g_warning("vu_panic: %s", buf);
>
> I think QEMU proper doesn't use g_warning() anywhere. This could be
> error_report() or warn_report(). (Or if you use a format string
> error_vreport() and warn_vreport().)
>
> > +    }
> > +
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    if (client->blk->exit_panic) {
> > +        client->blk->close = true;
> > +    }
> > +    if (!client->closed) {
> > +        close_client(client);
> > +    }
> > +}
> > +
> > +
> > +static void vub_req_complete(VubReq *req)
> > +{
> > +    VuDev *vu_dev = &req->client->parent;
> > +
> > +    /* IO size with 1 extra status byte */
> > +    vu_queue_push(vu_dev, req->vq, req->elem,
> > +                  req->size + 1);
>
> I think this fits in a single line.
>
> > +    vu_queue_notify(vu_dev, req->vq);
> > +
> > +    if (req->elem) {
> > +        free(req->elem);
> > +    }
> > +
> > +    g_free(req);
> > +}
> > +
> > +
> > +
> > +static int
> > +vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t
> iovcnt,
> > +                         uint32_t type)
> > +{
> > +    struct virtio_blk_discard_write_zeroes *desc;
> > +    ssize_t size;
> > +    void *buf;
> > +
> > +    size = iov_size(iov, iovcnt);
> > +    if (size != sizeof(*desc)) {
> > +        fprintf(stderr, "Invalid size %ld, expect %ld\n", size,
> sizeof(*desc));
> > +        return -1;
>
> This would be error_report(), too. (More cases below, I'll ignore them
> now.)
>
> I would prefer consistent use of -errno instead of -1 for error cases if
> you don't mind. I guess this would be -EINVAL here. I won't mention it
> for all the other cases; if you want to make the change, you need to
> make it everywhere, obviously.
>
> > +    }
> > +    buf = g_new0(char, size);
> > +
> > +    iov_to_buf_full(iov, iovcnt, 0, buf, size);
>
> I think uint8_t describes better than char what we want here: A buffer
> of bytes.
>
> The empty line would make more sense to me above the g_new0() line than
> after it because it starts a new section that deals with the buffer. In
> general, the use of empty lines feels a bit inconsistent in this patch.
> You may want to go over them again.
>
> > +
> > +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
>
> Preprocessor directives should be unindented.
>
> However, I don't think any of this code actually depends on Linux,
> BLKDISCARD or BLKZEROOUT. You can just call blk_pdiscard() and
> blk_pwrite_zeroes() and they will do whatever is necessary to perform
> the operation on the backend (which might not be a Linux block device,
> but could be a regular file or even using a network protocol like NBD).
>
> > +    VubDev *vdev_blk;
> > +    VuClient *client = container_of(dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +    desc = (struct virtio_blk_discard_write_zeroes *)buf;
> > +    uint64_t range[2] = { le64toh(desc->sector) << 9,
> > +                          le32toh(desc->num_sectors) << 9 };
> > +    if (type == VIRTIO_BLK_T_DISCARD) {
> > +        if (blk_pdiscard(vdev_blk->blk, range[0], range[1]) == 0) {
> > +            g_free(buf);
> > +            return 0;
> > +        }
> > +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
> > +        if (blk_pwrite_zeroes(vdev_blk->blk, range[0], range[1]) == 0) {
> > +            g_free(buf);
> > +            return 0;
> > +        }
>
> blk_pdiscard() and blk_pwrite_zeroes() are synchronous functions. In
> other words, the guest will be blocked until the I/O is complete. We
> cannot do this.
>
> I think you should let vub_virtio_process_req() run in a coroutine so
> that you can call blk_co_pdiscard() and blk_co_pwrite_zeroes() here.
>
> > +    }
> > +    #endif
> > +
> > +    g_free(buf);
> > +    return -1;
> > +}
> > +
> > +
> > +static void
> > +vub_flush(VubReq *req)
> > +{
> > +    VuClient *client = req->client;
> > +    blk_co_flush(client->blk->backend);
>
> You can't call blk_co_flush() from outside coroutine context. This code
> will be right after you move vub_virtio_process_req() to a coroutine,
> though (which will make this function a coroutine_fn).
>
> > +}
> > +
> > +
> > +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in
> progress */
> > +typedef struct BlkRwCo {
> > +    BlockBackend *blk;
> > +    int64_t offset;
> > +    void *iobuf;
> > +    int ret;
> > +    BdrvRequestFlags flags;
> > +} BlkRwCo;
> > +
> > +static void blk_read_entry(void *opaque)
> > +{
> > +    BlkRwCo *rwco = opaque;
> > +    QEMUIOVector *qiov = rwco->iobuf;
> > +
> > +    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
> > +                              qiov, rwco->flags);
> > +    aio_wait_kick();
> > +}
> > +
> > +
> > +static void blk_write_entry(void *opaque)
> > +{
> > +    BlkRwCo *rwco = opaque;
> > +    QEMUIOVector *qiov = rwco->iobuf;
> > +
> > +    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
> > +                              qiov, rwco->flags);
> > +    aio_wait_kick();
> > +}
> > +
> > +
> > +static int blk_prw(BlockBackend *blk, QEMUIOVector *qiov, int64_t
> offset,
> > +                   CoroutineEntry co_entry, BdrvRequestFlags flags)
> > +{
> > +
> > +    BlkRwCo rwco = {
> > +        .blk    = blk,
> > +        .offset = offset,
> > +        .iobuf  = qiov,
> > +        .flags  = flags,
> > +        .ret    = NOT_DONE,
> > +    };
> > +
> > +    if (qemu_in_coroutine()) {
> > +        /* Fast-path if already in coroutine context */
> > +        co_entry(&rwco);
> > +    } else {
> > +        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
> > +        bdrv_coroutine_enter(blk_bs(blk), co);
> > +        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> > +    }
> > +
> > +    return rwco.ret;
> > +}
>
> This is copy&paste from block-backend.c. We should certainly not do
> this. I think it will automatically go away when you can use
> blk_co_preadv() and blk_co_pwritev() directly.
>
> Note that the BDRV_POLL_WHILE() means that like above, we would be
> waiting for the request to complete. This would block the guest and
> would also not allow parallel requests, killing the I/O performance of
> our vhost-user export.
>
> > +
> > +static ssize_t
> > +vub_rwv(VubReq *req, struct iovec *iov,
> > +        uint32_t iovcnt,
> > +        CoroutineEntry co_entry)
>
> I don't understand the line wrapping here. :-)
>
> > +{
> > +    VuClient *client = req->client;
> > +    ssize_t rc;
> > +
> > +    if (!iovcnt) {
> > +        fprintf(stderr, "Invalid Read/Write IOV count\n");
> > +        return -1;
> > +    }
> > +
> > +    int64_t offset = req->sector_num * 512;
> > +    QEMUIOVector *qiov = g_new0(QEMUIOVector, 1);
> > +    qemu_iovec_init_external(qiov, iov, iovcnt);
> > +    rc = blk_prw(client->blk->backend, qiov, offset, co_entry, 0);
> > +
> > +    req->size = iov_size(iov, iovcnt);
>
> You can use qiov->size instead of duplicating this information into a
> separate VubReq field.
>
> > +    if (rc < 0) {
> > +        fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with
> %s\n",
> > +                client->blk->name, req->sector_num, req->size,
> > +                strerror(errno));
> > +        return -1;
> > +    }
> > +
> > +    return rc;
> > +}
> > +
> > +static int vub_virtio_process_req(VuClient *client,
> > +                                     VuVirtq *vq)
>
> Indentation is off. This could be a single line anyway.
>
> > +{
> > +    VuDev *vu_dev = &client->parent;
> > +    VuVirtqElement *elem;
> > +    uint32_t type;
> > +    VubReq *req;
> > +
> > +    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) +
> sizeof(VubReq));
> > +    if (!elem) {
> > +        return -1;
> > +    }
> > +
> > +    struct iovec *in_iov = elem->in_sg;
> > +    struct iovec *out_iov = elem->out_sg;
> > +    unsigned in_num = elem->in_num;
> > +    unsigned out_num = elem->out_num;
> > +    /* refer to hw/block/virtio_blk.c */
> > +    if (elem->out_num < 1 || elem->in_num < 1) {
> > +        fprintf(stderr, "virtio-blk request missing headers\n");
> > +        free(elem);
> > +        return -1;
> > +    }
> > +
> > +    req = g_new0(VubReq, 1);
> > +    req->client = client;
> > +    req->vq = vq;
> > +    req->elem = elem;
> > +
> > +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
> > +                            sizeof(req->out)) != sizeof(req->out))) {
> > +        fprintf(stderr, "virtio-blk request outhdr too short");
> > +        goto err;
> > +    }
> > +
> > +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
> > +
> > +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
> > +        fprintf(stderr, "virtio-blk request inhdr too short");
> > +        goto err;
> > +    }
> > +
> > +    /* We always touch the last byte, so just see how big in_iov is.  */
> > +    req->in = (void *)in_iov[in_num - 1].iov_base
> > +              + in_iov[in_num - 1].iov_len
> > +              - sizeof(struct virtio_blk_inhdr);
> > +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
> > +
> > +
> > +    type = le32toh(req->out.type);
> > +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
> > +    case VIRTIO_BLK_T_IN:
> > +    case VIRTIO_BLK_T_OUT: {
> > +        ssize_t ret = 0;
> > +        bool is_write = type & VIRTIO_BLK_T_OUT;
> > +        req->sector_num = le64toh(req->out.sector);
> > +        if (is_write) {
> > +            ret = vub_rwv(req, out_iov, out_num, blk_write_entry);
> > +        } else {
> > +            ret = vub_rwv(req, in_iov, in_num, blk_read_entry);
> > +        }
> > +        if (ret >= 0) {
> > +            req->in->status = VIRTIO_BLK_S_OK;
> > +        } else {
> > +            req->in->status = VIRTIO_BLK_S_IOERR;
> > +        }
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +    case VIRTIO_BLK_T_FLUSH:
> > +        vub_flush(req);
> > +        req->in->status = VIRTIO_BLK_S_OK;
> > +        vub_req_complete(req);
> > +        break;
> > +    case VIRTIO_BLK_T_GET_ID: {
> > +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> > +                          VIRTIO_BLK_ID_BYTES);
> > +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
> > +        req->in->status = VIRTIO_BLK_S_OK;
> > +        req->size = elem->in_sg[0].iov_len;
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +    case VIRTIO_BLK_T_DISCARD:
> > +    case VIRTIO_BLK_T_WRITE_ZEROES: {
> > +        int rc;
> > +        rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num,
> type);
> > +        if (rc == 0) {
> > +            req->in->status = VIRTIO_BLK_S_OK;
> > +        } else {
> > +            req->in->status = VIRTIO_BLK_S_IOERR;
> > +        }
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +    default:
> > +        req->in->status = VIRTIO_BLK_S_UNSUPP;
> > +        vub_req_complete(req);
> > +        break;
> > +    }
> > +
> > +    return 0;
> > +
> > +err:
> > +    free(elem);
> > +    g_free(req);
> > +    return -1;
> > +}
> > +
> > +
> > +static void vub_process_vq(VuDev *vu_dev, int idx)
> > +{
> > +    VuClient *client;
> > +    VuVirtq *vq;
> > +    int ret;
> > +
> > +    client = container_of(vu_dev, VuClient, parent);
> > +    assert(client);
> > +
> > +    vq = vu_get_queue(vu_dev, idx);
> > +    assert(vq);
> > +
> > +    while (1) {
> > +        ret = vub_virtio_process_req(client, vq);
> > +        if (ret) {
> > +            break;
> > +        }
> > +    }
> > +}
>
> I mentioned above moving vub_virtio_process_req() into a coroutine. Just
> creating a coroutine here and entering it wouldn't work, though.
>
> The best design is probably to get requests from the virtqueue in this
> function (what is currently the first half of vub_virtio_process_req())
> and then spawn a coroutine per request to actually execute them (roughly
> the switch in vub_virtio_process_req()).
>
> This way you'll get parallel requests and won't have to think about
> synchronising accesses to the virtqueue from multiple coroutines.
>
> > +
> > +static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
> > +{
> > +    VuVirtq *vq;
> > +
> > +    assert(vu_dev);
> > +
> > +    vq = vu_get_queue(vu_dev, idx);
> > +    vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
> > +}
> > +
> > +static uint64_t
> > +vub_get_features(VuDev *dev)
> > +{
> > +    uint64_t features;
> > +    VubDev *vdev_blk;
> > +
> > +    VuClient *client = container_of(dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +
> > +    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
> > +               1ull << VIRTIO_BLK_F_SEG_MAX |
> > +               1ull << VIRTIO_BLK_F_TOPOLOGY |
> > +               1ull << VIRTIO_BLK_F_BLK_SIZE |
> > +               1ull << VIRTIO_BLK_F_FLUSH |
> > +               #if defined(__linux__) && defined(BLKDISCARD) &&
> defined(BLKZEROOUT)
> > +               1ull << VIRTIO_BLK_F_DISCARD |
> > +               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
> > +               #endif
> > +               1ull << VIRTIO_BLK_F_CONFIG_WCE |
> > +               1ull << VIRTIO_F_VERSION_1 |
> > +               1ull << VIRTIO_RING_F_INDIRECT_DESC |
> > +               1ull << VIRTIO_RING_F_EVENT_IDX |
> > +               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
> > +
> > +    if (!vdev_blk->writable) {
> > +        features |= 1ull << VIRTIO_BLK_F_RO;
> > +    }
> > +
> > +    return features;
> > +}
> > +
> > +static uint64_t
> > +vub_get_protocol_features(VuDev *dev)
> > +{
> > +    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
> > +           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
> > +}
> > +
> > +static int
> > +vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
> > +{
> > +    VubDev *vdev_blk;
> > +
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +    memcpy(config, &vdev_blk->blkcfg, len);
> > +
> > +    return 0;
> > +}
> > +
> > +static int
> > +vub_set_config(VuDev *vu_dev, const uint8_t *data,
> > +               uint32_t offset, uint32_t size, uint32_t flags)
> > +{
> > +    VubDev *vdev_blk;
> > +
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    vdev_blk = client->blk;
> > +    uint8_t wce;
> > +
> > +    /* don't support live migration */
> > +    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
> > +        return -1;
> > +    }
> > +
> > +
> > +    if (offset != offsetof(struct virtio_blk_config, wce) ||
> > +        size != 1) {
> > +        return -1;
> > +    }
> > +
> > +    wce = *data;
> > +    if (wce == vdev_blk->blkcfg.wce) {
> > +        /* Do nothing as same with old configuration */
> > +        return 0;
> > +    }
> > +
> > +    vdev_blk->blkcfg.wce = wce;
> > +    blk_set_enable_write_cache(vdev_blk->backend, true);
> > +    return 0;
> > +}
> > +
> > +
> > +/*
> > + * When the client disconnects, it send a VHOST_USER_NONE request
>
> s/send/sends/
>
> > + * and vu_process_message will simple call exit which cause the VM
> > + * to exit abruptly.
> > + * To avoid this issue,  process VHOST_USER_NONE request ahead
> > + * of vu_process_message.
> > + *
> > + */
> > +static int vub_process_msg(VuDev *dev, VhostUserMsg *vmsg, int
> *do_reply)
> > +{
> > +    if (vmsg->request == VHOST_USER_NONE) {
> > +        dev->panic(dev, "disconnect");
> > +        return true;
> > +    }
> > +    return false;
> > +}
> > +
> > +static void
> > +vmsg_close_fds(VhostUserMsg *vmsg)
> > +{
> > +    int i;
> > +    for (i = 0; i < vmsg->fd_num; i++) {
> > +        close(vmsg->fds[i]);
> > +    }
> > +}
> > +
> > +static bool
> > +vu_message_read_co(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
> > +{
> > +    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] =
> { };
> > +    struct iovec iov = {
> > +        .iov_base = (char *)vmsg,
> > +        .iov_len = VHOST_USER_HDR_SIZE,
> > +    };
> > +    struct msghdr msg = {
> > +        .msg_iov = &iov,
> > +        .msg_iovlen = 1,
> > +        .msg_control = control,
> > +        .msg_controllen = sizeof(control),
> > +    };
> > +    size_t fd_size;
> > +    struct cmsghdr *cmsg;
> > +    int rc;
> > +    char buffer[100];
> > +    VuClient *client = container_of(vu_dev, VuClient, parent);
> > +    QIOChannel *ioc = client->ioc;
> > +    do {
> > +        rc = recvmsg(conn_fd, &msg, 0);
>
> This should certainly use qio_channel_readv_full() rather than working
> directly with a socket fd?
>
> > +        if (rc < 0) {
> > +            if (errno == EAGAIN) {
> > +                if (qemu_in_coroutine()) {
> > +                    qio_channel_yield(ioc, G_IO_IN);
> > +                } else {
> > +                    qio_channel_wait(ioc, G_IO_IN);
> > +                }
> > +                continue;
> > +            } else if (errno == EINTR) {
> > +                continue;
> > +            }
> > +        }
> > +        break;
> > +    } while (true);
> > +
> > +    if (rc < 0) {
> > +        sprintf(buffer, "Error while recvmsg: %s", strerror(errno));
> > +        vub_panic_cb(vu_dev, buffer);
> > +        return false;
> > +    }
> > +
> > +    assert(rc == VHOST_USER_HDR_SIZE || rc == 0);
>
> Why do you think that there can't be short reads?
>
> > +    vmsg->fd_num = 0;
> > +    for (cmsg = CMSG_FIRSTHDR(&msg);
> > +         cmsg != NULL;
> > +         cmsg = CMSG_NXTHDR(&msg, cmsg))
> > +    {
> > +        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type ==
> SCM_RIGHTS) {
> > +            fd_size = cmsg->cmsg_len - CMSG_LEN(0);
> > +            vmsg->fd_num = fd_size / sizeof(int);
> > +            memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
> > +            break;
> > +        }
> > +    }
>
> I think the fd passing part becomes easier when you use the proper
> qio_channel_readv_full() function. Its implementation is also a bit more
> careful than yours. For example, you forgot checking fd_size against
> VHOST_MEMORY_MAX_NREGIONS, allowing a buffer overflow in the memcpy(),
> and you don't adjust fd flags for the new file descriptors.
>
> > +    if (vmsg->size > sizeof(vmsg->payload)) {
> > +        sprintf(buffer,
> > +                "Error: too big message request: %d, size: vmsg->size:
> %u, "
> > +                "while sizeof(vmsg->payload) = %zu\n",
> > +                vmsg->request, vmsg->size, sizeof(vmsg->payload));
> > +        vub_panic_cb(vu_dev, buffer);
> > +        goto fail;
> > +    }
> > +
> > +    if (vmsg->size) {
> > +        do {
> > +            rc = read(conn_fd, &vmsg->payload, vmsg->size);
>
> qio_channel_readv_all_eof() already implements this whole loop and
> correctly handles short reads, too.
>
> > +            if (rc < 0) {
> > +                if (errno == EAGAIN) {
> > +                    if (qemu_in_coroutine()) {
> > +                        qio_channel_yield(ioc, G_IO_IN);
> > +                    } else {
> > +                        qio_channel_wait(ioc, G_IO_IN);
> > +                    }
> > +                    continue;
> > +                } else if (errno == EINTR) {
> > +                    continue;
> > +                }
> > +            }
> > +            break;
> > +        } while (true);
> > +
> > +        if (rc <= 0) {
> > +            sprintf(buffer, "Error while reading: %s", strerror(errno));
> > +            vub_panic_cb(vu_dev, buffer);
> > +            goto fail;
> > +        }
> > +
> > +        assert(rc == vmsg->size);
> > +    }
> > +
> > +    return true;
> > +
> > +fail:
> > +    vmsg_close_fds(vmsg);
> > +
> > +    return false;
> > +}
> > +
> > +static void vub_kick_cb(void *opaque)
> > +{
> > +    vu_watch_cb_data *data = (vu_watch_cb_data *) opaque;
> > +    int index = data->index;
> > +    VuDev *dev = data->vu_dev;
> > +    VuVirtq *vq = &dev->vq[index];
> > +    int sock = vq->kick_fd;
> > +    eventfd_t kick_data;
> > +    ssize_t rc;
> > +
> > +    rc = eventfd_read(sock, &kick_data);
> > +    if (rc == -1) {
> > +        char buffer[100];
> > +        sprintf(buffer, "kick eventfd_read(): %s", strerror(errno));
> > +        vub_panic_cb(dev, buffer);
> > +        g_free(data);
> > +        dev->remove_watch(dev, dev->vq[index].kick_fd);
> > +    } else {
> > +        if (vq->handler) {
> > +            vq->handler(dev, index);
> > +        }
> > +    }
> > +}
> > +
> > +static const VuDevIface vub_iface = {
> > +    .get_features = vub_get_features,
> > +    .queue_set_started = vub_queue_set_started,
> > +    .get_protocol_features = vub_get_protocol_features,
> > +    .get_config = vub_get_config,
> > +    .set_config = vub_set_config,
> > +    .process_msg = vub_process_msg,
> > +    .read_msg = vu_message_read_co,
> > +    .kick_callback = vub_kick_cb,
> > +};
>
> I would prefer the = signs to be aligned to the same column.
>
> > +
> > +void vub_free(VubDev *vub_dev, bool called_by_QOM)
> > +{
> > +    if (!vub_dev) {
> > +        return;
> > +    }
> > +
> > +    blk_unref(vub_dev->backend);
> > +    g_free(vub_dev->name);
> > +    g_free(vub_dev->unix_socket);
> > +
> > +    if (vub_dev->next.tqe_circ.tql_prev) {
> > +        /*
> > +         * if vub_dev->next.tqe_circ.tql_prev = null,
> > +         * vub_dev hasn't been inserted into the queue and
> > +         * vub_free is called by obj->instance_finalize.
> > +         */
> > +        QTAILQ_REMOVE(&vub_devs, vub_dev, next);
> > +    }
> > +    /*
> > +     * Needn't to free vub_dev if called by QOM
> > +     * because QOM will do the clean-up work.
> > +     */
> > +    if (!called_by_QOM) {
> > +        g_free(vub_dev);
> > +    }
> > +}
> > +
> > +static coroutine_fn void vu_client_trip(void *opaque)
> > +{
> > +    VuClient *client = opaque;
> > +
> > +    while (!client->closed) {
> > +        vu_dispatch(&client->parent);
> > +    }
> > +
> > +    QTAILQ_REMOVE(&client->blk->clients, client, next);
> > +
>
> Extra empty line.
>
> > +}
> > +
> > +static void vu_client_start(VuClient *client)
> > +{
> > +    Coroutine *co = qemu_coroutine_create(vu_client_trip, client);
> > +    qemu_coroutine_enter(co);
> > +}
> > +
> > +
> > +G_STATIC_ASSERT((int)G_IO_IN == (int)VU_WATCH_IN);
> > +G_STATIC_ASSERT((int)G_IO_OUT == (int)VU_WATCH_OUT);
> > +G_STATIC_ASSERT((int)G_IO_PRI == (int)VU_WATCH_PRI);
> > +G_STATIC_ASSERT((int)G_IO_ERR == (int)VU_WATCH_ERR);
> > +G_STATIC_ASSERT((int)G_IO_HUP == (int)VU_WATCH_HUP);
> > +
> > +static void
> > +set_watch(VuDev *vu_dev, int fd, int vu_evt,
> > +          vu_watch_cb_packed_data cb, void *pvt)
> > +{
> > +    /*
> > +     * since aio_dispatch can only pass one user data pointer to the
> > +     * callback function, pack VuDev, pvt into a struct
> > +     */
> > +    VuClient *client;
> > +
> > +    g_assert(vu_dev);
> > +    g_assert(fd >= 0);
> > +    g_assert(cb);
> > +    client = container_of(vu_dev, VuClient, parent);
> > +    vu_watch_cb_data *cb_data = g_new0(vu_watch_cb_data, 1);
> > +    cb_data->index = (intptr_t) pvt;
> > +    cb_data->vu_dev = vu_dev;
> > +    aio_set_fd_handler(client->blk->ctx, fd, false, (void *) cb,
> > +                       NULL, NULL, cb_data);
> > +}
> > +
> > +
> > +void vub_accept(QIONetListener *listener, QIOChannelSocket *sioc,
> > +                gpointer opaque)
> > +{
> > +    VuClient *client;
> > +    VubDev *vub_device = opaque;
> > +    client = g_new0(VuClient, 1);
> > +
> > +    if (!vu_init_packed_data(&client->parent, VHOST_USER_BLK_MAX_QUEUES,
> > +                             sioc->fd, vub_panic_cb, set_watch,
> > +                             remove_watch, &vub_iface)) {
> > +        fprintf(stderr, "Failed to initialized libvhost-user\n");
> > +        g_free(client);
> > +        return;
> > +    }
> > +
> > +    client->blk = vub_device;
> > +    client->refcount = 1;
> > +    client->sioc = sioc;
> > +    /*
> > +     * increase the object reference, so cioc will not freed by
> > +     * qio_net_listener_channel_func which will call
> object_unref(OBJECT(sioc))
> > +     */
> > +    object_ref(OBJECT(client->sioc));
> > +    qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
> > +    client->ioc = QIO_CHANNEL(sioc);
> > +    object_ref(OBJECT(client->ioc));
> > +    object_ref(OBJECT(sioc));
> > +
> > +    qio_channel_set_blocking(QIO_CHANNEL(client->sioc), false, NULL);
> > +    client->closed = false;
> > +    QTAILQ_INSERT_TAIL(&client->blk->clients, client, next);
> > +    vu_client_start(client);
> > +}
> > +
> > +
> > +void
> > +vub_initialize_config(BlockDriverState *bs, struct virtio_blk_config
> *config)
> > +{
> > +    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
> > +    config->blk_size = BDRV_SECTOR_SIZE;
> > +    config->size_max = 65536;
> > +    config->seg_max = 128 - 2;
> > +    config->min_io_size = 1;
> > +    config->opt_io_size = 1;
> > +    config->num_queues = 1;
> > +    #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
> > +    config->max_discard_sectors = 32768;
> > +    config->max_discard_seg = 1;
> > +    config->discard_sector_alignment = config->blk_size >> 9;
> > +    config->max_write_zeroes_sectors = 32768;
> > +    config->max_write_zeroes_seg = 1;
> > +    #endif
> > +}
> > +
> > +
> > +static VubDev *vub_new(VubDev *vub_device, const char *name,
> > +                       const char *unix_socket, bool writable, Error
> **errp)
> > +{
> > +
> > +    BlockBackend *blk;
> > +
> > +    /*
> > +     * Don't allow resize while the vhost user server is running,
> > +     * otherwise we don't care what happens with the node.
> > +     */
> > +    uint64_t perm = BLK_PERM_CONSISTENT_READ;
> > +    int ret;
> > +
> > +    AioContext *ctx;
> > +
> > +    BlockDriverState *bs = bdrv_lookup_bs(name,
> > +                                          name,
> > +                                          errp);
>
> This fits in a single line.
>
> > +
> > +    if (!bs) {
> > +        error_setg(errp,
> > +                   "No drive with name '%s'."
> > +                   " Please find the list of names with "
> > +                   "'info block'", name);
>
> This can probably be two lines instead of four.
>
> > +        return NULL;
> > +    }
> > +
> > +    if (bdrv_is_read_only(bs)) {
> > +        writable = false;
> > +    }
> > +
> > +    if (writable) {
> > +        perm |= BLK_PERM_WRITE;
> > +    }
> > +
> > +    ctx = bdrv_get_aio_context(bs);
> > +    aio_context_acquire(ctx);
> > +    bdrv_invalidate_cache(bs, NULL);
> > +    aio_context_release(ctx);
> > +
> > +    blk = blk_new(bdrv_get_aio_context(bs), perm,
> > +                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> > +                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
> > +    ret = blk_insert_bs(blk, bs, errp);
> > +
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +
> > +
> > +    blk_set_enable_write_cache(blk, false);
> > +
> > +    blk_set_allow_aio_context_change(blk, true);
> > +
> > +
> > +    vub_device->name = g_strdup(name);
> > +    vub_device->unix_socket = g_strdup(unix_socket);
> > +    vub_device->writable = writable;
> > +    vub_device->blkcfg.wce = 0;
> > +    vub_device->backend = blk;
> > +    vub_device->ctx = ctx;
> > +    vub_initialize_config(bs, &vub_device->blkcfg);
> > +    return vub_device;
> > +
> > +fail:
> > +    blk_unref(blk);
> > +    return NULL;
> > +}
> > +
> > +void vhost_user_server_free(VubDev *vub_device, bool called_by_QOM)
> > +{
> > +    if (!vub_device) {
> > +        return;
> > +    }
> > +
> > +    VuClient *client, *next;
> > +    QTAILQ_FOREACH_SAFE(client, &vub_device->clients, next, next) {
> > +        if (!client->closed) {
> > +            close_client(client);
> > +        }
> > +    }
> > +
> > +    if (vub_device->listener) {
> > +        qio_net_listener_disconnect(vub_device->listener);
> > +        object_unref(OBJECT(vub_device->listener));
> > +    }
> > +    vub_free(vub_device, called_by_QOM);
> > +
> > +}
> > +
> > +
> > +VubDev *vub_dev_find(const char *name)
> > +{
> > +    VubDev *vub_device;
> > +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> > +        if (strcmp(name, vub_device->name) == 0) {
> > +            return vub_device;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +
> > +static VubDev *vub_dev_find_by_unix_socket(const char *unix_socket)
> > +{
> > +    VubDev *vub_device;
> > +    QTAILQ_FOREACH(vub_device, &vub_devs, next) {
> > +        if (strcmp(unix_socket, vub_device->unix_socket) == 0) {
> > +            return vub_device;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +static void vhost_user_server_start(VubDev *vub_device, const char
> *unix_socket,
> > +                                    const char *name, bool writable,
> > +                                    Error **errp)
> > +{
> > +
> > +    if (vub_dev_find(name) || vub_dev_find_by_unix_socket(unix_socket))
> {
> > +        error_setg(errp, "Vhost user server with name '%s' or "
> > +                "with socket_path '%s' has already been started",
> > +                name, unix_socket);
> > +        return;
> > +    }
> > +
> > +
> > +    if (!vub_new(vub_device, name, unix_socket, writable, errp)) {
> > +        return;
> > +    }
> > +
> > +
> > +    vub_device->listener = qio_net_listener_new();
> > +
> > +    qio_net_listener_set_name(vub_device->listener,
> > +                              "vhost-user-backend-listener");
> > +
> > +    SocketAddress *addr = g_new0(SocketAddress, 1);
> > +    addr->u.q_unix.path = (char *) unix_socket;
> > +    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
> > +    if (qio_net_listener_open_sync(vub_device->listener, addr, 1, errp)
> < 0) {
> > +        goto error;
> > +    }
> > +
> > +
> > +    QTAILQ_INSERT_TAIL(&vub_devs, vub_device, next);
> > +    QTAILQ_INIT(&vub_device->clients);
> > +
> > +    qio_net_listener_set_client_func(vub_device->listener,
> > +                                     vub_accept,
> > +                                     vub_device,
> > +                                     NULL);
> > +
> > +    return;
> > +
> > + error:
> > +    vub_free(vub_device, false);
> > +}
> > +
> > +static void vu_set_block_name(Object *obj, const char *value,
> > +                                           Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +
> > +    if (vus->name) {
> > +        error_setg(errp, "evdev property already set");
> > +        return;
> > +    }
> > +
> > +    vus->name = g_strdup(value);
> > +}
> > +
> > +static char *vu_get_block_name(Object *obj, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);
> > +    return g_strdup(vus->name);
> > +}
> > +
> > +
> > +static void vu_set_unix_socket(Object *obj, const char *value,
> > +                                            Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +
> > +    if (vus->unix_socket) {
> > +        error_setg(errp, "unix_socket property already set");
> > +        return;
> > +    }
> > +
> > +    vus->unix_socket = g_strdup(value);
> > +    vhost_user_server_start(vus, value, vus->name,
> > +                            vus->writable, errp);
> > +}
>
> This makes the unix-socket property magic in that it starts the server
> with the properties specified at this point. This means that this
> property must always be specified last.
>
> Maybe it would be better to use a boolean property (similar to qdev's
> "realized") that explicitly start and possibly stops the export.
>
> Writing to other properties should probably result in an error while the
> server is already running because these property changes won't take
> effect any more then.
>
> > +static char *vu_get_unix_socket(Object *obj, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +    return g_strdup(vus->unix_socket);
> > +}
> > +
> > +static bool vu_get_block_writable(Object *obj, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);;
> > +    return vus->writable;
> > +}
> > +
> > +static void vu_set_block_writable(Object *obj, bool value, Error **errp)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);
> > +
> > +    vus->writable = value;
> > +}
> > +
> > +static void vhost_user_server_instance_init(Object *obj)
> > +{
> > +
> > +    object_property_add_bool(obj, "writable",
> > +                            vu_get_block_writable,
> > +                            vu_set_block_writable, NULL);
> > +
> > +    object_property_add_str(obj, "name",
> > +                            vu_get_block_name,
> > +                            vu_set_block_name, NULL);
> > +
> > +    object_property_add_str(obj, "unix_socket",
> > +                            vu_get_unix_socket,
> > +                            vu_set_unix_socket, NULL);
>
> These should probably be object_class_property_add_*() and be called in
> .class_init rather than .instance_init.
>
> "name" suggests that it's the name of the export rather than the block
> device to be exported. I would suggest "node-name" (and then actually
> only pass it as node-name to bdrv_lookup_bs()).
>
> I expect that in the long run, we'll want to accept a full SocketAddress
> rather than just a filename like in "unix_socket".
>
> > +}
> > +
> > +static void vhost_user_server_instance_finalize(Object *obj)
> > +{
> > +    VubDev *vus = VHOST_USER_SERVER(obj);
> > +    vhost_user_server_free(vus, true);
> > +    /* object_del shouldn't free this object struct */
> > +    obj->free = NULL;
> > +}
> > +
> > +static const TypeInfo vhost_user_server_info = {
> > +    .name = TYPE_VHOST_USER_SERVER,
> > +    .parent = TYPE_OBJECT,
> > +    .instance_size = sizeof(VuDev),
> > +    .instance_init = vhost_user_server_instance_init,
> > +    .instance_finalize = vhost_user_server_instance_finalize,
> > +    .interfaces = (InterfaceInfo[]) {
> > +        {TYPE_USER_CREATABLE},
> > +        {}
> > +    },
> > +};
> > +
> > +static void vhost_user_server_register_types(void)
> > +{
> > +    type_register_static(&vhost_user_server_info);
> > +}
> > +
> > +type_init(vhost_user_server_register_types)
> > +
>
> Extra empty line at the file end.
>
> In summary, I can see this going in the right direction, though in
> detail some more work will be needed.
>
> Kevin
>
>

-- 
*Best regards,*
*Coiby*

[-- Attachment #2: Type: text/html, Size: 51243 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2020-02-20  9:32 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-14 14:06 [PATCH v2 0/5] vhost-user block device backend implementation Coiby Xu
2020-01-14 14:06 ` [PATCH v2 1/5] vhost-user block device backend Coiby Xu
2020-01-16 13:51   ` Stefan Hajnoczi
2020-01-16 14:20     ` Kevin Wolf
2020-02-14  3:07     ` Coiby Xu
2020-01-16 13:56   ` Kevin Wolf
2020-02-20  7:04     ` Coiby Xu
2020-02-20  9:30     ` Coiby Xu
2020-01-14 14:06 ` [PATCH v2 2/5] extend libvhost to support IOThread Coiby Xu
2020-01-14 14:06 ` [PATCH v2 3/5] a standone-alone tool to directly share disk image file via vhost-user protocol Coiby Xu
2020-01-16 14:04   ` Stefan Hajnoczi
2020-01-17  8:12     ` Coiby Xu
2020-01-17 10:11       ` Kevin Wolf
2020-01-31 16:42         ` Coiby Xu
2020-02-02  9:33           ` Kevin Wolf
2020-02-13  1:00             ` Coiby Xu
2020-01-14 14:06 ` [PATCH v2 4/5] new qTest case for the vhost-user-blk device backend Coiby Xu
2020-01-14 14:06 ` [PATCH v2 5/5] building configuration files changes Coiby Xu
2020-01-16 11:07   ` Kevin Wolf

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).