All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yongji Xie <elohimes@gmail.com>
To: Jason Wang <jasowang@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>,
	"Marc-André Lureau" <marcandre.lureau@redhat.com>,
	"Daniel P. Berrangé" <berrange@redhat.com>,
	"Coquelin, Maxime" <maxime.coquelin@redhat.com>,
	"Yury Kotov" <yury-kotov@yandex-team.ru>,
	"Евгений Яковлев" <wrfsh@yandex-team.ru>,
	qemu-devel <qemu-devel@nongnu.org>,
	zhangyu31@baidu.com, chaiwen@baidu.com, nixun@baidu.com,
	lilin24@baidu.com, "Xie Yongji" <xieyongji@baidu.com>
Subject: Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory
Date: Fri, 11 Jan 2019 14:10:46 +0800	[thread overview]
Message-ID: <CAONzpcY6pdSz65Tzj_Ws0M4vskUOXMuaxQ-uYz6NnCvdMgkFOA@mail.gmail.com> (raw)
In-Reply-To: <ce11d062-1d64-3d35-b617-5c135f1506eb@redhat.com>

On Fri, 11 Jan 2019 at 11:56, Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2019/1/9 下午7:27, elohimes@gmail.com wrote:
> > From: Xie Yongji <xieyongji@baidu.com>
> >
> > This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
> > VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory
> > to/from qemu. Then we maintain a "bitmap" of all descriptors in
> > the shared memory for each queue to track inflight I/O.
> >
> > Signed-off-by: Xie Yongji <xieyongji@baidu.com>
> > Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> > ---
> >   Makefile                              |   2 +-
> >   contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++--
> >   contrib/libvhost-user/libvhost-user.h |  29 +++
> >   3 files changed, 268 insertions(+), 21 deletions(-)
> >
> > diff --git a/Makefile b/Makefile
> > index dd53965f77..b5c9092605 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -473,7 +473,7 @@ Makefile: $(version-obj-y)
> >   # Build libraries
> >
> >   libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
> > -libvhost-user.a: $(libvhost-user-obj-y)
> > +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
> >
> >   ######################################################################
> >
> > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> > index 23bd52264c..e73ce04619 100644
> > --- a/contrib/libvhost-user/libvhost-user.c
> > +++ b/contrib/libvhost-user/libvhost-user.c
> > @@ -41,6 +41,8 @@
> >   #endif
> >
> >   #include "qemu/atomic.h"
> > +#include "qemu/osdep.h"
> > +#include "qemu/memfd.h"
> >
> >   #include "libvhost-user.h"
> >
> > @@ -53,6 +55,18 @@
> >               _min1 < _min2 ? _min1 : _min2; })
> >   #endif
> >
> > +/* Round number down to multiple */
> > +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> > +
> > +/* Round number up to multiple */
> > +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> > +
> > +/* Align each region to cache line size in inflight buffer */
> > +#define INFLIGHT_ALIGNMENT 64
> > +
> > +/* The version of inflight buffer */
> > +#define INFLIGHT_VERSION 1
> > +
> >   #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
> >
> >   /* The version of the protocol we support */
> > @@ -66,6 +80,20 @@
> >           }                                       \
> >       } while (0)
> >
> > +static inline
> > +bool has_feature(uint64_t features, unsigned int fbit)
> > +{
> > +    assert(fbit < 64);
> > +    return !!(features & (1ULL << fbit));
> > +}
> > +
> > +static inline
> > +bool vu_has_feature(VuDev *dev,
> > +                    unsigned int fbit)
> > +{
> > +    return has_feature(dev->features, fbit);
> > +}
> > +
> >   static const char *
> >   vu_request_to_string(unsigned int req)
> >   {
> > @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
> >           REQ(VHOST_USER_POSTCOPY_ADVISE),
> >           REQ(VHOST_USER_POSTCOPY_LISTEN),
> >           REQ(VHOST_USER_POSTCOPY_END),
> > +        REQ(VHOST_USER_GET_INFLIGHT_FD),
> > +        REQ(VHOST_USER_SET_INFLIGHT_FD),
> >           REQ(VHOST_USER_MAX),
> >       };
> >   #undef REQ
> > @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
> >       return true;
> >   }
> >
> > +static int
> > +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
> > +{
> > +    int i = 0;
> > +
> > +    if (!has_feature(dev->protocol_features,
> > +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > +        return 0;
> > +    }
> > +
> > +    if (unlikely(!vq->inflight)) {
> > +        return -1;
> > +    }
> > +
> > +    vq->used_idx = vq->vring.used->idx;
> > +    vq->inflight_num = 0;
> > +    for (i = 0; i < vq->vring.num; i++) {
> > +        if (vq->inflight->desc[i] == 0) {
> > +            continue;
> > +        }
> > +
> > +        vq->inflight_desc[vq->inflight_num++] = i;
> > +        vq->inuse++;
> > +    }
> > +    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
> > +
> > +    /* in case of I/O hang after reconnecting */
> > +    if (eventfd_write(vq->kick_fd, 1) ||
> > +        eventfd_write(vq->call_fd, 1)) {
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >   static bool
> >   vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> >   {
> > @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> >                  dev->vq[index].kick_fd, index);
> >       }
> >
> > +    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
> > +        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
> > +    }
> > +
> >       return false;
> >   }
> >
> > @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
> >       return true;
> >   }
> >
> > +static bool
> > +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> > +{
> > +    int fd;
> > +    void *addr;
> > +    uint64_t mmap_size;
> > +
> > +    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
> > +        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
> > +        vmsg->payload.inflight.mmap_size = 0;
> > +        return true;
> > +    }
> > +
> > +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n",
> > +           vmsg->payload.inflight.num_queues);
> > +
> > +    mmap_size = vmsg->payload.inflight.num_queues *
> > +                ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT);
> > +
> > +    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
> > +                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
> > +                            &fd, NULL);
> > +
> > +    if (!addr) {
> > +        vu_panic(dev, "Failed to alloc vhost inflight area");
> > +        vmsg->payload.inflight.mmap_size = 0;
> > +        return true;
> > +    }
> > +
> > +    dev->inflight_info.addr = addr;
> > +    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
> > +    vmsg->payload.inflight.mmap_offset = 0;
> > +    vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT;
> > +    vmsg->payload.inflight.version = INFLIGHT_VERSION;
> > +    vmsg->fd_num = 1;
> > +    dev->inflight_info.fd = vmsg->fds[0] = fd;
> > +
> > +    DPRINT("send inflight mmap_size: %"PRId64"\n",
> > +           vmsg->payload.inflight.mmap_size);
> > +    DPRINT("send inflight mmap offset: %"PRId64"\n",
> > +           vmsg->payload.inflight.mmap_offset);
> > +    DPRINT("send inflight align: %"PRId32"\n",
> > +           vmsg->payload.inflight.align);
> > +    DPRINT("send inflight version: %"PRId16"\n",
> > +           vmsg->payload.inflight.version);
> > +
> > +    return true;
> > +}
> > +
> > +static bool
> > +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> > +{
> > +    int fd, i;
> > +    uint64_t mmap_size, mmap_offset;
> > +    uint32_t align;
> > +    uint16_t num_queues, version;
> > +    void *rc;
> > +
> > +    if (vmsg->fd_num != 1 ||
> > +        vmsg->size != sizeof(vmsg->payload.inflight)) {
> > +        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
> > +                 vmsg->size, vmsg->fd_num);
> > +        return false;
> > +    }
> > +
> > +    fd = vmsg->fds[0];
> > +    mmap_size = vmsg->payload.inflight.mmap_size;
> > +    mmap_offset = vmsg->payload.inflight.mmap_offset;
> > +    align = vmsg->payload.inflight.align;
> > +    num_queues = vmsg->payload.inflight.num_queues;
> > +    version = vmsg->payload.inflight.version;
> > +
> > +    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
> > +    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
> > +    DPRINT("set_inflight_fd align: %"PRId32"\n", align);
> > +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> > +    DPRINT("set_inflight_fd version: %"PRId16"\n", version);
> > +
> > +    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > +              fd, mmap_offset);
> > +
> > +    if (rc == MAP_FAILED) {
> > +        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
> > +        return false;
> > +    }
> > +
> > +    if (version != INFLIGHT_VERSION) {
> > +        vu_panic(dev, "Invalid set_inflight_fd version: %d", version);
> > +        return false;
> > +    }
> > +
> > +    if (dev->inflight_info.fd) {
> > +        close(dev->inflight_info.fd);
> > +    }
> > +
> > +    if (dev->inflight_info.addr) {
> > +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > +    }
> > +
> > +    dev->inflight_info.fd = fd;
> > +    dev->inflight_info.addr = rc;
> > +    dev->inflight_info.size = mmap_size;
> > +
> > +    for (i = 0; i < num_queues; i++) {
> > +        dev->vq[i].inflight = (VuVirtqInflight *)rc;
> > +        rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align));
> > +    }
> > +
> > +    return false;
> > +}
> > +
> >   static bool
> >   vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> >   {
> > @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> >           return vu_set_postcopy_listen(dev, vmsg);
> >       case VHOST_USER_POSTCOPY_END:
> >           return vu_set_postcopy_end(dev, vmsg);
> > +    case VHOST_USER_GET_INFLIGHT_FD:
> > +        return vu_get_inflight_fd(dev, vmsg);
> > +    case VHOST_USER_SET_INFLIGHT_FD:
> > +        return vu_set_inflight_fd(dev, vmsg);
> >       default:
> >           vmsg_close_fds(vmsg);
> >           vu_panic(dev, "Unhandled request: %d", vmsg->request);
> > @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev)
> >               close(vq->err_fd);
> >               vq->err_fd = -1;
> >           }
> > +        vq->inflight = NULL;
> >       }
> >
> > +    if (dev->inflight_info.addr) {
> > +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > +        dev->inflight_info.addr = NULL;
> > +    }
> > +
> > +    if (dev->inflight_info.fd > 0) {
> > +        close(dev->inflight_info.fd);
> > +        dev->inflight_info.fd = -1;
> > +    }
> >
> >       vu_close_log(dev);
> >       if (dev->slave_fd != -1) {
> > @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
> >       return vring_avail_idx(vq) == vq->last_avail_idx;
> >   }
> >
> > -static inline
> > -bool has_feature(uint64_t features, unsigned int fbit)
> > -{
> > -    assert(fbit < 64);
> > -    return !!(features & (1ULL << fbit));
> > -}
> > -
> > -static inline
> > -bool vu_has_feature(VuDev *dev,
> > -                    unsigned int fbit)
> > -{
> > -    return has_feature(dev->features, fbit);
> > -}
> > -
> >   static bool
> >   vring_notify(VuDev *dev, VuVirtq *vq)
> >   {
> > @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev,
> >       *p_num_sg = num_sg;
> >   }
> >
> > -/* Round number down to multiple */
> > -#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> > -
> > -/* Round number up to multiple */
> > -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> > -
> >   static void *
> >   virtqueue_alloc_element(size_t sz,
> >                                        unsigned out_num, unsigned in_num)
> > @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
> >       return elem;
> >   }
> >
> > +static int
> > +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
> > +{
> > +    if (!has_feature(dev->protocol_features,
> > +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > +        return 0;
> > +    }
> > +
> > +    if (unlikely(!vq->inflight)) {
> > +        return -1;
> > +    }
> > +
>
>
> Just wonder what happens if backend get killed at this point?
>

We will re-caculate last_avail_idx like: last_avail_idx = inuse + used_idx

At this point, backend could consume this entry correctly after reconnect.

> You want to survive from the backend crash but you still depend on
> backend to get and put inflight descriptors which seems somehow conflict.
>

But if backend get killed in vu_queue_inflight_put(), I think you are
right, there is something conflict. One descriptor is consumed by
guest but still marked as inused in inflight buffer. Then we will
re-send this old descriptor after restart.

Maybe we can add something like that to fix this issue:

void vu_queue_push()
{
    vq->inflight->elem_idx = elem->idx;
    vu_queue_fill();
    vu_queue_flush();
    vq->inflight->desc[elem->idx] = 0;
    vq->inflight->used_idx = vq->vring.used->idx;
}

static int vu_check_queue_inflights()
{
    ....
    if (vq->inflight->used_idx != vq->vring.used->idx) {
        /* Crash in vu_queue_push() */
        vq->inflight->desc[vq->inflight->elem_idx] = 0;
    }
    ....
}

Thanks,
Yongji

  reply	other threads:[~2019-01-11  6:11 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-09 11:27 [Qemu-devel] [PATCH v4 for-4.0 0/7] vhost-user-blk: Add support for backend reconnecting elohimes
2019-01-09 11:27 ` [Qemu-devel] [PATCH v4 for-4.0 1/7] char-socket: Enable "nowait" option on client sockets elohimes
2019-01-10 12:49   ` Daniel P. Berrangé
2019-01-10 13:19     ` Yongji Xie
2019-01-10 13:24       ` Daniel P. Berrangé
2019-01-10 14:08         ` Yongji Xie
2019-01-10 14:11           ` Daniel P. Berrangé
2019-01-10 14:29             ` Yongji Xie
2019-01-10 16:41               ` Daniel P. Berrangé
2019-01-11  7:50                 ` Yongji Xie
2019-01-11  8:32                   ` Daniel P. Berrangé
2019-01-11  8:36                     ` Yongji Xie
2019-01-15 15:39                       ` Daniel P. Berrangé
2019-01-15 16:53                         ` Yury Kotov
2019-01-15 17:15                           ` Daniel P. Berrangé
2019-01-16  5:39                         ` Yongji Xie
2019-01-09 11:27 ` [Qemu-devel] [PATCH v4 for-4.0 2/7] vhost-user: Support transferring inflight buffer between qemu and backend elohimes
2019-01-14 22:25   ` Michael S. Tsirkin
2019-01-15  6:46     ` Yongji Xie
2019-01-15 12:54       ` Michael S. Tsirkin
2019-01-15 14:18         ` Yongji Xie
2019-01-18  2:45           ` Yongji Xie
2019-01-09 11:27 ` [Qemu-devel] [PATCH v4 for-4.0 3/7] libvhost-user: Introduce vu_queue_map_desc() elohimes
2019-01-09 11:27 ` [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory elohimes
2019-01-11  3:56   ` Jason Wang
2019-01-11  6:10     ` Yongji Xie [this message]
2019-01-15  7:52       ` Jason Wang
2019-01-15 14:51         ` Yongji Xie
2019-01-17  9:57           ` Jason Wang
2019-01-17 14:59             ` Michael S. Tsirkin
2019-01-18  3:57               ` Jason Wang
2019-01-18  3:32             ` Yongji Xie
2019-01-18  3:56               ` Michael S. Tsirkin
2019-01-18  3:59               ` Jason Wang
2019-01-18  4:03                 ` Michael S. Tsirkin
2019-01-18  7:01                 ` Yongji Xie
2019-01-18  9:26                   ` Jason Wang
2019-01-19 12:19                     ` Yongji Xie
2019-01-15 15:58         ` Michael S. Tsirkin
2019-01-17 10:01           ` Jason Wang
2019-01-17 15:04             ` Michael S. Tsirkin
2019-01-09 11:27 ` [Qemu-devel] [PATCH v4 for-4.0 5/7] vhost-user-blk: Add support to get/set inflight buffer elohimes
2019-01-09 11:27 ` [Qemu-devel] [PATCH v4 for-4.0 6/7] vhost-user-blk: Add support to reconnect backend elohimes
2019-01-09 11:27 ` [Qemu-devel] [PATCH v4 for-4.0 7/7] contrib/vhost-user-blk: enable inflight I/O tracking elohimes
2019-01-10 10:25 ` [Qemu-devel] [PATCH v4 for-4.0 0/7] vhost-user-blk: Add support for backend reconnecting Stefan Hajnoczi
2019-01-10 10:59   ` Yongji Xie
2019-01-11 15:53     ` Stefan Hajnoczi
2019-01-11 17:24       ` Michael S. Tsirkin
2019-01-12  4:50       ` Yongji Xie
2019-01-14 10:22         ` Stefan Hajnoczi
2019-01-14 10:55           ` Yongji Xie
2019-01-16 14:28             ` Stefan Hajnoczi
2019-01-10 10:39 ` Marc-André Lureau
2019-01-10 11:09   ` Yongji Xie

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAONzpcY6pdSz65Tzj_Ws0M4vskUOXMuaxQ-uYz6NnCvdMgkFOA@mail.gmail.com \
    --to=elohimes@gmail.com \
    --cc=berrange@redhat.com \
    --cc=chaiwen@baidu.com \
    --cc=jasowang@redhat.com \
    --cc=lilin24@baidu.com \
    --cc=marcandre.lureau@redhat.com \
    --cc=maxime.coquelin@redhat.com \
    --cc=mst@redhat.com \
    --cc=nixun@baidu.com \
    --cc=qemu-devel@nongnu.org \
    --cc=wrfsh@yandex-team.ru \
    --cc=xieyongji@baidu.com \
    --cc=yury-kotov@yandex-team.ru \
    --cc=zhangyu31@baidu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.