All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Michael S. Tsirkin" <mst@redhat.com>
To: Victor Kaplansky <victork@redhat.com>
Cc: Marc-Andre Lureau <mlureau@redhat.com>,
	qemu-devel@nongnu.org, peterx@redhat.com
Subject: Re: [Qemu-devel] [PATCH v2] tests/vhost-user-bridge: implement logging of dirty pages
Date: Thu, 12 Nov 2015 16:38:51 +0200	[thread overview]
Message-ID: <20151112162743-mutt-send-email-mst@redhat.com> (raw)
In-Reply-To: <1447331679-6270-1-git-send-email-victork@redhat.com>

On Thu, Nov 12, 2015 at 02:34:56PM +0200, Victor Kaplansky wrote:
> During migration devices continue writing to the guest's memory.
> The writes has to be reported to QEMU. This change implements
> minimal support in vhost-user-bridge required for successful
> migration of a guest with virtio-net device.
> 
> Signed-off-by: Victor Kaplansky <victork@redhat.com>
> ---
> v2:
>    - use log_guest_addr for used ring reported by qemu instead of
>      translating.
>    - use mmap_size and mmap_offset defined in new
>      VHOST_USER_SET_LOG_BASE interface. See the patch
>      "vhost-user: modify SET_LOG_BASE to pass mmap size and
>      offset".
>    - start logging dirty pages only after the appropriate feature
>      is set by a VHOST_USER_GET_PROTOCOL_FEATURES request.
>    - updated TODO list.
> 
>  tests/vhost-user-bridge.c | 169 ++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 155 insertions(+), 14 deletions(-)
> 
> diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c
> index fa18ad5..8c1c997 100644
> --- a/tests/vhost-user-bridge.c
> +++ b/tests/vhost-user-bridge.c
> @@ -13,16 +13,22 @@
>  /*
>   * TODO:
>   *     - main should get parameters from the command line.
> - *     - implement all request handlers.
> + *     - implement all request handlers. Still not implemented:
> + *          vubr_set_protocol_features_exec()
> + *          vubr_get_queue_num_exec()
> + *          vubr_set_vring_enable_exec()
> + *          vubr_send_rarp_exec()
> + *          vubr_set_log_fd_exec()
>   *     - test for broken requests and virtqueue.
>   *     - implement features defined by Virtio 1.0 spec.
>   *     - support mergeable buffers and indirect descriptors.
> - *     - implement RESET_DEVICE request.
>   *     - implement clean shutdown.
>   *     - implement non-blocking writes to UDP backend.
>   *     - implement polling strategy.
>   */
>  
> +#define _FILE_OFFSET_BITS 64
> +
>  #include <stddef.h>
>  #include <assert.h>
>  #include <stdio.h>
> @@ -166,6 +172,7 @@ typedef struct VubrVirtq {
>      struct vring_desc *desc;
>      struct vring_avail *avail;
>      struct vring_used *used;
> +    uint64_t log_guest_addr;
>  } VubrVirtq;
>  
>  /* Based on qemu/hw/virtio/vhost-user.c */
> @@ -173,6 +180,9 @@ typedef struct VubrVirtq {
>  #define VHOST_MEMORY_MAX_NREGIONS    8
>  #define VHOST_USER_F_PROTOCOL_FEATURES 30
>  
> +typedef uint8_t vhost_log_chunk_t;

Most code just uses uint8_t directly - I think you should
just drop this typedef.

> +#define VHOST_LOG_PAGE 4096
> +
>  enum VhostUserProtocolFeature {
>      VHOST_USER_PROTOCOL_F_MQ = 0,
>      VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
> @@ -220,6 +230,11 @@ typedef struct VhostUserMemory {
>      VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
>  } VhostUserMemory;
>  
> +typedef struct VhostUserLog {
> +    uint64_t mmap_size;
> +    uint64_t mmap_offset;
> +} VhostUserLog;
> +
>  typedef struct VhostUserMsg {
>      VhostUserRequest request;
>  
> @@ -234,6 +249,7 @@ typedef struct VhostUserMsg {
>          struct vhost_vring_state state;
>          struct vhost_vring_addr addr;
>          VhostUserMemory memory;
> +        VhostUserLog log;
>      } payload;
>      int fds[VHOST_MEMORY_MAX_NREGIONS];
>      int fd_num;
> @@ -265,8 +281,13 @@ typedef struct VubrDev {
>      uint32_t nregions;
>      VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
>      VubrVirtq vq[MAX_NR_VIRTQUEUE];
> +    int log_call_fd;

This doesn't seem to be used. Pls add code to signal this
after logging.

> +    uint64_t log_size;
> +    vhost_log_chunk_t *log_table;
>      int backend_udp_sock;
>      struct sockaddr_in backend_udp_dest;
> +    int ready;
> +    uint64_t features;
>  } VubrDev;
>  
>  static const char *vubr_request_str[] = {
> @@ -368,7 +389,12 @@ vubr_message_read(int conn_fd, VhostUserMsg *vmsg)
>  
>      rc = recvmsg(conn_fd, &msg, 0);
>  
> -    if (rc <= 0) {
> +    if (rc == 0) {
> +        vubr_die("recvmsg");
> +        fprintf(stderr, "Peer disconnected.\n");
> +        exit(1);
> +    }
> +    if (rc < 0) {
>          vubr_die("recvmsg");
>      }
>  
> @@ -395,7 +421,12 @@ vubr_message_read(int conn_fd, VhostUserMsg *vmsg)
>  
>      if (vmsg->size) {
>          rc = read(conn_fd, &vmsg->payload, vmsg->size);
> -        if (rc <= 0) {
> +        if (rc == 0) {
> +            vubr_die("recvmsg");
> +            fprintf(stderr, "Peer disconnected.\n");
> +            exit(1);
> +        }
> +        if (rc < 0) {
>              vubr_die("recvmsg");
>          }
>  
> @@ -465,12 +496,39 @@ vubr_virtqueue_kick(VubrVirtq *vq)
>      }
>  }
>  
> +
> +static void
> +vubr_log_page(uint8_t *log_table, uint64_t page)
> +{
> +    DPRINT("Logged dirty guest page: %"PRId64"\n", page);
> +    log_table[page / 8] |= 1 << (page % 8);
> +}
> +

This is only safe if there's a single writer.
Please add a comment that says as much.
Or set this atomically, that's also not hard to do.

> +static void
> +vubr_log_write(VubrDev *dev, uint64_t address, uint64_t length)
> +{
> +    uint64_t page;
> +
> +    if (!(dev->features & VHOST_F_LOG_ALL) || !dev->log_table || !length) {
> +        return;
> +    }
> +
> +    assert(dev->log_size >= ((address + length) / VHOST_LOG_PAGE / 8));

I think there's an off by 1 here.
Imagine size == 0, test should always fail.
But imagine address == 0 and length == 1.
You get >= and test passes, seems wrong.

> +
> +    page = address / VHOST_LOG_PAGE;
> +    while (page * VHOST_LOG_PAGE < address + length) {
> +        vubr_log_page(dev->log_table, page);
> +        page += VHOST_LOG_PAGE;
> +    }
> +}
> +
>  static void
>  vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len)
>  {
>      struct vring_desc *desc   = vq->desc;
>      struct vring_avail *avail = vq->avail;
>      struct vring_used *used   = vq->used;
> +    uint64_t log_guest_addr = vq->log_guest_addr;
>  
>      unsigned int size = vq->size;
>  
> @@ -510,6 +568,7 @@ vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len)
>  
>      if (len <= chunk_len) {
>          memcpy(chunk_start, buf, len);
> +        vubr_log_write(dev, desc[i].addr, len);
>      } else {
>          fprintf(stderr,
>                  "Received too long packet from the backend. Dropping...\n");
> @@ -519,11 +578,17 @@ vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len)
>      /* Add descriptor to the used ring. */
>      used->ring[u_index].id = d_index;
>      used->ring[u_index].len = len;
> +    vubr_log_write(dev,
> +                   log_guest_addr + offsetof(struct vring_used, ring[u_index]),
> +                   sizeof(used->ring[u_index]));
>  
>      vq->last_avail_index++;
>      vq->last_used_index++;
>  
>      atomic_mb_set(&used->idx, vq->last_used_index);
> +    vubr_log_write(dev,
> +                   log_guest_addr + offsetof(struct vring_used, idx),
> +                   sizeof(used->idx));
>  
>      /* Kick the guest if necessary. */
>      vubr_virtqueue_kick(vq);
> @@ -535,6 +600,7 @@ vubr_process_desc(VubrDev *dev, VubrVirtq *vq)
>      struct vring_desc *desc   = vq->desc;
>      struct vring_avail *avail = vq->avail;
>      struct vring_used *used   = vq->used;
> +    uint64_t log_guest_addr = vq->log_guest_addr;
>  
>      unsigned int size = vq->size;
>  
> @@ -552,6 +618,8 @@ vubr_process_desc(VubrDev *dev, VubrVirtq *vq)
>          void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr);
>          uint32_t chunk_len = desc[i].len;
>  
> +        assert(!(desc[i].flags & VRING_DESC_F_WRITE));
> +
>          if (len + chunk_len < buf_size) {
>              memcpy(buf + len, chunk_start, chunk_len);
>              DPRINT("%d ", chunk_len);
> @@ -577,6 +645,9 @@ vubr_process_desc(VubrDev *dev, VubrVirtq *vq)
>      /* Add descriptor to the used ring. */
>      used->ring[u_index].id = d_index;
>      used->ring[u_index].len = len;
> +    vubr_log_write(dev,
> +                   log_guest_addr + offsetof(struct vring_used, ring[u_index]),
> +                   sizeof(used->ring[u_index]));
>  
>      vubr_consume_raw_packet(dev, buf, len);
>  
> @@ -588,6 +659,7 @@ vubr_process_avail(VubrDev *dev, VubrVirtq *vq)
>  {
>      struct vring_avail *avail = vq->avail;
>      struct vring_used *used = vq->used;
> +    uint64_t log_guest_addr = vq->log_guest_addr;
>  
>      while (vq->last_avail_index != atomic_mb_read(&avail->idx)) {
>          vubr_process_desc(dev, vq);
> @@ -596,6 +668,9 @@ vubr_process_avail(VubrDev *dev, VubrVirtq *vq)
>      }
>  
>      atomic_mb_set(&used->idx, vq->last_used_index);
> +    vubr_log_write(dev,
> +                   log_guest_addr + offsetof(struct vring_used, idx),
> +                   sizeof(used->idx));
>  }
>  
>  static void
> @@ -609,6 +684,10 @@ vubr_backend_recv_cb(int sock, void *ctx)
>      int buflen = sizeof(buf);
>      int len;
>  
> +    if (!dev->ready) {
> +        return;
> +    }
> +
>      DPRINT("\n\n   ***   IN UDP RECEIVE CALLBACK    ***\n\n");
>  
>      uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx);
> @@ -656,9 +735,9 @@ vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
>  {
>      vmsg->payload.u64 =
>              ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
> -             (1ULL << VIRTIO_NET_F_CTRL_VQ) |
> -             (1ULL << VIRTIO_NET_F_CTRL_RX) |
> -             (1ULL << VHOST_F_LOG_ALL));
> +             (1ULL << VHOST_F_LOG_ALL) |
> +             (1ULL << VHOST_USER_F_PROTOCOL_FEATURES));
> +
>      vmsg->size = sizeof(vmsg->payload.u64);
>  
>      DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
> @@ -671,6 +750,7 @@ static int
>  vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
>  {
>      DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
> +    dev->features = vmsg->payload.u64;
>      return 0;
>  }
>  
> @@ -680,10 +760,28 @@ vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg)
>      return 0;
>  }
>  
> +static void
> +vubr_close_log(VubrDev *dev)
> +{
> +    if (dev->log_table) {
> +        if (munmap(dev->log_table, dev->log_size) != 0) {
> +            vubr_die("munmap()");
> +        }
> +
> +        dev->log_table = 0;
> +    }
> +    if (dev->log_call_fd != -1) {
> +        close(dev->log_call_fd);
> +        dev->log_call_fd = -1;
> +    }
> +}
> +
>  static int
>  vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg)
>  {
> -    DPRINT("Function %s() not implemented yet.\n", __func__);
> +    vubr_close_log(dev);
> +    dev->ready = 0;
> +    dev->features = 0;
>      return 0;
>  }
>  
> @@ -736,8 +834,30 @@ vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg)
>  static int
>  vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
>  {
> -    DPRINT("Function %s() not implemented yet.\n", __func__);
> -    return 0;
> +    int fd;
> +    uint64_t log_mmap_size, log_mmap_offset;
> +    void *rc;
> +
> +    assert(vmsg->fd_num == 1);
> +    fd = vmsg->fds[0];
> +
> +    assert(vmsg->size == sizeof(vmsg->payload.log));
> +    log_mmap_offset = vmsg->payload.log.mmap_offset;
> +    log_mmap_size   = vmsg->payload.log.mmap_size;
> +    DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
> +    DPRINT("Log mmap_size:   %"PRId64"\n", log_mmap_size);
> +
> +    rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
> +              log_mmap_offset);
> +    if (rc == MAP_FAILED) {
> +        vubr_die("mmap");
> +    }
> +    dev->log_table = (vhost_log_chunk_t *) rc;

Cast is not needed here.

> +    dev->log_size = log_mmap_size;
> +
> +    vmsg->size = sizeof(vmsg->payload.u64);
> +    /* Reply */
> +    return 1;
>  }
>  
>  static int
> @@ -777,6 +897,7 @@ vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg)
>      vq->desc = (struct vring_desc *)qva_to_va(dev, vra->desc_user_addr);
>      vq->used = (struct vring_used *)qva_to_va(dev, vra->used_user_addr);
>      vq->avail = (struct vring_avail *)qva_to_va(dev, vra->avail_user_addr);
> +    vq->log_guest_addr = vra->log_guest_addr;
>  
>      DPRINT("Setting virtq addresses:\n");
>      DPRINT("    vring_desc  at %p\n", vq->desc);
> @@ -803,8 +924,14 @@ vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
>  static int
>  vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
>  {
> -    DPRINT("Function %s() not implemented yet.\n", __func__);
> -    return 0;
> +    unsigned int index = vmsg->payload.state.index;
> +
> +    DPRINT("State.index: %d\n", index);
> +    vmsg->payload.state.num = dev->vq[index].last_avail_index;
> +    vmsg->size = sizeof(vmsg->payload.state);
> +
> +    /* reply */

But Reply above. Pls keep it consistent.

> +    return 1;
>  }
>  
>  static int
> @@ -829,6 +956,10 @@ vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg)
>          DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
>                 dev->vq[index].kick_fd, index);
>      }
> +    if (dev->vq[0].kick_fd != -1 &&
> +        dev->vq[1].kick_fd != -1) {
> +        dev->ready = 1;

I'm not sure what this does. Related to logging somehow?
Anyway, processing a VQ should happen after either
- you received a kick
or
- you received VRING_ENABLE

> +    }
>      return 0;
>  }
>  
> @@ -858,9 +989,12 @@ vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg)
>  static int
>  vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
>  {
> -    /* FIXME: unimplented */
> +    vmsg->payload.u64 = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD;
>      DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
> -    return 0;
> +    vmsg->size = sizeof(vmsg->payload.u64);
> +
> +    /* Reply */
> +    return 1;
>  }
>  
>  static int
> @@ -1012,6 +1146,13 @@ vubr_new(const char *path)
>          };
>      }
>  
> +    /* Init log */
> +    dev->log_call_fd = -1;
> +    dev->log_size    = 0;
> +    dev->log_table   = 0;

Pls just put a single space there. Don't align things,
it's hard to keep consistency when one does.

> +    dev->ready = 0;
> +    dev->features = 0;
> +
>      /* Get a UNIX socket. */
>      dev->sock = socket(AF_UNIX, SOCK_STREAM, 0);
>      if (dev->sock == -1) {
> -- 
> --Victor

  reply	other threads:[~2015-11-12 14:39 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-11-12 12:34 [Qemu-devel] [PATCH v2] tests/vhost-user-bridge: implement logging of dirty pages Victor Kaplansky
2015-11-12 14:38 ` Michael S. Tsirkin [this message]
2015-11-12 17:39   ` Victor Kaplansky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20151112162743-mutt-send-email-mst@redhat.com \
    --to=mst@redhat.com \
    --cc=mlureau@redhat.com \
    --cc=peterx@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=victork@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.