Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ

From: Jason Wang <jasowang@redhat.com>
To: "Eugenio Pérez" <eperezma@redhat.com>, qemu-devel@nongnu.org
Cc: "Michael S. Tsirkin" <mst@redhat.com>,
	virtualization@lists.linux-foundation.org,
	Eli Cohen <eli@mellanox.com>, Eric Blake <eblake@redhat.com>,
	Parav Pandit <parav@mellanox.com>, Cindy Lu <lulu@redhat.com>,
	"Fangyi \(Eric\)" <eric.fangyi@huawei.com>,
	Markus Armbruster <armbru@redhat.com>,
	yebiaoxiang@huawei.com, Liuxiangdong <liuxiangdong5@huawei.com>,
	Laurent Vivier <lvivier@redhat.com>,
	Eduardo Habkost <ehabkost@redhat.com>,
	Richard Henderson <richard.henderson@linaro.org>,
	Gautam Dawar <gdawar@xilinx.com>,
	Xiao W Wang <xiao.w.wang@intel.com>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	Harpreet Singh Anand <hanand@xilinx.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Lingshan <lingshan.zhu@intel.com>
Subject: Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ
Date: Mon, 28 Feb 2022 15:36:58 +0800	[thread overview]
Message-ID: <af6d14b3-6bdf-8717-90b5-bc685896cc8a@redhat.com> (raw)
In-Reply-To: <20220227134111.3254066-11-eperezma@redhat.com>

在 2022/2/27 下午9:41, Eugenio Pérez 写道:
> Use translations added in VhostIOVATree in SVQ.
>
> Only introduce usage here, not allocation and deallocation. As with
> previous patches, we use the dead code paths of shadow_vqs_enabled to
> avoid commiting too many changes at once. These are impossible to take
> at the moment.
>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
>   hw/virtio/vhost-shadow-virtqueue.h |   6 +-
>   include/hw/virtio/vhost-vdpa.h     |   3 +
>   hw/virtio/vhost-shadow-virtqueue.c |  76 ++++++++++++++++-
>   hw/virtio/vhost-vdpa.c             | 128 ++++++++++++++++++++++++-----
>   4 files changed, 187 insertions(+), 26 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> index 04c67685fd..b2f722d101 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.h
> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> @@ -13,6 +13,7 @@
>   #include "qemu/event_notifier.h"
>   #include "hw/virtio/virtio.h"
>   #include "standard-headers/linux/vhost_types.h"
> +#include "hw/virtio/vhost-iova-tree.h"
>   
>   /* Shadow virtqueue to relay notifications */
>   typedef struct VhostShadowVirtqueue {
> @@ -43,6 +44,9 @@ typedef struct VhostShadowVirtqueue {
>       /* Virtio device */
>       VirtIODevice *vdev;
>   
> +    /* IOVA mapping */
> +    VhostIOVATree *iova_tree;
> +
>       /* Map for use the guest's descriptors */
>       VirtQueueElement **ring_id_maps;
>   
> @@ -78,7 +82,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
>                        VirtQueue *vq);
>   void vhost_svq_stop(VhostShadowVirtqueue *svq);
>   
> -VhostShadowVirtqueue *vhost_svq_new(void);
> +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
>   
>   void vhost_svq_free(gpointer vq);
>   G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> index 009a9f3b6b..ee8e939ad0 100644
> --- a/include/hw/virtio/vhost-vdpa.h
> +++ b/include/hw/virtio/vhost-vdpa.h
> @@ -14,6 +14,7 @@
>   
>   #include <gmodule.h>
>   
> +#include "hw/virtio/vhost-iova-tree.h"
>   #include "hw/virtio/virtio.h"
>   #include "standard-headers/linux/vhost_types.h"
>   
> @@ -30,6 +31,8 @@ typedef struct vhost_vdpa {
>       MemoryListener listener;
>       struct vhost_vdpa_iova_range iova_range;
>       bool shadow_vqs_enabled;
> +    /* IOVA mapping used by the Shadow Virtqueue */
> +    VhostIOVATree *iova_tree;
>       GPtrArray *shadow_vqs;
>       struct vhost_dev *dev;
>       VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> index a38d313755..7e073773d1 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -11,6 +11,7 @@
>   #include "hw/virtio/vhost-shadow-virtqueue.h"
>   
>   #include "qemu/error-report.h"
> +#include "qemu/log.h"
>   #include "qemu/main-loop.h"
>   #include "qemu/log.h"
>   #include "linux-headers/linux/vhost.h"
> @@ -84,7 +85,58 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
>       }
>   }
>   
> +/**
> + * Translate addresses between the qemu's virtual address and the SVQ IOVA
> + *
> + * @svq    Shadow VirtQueue
> + * @vaddr  Translated IOVA addresses
> + * @iovec  Source qemu's VA addresses
> + * @num    Length of iovec and minimum length of vaddr
> + */
> +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> +                                     void **addrs, const struct iovec *iovec,
> +                                     size_t num)
> +{
> +    if (num == 0) {
> +        return true;
> +    }
> +
> +    for (size_t i = 0; i < num; ++i) {
> +        DMAMap needle = {
> +            .translated_addr = (hwaddr)iovec[i].iov_base,
> +            .size = iovec[i].iov_len,
> +        };
> +        size_t off;
> +
> +        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
> +        /*
> +         * Map cannot be NULL since iova map contains all guest space and
> +         * qemu already has a physical address mapped
> +         */
> +        if (unlikely(!map)) {
> +            qemu_log_mask(LOG_GUEST_ERROR,
> +                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
> +                          needle.translated_addr);
> +            return false;
> +        }
> +
> +        off = needle.translated_addr - map->translated_addr;
> +        addrs[i] = (void *)(map->iova + off);
> +
> +        if (unlikely(int128_gt(int128_add(needle.translated_addr,
> +                                          iovec[i].iov_len),
> +                               map->translated_addr + map->size))) {
> +            qemu_log_mask(LOG_GUEST_ERROR,
> +                          "Guest buffer expands over iova range");
> +            return false;
> +        }
> +    }
> +
> +    return true;
> +}
> +
>   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> +                                    void * const *vaddr_sg,

Nit: it looks to me we are not passing vaddr but iova here, so it might 
be better to use "sg"?

>                                       const struct iovec *iovec,
>                                       size_t num, bool more_descs, bool write)
>   {
> @@ -103,7 +155,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
>           } else {
>               descs[i].flags = flags;
>           }
> -        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> +        descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]);
>           descs[i].len = cpu_to_le32(iovec[n].iov_len);
>   
>           last = i;
> @@ -119,6 +171,8 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
>   {
>       unsigned avail_idx;
>       vring_avail_t *avail = svq->vring.avail;
> +    bool ok;
> +    g_autofree void **sgs = g_new(void *, MAX(elem->out_num, elem->in_num));
>   
>       *head = svq->free_head;
>   
> @@ -129,9 +183,20 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
>           return false;
>       }
>   
> -    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
> +    if (unlikely(!ok)) {
> +        return false;
> +    }
> +    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
>                               elem->in_num > 0, false);
> -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> +
> +
> +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
> +    if (unlikely(!ok)) {
> +        return false;
> +    }
> +
> +    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
>   
>       /*
>        * Put the entry in the available array (but don't update avail->idx until
> @@ -514,11 +579,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
>    * Creates vhost shadow virtqueue, and instructs the vhost device to use the
>    * shadow methods and file descriptors.
>    *
> + * @iova_tree Tree to perform descriptors translations
> + *
>    * Returns the new virtqueue or NULL.
>    *
>    * In case of error, reason is reported through error_report.
>    */
> -VhostShadowVirtqueue *vhost_svq_new(void)
> +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
>   {
>       g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
>       int r;
> @@ -539,6 +606,7 @@ VhostShadowVirtqueue *vhost_svq_new(void)
>   
>       event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
>       event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
> +    svq->iova_tree = iova_tree;
>       return g_steal_pointer(&svq);
>   
>   err_init_hdev_call:
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 435b9c2e9e..56f9f125cd 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -209,6 +209,21 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
>                                            vaddr, section->readonly);
>   
>       llsize = int128_sub(llend, int128_make64(iova));
> +    if (v->shadow_vqs_enabled) {
> +        DMAMap mem_region = {
> +            .translated_addr = (hwaddr)vaddr,
> +            .size = int128_get64(llsize) - 1,
> +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> +        };
> +
> +        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
> +        if (unlikely(r != IOVA_OK)) {
> +            error_report("Can't allocate a mapping (%d)", r);
> +            goto fail;
> +        }
> +
> +        iova = mem_region.iova;
> +    }
>   
>       vhost_vdpa_iotlb_batch_begin_once(v);
>       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> @@ -261,6 +276,20 @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
>   
>       llsize = int128_sub(llend, int128_make64(iova));
>   
> +    if (v->shadow_vqs_enabled) {
> +        const DMAMap *result;
> +        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
> +            section->offset_within_region +
> +            (iova - section->offset_within_address_space);
> +        DMAMap mem_region = {
> +            .translated_addr = (hwaddr)vaddr,
> +            .size = int128_get64(llsize) - 1,
> +        };
> +
> +        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
> +        iova = result->iova;
> +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
> +    }
>       vhost_vdpa_iotlb_batch_begin_once(v);
>       ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
>       if (ret) {
> @@ -383,7 +412,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
>   
>       shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
>       for (unsigned n = 0; n < hdev->nvqs; ++n) {
> -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
> +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
>   
>           if (unlikely(!svq)) {
>               error_setg(errp, "Cannot create svq %u", n);
> @@ -834,37 +863,78 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
>   /**
>    * Unmap a SVQ area in the device
>    */
> -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> -                                      hwaddr size)
> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
> +                                      const DMAMap *needle)
>   {
> +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
> +    hwaddr size;
>       int r;
>   
> -    size = ROUND_UP(size, qemu_real_host_page_size);
> -    r = vhost_vdpa_dma_unmap(v, iova, size);
> +    if (unlikely(!result)) {
> +        error_report("Unable to find SVQ address to unmap");
> +        return false;
> +    }
> +
> +    size = ROUND_UP(result->size, qemu_real_host_page_size);
> +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
>       return r == 0;
>   }
>   
>   static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
>                                          const VhostShadowVirtqueue *svq)
>   {
> +    DMAMap needle;
>       struct vhost_vdpa *v = dev->opaque;
>       struct vhost_vring_addr svq_addr;
> -    size_t device_size = vhost_svq_device_area_size(svq);
> -    size_t driver_size = vhost_svq_driver_area_size(svq);
>       bool ok;
>   
>       vhost_svq_get_vring_addr(svq, &svq_addr);
>   
> -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
> +    needle = (DMAMap) {
> +        .translated_addr = svq_addr.desc_user_addr,
> +    };

Let's simply initialize the member to zero during start of this function 
then we can use needle->transalted_addr = XXX here.

> +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
>       if (unlikely(!ok)) {
>           return false;
>       }
>   
> -    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
> +    needle = (DMAMap) {
> +        .translated_addr = svq_addr.used_user_addr,
> +    };
> +    return vhost_vdpa_svq_unmap_ring(v, &needle);
> +}
> +
> +/**
> + * Map the SVQ area in the device
> + *
> + * @v          Vhost-vdpa device
> + * @needle     The area to search iova
> + * @errorp     Error pointer
> + */
> +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
> +                                    Error **errp)
> +{
> +    int r;
> +
> +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
> +    if (unlikely(r != IOVA_OK)) {
> +        error_setg(errp, "Cannot allocate iova (%d)", r);
> +        return false;
> +    }
> +
> +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size,
> +                           (void *)needle->translated_addr,
> +                           !(needle->perm & IOMMU_ACCESS_FLAG(0, 1)));

Let's simply use needle->perm == IOMMU_RO here?

> +    if (unlikely(r != 0)) {
> +        error_setg_errno(errp, -r, "Cannot map region to device");
> +        vhost_iova_tree_remove(v->iova_tree, needle);
> +    }
> +
> +    return r == 0;
>   }
>   
>   /**
> - * Map shadow virtqueue rings in device
> + * Map the shadow virtqueue rings in the device
>    *
>    * @dev   The vhost device
>    * @svq   The shadow virtqueue
> @@ -876,28 +946,44 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
>                                        struct vhost_vring_addr *addr,
>                                        Error **errp)
>   {
> +    DMAMap device_region, driver_region;
> +    struct vhost_vring_addr svq_addr;
>       struct vhost_vdpa *v = dev->opaque;
>       size_t device_size = vhost_svq_device_area_size(svq);
>       size_t driver_size = vhost_svq_driver_area_size(svq);
> -    int r;
> +    size_t avail_offset;
> +    bool ok;
>   
>       ERRP_GUARD();
> -    vhost_svq_get_vring_addr(svq, addr);
> +    vhost_svq_get_vring_addr(svq, &svq_addr);
>   
> -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
> -                           (void *)addr->desc_user_addr, true);
> -    if (unlikely(r != 0)) {
> -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
> +    driver_region = (DMAMap) {
> +        .translated_addr = svq_addr.desc_user_addr,
> +        .size = driver_size - 1,

Any reason for the "-1" here? I see several places do things like that, 
it's probably hint of wrong API somehwere.

Thanks

> +        .perm = IOMMU_RO,
> +    };
> +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
> +    if (unlikely(!ok)) {
> +        error_prepend(errp, "Cannot create vq driver region: ");
>           return false;
>       }
> +    addr->desc_user_addr = driver_region.iova;
> +    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
> +    addr->avail_user_addr = driver_region.iova + avail_offset;
>   
> -    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
> -                           (void *)addr->used_user_addr, false);
> -    if (unlikely(r != 0)) {
> -        error_setg_errno(errp, -r, "Cannot create vq device region: ");
> +    device_region = (DMAMap) {
> +        .translated_addr = svq_addr.used_user_addr,
> +        .size = device_size - 1,
> +        .perm = IOMMU_RW,
> +    };
> +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
> +    if (unlikely(!ok)) {
> +        error_prepend(errp, "Cannot create vq device region: ");
> +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
>       }
> +    addr->used_user_addr = device_region.iova;
>   
> -    return r == 0;
> +    return ok;
>   }
>   
>   static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization