[Qemu-devel] [RFC 0/3] Supporting programming IOMMU in QEMU (vDPA/vhost-user)

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Qemu-devel] [RFC 0/3] Supporting programming IOMMU in QEMU (vDPA/vhost-user)
@ 2018-07-23  4:59 Tiwei Bie
  2018-07-23  4:59 ` [Qemu-devel] [RFC 1/3] vfio: split vfio_get_group() into small functions Tiwei Bie
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Tiwei Bie @ 2018-07-23  4:59 UTC (permalink / raw)
  To: mst, alex.williamson, jasowang, qemu-devel; +Cc: tiwei.bie

This patch set introduces a slave message in vhost-user to
allow slave to share its VFIO group fd to master and do the
IOMMU programming based on virtio device's DMA address space
for this group in QEMU.

For the vhost-user backends which support vDPA, they could
leverage this message to ask master to do IOMMU programming
in QEMU for the vDPA device in backend. This is helpful to
support vIOMMU in vDPA.

Tiwei Bie (3):
  vfio: split vfio_get_group() into small functions
  vfio: support getting VFIOGroup from groupfd
  vhost-user: support programming VFIO group in master

 docs/interop/vhost-user.txt    |  16 +++++
 hw/vfio/common.c               | 127 +++++++++++++++++++++++++--------
 hw/virtio/vhost-user.c         |  40 +++++++++++
 include/hw/vfio/vfio-common.h  |   1 +
 include/hw/virtio/vhost-user.h |   2 +
 5 files changed, 158 insertions(+), 28 deletions(-)

-- 
2.18.0

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] [RFC 1/3] vfio: split vfio_get_group() into small functions
  2018-07-23  4:59 [Qemu-devel] [RFC 0/3] Supporting programming IOMMU in QEMU (vDPA/vhost-user) Tiwei Bie
@ 2018-07-23  4:59 ` Tiwei Bie
  2018-07-23  4:59 ` [Qemu-devel] [RFC 2/3] vfio: support getting VFIOGroup from groupfd Tiwei Bie
  2018-07-23  4:59 ` [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master Tiwei Bie
  2 siblings, 0 replies; 23+ messages in thread
From: Tiwei Bie @ 2018-07-23  4:59 UTC (permalink / raw)
  To: mst, alex.williamson, jasowang, qemu-devel; +Cc: tiwei.bie

This patch splits vfio_get_group() into small functions.
It makes it easier to implement other vfio_get_group*()
functions in the future.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 hw/vfio/common.c | 83 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 28 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index fb396cf00a..52a05532cd 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1279,37 +1279,20 @@ static void vfio_disconnect_container(VFIOGroup *group)
     }
 }
 
-VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
+static VFIOGroup *vfio_init_group(int groupfd, int groupid, AddressSpace *as,
+                                  Error **errp)
 {
     VFIOGroup *group;
-    char path[32];
     struct vfio_group_status status = { .argsz = sizeof(status) };
 
-    QLIST_FOREACH(group, &vfio_group_list, next) {
-        if (group->groupid == groupid) {
-            /* Found it.  Now is it already in the right context? */
-            if (group->container->space->as == as) {
-                return group;
-            } else {
-                error_setg(errp, "group %d used in multiple address spaces",
-                           group->groupid);
-                return NULL;
-            }
-        }
-    }
-
     group = g_malloc0(sizeof(*group));
 
-    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
-    group->fd = qemu_open(path, O_RDWR);
-    if (group->fd < 0) {
-        error_setg_errno(errp, errno, "failed to open %s", path);
-        goto free_group_exit;
-    }
+    group->fd = groupfd;
+    group->groupid = groupid;
 
     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
         error_setg_errno(errp, errno, "failed to get group %d status", groupid);
-        goto close_fd_exit;
+        goto free_group_exit;
     }
 
     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
@@ -1317,16 +1300,15 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
         error_append_hint(errp,
                           "Please ensure all devices within the iommu_group "
                           "are bound to their vfio bus driver.\n");
-        goto close_fd_exit;
+        goto free_group_exit;
     }
 
-    group->groupid = groupid;
     QLIST_INIT(&group->device_list);
 
     if (vfio_connect_container(group, as, errp)) {
         error_prepend(errp, "failed to setup container for group %d: ",
                       groupid);
-        goto close_fd_exit;
+        goto free_group_exit;
     }
 
     if (QLIST_EMPTY(&vfio_group_list)) {
@@ -1337,15 +1319,60 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
 
     return group;
 
-close_fd_exit:
-    close(group->fd);
-
 free_group_exit:
     g_free(group);
 
     return NULL;
 }
 
+static VFIOGroup *vfio_find_group(int groupid, AddressSpace *as,
+                                  Error **errp)
+{
+    VFIOGroup *group;
+
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        if (group->groupid == groupid) {
+            /* Found it.  Now is it already in the right context? */
+            if (group->container->space->as == as) {
+                return group;
+            } else {
+                error_setg(errp, "group %d used in multiple address spaces",
+                           group->groupid);
+                return NULL;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
+{
+    VFIOGroup *group;
+    char path[32];
+    int groupfd;
+
+    group = vfio_find_group(groupid, as, errp);
+    if (group) {
+        return group;
+    }
+
+    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
+    groupfd = qemu_open(path, O_RDWR);
+    if (groupfd < 0) {
+        error_setg_errno(errp, errno, "failed to open %s", path);
+        return NULL;
+    }
+
+    group = vfio_init_group(groupfd, groupid, as, errp);
+    if (!group) {
+        close(groupfd);
+        return NULL;
+    }
+
+    return group;
+}
+
 void vfio_put_group(VFIOGroup *group)
 {
     if (!group || !QLIST_EMPTY(&group->device_list)) {
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [Qemu-devel] [RFC 2/3] vfio: support getting VFIOGroup from groupfd
  2018-07-23  4:59 [Qemu-devel] [RFC 0/3] Supporting programming IOMMU in QEMU (vDPA/vhost-user) Tiwei Bie
  2018-07-23  4:59 ` [Qemu-devel] [RFC 1/3] vfio: split vfio_get_group() into small functions Tiwei Bie
@ 2018-07-23  4:59 ` Tiwei Bie
  2018-07-23  4:59 ` [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master Tiwei Bie
  2 siblings, 0 replies; 23+ messages in thread
From: Tiwei Bie @ 2018-07-23  4:59 UTC (permalink / raw)
  To: mst, alex.williamson, jasowang, qemu-devel; +Cc: tiwei.bie

This patch introduces an API to support getting
VFIOGroup from groupfd. This is useful when the
groupfd is opened and shared by another process
via UNIX socket.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 hw/vfio/common.c              | 44 +++++++++++++++++++++++++++++++++++
 include/hw/vfio/vfio-common.h |  1 +
 2 files changed, 45 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 52a05532cd..4c19a33dd5 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1279,6 +1279,30 @@ static void vfio_disconnect_container(VFIOGroup *group)
     }
 }
 
+static int vfio_groupfd_to_groupid(int groupfd)
+{
+    char *tmp, group_path[PATH_MAX], *group_name;
+    int groupid, len;
+
+    tmp = g_strdup_printf("/proc/self/fd/%d", groupfd);
+    len = readlink(tmp, group_path, sizeof(group_path));
+    g_free(tmp);
+
+    if (len <= 0 || len >= sizeof(group_path)) {
+        return -1;
+    }
+
+    group_path[len] = '\0';
+
+    group_name = g_path_get_basename(group_path);
+    if (sscanf(group_name, "%d", &groupid) != 1) {
+        groupid = -1;
+    }
+    g_free(group_name);
+
+    return groupid;
+}
+
 static VFIOGroup *vfio_init_group(int groupfd, int groupid, AddressSpace *as,
                                   Error **errp)
 {
@@ -1373,6 +1397,26 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
     return group;
 }
 
+VFIOGroup *vfio_get_group_from_fd(int groupfd, AddressSpace *as, Error **errp)
+{
+    VFIOGroup *group;
+    int groupid;
+
+    groupid = vfio_groupfd_to_groupid(groupfd);
+    if (groupid < 0) {
+        error_setg(errp, "failed to get group id from group fd %d",
+                   groupfd);
+        return NULL;
+    }
+
+    group = vfio_find_group(groupid, as, errp);
+    if (group) {
+        return group;
+    }
+
+    return vfio_init_group(groupfd, groupid, as, errp);
+}
+
 void vfio_put_group(VFIOGroup *group)
 {
     if (!group || !QLIST_EMPTY(&group->device_list)) {
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index a9036929b2..9bb1068a36 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -170,6 +170,7 @@ void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
+VFIOGroup *vfio_get_group_from_fd(int groupfd, AddressSpace *as, Error **errp);
 void vfio_put_group(VFIOGroup *group);
 int vfio_get_device(VFIOGroup *group, const char *name,
                     VFIODevice *vbasedev, Error **errp);
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-23  4:59 [Qemu-devel] [RFC 0/3] Supporting programming IOMMU in QEMU (vDPA/vhost-user) Tiwei Bie
  2018-07-23  4:59 ` [Qemu-devel] [RFC 1/3] vfio: split vfio_get_group() into small functions Tiwei Bie
  2018-07-23  4:59 ` [Qemu-devel] [RFC 2/3] vfio: support getting VFIOGroup from groupfd Tiwei Bie
@ 2018-07-23  4:59 ` Tiwei Bie
  2018-07-23  9:19   ` Michael S. Tsirkin
                     ` (2 more replies)
  2 siblings, 3 replies; 23+ messages in thread
From: Tiwei Bie @ 2018-07-23  4:59 UTC (permalink / raw)
  To: mst, alex.williamson, jasowang, qemu-devel; +Cc: tiwei.bie

Introduce a slave message to allow slave to share its
VFIO group fd to master and do the IOMMU programming
based on virtio device's DMA address space for this
group in QEMU.

For the vhost backends which support vDPA, they could
leverage this message to ask master to do the IOMMU
programming in QEMU for the vDPA device in backend.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 docs/interop/vhost-user.txt    | 16 ++++++++++++++
 hw/virtio/vhost-user.c         | 40 ++++++++++++++++++++++++++++++++++
 include/hw/virtio/vhost-user.h |  2 ++
 3 files changed, 58 insertions(+)

diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
index f59667f498..a57a8f9451 100644
--- a/docs/interop/vhost-user.txt
+++ b/docs/interop/vhost-user.txt
@@ -397,6 +397,7 @@ Protocol features
 #define VHOST_USER_PROTOCOL_F_CONFIG         9
 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
+#define VHOST_USER_PROTOCOL_F_VFIO_GROUP     12
 
 Master message types
 --------------------
@@ -815,6 +816,21 @@ Slave message types
       This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
       protocol feature has been successfully negotiated.
 
+ * VHOST_USER_SLAVE_VFIO_GROUP_MSG
+
+      Id: 4
+      Equivalent ioctl: N/A
+      Slave payload: N/A
+      Master payload: N/A
+
+      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
+      could send this request to share its VFIO group fd via ancillary data
+      to master. After receiving this request from slave, master will close
+      the existing VFIO group if any and do the DMA programming based on the
+      virtio device's DMA address space for the new group if the request is
+      sent with a file descriptor.
+
+
 VHOST_USER_PROTOCOL_F_REPLY_ACK:
 -------------------------------
 The original vhost-user specification only demands replies for certain
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index b041343632..db958e24c7 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -52,6 +52,7 @@ enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_CONFIG = 9,
     VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
     VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
+    VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12,
     VHOST_USER_PROTOCOL_F_MAX
 };
 
@@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest {
     VHOST_USER_SLAVE_IOTLB_MSG = 1,
     VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
     VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
+    VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4,
     VHOST_USER_SLAVE_MAX
 }  VhostUserSlaveRequest;
 
@@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
     return 0;
 }
 
+static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
+                                              int *fd)
+{
+    struct vhost_user *u = dev->opaque;
+    VhostUserState *user = u->user;
+    VirtIODevice *vdev = dev->vdev;
+    int groupfd = fd[0];
+    VFIOGroup *group;
+
+    if (!virtio_has_feature(dev->protocol_features,
+                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
+        vdev == NULL) {
+        return -1;
+    }
+
+    if (user->vfio_group) {
+        vfio_put_group(user->vfio_group);
+        user->vfio_group = NULL;
+    }
+
+    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
+    if (group == NULL) {
+        return -1;
+    }
+
+    if (group->fd != groupfd) {
+        close(groupfd);
+    }
+
+    user->vfio_group = group;
+    fd[0] = -1;
+
+    return 0;
+}
+
 static void slave_read(void *opaque)
 {
     struct vhost_dev *dev = opaque;
@@ -1021,6 +1058,9 @@ static void slave_read(void *opaque)
         ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
                                                           fd[0]);
         break;
+    case VHOST_USER_SLAVE_VFIO_GROUP_MSG:
+        ret = vhost_user_slave_handle_vfio_group(dev, fd);
+        break;
     default:
         error_report("Received unexpected msg type.");
         ret = -EINVAL;
diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
index fd660393a0..9e11473274 100644
--- a/include/hw/virtio/vhost-user.h
+++ b/include/hw/virtio/vhost-user.h
@@ -10,6 +10,7 @@
 
 #include "chardev/char-fe.h"
 #include "hw/virtio/virtio.h"
+#include "hw/vfio/vfio-common.h"
 
 typedef struct VhostUserHostNotifier {
     MemoryRegion mr;
@@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier {
 typedef struct VhostUserState {
     CharBackend *chr;
     VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
+    VFIOGroup *vfio_group;
 } VhostUserState;
 
 VhostUserState *vhost_user_init(void);
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-23  4:59 ` [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master Tiwei Bie
@ 2018-07-23  9:19   ` Michael S. Tsirkin
  2018-07-23 11:59     ` Tiwei Bie
  2018-07-23  9:20   ` Michael S. Tsirkin
  2018-07-26 20:45   ` Alex Williamson
  2 siblings, 1 reply; 23+ messages in thread
From: Michael S. Tsirkin @ 2018-07-23  9:19 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: alex.williamson, jasowang, qemu-devel

On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote:
> Introduce a slave message to allow slave to share its
> VFIO group fd to master and do the IOMMU programming
> based on virtio device's DMA address space for this
> group in QEMU.
> 
> For the vhost backends which support vDPA, they could
> leverage this message to ask master to do the IOMMU
> programming in QEMU for the vDPA device in backend.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  docs/interop/vhost-user.txt    | 16 ++++++++++++++
>  hw/virtio/vhost-user.c         | 40 ++++++++++++++++++++++++++++++++++
>  include/hw/virtio/vhost-user.h |  2 ++
>  3 files changed, 58 insertions(+)
> 
> diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
> index f59667f498..a57a8f9451 100644
> --- a/docs/interop/vhost-user.txt
> +++ b/docs/interop/vhost-user.txt
> @@ -397,6 +397,7 @@ Protocol features
>  #define VHOST_USER_PROTOCOL_F_CONFIG         9
>  #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
>  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
> +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP     12
>  
>  Master message types
>  --------------------
> @@ -815,6 +816,21 @@ Slave message types
>        This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
>        protocol feature has been successfully negotiated.
>  
> + * VHOST_USER_SLAVE_VFIO_GROUP_MSG
> +
> +      Id: 4
> +      Equivalent ioctl: N/A
> +      Slave payload: N/A
> +      Master payload: N/A
> +
> +      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
> +      could send this request to share its VFIO group fd via ancillary data
> +      to master. After receiving this request from slave, master will close
> +      the existing VFIO group if any and do the DMA programming based on the
> +      virtio device's DMA address space for the new group if the request is
> +      sent with a file descriptor.
> +

Should it also clear out any mappings that were set on the old fd
before closing it?

Also should we limit when can this message be received?
If not you need to re-program mappings again whenever
we get a new fd.

> +
>  VHOST_USER_PROTOCOL_F_REPLY_ACK:
>  -------------------------------
>  The original vhost-user specification only demands replies for certain
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index b041343632..db958e24c7 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature {
>      VHOST_USER_PROTOCOL_F_CONFIG = 9,
>      VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
>      VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> +    VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12,
>      VHOST_USER_PROTOCOL_F_MAX
>  };
>  
> @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest {
>      VHOST_USER_SLAVE_IOTLB_MSG = 1,
>      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
>      VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> +    VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4,
>      VHOST_USER_SLAVE_MAX
>  }  VhostUserSlaveRequest;
>  
> @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
>      return 0;
>  }
>  
> +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> +                                              int *fd)
> +{
> +    struct vhost_user *u = dev->opaque;
> +    VhostUserState *user = u->user;
> +    VirtIODevice *vdev = dev->vdev;
> +    int groupfd = fd[0];
> +    VFIOGroup *group;
> +
> +    if (!virtio_has_feature(dev->protocol_features,
> +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> +        vdev == NULL) {
> +        return -1;
> +    }
> +
> +    if (user->vfio_group) {
> +        vfio_put_group(user->vfio_group);
> +        user->vfio_group = NULL;
> +    }
> +
> +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> +    if (group == NULL) {
> +        return -1;
> +    }
> +
> +    if (group->fd != groupfd) {
> +        close(groupfd);
> +    }
> +
> +    user->vfio_group = group;
> +    fd[0] = -1;
> +
> +    return 0;
> +}
> +

What will cause propagating groups to this vfio fd?


>  static void slave_read(void *opaque)
>  {
>      struct vhost_dev *dev = opaque;
> @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque)
>          ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
>                                                            fd[0]);
>          break;
> +    case VHOST_USER_SLAVE_VFIO_GROUP_MSG:
> +        ret = vhost_user_slave_handle_vfio_group(dev, fd);
> +        break;
>      default:
>          error_report("Received unexpected msg type.");
>          ret = -EINVAL;
> diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> index fd660393a0..9e11473274 100644
> --- a/include/hw/virtio/vhost-user.h
> +++ b/include/hw/virtio/vhost-user.h
> @@ -10,6 +10,7 @@
>  
>  #include "chardev/char-fe.h"
>  #include "hw/virtio/virtio.h"
> +#include "hw/vfio/vfio-common.h"
>  
>  typedef struct VhostUserHostNotifier {
>      MemoryRegion mr;
> @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier {
>  typedef struct VhostUserState {
>      CharBackend *chr;
>      VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> +    VFIOGroup *vfio_group;
>  } VhostUserState;
>  
>  VhostUserState *vhost_user_init(void);
> -- 
> 2.18.0

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-23  4:59 ` [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master Tiwei Bie
  2018-07-23  9:19   ` Michael S. Tsirkin
@ 2018-07-23  9:20   ` Michael S. Tsirkin
  2018-07-23 12:04     ` Tiwei Bie
  2018-07-26 20:45   ` Alex Williamson
  2 siblings, 1 reply; 23+ messages in thread
From: Michael S. Tsirkin @ 2018-07-23  9:20 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: alex.williamson, jasowang, qemu-devel

On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote:
> Introduce a slave message to allow slave to share its
> VFIO group fd to master and do the IOMMU programming
> based on virtio device's DMA address space for this
> group in QEMU.
> 
> For the vhost backends which support vDPA, they could
> leverage this message to ask master to do the IOMMU
> programming in QEMU for the vDPA device in backend.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  docs/interop/vhost-user.txt    | 16 ++++++++++++++
>  hw/virtio/vhost-user.c         | 40 ++++++++++++++++++++++++++++++++++
>  include/hw/virtio/vhost-user.h |  2 ++
>  3 files changed, 58 insertions(+)
> 
> diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
> index f59667f498..a57a8f9451 100644
> --- a/docs/interop/vhost-user.txt
> +++ b/docs/interop/vhost-user.txt
> @@ -397,6 +397,7 @@ Protocol features
>  #define VHOST_USER_PROTOCOL_F_CONFIG         9
>  #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
>  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
> +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP     12
>  
>  Master message types
>  --------------------
> @@ -815,6 +816,21 @@ Slave message types
>        This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
>        protocol feature has been successfully negotiated.
>  
> + * VHOST_USER_SLAVE_VFIO_GROUP_MSG
> +
> +      Id: 4
> +      Equivalent ioctl: N/A
> +      Slave payload: N/A
> +      Master payload: N/A
> +
> +      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
> +      could send this request to share its VFIO group fd via ancillary data
> +      to master. After receiving this request from slave, master will close
> +      the existing VFIO group if any and do the DMA programming based on the
> +      virtio device's DMA address space for the new group if the request is
> +      sent with a file descriptor.
> +
> +
>  VHOST_USER_PROTOCOL_F_REPLY_ACK:
>  -------------------------------
>  The original vhost-user specification only demands replies for certain
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index b041343632..db958e24c7 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature {
>      VHOST_USER_PROTOCOL_F_CONFIG = 9,
>      VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
>      VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> +    VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12,
>      VHOST_USER_PROTOCOL_F_MAX
>  };
>  
> @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest {
>      VHOST_USER_SLAVE_IOTLB_MSG = 1,
>      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
>      VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> +    VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4,
>      VHOST_USER_SLAVE_MAX
>  }  VhostUserSlaveRequest;
>  
> @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
>      return 0;
>  }
>  
> +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> +                                              int *fd)
> +{
> +    struct vhost_user *u = dev->opaque;
> +    VhostUserState *user = u->user;
> +    VirtIODevice *vdev = dev->vdev;
> +    int groupfd = fd[0];
> +    VFIOGroup *group;
> +
> +    if (!virtio_has_feature(dev->protocol_features,
> +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> +        vdev == NULL) {
> +        return -1;
> +    }
> +
> +    if (user->vfio_group) {
> +        vfio_put_group(user->vfio_group);
> +        user->vfio_group = NULL;

Seems to create a window where mappings are invalid
even if the same fd is re-sent. Is that OK?

> +    }
> +
> +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> +    if (group == NULL) {
> +        return -1;
> +    }
> +
> +    if (group->fd != groupfd) {
> +        close(groupfd);
> +    }
> +
> +    user->vfio_group = group;
> +    fd[0] = -1;
> +
> +    return 0;
> +}
> +
>  static void slave_read(void *opaque)
>  {
>      struct vhost_dev *dev = opaque;
> @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque)
>          ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
>                                                            fd[0]);
>          break;
> +    case VHOST_USER_SLAVE_VFIO_GROUP_MSG:
> +        ret = vhost_user_slave_handle_vfio_group(dev, fd);
> +        break;
>      default:
>          error_report("Received unexpected msg type.");
>          ret = -EINVAL;
> diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> index fd660393a0..9e11473274 100644
> --- a/include/hw/virtio/vhost-user.h
> +++ b/include/hw/virtio/vhost-user.h
> @@ -10,6 +10,7 @@
>  
>  #include "chardev/char-fe.h"
>  #include "hw/virtio/virtio.h"
> +#include "hw/vfio/vfio-common.h"
>  
>  typedef struct VhostUserHostNotifier {
>      MemoryRegion mr;
> @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier {
>  typedef struct VhostUserState {
>      CharBackend *chr;
>      VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> +    VFIOGroup *vfio_group;
>  } VhostUserState;
>  
>  VhostUserState *vhost_user_init(void);
> -- 
> 2.18.0

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-23  9:19   ` Michael S. Tsirkin
@ 2018-07-23 11:59     ` Tiwei Bie
  0 siblings, 0 replies; 23+ messages in thread
From: Tiwei Bie @ 2018-07-23 11:59 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: alex.williamson, jasowang, qemu-devel

On Mon, Jul 23, 2018 at 12:19:04PM +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote:
[...]
> > @@ -815,6 +816,21 @@ Slave message types
> >        This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
> >        protocol feature has been successfully negotiated.
> >  
> > + * VHOST_USER_SLAVE_VFIO_GROUP_MSG
> > +
> > +      Id: 4
> > +      Equivalent ioctl: N/A
> > +      Slave payload: N/A
> > +      Master payload: N/A
> > +
> > +      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
> > +      could send this request to share its VFIO group fd via ancillary data
> > +      to master. After receiving this request from slave, master will close
> > +      the existing VFIO group if any and do the DMA programming based on the
> > +      virtio device's DMA address space for the new group if the request is
> > +      sent with a file descriptor.
> > +
> 
> Should it also clear out any mappings that were set on the old fd
> before closing it?

Yes, more exactly it will do below things:

1. Delete VFIO group from KVM device fd;
2. Unset the container fd for this group fd;
3. Close this VFIO group fd;

Should I include above details in this doc?

> 
> Also should we limit when can this message be received?
> If not you need to re-program mappings again whenever
> we get a new fd.

To keep things simple, this proposal requires the slave
to assume the mappings are invalid before receiving the
REPLY from master when the slave sends this message to
master, and master will destroy the existing VFIO group
if any and do the setup for the (new) VFIO group if the
message carries a fd.

So if a VFIO group fd has been sent and the device has
been started, before sending a VFIO group fd (could be
the same fd that has been sent), the slave should stop
the device first and shouldn't assume the mappings are
valid before receiving the REPLY.

> 
> > +
[...]
> >  
> > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > +                                              int *fd)
> > +{
> > +    struct vhost_user *u = dev->opaque;
> > +    VhostUserState *user = u->user;
> > +    VirtIODevice *vdev = dev->vdev;
> > +    int groupfd = fd[0];
> > +    VFIOGroup *group;
> > +
> > +    if (!virtio_has_feature(dev->protocol_features,
> > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > +        vdev == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (user->vfio_group) {
> > +        vfio_put_group(user->vfio_group);
> > +        user->vfio_group = NULL;
> > +    }
> > +
> > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > +    if (group == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (group->fd != groupfd) {
> > +        close(groupfd);
> > +    }
> > +
> > +    user->vfio_group = group;
> > +    fd[0] = -1;
> > +
> > +    return 0;
> > +}
> > +
> 
> What will cause propagating groups to this vfio fd?

Do you mean when a VFIOGroup will be created for this
VFIO group fd?

A VFIOGroup will be created if there is no existing
VFIOGroup that references to the same group id.

Best regards,
Tiwei Bie

> 
> 
[...]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-23  9:20   ` Michael S. Tsirkin
@ 2018-07-23 12:04     ` Tiwei Bie
  0 siblings, 0 replies; 23+ messages in thread
From: Tiwei Bie @ 2018-07-23 12:04 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: alex.williamson, jasowang, qemu-devel

On Mon, Jul 23, 2018 at 12:20:12PM +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote:
[...]
> >  
> > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > +                                              int *fd)
> > +{
> > +    struct vhost_user *u = dev->opaque;
> > +    VhostUserState *user = u->user;
> > +    VirtIODevice *vdev = dev->vdev;
> > +    int groupfd = fd[0];
> > +    VFIOGroup *group;
> > +
> > +    if (!virtio_has_feature(dev->protocol_features,
> > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > +        vdev == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (user->vfio_group) {
> > +        vfio_put_group(user->vfio_group);
> > +        user->vfio_group = NULL;
> 
> Seems to create a window where mappings are invalid
> even if the same fd is re-sent. Is that OK?

Yeah, there will be a window that mappings are invalid
when the same fd is re-sent. Based on the proposal [1]
of this patch, it should be OK.

[1] http://lists.gnu.org/archive/html/qemu-devel/2018-07/msg04335.html
"""
To keep things simple, this proposal requires the slave
to assume the mappings are invalid before receiving the
REPLY from master when the slave sends this message to
master, and master will destroy the existing VFIO group
if any and do the setup for the (new) VFIO group if the
message carries a fd.

So if a VFIO group fd has been sent and the device has
been started, before sending a VFIO group fd (could be
the same fd that has been sent), the slave should stop
the device first and shouldn't assume the mappings are
valid before receiving the REPLY.
"""

Best regards,
Tiwei Bie

> 
> > +    }
[...]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-23  4:59 ` [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master Tiwei Bie
  2018-07-23  9:19   ` Michael S. Tsirkin
  2018-07-23  9:20   ` Michael S. Tsirkin
@ 2018-07-26 20:45   ` Alex Williamson
  2018-07-27  1:58     ` Tiwei Bie
  2 siblings, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2018-07-26 20:45 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, jasowang, qemu-devel

On Mon, 23 Jul 2018 12:59:56 +0800
Tiwei Bie <tiwei.bie@intel.com> wrote:

> Introduce a slave message to allow slave to share its
> VFIO group fd to master and do the IOMMU programming
> based on virtio device's DMA address space for this
> group in QEMU.
> 
> For the vhost backends which support vDPA, they could
> leverage this message to ask master to do the IOMMU
> programming in QEMU for the vDPA device in backend.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  docs/interop/vhost-user.txt    | 16 ++++++++++++++
>  hw/virtio/vhost-user.c         | 40 ++++++++++++++++++++++++++++++++++
>  include/hw/virtio/vhost-user.h |  2 ++
>  3 files changed, 58 insertions(+)
> 
> diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
> index f59667f498..a57a8f9451 100644
> --- a/docs/interop/vhost-user.txt
> +++ b/docs/interop/vhost-user.txt
> @@ -397,6 +397,7 @@ Protocol features
>  #define VHOST_USER_PROTOCOL_F_CONFIG         9
>  #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
>  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
> +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP     12
>  
>  Master message types
>  --------------------
> @@ -815,6 +816,21 @@ Slave message types
>        This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
>        protocol feature has been successfully negotiated.
>  
> + * VHOST_USER_SLAVE_VFIO_GROUP_MSG
> +
> +      Id: 4
> +      Equivalent ioctl: N/A
> +      Slave payload: N/A
> +      Master payload: N/A
> +
> +      When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave
> +      could send this request to share its VFIO group fd via ancillary data
> +      to master. After receiving this request from slave, master will close
> +      the existing VFIO group if any and do the DMA programming based on the
> +      virtio device's DMA address space for the new group if the request is
> +      sent with a file descriptor.
> +
> +
>  VHOST_USER_PROTOCOL_F_REPLY_ACK:
>  -------------------------------
>  The original vhost-user specification only demands replies for certain
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index b041343632..db958e24c7 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature {
>      VHOST_USER_PROTOCOL_F_CONFIG = 9,
>      VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
>      VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> +    VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12,
>      VHOST_USER_PROTOCOL_F_MAX
>  };
>  
> @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest {
>      VHOST_USER_SLAVE_IOTLB_MSG = 1,
>      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
>      VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> +    VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4,
>      VHOST_USER_SLAVE_MAX
>  }  VhostUserSlaveRequest;
>  
> @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
>      return 0;
>  }
>  
> +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> +                                              int *fd)
> +{
> +    struct vhost_user *u = dev->opaque;
> +    VhostUserState *user = u->user;
> +    VirtIODevice *vdev = dev->vdev;
> +    int groupfd = fd[0];
> +    VFIOGroup *group;
> +
> +    if (!virtio_has_feature(dev->protocol_features,
> +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> +        vdev == NULL) {
> +        return -1;
> +    }
> +
> +    if (user->vfio_group) {
> +        vfio_put_group(user->vfio_group);
> +        user->vfio_group = NULL;
> +    }
> +
> +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> +    if (group == NULL) {
> +        return -1;
> +    }
> +
> +    if (group->fd != groupfd) {
> +        close(groupfd);
> +    }
> +
> +    user->vfio_group = group;
> +    fd[0] = -1;
> +
> +    return 0;
> +}

This all looks very sketchy, we're reaching into vfio internal state
and arbitrarily releasing data structures, reusing existing ones, and
maybe creating new ones.  We know that a vfio group only allows a
single open, right?  So if you have a groupfd, vfio will never have
that same group opened already.  Is that the goal?  It's not obvious in
the code.  I don't really understand what vhost goes on to do with this
group, but I'm pretty sure the existing state machine in vfio isn't
designed for it.  Thanks,

Alex

> +
>  static void slave_read(void *opaque)
>  {
>      struct vhost_dev *dev = opaque;
> @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque)
>          ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
>                                                            fd[0]);
>          break;
> +    case VHOST_USER_SLAVE_VFIO_GROUP_MSG:
> +        ret = vhost_user_slave_handle_vfio_group(dev, fd);
> +        break;
>      default:
>          error_report("Received unexpected msg type.");
>          ret = -EINVAL;
> diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> index fd660393a0..9e11473274 100644
> --- a/include/hw/virtio/vhost-user.h
> +++ b/include/hw/virtio/vhost-user.h
> @@ -10,6 +10,7 @@
>  
>  #include "chardev/char-fe.h"
>  #include "hw/virtio/virtio.h"
> +#include "hw/vfio/vfio-common.h"
>  
>  typedef struct VhostUserHostNotifier {
>      MemoryRegion mr;
> @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier {
>  typedef struct VhostUserState {
>      CharBackend *chr;
>      VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> +    VFIOGroup *vfio_group;
>  } VhostUserState;
>  
>  VhostUserState *vhost_user_init(void);

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-26 20:45   ` Alex Williamson
@ 2018-07-27  1:58     ` Tiwei Bie
  2018-07-27 20:03       ` Alex Williamson
  0 siblings, 1 reply; 23+ messages in thread
From: Tiwei Bie @ 2018-07-27  1:58 UTC (permalink / raw)
  To: Alex Williamson; +Cc: mst, jasowang, qemu-devel

On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> On Mon, 23 Jul 2018 12:59:56 +0800
> Tiwei Bie <tiwei.bie@intel.com> wrote:
> 
[...]
> >  
> > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > +                                              int *fd)
> > +{
> > +    struct vhost_user *u = dev->opaque;
> > +    VhostUserState *user = u->user;
> > +    VirtIODevice *vdev = dev->vdev;
> > +    int groupfd = fd[0];
> > +    VFIOGroup *group;
> > +
> > +    if (!virtio_has_feature(dev->protocol_features,
> > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > +        vdev == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (user->vfio_group) {
> > +        vfio_put_group(user->vfio_group);
> > +        user->vfio_group = NULL;
> > +    }
> > +

There should be a below check here (I missed it in this
patch, sorry..):

    if (groupfd < 0) {
        return 0;
    }

> > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > +    if (group == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (group->fd != groupfd) {
> > +        close(groupfd);
> > +    }
> > +
> > +    user->vfio_group = group;
> > +    fd[0] = -1;
> > +
> > +    return 0;
> > +}
> 
> This all looks very sketchy, we're reaching into vfio internal state
> and arbitrarily releasing data structures, reusing existing ones, and
> maybe creating new ones.  We know that a vfio group only allows a
> single open, right?

Yeah, exactly!

When the vhost-user backend process has opened a VFIO group fd,
QEMU won't be able to open this group file via open() any more.
So vhost-user backend process will share the VFIO group fd to
QEMU via the UNIX socket. And that's why we're introducing a
new API:

	vfio_get_group_from_fd(groupfd, ...);

for vfio/common.c to get (setup) a VFIOGroup structure. (Because
in this case, vfio/common.c can't open this group file via open(),
and what we have is a VFIO group fd shared by another process via
UNIX socket). And ...

> So if you have a groupfd, vfio will never have
> that same group opened already.

... if the same vhost-user backend process shares the same VFIO
group fd more than one times via different vhost-user slave channels
to different vhost-user frontends in the same QEMU process, the
same VFIOGroup could exist already.

This case could happen when e.g. there are two vhost accelerator
devices in the same VFIO group, and they are managed by the same
vhost-user backend process, and used to accelerate two different
virtio devices in QEMU. In this case, the same VFIO group fd could
be sent twice.

If we need to support this case, more changes in vfio/common.c
will be needed, e.g. 1) add a refcnt in VFIOGroup to support
getting and putting a VFIOGroup structure multiple times,
2) add a lock to make vfio_get_group*() and vfio_put_group()
thread-safe.

> Is that the goal?  It's not obvious in
> the code.  I don't really understand what vhost goes on to do with this
> group,

The goal is that, we have a VFIO group opened by an external
vhost-user backend process, this process will manage the VFIO
device, but want QEMU to manage the VFIO group, including:

1 - program the DMA mappings for this VFIO group based on
    the front-end device's dma_as in QEMU.

2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).

Vhost-user in QEMU as the vhost-user frontend just wants
hw/vfio to do above things after receiving a VFIO group fd
from the vhost-user backend process.

Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
by calling vfio_get_group_from_fd() or put this VFIOGroup by
calling vfio_put_group() when it's not needed anymore, and
won't do anything else.

Vhost-user will only deal with the VFIOGroups allocated by
itself, and won't influence any other VFIOGroups used by the
VFIO passthru devices (-device vfio-pci). Because the same
VFIO group file can only be opened by one process via open().

> but I'm pretty sure the existing state machine in vfio isn't
> designed for it.  Thanks,

Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
is going to use VFIOGroup and how we are going to extend hw/vfio)
acceptable to you? Thanks!

Best regards,
Tiwei Bie

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-27  1:58     ` Tiwei Bie
@ 2018-07-27 20:03       ` Alex Williamson
  2018-07-30  8:10         ` Tiwei Bie
  0 siblings, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2018-07-27 20:03 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, jasowang, qemu-devel

On Fri, 27 Jul 2018 09:58:05 +0800
Tiwei Bie <tiwei.bie@intel.com> wrote:

> On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> > On Mon, 23 Jul 2018 12:59:56 +0800
> > Tiwei Bie <tiwei.bie@intel.com> wrote:
> >   
> [...]
> > >  
> > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > +                                              int *fd)
> > > +{
> > > +    struct vhost_user *u = dev->opaque;
> > > +    VhostUserState *user = u->user;
> > > +    VirtIODevice *vdev = dev->vdev;
> > > +    int groupfd = fd[0];
> > > +    VFIOGroup *group;
> > > +
> > > +    if (!virtio_has_feature(dev->protocol_features,
> > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > +        vdev == NULL) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (user->vfio_group) {
> > > +        vfio_put_group(user->vfio_group);
> > > +        user->vfio_group = NULL;
> > > +    }
> > > +  
> 
> There should be a below check here (I missed it in this
> patch, sorry..):
> 
>     if (groupfd < 0) {
>         return 0;
>     }
> 
> > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > +    if (group == NULL) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (group->fd != groupfd) {
> > > +        close(groupfd);
> > > +    }
> > > +
> > > +    user->vfio_group = group;
> > > +    fd[0] = -1;
> > > +
> > > +    return 0;
> > > +}  
> > 
> > This all looks very sketchy, we're reaching into vfio internal state
> > and arbitrarily releasing data structures, reusing existing ones, and
> > maybe creating new ones.  We know that a vfio group only allows a
> > single open, right?  
> 
> Yeah, exactly!
> 
> When the vhost-user backend process has opened a VFIO group fd,
> QEMU won't be able to open this group file via open() any more.
> So vhost-user backend process will share the VFIO group fd to
> QEMU via the UNIX socket. And that's why we're introducing a
> new API:
> 
> 	vfio_get_group_from_fd(groupfd, ...);
> 
> for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> in this case, vfio/common.c can't open this group file via open(),
> and what we have is a VFIO group fd shared by another process via
> UNIX socket). And ...
> 
> > So if you have a groupfd, vfio will never have
> > that same group opened already.  
> 
> ... if the same vhost-user backend process shares the same VFIO
> group fd more than one times via different vhost-user slave channels
> to different vhost-user frontends in the same QEMU process, the
> same VFIOGroup could exist already.
> 
> This case could happen when e.g. there are two vhost accelerator
> devices in the same VFIO group, and they are managed by the same
> vhost-user backend process, and used to accelerate two different
> virtio devices in QEMU. In this case, the same VFIO group fd could
> be sent twice.
> 
> If we need to support this case, more changes in vfio/common.c
> will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> getting and putting a VFIOGroup structure multiple times,
> 2) add a lock to make vfio_get_group*() and vfio_put_group()
> thread-safe.

This is the sort of case where it seems like we're just grabbing
internal interfaces and using them from other modules.  VFIOGroups have
a device_list and currently handles multiple puts by testing whether
that device list is empty (ie. we already have a refcnt effectively).
Now we have a group user that has no devices, which is not something
that it seems like we've designed or audited the code for handling.
IOW, while the header file lives in a common location, it's not really
intended to be an API within QEMU and it makes me a bit uncomfortable
to use it as such.

> > Is that the goal?  It's not obvious in
> > the code.  I don't really understand what vhost goes on to do with this
> > group,  
> 
> The goal is that, we have a VFIO group opened by an external
> vhost-user backend process, this process will manage the VFIO
> device, but want QEMU to manage the VFIO group, including:
> 
> 1 - program the DMA mappings for this VFIO group based on
>     the front-end device's dma_as in QEMU.
> 
> 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> 
> Vhost-user in QEMU as the vhost-user frontend just wants
> hw/vfio to do above things after receiving a VFIO group fd
> from the vhost-user backend process.
> 
> Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> by calling vfio_get_group_from_fd() or put this VFIOGroup by
> calling vfio_put_group() when it's not needed anymore, and
> won't do anything else.
> 
> Vhost-user will only deal with the VFIOGroups allocated by
> itself, and won't influence any other VFIOGroups used by the
> VFIO passthru devices (-device vfio-pci). Because the same
> VFIO group file can only be opened by one process via open().

But there's nothing here that guarantees the reverse.  vhost cannot
open the group of an existing vfio-pci device, but vfio-pci can re-use
the group that vhost has already opened.  This is again where it seems
like we're trying to cherry pick from an internal API and leaving some
loose ends dangling.

> > but I'm pretty sure the existing state machine in vfio isn't
> > designed for it.  Thanks,  
> 
> Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> is going to use VFIOGroup and how we are going to extend hw/vfio)
> acceptable to you? Thanks!

I certainly would not want to duplicate managing the group, container,
and MemoryListener, but I think we need a more formal interface with at
least some notion of reference counting beyond the device list or
explicit knowledge of an external user.  I'm also curious if it really
makes sense that the vhost backend is opening the group rather than
letting QEMU do it.  It seems that vhost wants to deal with the device
and we can never have symmetry in the scenario above, where a group
might contain multiple devices and order matters because of where the
group is opened.  If we still had a notion of a VFIODevice for an
external user, then the device_list refcnt would just work.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-27 20:03       ` Alex Williamson
@ 2018-07-30  8:10         ` Tiwei Bie
  2018-07-30  9:30           ` Michael S. Tsirkin
  0 siblings, 1 reply; 23+ messages in thread
From: Tiwei Bie @ 2018-07-30  8:10 UTC (permalink / raw)
  To: Alex Williamson; +Cc: mst, jasowang, qemu-devel

On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote:
> On Fri, 27 Jul 2018 09:58:05 +0800
> Tiwei Bie <tiwei.bie@intel.com> wrote:
> 
> > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> > > On Mon, 23 Jul 2018 12:59:56 +0800
> > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > >   
> > [...]
> > > >  
> > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > > +                                              int *fd)
> > > > +{
> > > > +    struct vhost_user *u = dev->opaque;
> > > > +    VhostUserState *user = u->user;
> > > > +    VirtIODevice *vdev = dev->vdev;
> > > > +    int groupfd = fd[0];
> > > > +    VFIOGroup *group;
> > > > +
> > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > > +        vdev == NULL) {
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (user->vfio_group) {
> > > > +        vfio_put_group(user->vfio_group);
> > > > +        user->vfio_group = NULL;
> > > > +    }
> > > > +  
> > 
> > There should be a below check here (I missed it in this
> > patch, sorry..):
> > 
> >     if (groupfd < 0) {
> >         return 0;
> >     }
> > 
> > > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > > +    if (group == NULL) {
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (group->fd != groupfd) {
> > > > +        close(groupfd);
> > > > +    }
> > > > +
> > > > +    user->vfio_group = group;
> > > > +    fd[0] = -1;
> > > > +
> > > > +    return 0;
> > > > +}  
> > > 
> > > This all looks very sketchy, we're reaching into vfio internal state
> > > and arbitrarily releasing data structures, reusing existing ones, and
> > > maybe creating new ones.  We know that a vfio group only allows a
> > > single open, right?  
> > 
> > Yeah, exactly!
> > 
> > When the vhost-user backend process has opened a VFIO group fd,
> > QEMU won't be able to open this group file via open() any more.
> > So vhost-user backend process will share the VFIO group fd to
> > QEMU via the UNIX socket. And that's why we're introducing a
> > new API:
> > 
> > 	vfio_get_group_from_fd(groupfd, ...);
> > 
> > for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> > in this case, vfio/common.c can't open this group file via open(),
> > and what we have is a VFIO group fd shared by another process via
> > UNIX socket). And ...
> > 
> > > So if you have a groupfd, vfio will never have
> > > that same group opened already.  
> > 
> > ... if the same vhost-user backend process shares the same VFIO
> > group fd more than one times via different vhost-user slave channels
> > to different vhost-user frontends in the same QEMU process, the
> > same VFIOGroup could exist already.
> > 
> > This case could happen when e.g. there are two vhost accelerator
> > devices in the same VFIO group, and they are managed by the same
> > vhost-user backend process, and used to accelerate two different
> > virtio devices in QEMU. In this case, the same VFIO group fd could
> > be sent twice.
> > 
> > If we need to support this case, more changes in vfio/common.c
> > will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> > getting and putting a VFIOGroup structure multiple times,
> > 2) add a lock to make vfio_get_group*() and vfio_put_group()
> > thread-safe.
> 
> This is the sort of case where it seems like we're just grabbing
> internal interfaces and using them from other modules.  VFIOGroups have
> a device_list and currently handles multiple puts by testing whether
> that device list is empty (ie. we already have a refcnt effectively).
> Now we have a group user that has no devices, which is not something
> that it seems like we've designed or audited the code for handling.
> IOW, while the header file lives in a common location, it's not really
> intended to be an API within QEMU and it makes me a bit uncomfortable
> to use it as such.
> 
> > > Is that the goal?  It's not obvious in
> > > the code.  I don't really understand what vhost goes on to do with this
> > > group,  
> > 
> > The goal is that, we have a VFIO group opened by an external
> > vhost-user backend process, this process will manage the VFIO
> > device, but want QEMU to manage the VFIO group, including:
> > 
> > 1 - program the DMA mappings for this VFIO group based on
> >     the front-end device's dma_as in QEMU.
> > 
> > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> > 
> > Vhost-user in QEMU as the vhost-user frontend just wants
> > hw/vfio to do above things after receiving a VFIO group fd
> > from the vhost-user backend process.
> > 
> > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> > by calling vfio_get_group_from_fd() or put this VFIOGroup by
> > calling vfio_put_group() when it's not needed anymore, and
> > won't do anything else.
> > 
> > Vhost-user will only deal with the VFIOGroups allocated by
> > itself, and won't influence any other VFIOGroups used by the
> > VFIO passthru devices (-device vfio-pci). Because the same
> > VFIO group file can only be opened by one process via open().
> 
> But there's nothing here that guarantees the reverse.  vhost cannot
> open the group of an existing vfio-pci device, but vfio-pci can re-use
> the group that vhost has already opened.  This is again where it seems
> like we're trying to cherry pick from an internal API and leaving some
> loose ends dangling.
> 
> > > but I'm pretty sure the existing state machine in vfio isn't
> > > designed for it.  Thanks,  
> > 
> > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> > is going to use VFIOGroup and how we are going to extend hw/vfio)
> > acceptable to you? Thanks!
> 
> I certainly would not want to duplicate managing the group, container,
> and MemoryListener, but I think we need a more formal interface with at
> least some notion of reference counting beyond the device list or
> explicit knowledge of an external user.

Got it! I'll try to address this. Thanks!

> I'm also curious if it really
> makes sense that the vhost backend is opening the group rather than
> letting QEMU do it.  It seems that vhost wants to deal with the device
> and we can never have symmetry in the scenario above, where a group
> might contain multiple devices and order matters because of where the
> group is opened.  If we still had a notion of a VFIODevice for an
> external user, then the device_list refcnt would just work.  Thanks,

Vhost-user backend will but QEMU won't open the VFIO
fds, because QEMU just has a vhost-user UNIX socket
path that it should connect to or listen on. So QEMU
doesn't know which group and device it should open.
The vhost accelerator devices are probed and managed
in the vhost-user backend process. And the vhost-user
backend process will bind the vhost-user instances
and vhost accelerator devices based on some sort of
user's input.

Best regards,
Tiwei Bie

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-30  8:10         ` Tiwei Bie
@ 2018-07-30  9:30           ` Michael S. Tsirkin
  2018-07-30 16:20             ` Alex Williamson
                               ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2018-07-30  9:30 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: Alex Williamson, jasowang, qemu-devel

On Mon, Jul 30, 2018 at 04:10:04PM +0800, Tiwei Bie wrote:
> On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote:
> > On Fri, 27 Jul 2018 09:58:05 +0800
> > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > 
> > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> > > > On Mon, 23 Jul 2018 12:59:56 +0800
> > > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > > >   
> > > [...]
> > > > >  
> > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > > > +                                              int *fd)
> > > > > +{
> > > > > +    struct vhost_user *u = dev->opaque;
> > > > > +    VhostUserState *user = u->user;
> > > > > +    VirtIODevice *vdev = dev->vdev;
> > > > > +    int groupfd = fd[0];
> > > > > +    VFIOGroup *group;
> > > > > +
> > > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > > > +        vdev == NULL) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (user->vfio_group) {
> > > > > +        vfio_put_group(user->vfio_group);
> > > > > +        user->vfio_group = NULL;
> > > > > +    }
> > > > > +  
> > > 
> > > There should be a below check here (I missed it in this
> > > patch, sorry..):
> > > 
> > >     if (groupfd < 0) {
> > >         return 0;
> > >     }
> > > 
> > > > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > > > +    if (group == NULL) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (group->fd != groupfd) {
> > > > > +        close(groupfd);
> > > > > +    }
> > > > > +
> > > > > +    user->vfio_group = group;
> > > > > +    fd[0] = -1;
> > > > > +
> > > > > +    return 0;
> > > > > +}  
> > > > 
> > > > This all looks very sketchy, we're reaching into vfio internal state
> > > > and arbitrarily releasing data structures, reusing existing ones, and
> > > > maybe creating new ones.  We know that a vfio group only allows a
> > > > single open, right?  
> > > 
> > > Yeah, exactly!
> > > 
> > > When the vhost-user backend process has opened a VFIO group fd,
> > > QEMU won't be able to open this group file via open() any more.
> > > So vhost-user backend process will share the VFIO group fd to
> > > QEMU via the UNIX socket. And that's why we're introducing a
> > > new API:
> > > 
> > > 	vfio_get_group_from_fd(groupfd, ...);
> > > 
> > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> > > in this case, vfio/common.c can't open this group file via open(),
> > > and what we have is a VFIO group fd shared by another process via
> > > UNIX socket). And ...
> > > 
> > > > So if you have a groupfd, vfio will never have
> > > > that same group opened already.  
> > > 
> > > ... if the same vhost-user backend process shares the same VFIO
> > > group fd more than one times via different vhost-user slave channels
> > > to different vhost-user frontends in the same QEMU process, the
> > > same VFIOGroup could exist already.
> > > 
> > > This case could happen when e.g. there are two vhost accelerator
> > > devices in the same VFIO group, and they are managed by the same
> > > vhost-user backend process, and used to accelerate two different
> > > virtio devices in QEMU. In this case, the same VFIO group fd could
> > > be sent twice.
> > > 
> > > If we need to support this case, more changes in vfio/common.c
> > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> > > getting and putting a VFIOGroup structure multiple times,
> > > 2) add a lock to make vfio_get_group*() and vfio_put_group()
> > > thread-safe.
> > 
> > This is the sort of case where it seems like we're just grabbing
> > internal interfaces and using them from other modules.  VFIOGroups have
> > a device_list and currently handles multiple puts by testing whether
> > that device list is empty (ie. we already have a refcnt effectively).
> > Now we have a group user that has no devices, which is not something
> > that it seems like we've designed or audited the code for handling.
> > IOW, while the header file lives in a common location, it's not really
> > intended to be an API within QEMU and it makes me a bit uncomfortable
> > to use it as such.
> > 
> > > > Is that the goal?  It's not obvious in
> > > > the code.  I don't really understand what vhost goes on to do with this
> > > > group,  
> > > 
> > > The goal is that, we have a VFIO group opened by an external
> > > vhost-user backend process, this process will manage the VFIO
> > > device, but want QEMU to manage the VFIO group, including:
> > > 
> > > 1 - program the DMA mappings for this VFIO group based on
> > >     the front-end device's dma_as in QEMU.
> > > 
> > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> > > 
> > > Vhost-user in QEMU as the vhost-user frontend just wants
> > > hw/vfio to do above things after receiving a VFIO group fd
> > > from the vhost-user backend process.
> > > 
> > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> > > by calling vfio_get_group_from_fd() or put this VFIOGroup by
> > > calling vfio_put_group() when it's not needed anymore, and
> > > won't do anything else.
> > > 
> > > Vhost-user will only deal with the VFIOGroups allocated by
> > > itself, and won't influence any other VFIOGroups used by the
> > > VFIO passthru devices (-device vfio-pci). Because the same
> > > VFIO group file can only be opened by one process via open().
> > 
> > But there's nothing here that guarantees the reverse.  vhost cannot
> > open the group of an existing vfio-pci device, but vfio-pci can re-use
> > the group that vhost has already opened.  This is again where it seems
> > like we're trying to cherry pick from an internal API and leaving some
> > loose ends dangling.
> > 
> > > > but I'm pretty sure the existing state machine in vfio isn't
> > > > designed for it.  Thanks,  
> > > 
> > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> > > is going to use VFIOGroup and how we are going to extend hw/vfio)
> > > acceptable to you? Thanks!
> > 
> > I certainly would not want to duplicate managing the group, container,
> > and MemoryListener, but I think we need a more formal interface with at
> > least some notion of reference counting beyond the device list or
> > explicit knowledge of an external user.
> 
> Got it! I'll try to address this. Thanks!
> 
> > I'm also curious if it really
> > makes sense that the vhost backend is opening the group rather than
> > letting QEMU do it.  It seems that vhost wants to deal with the device
> > and we can never have symmetry in the scenario above, where a group
> > might contain multiple devices and order matters because of where the
> > group is opened.  If we still had a notion of a VFIODevice for an
> > external user, then the device_list refcnt would just work.  Thanks,
> 
> Vhost-user backend will but QEMU won't open the VFIO
> fds, because QEMU just has a vhost-user UNIX socket
> path that it should connect to or listen on. So QEMU
> doesn't know which group and device it should open.
> The vhost accelerator devices are probed and managed
> in the vhost-user backend process. And the vhost-user
> backend process will bind the vhost-user instances
> and vhost accelerator devices based on some sort of
> user's input.
> 
> Best regards,
> Tiwei Bie

Is the reason it's done like this because the
backend is sharing the device with multiple QEMUs?

I generally wonder how are restarts of the backend handled
with this approach: closing the VFIO device tends to reset
the whole device.

-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-30  9:30           ` Michael S. Tsirkin
@ 2018-07-30 16:20             ` Alex Williamson
  2018-07-31  7:47             ` Tiwei Bie
  2018-09-12  8:04             ` Tiwei Bie
  2 siblings, 0 replies; 23+ messages in thread
From: Alex Williamson @ 2018-07-30 16:20 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Tiwei Bie, jasowang, qemu-devel

On Mon, 30 Jul 2018 12:30:58 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Jul 30, 2018 at 04:10:04PM +0800, Tiwei Bie wrote:
> > On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote:  
> > > On Fri, 27 Jul 2018 09:58:05 +0800
> > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > >   
> > > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:  
> > > > > On Mon, 23 Jul 2018 12:59:56 +0800
> > > > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > > > >     
> > > > [...]  
> > > > > >  
> > > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > > > > +                                              int *fd)
> > > > > > +{
> > > > > > +    struct vhost_user *u = dev->opaque;
> > > > > > +    VhostUserState *user = u->user;
> > > > > > +    VirtIODevice *vdev = dev->vdev;
> > > > > > +    int groupfd = fd[0];
> > > > > > +    VFIOGroup *group;
> > > > > > +
> > > > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > > > > +        vdev == NULL) {
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (user->vfio_group) {
> > > > > > +        vfio_put_group(user->vfio_group);
> > > > > > +        user->vfio_group = NULL;
> > > > > > +    }
> > > > > > +    
> > > > 
> > > > There should be a below check here (I missed it in this
> > > > patch, sorry..):
> > > > 
> > > >     if (groupfd < 0) {
> > > >         return 0;
> > > >     }
> > > >   
> > > > > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > > > > +    if (group == NULL) {
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (group->fd != groupfd) {
> > > > > > +        close(groupfd);
> > > > > > +    }
> > > > > > +
> > > > > > +    user->vfio_group = group;
> > > > > > +    fd[0] = -1;
> > > > > > +
> > > > > > +    return 0;
> > > > > > +}    
> > > > > 
> > > > > This all looks very sketchy, we're reaching into vfio internal state
> > > > > and arbitrarily releasing data structures, reusing existing ones, and
> > > > > maybe creating new ones.  We know that a vfio group only allows a
> > > > > single open, right?    
> > > > 
> > > > Yeah, exactly!
> > > > 
> > > > When the vhost-user backend process has opened a VFIO group fd,
> > > > QEMU won't be able to open this group file via open() any more.
> > > > So vhost-user backend process will share the VFIO group fd to
> > > > QEMU via the UNIX socket. And that's why we're introducing a
> > > > new API:
> > > > 
> > > > 	vfio_get_group_from_fd(groupfd, ...);
> > > > 
> > > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> > > > in this case, vfio/common.c can't open this group file via open(),
> > > > and what we have is a VFIO group fd shared by another process via
> > > > UNIX socket). And ...
> > > >   
> > > > > So if you have a groupfd, vfio will never have
> > > > > that same group opened already.    
> > > > 
> > > > ... if the same vhost-user backend process shares the same VFIO
> > > > group fd more than one times via different vhost-user slave channels
> > > > to different vhost-user frontends in the same QEMU process, the
> > > > same VFIOGroup could exist already.
> > > > 
> > > > This case could happen when e.g. there are two vhost accelerator
> > > > devices in the same VFIO group, and they are managed by the same
> > > > vhost-user backend process, and used to accelerate two different
> > > > virtio devices in QEMU. In this case, the same VFIO group fd could
> > > > be sent twice.
> > > > 
> > > > If we need to support this case, more changes in vfio/common.c
> > > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> > > > getting and putting a VFIOGroup structure multiple times,
> > > > 2) add a lock to make vfio_get_group*() and vfio_put_group()
> > > > thread-safe.  
> > > 
> > > This is the sort of case where it seems like we're just grabbing
> > > internal interfaces and using them from other modules.  VFIOGroups have
> > > a device_list and currently handles multiple puts by testing whether
> > > that device list is empty (ie. we already have a refcnt effectively).
> > > Now we have a group user that has no devices, which is not something
> > > that it seems like we've designed or audited the code for handling.
> > > IOW, while the header file lives in a common location, it's not really
> > > intended to be an API within QEMU and it makes me a bit uncomfortable
> > > to use it as such.
> > >   
> > > > > Is that the goal?  It's not obvious in
> > > > > the code.  I don't really understand what vhost goes on to do with this
> > > > > group,    
> > > > 
> > > > The goal is that, we have a VFIO group opened by an external
> > > > vhost-user backend process, this process will manage the VFIO
> > > > device, but want QEMU to manage the VFIO group, including:
> > > > 
> > > > 1 - program the DMA mappings for this VFIO group based on
> > > >     the front-end device's dma_as in QEMU.
> > > > 
> > > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> > > > 
> > > > Vhost-user in QEMU as the vhost-user frontend just wants
> > > > hw/vfio to do above things after receiving a VFIO group fd
> > > > from the vhost-user backend process.
> > > > 
> > > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> > > > by calling vfio_get_group_from_fd() or put this VFIOGroup by
> > > > calling vfio_put_group() when it's not needed anymore, and
> > > > won't do anything else.
> > > > 
> > > > Vhost-user will only deal with the VFIOGroups allocated by
> > > > itself, and won't influence any other VFIOGroups used by the
> > > > VFIO passthru devices (-device vfio-pci). Because the same
> > > > VFIO group file can only be opened by one process via open().  
> > > 
> > > But there's nothing here that guarantees the reverse.  vhost cannot
> > > open the group of an existing vfio-pci device, but vfio-pci can re-use
> > > the group that vhost has already opened.  This is again where it seems
> > > like we're trying to cherry pick from an internal API and leaving some
> > > loose ends dangling.
> > >   
> > > > > but I'm pretty sure the existing state machine in vfio isn't
> > > > > designed for it.  Thanks,    
> > > > 
> > > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> > > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> > > > is going to use VFIOGroup and how we are going to extend hw/vfio)
> > > > acceptable to you? Thanks!  
> > > 
> > > I certainly would not want to duplicate managing the group, container,
> > > and MemoryListener, but I think we need a more formal interface with at
> > > least some notion of reference counting beyond the device list or
> > > explicit knowledge of an external user.  
> > 
> > Got it! I'll try to address this. Thanks!
> >   
> > > I'm also curious if it really
> > > makes sense that the vhost backend is opening the group rather than
> > > letting QEMU do it.  It seems that vhost wants to deal with the device
> > > and we can never have symmetry in the scenario above, where a group
> > > might contain multiple devices and order matters because of where the
> > > group is opened.  If we still had a notion of a VFIODevice for an
> > > external user, then the device_list refcnt would just work.  Thanks,  
> > 
> > Vhost-user backend will but QEMU won't open the VFIO
> > fds, because QEMU just has a vhost-user UNIX socket
> > path that it should connect to or listen on. So QEMU
> > doesn't know which group and device it should open.
> > The vhost accelerator devices are probed and managed
> > in the vhost-user backend process. And the vhost-user
> > backend process will bind the vhost-user instances
> > and vhost accelerator devices based on some sort of
> > user's input.
> > 
> > Best regards,
> > Tiwei Bie  
> 
> Is the reason it's done like this because the
> backend is sharing the device with multiple QEMUs?

Each QEMU instance that gets passed a vfio group would attempt to add
it to a vfio container (ie. IOMMU domain), so this mechanism cannot
support multiple QEMU instances attached to the same group.  Beyond
that, the address space of each device would need to be unique within
the instance as we only have a single IOVA space for the group.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-30  9:30           ` Michael S. Tsirkin
  2018-07-30 16:20             ` Alex Williamson
@ 2018-07-31  7:47             ` Tiwei Bie
  2018-09-12  8:04             ` Tiwei Bie
  2 siblings, 0 replies; 23+ messages in thread
From: Tiwei Bie @ 2018-07-31  7:47 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, jasowang, qemu-devel

On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> On Mon, Jul 30, 2018 at 04:10:04PM +0800, Tiwei Bie wrote:
> > On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote:
> > > On Fri, 27 Jul 2018 09:58:05 +0800
> > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > > 
> > > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote:
> > > > > On Mon, 23 Jul 2018 12:59:56 +0800
> > > > > Tiwei Bie <tiwei.bie@intel.com> wrote:
> > > > >   
> > > > [...]
> > > > > >  
> > > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev,
> > > > > > +                                              int *fd)
> > > > > > +{
> > > > > > +    struct vhost_user *u = dev->opaque;
> > > > > > +    VhostUserState *user = u->user;
> > > > > > +    VirtIODevice *vdev = dev->vdev;
> > > > > > +    int groupfd = fd[0];
> > > > > > +    VFIOGroup *group;
> > > > > > +
> > > > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > > > +                            VHOST_USER_PROTOCOL_F_VFIO_GROUP) ||
> > > > > > +        vdev == NULL) {
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (user->vfio_group) {
> > > > > > +        vfio_put_group(user->vfio_group);
> > > > > > +        user->vfio_group = NULL;
> > > > > > +    }
> > > > > > +  
> > > > 
> > > > There should be a below check here (I missed it in this
> > > > patch, sorry..):
> > > > 
> > > >     if (groupfd < 0) {
> > > >         return 0;
> > > >     }
> > > > 
> > > > > > +    group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL);
> > > > > > +    if (group == NULL) {
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (group->fd != groupfd) {
> > > > > > +        close(groupfd);
> > > > > > +    }
> > > > > > +
> > > > > > +    user->vfio_group = group;
> > > > > > +    fd[0] = -1;
> > > > > > +
> > > > > > +    return 0;
> > > > > > +}  
> > > > > 
> > > > > This all looks very sketchy, we're reaching into vfio internal state
> > > > > and arbitrarily releasing data structures, reusing existing ones, and
> > > > > maybe creating new ones.  We know that a vfio group only allows a
> > > > > single open, right?  
> > > > 
> > > > Yeah, exactly!
> > > > 
> > > > When the vhost-user backend process has opened a VFIO group fd,
> > > > QEMU won't be able to open this group file via open() any more.
> > > > So vhost-user backend process will share the VFIO group fd to
> > > > QEMU via the UNIX socket. And that's why we're introducing a
> > > > new API:
> > > > 
> > > > 	vfio_get_group_from_fd(groupfd, ...);
> > > > 
> > > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because
> > > > in this case, vfio/common.c can't open this group file via open(),
> > > > and what we have is a VFIO group fd shared by another process via
> > > > UNIX socket). And ...
> > > > 
> > > > > So if you have a groupfd, vfio will never have
> > > > > that same group opened already.  
> > > > 
> > > > ... if the same vhost-user backend process shares the same VFIO
> > > > group fd more than one times via different vhost-user slave channels
> > > > to different vhost-user frontends in the same QEMU process, the
> > > > same VFIOGroup could exist already.
> > > > 
> > > > This case could happen when e.g. there are two vhost accelerator
> > > > devices in the same VFIO group, and they are managed by the same
> > > > vhost-user backend process, and used to accelerate two different
> > > > virtio devices in QEMU. In this case, the same VFIO group fd could
> > > > be sent twice.
> > > > 
> > > > If we need to support this case, more changes in vfio/common.c
> > > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support
> > > > getting and putting a VFIOGroup structure multiple times,
> > > > 2) add a lock to make vfio_get_group*() and vfio_put_group()
> > > > thread-safe.
> > > 
> > > This is the sort of case where it seems like we're just grabbing
> > > internal interfaces and using them from other modules.  VFIOGroups have
> > > a device_list and currently handles multiple puts by testing whether
> > > that device list is empty (ie. we already have a refcnt effectively).
> > > Now we have a group user that has no devices, which is not something
> > > that it seems like we've designed or audited the code for handling.
> > > IOW, while the header file lives in a common location, it's not really
> > > intended to be an API within QEMU and it makes me a bit uncomfortable
> > > to use it as such.
> > > 
> > > > > Is that the goal?  It's not obvious in
> > > > > the code.  I don't really understand what vhost goes on to do with this
> > > > > group,  
> > > > 
> > > > The goal is that, we have a VFIO group opened by an external
> > > > vhost-user backend process, this process will manage the VFIO
> > > > device, but want QEMU to manage the VFIO group, including:
> > > > 
> > > > 1 - program the DMA mappings for this VFIO group based on
> > > >     the front-end device's dma_as in QEMU.
> > > > 
> > > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD).
> > > > 
> > > > Vhost-user in QEMU as the vhost-user frontend just wants
> > > > hw/vfio to do above things after receiving a VFIO group fd
> > > > from the vhost-user backend process.
> > > > 
> > > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup
> > > > by calling vfio_get_group_from_fd() or put this VFIOGroup by
> > > > calling vfio_put_group() when it's not needed anymore, and
> > > > won't do anything else.
> > > > 
> > > > Vhost-user will only deal with the VFIOGroups allocated by
> > > > itself, and won't influence any other VFIOGroups used by the
> > > > VFIO passthru devices (-device vfio-pci). Because the same
> > > > VFIO group file can only be opened by one process via open().
> > > 
> > > But there's nothing here that guarantees the reverse.  vhost cannot
> > > open the group of an existing vfio-pci device, but vfio-pci can re-use
> > > the group that vhost has already opened.  This is again where it seems
> > > like we're trying to cherry pick from an internal API and leaving some
> > > loose ends dangling.
> > > 
> > > > > but I'm pretty sure the existing state machine in vfio isn't
> > > > > designed for it.  Thanks,  
> > > > 
> > > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to
> > > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user
> > > > is going to use VFIOGroup and how we are going to extend hw/vfio)
> > > > acceptable to you? Thanks!
> > > 
> > > I certainly would not want to duplicate managing the group, container,
> > > and MemoryListener, but I think we need a more formal interface with at
> > > least some notion of reference counting beyond the device list or
> > > explicit knowledge of an external user.
> > 
> > Got it! I'll try to address this. Thanks!
> > 
> > > I'm also curious if it really
> > > makes sense that the vhost backend is opening the group rather than
> > > letting QEMU do it.  It seems that vhost wants to deal with the device
> > > and we can never have symmetry in the scenario above, where a group
> > > might contain multiple devices and order matters because of where the
> > > group is opened.  If we still had a notion of a VFIODevice for an
> > > external user, then the device_list refcnt would just work.  Thanks,
> > 
> > Vhost-user backend will but QEMU won't open the VFIO
> > fds, because QEMU just has a vhost-user UNIX socket
> > path that it should connect to or listen on. So QEMU
> > doesn't know which group and device it should open.
> > The vhost accelerator devices are probed and managed
> > in the vhost-user backend process. And the vhost-user
> > backend process will bind the vhost-user instances
> > and vhost accelerator devices based on some sort of
> > user's input.
> > 
> > Best regards,
> > Tiwei Bie
> 
> Is the reason it's done like this because the
> backend is sharing the device with multiple QEMUs?

IMO, in the vhost-user based approach, it's natural
for QEMU users to just specify a vhost-user UNIX socket
path when launching the QEMU. And the vhost-user backend
behind this socket path can be implemented purely in
software or accelerated by hardware, which is transparent
to the QEMU users.

If we want an approach that QEMU users need to specify
the vhost accelerate device (similar to the case that
users specify the vhost-user socket path) when launching
the QEMU, we may not want to do it over vhost-user.

Best regards,
Tiwei Bie

> 
> I generally wonder how are restarts of the backend handled
> with this approach: closing the VFIO device tends to reset
> the whole device.
> 
> -- 
> MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-07-30  9:30           ` Michael S. Tsirkin
  2018-07-30 16:20             ` Alex Williamson
  2018-07-31  7:47             ` Tiwei Bie
@ 2018-09-12  8:04             ` Tiwei Bie
  2018-09-12 16:14               ` Michael S. Tsirkin
  2 siblings, 1 reply; 23+ messages in thread
From: Tiwei Bie @ 2018-09-12  8:04 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, jasowang, qemu-devel

On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
[...]
> 
> I generally wonder how are restarts of the backend handled
> with this approach: closing the VFIO device tends to reset
> the whole device.

Hi Michael,

I missed this comment previously.. This is a good point!
In this RFC, before sending the VFIO group fd to QEMU,
backend needs to close the VFIO device and unset the VFIO
container first. Otherwise, QEMU won't be able to set the
VFIO container for the VFIO group.

Another option is to share the container fd instead of
the group fd to QEMU. In this case, backend won't need
to close any fd. But there is one problem that, it's
hard to unmap the old mappings, especially when QEMU
crashes. Do you have any suggestions? Thanks!

Best regards,
Tiwei Bie

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-09-12  8:04             ` Tiwei Bie
@ 2018-09-12 16:14               ` Michael S. Tsirkin
  2018-09-12 16:34                 ` Alex Williamson
  0 siblings, 1 reply; 23+ messages in thread
From: Michael S. Tsirkin @ 2018-09-12 16:14 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: Alex Williamson, jasowang, qemu-devel

On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:
> On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> [...]
> > 
> > I generally wonder how are restarts of the backend handled
> > with this approach: closing the VFIO device tends to reset
> > the whole device.
> 
> Hi Michael,
> 
> I missed this comment previously.. This is a good point!
> In this RFC, before sending the VFIO group fd to QEMU,
> backend needs to close the VFIO device and unset the VFIO
> container first. Otherwise, QEMU won't be able to set the
> VFIO container for the VFIO group.
> 
> Another option is to share the container fd instead of
> the group fd to QEMU. In this case, backend won't need
> to close any fd. But there is one problem that, it's
> hard to unmap the old mappings, especially when QEMU
> crashes.

What are these old mappings and who creates them?
If you want to just reset everything the way it was
on open, surely it would be easy to add such a reset ioctl.

> Do you have any suggestions? Thanks!
> 
> Best regards,
> Tiwei Bie

Donnu. Alex, any thoughts? Which approach would you prefer?

-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-09-12 16:14               ` Michael S. Tsirkin
@ 2018-09-12 16:34                 ` Alex Williamson
  2018-09-12 16:44                   ` Michael S. Tsirkin
  0 siblings, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2018-09-12 16:34 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Tiwei Bie, jasowang, qemu-devel

On Wed, 12 Sep 2018 12:14:44 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:
> > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > [...]  
> > > 
> > > I generally wonder how are restarts of the backend handled
> > > with this approach: closing the VFIO device tends to reset
> > > the whole device.  
> > 
> > Hi Michael,
> > 
> > I missed this comment previously.. This is a good point!
> > In this RFC, before sending the VFIO group fd to QEMU,
> > backend needs to close the VFIO device and unset the VFIO
> > container first. Otherwise, QEMU won't be able to set the
> > VFIO container for the VFIO group.
> > 
> > Another option is to share the container fd instead of
> > the group fd to QEMU. In this case, backend won't need
> > to close any fd. But there is one problem that, it's
> > hard to unmap the old mappings, especially when QEMU
> > crashes.  
> 
> What are these old mappings and who creates them?
> If you want to just reset everything the way it was
> on open, surely it would be easy to add such a reset ioctl.
> 
> > Do you have any suggestions? Thanks!
> > 
> > Best regards,
> > Tiwei Bie  
> 
> Donnu. Alex, any thoughts? Which approach would you prefer?

The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
that an unmap does not bisect previous mappings, ie. a previous mapping
cannot be partially unmapped.  Therefore you can already dump the
entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
size = (u64)-1.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-09-12 16:34                 ` Alex Williamson
@ 2018-09-12 16:44                   ` Michael S. Tsirkin
  2018-09-12 17:15                     ` Alex Williamson
  0 siblings, 1 reply; 23+ messages in thread
From: Michael S. Tsirkin @ 2018-09-12 16:44 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Tiwei Bie, jasowang, qemu-devel

On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:
> On Wed, 12 Sep 2018 12:14:44 -0400
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:
> > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > > [...]  
> > > > 
> > > > I generally wonder how are restarts of the backend handled
> > > > with this approach: closing the VFIO device tends to reset
> > > > the whole device.  
> > > 
> > > Hi Michael,
> > > 
> > > I missed this comment previously.. This is a good point!
> > > In this RFC, before sending the VFIO group fd to QEMU,
> > > backend needs to close the VFIO device and unset the VFIO
> > > container first. Otherwise, QEMU won't be able to set the
> > > VFIO container for the VFIO group.
> > > 
> > > Another option is to share the container fd instead of
> > > the group fd to QEMU. In this case, backend won't need
> > > to close any fd. But there is one problem that, it's
> > > hard to unmap the old mappings, especially when QEMU
> > > crashes.  
> > 
> > What are these old mappings and who creates them?
> > If you want to just reset everything the way it was
> > on open, surely it would be easy to add such a reset ioctl.
> > 
> > > Do you have any suggestions? Thanks!
> > > 
> > > Best regards,
> > > Tiwei Bie  
> > 
> > Donnu. Alex, any thoughts? Which approach would you prefer?
> 
> The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
> that an unmap does not bisect previous mappings, ie. a previous mapping
> cannot be partially unmapped.  Therefore you can already dump the
> entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
> size = (u64)-1.  Thanks,
> 
> Alex

Hmm this would exclude the last byte (address (u64)-1).
VTD does not support such iova values for now but something
to keep in mind e.g. for virtio-iommu with nested virt
which does.

-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-09-12 16:44                   ` Michael S. Tsirkin
@ 2018-09-12 17:15                     ` Alex Williamson
  2018-09-12 17:29                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2018-09-12 17:15 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Tiwei Bie, jasowang, qemu-devel

On Wed, 12 Sep 2018 12:44:15 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:
> > On Wed, 12 Sep 2018 12:14:44 -0400
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:  
> > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > > > [...]    
> > > > > 
> > > > > I generally wonder how are restarts of the backend handled
> > > > > with this approach: closing the VFIO device tends to reset
> > > > > the whole device.    
> > > > 
> > > > Hi Michael,
> > > > 
> > > > I missed this comment previously.. This is a good point!
> > > > In this RFC, before sending the VFIO group fd to QEMU,
> > > > backend needs to close the VFIO device and unset the VFIO
> > > > container first. Otherwise, QEMU won't be able to set the
> > > > VFIO container for the VFIO group.
> > > > 
> > > > Another option is to share the container fd instead of
> > > > the group fd to QEMU. In this case, backend won't need
> > > > to close any fd. But there is one problem that, it's
> > > > hard to unmap the old mappings, especially when QEMU
> > > > crashes.    
> > > 
> > > What are these old mappings and who creates them?
> > > If you want to just reset everything the way it was
> > > on open, surely it would be easy to add such a reset ioctl.
> > >   
> > > > Do you have any suggestions? Thanks!
> > > > 
> > > > Best regards,
> > > > Tiwei Bie    
> > > 
> > > Donnu. Alex, any thoughts? Which approach would you prefer?  
> > 
> > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
> > that an unmap does not bisect previous mappings, ie. a previous mapping
> > cannot be partially unmapped.  Therefore you can already dump the
> > entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
> > size = (u64)-1.  Thanks,
> > 
> > Alex  
> 
> Hmm this would exclude the last byte (address (u64)-1).
> VTD does not support such iova values for now but something
> to keep in mind e.g. for virtio-iommu with nested virt
> which does.

Yes, technically you'd need another ioctl to get the last byte, the
ioctls should have used start and end rather than start and size, but
it's too late to change.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-09-12 17:15                     ` Alex Williamson
@ 2018-09-12 17:29                       ` Michael S. Tsirkin
  2018-09-12 18:09                         ` Alex Williamson
  0 siblings, 1 reply; 23+ messages in thread
From: Michael S. Tsirkin @ 2018-09-12 17:29 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Tiwei Bie, jasowang, qemu-devel

On Wed, Sep 12, 2018 at 11:15:32AM -0600, Alex Williamson wrote:
> On Wed, 12 Sep 2018 12:44:15 -0400
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:
> > > On Wed, 12 Sep 2018 12:14:44 -0400
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >   
> > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:  
> > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > > > > [...]    
> > > > > > 
> > > > > > I generally wonder how are restarts of the backend handled
> > > > > > with this approach: closing the VFIO device tends to reset
> > > > > > the whole device.    
> > > > > 
> > > > > Hi Michael,
> > > > > 
> > > > > I missed this comment previously.. This is a good point!
> > > > > In this RFC, before sending the VFIO group fd to QEMU,
> > > > > backend needs to close the VFIO device and unset the VFIO
> > > > > container first. Otherwise, QEMU won't be able to set the
> > > > > VFIO container for the VFIO group.
> > > > > 
> > > > > Another option is to share the container fd instead of
> > > > > the group fd to QEMU. In this case, backend won't need
> > > > > to close any fd. But there is one problem that, it's
> > > > > hard to unmap the old mappings, especially when QEMU
> > > > > crashes.    
> > > > 
> > > > What are these old mappings and who creates them?
> > > > If you want to just reset everything the way it was
> > > > on open, surely it would be easy to add such a reset ioctl.
> > > >   
> > > > > Do you have any suggestions? Thanks!
> > > > > 
> > > > > Best regards,
> > > > > Tiwei Bie    
> > > > 
> > > > Donnu. Alex, any thoughts? Which approach would you prefer?  
> > > 
> > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
> > > that an unmap does not bisect previous mappings, ie. a previous mapping
> > > cannot be partially unmapped.  Therefore you can already dump the
> > > entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
> > > size = (u64)-1.  Thanks,
> > > 
> > > Alex  
> > 
> > Hmm this would exclude the last byte (address (u64)-1).
> > VTD does not support such iova values for now but something
> > to keep in mind e.g. for virtio-iommu with nested virt
> > which does.
> 
> Yes, technically you'd need another ioctl to get the last byte,

It will violate the requirement of not unmapping part of a region,
won't it? So it won't work.

> the
> ioctls should have used start and end rather than start and size, but
> it's too late to change.  Thanks,
> 
> Alex

At some point we'll have to fix above issue in some way. Maybe when we
add support for atomic remaps.

-- 
MST

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-09-12 17:29                       ` Michael S. Tsirkin
@ 2018-09-12 18:09                         ` Alex Williamson
  2018-09-13  5:26                           ` Tian, Kevin
  0 siblings, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2018-09-12 18:09 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Tiwei Bie, jasowang, qemu-devel

On Wed, 12 Sep 2018 13:29:33 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Sep 12, 2018 at 11:15:32AM -0600, Alex Williamson wrote:
> > On Wed, 12 Sep 2018 12:44:15 -0400
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:  
> > > > On Wed, 12 Sep 2018 12:14:44 -0400
> > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > >     
> > > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:    
> > > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote:
> > > > > > [...]      
> > > > > > > 
> > > > > > > I generally wonder how are restarts of the backend handled
> > > > > > > with this approach: closing the VFIO device tends to reset
> > > > > > > the whole device.      
> > > > > > 
> > > > > > Hi Michael,
> > > > > > 
> > > > > > I missed this comment previously.. This is a good point!
> > > > > > In this RFC, before sending the VFIO group fd to QEMU,
> > > > > > backend needs to close the VFIO device and unset the VFIO
> > > > > > container first. Otherwise, QEMU won't be able to set the
> > > > > > VFIO container for the VFIO group.
> > > > > > 
> > > > > > Another option is to share the container fd instead of
> > > > > > the group fd to QEMU. In this case, backend won't need
> > > > > > to close any fd. But there is one problem that, it's
> > > > > > hard to unmap the old mappings, especially when QEMU
> > > > > > crashes.      
> > > > > 
> > > > > What are these old mappings and who creates them?
> > > > > If you want to just reset everything the way it was
> > > > > on open, surely it would be easy to add such a reset ioctl.
> > > > >     
> > > > > > Do you have any suggestions? Thanks!
> > > > > > 
> > > > > > Best regards,
> > > > > > Tiwei Bie      
> > > > > 
> > > > > Donnu. Alex, any thoughts? Which approach would you prefer?    
> > > > 
> > > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires
> > > > that an unmap does not bisect previous mappings, ie. a previous mapping
> > > > cannot be partially unmapped.  Therefore you can already dump the
> > > > entire IOVA space for a container with one UNMAP_DMA call, iova = 0,
> > > > size = (u64)-1.  Thanks,
> > > > 
> > > > Alex    
> > > 
> > > Hmm this would exclude the last byte (address (u64)-1).
> > > VTD does not support such iova values for now but something
> > > to keep in mind e.g. for virtio-iommu with nested virt
> > > which does.  
> > 
> > Yes, technically you'd need another ioctl to get the last byte,  
> 
> It will violate the requirement of not unmapping part of a region,
> won't it? So it won't work.

Yes, in theory it's more complicated, but in practice it's not because
Intel only supports up to 48-bits of IOVA space (modulo round down by
a page, not a byte).  It would also be deterministic if we could ever
get past the RMRR issues with IGD to implement an IOVA list as every
IOMMU (AFAIK) has some reserved ranges that cannot be mapped and would
bisect the IOVA space somewhere.
 
> > the
> > ioctls should have used start and end rather than start and size, but
> > it's too late to change.  Thanks,
> > 
> > Alex  
> 
> At some point we'll have to fix above issue in some way. Maybe when we
> add support for atomic remaps.

"Patches welcome".  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master
  2018-09-12 18:09                         ` Alex Williamson
@ 2018-09-13  5:26                           ` Tian, Kevin
  0 siblings, 0 replies; 23+ messages in thread
From: Tian, Kevin @ 2018-09-13  5:26 UTC (permalink / raw)
  To: Alex Williamson, Michael S. Tsirkin; +Cc: jasowang, qemu-devel, Bie, Tiwei

> From: Alex Williamson
> Sent: Thursday, September 13, 2018 2:10 AM
> 
> On Wed, 12 Sep 2018 13:29:33 -0400
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Wed, Sep 12, 2018 at 11:15:32AM -0600, Alex Williamson wrote:
> > > On Wed, 12 Sep 2018 12:44:15 -0400
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >
> > > > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote:
> > > > > On Wed, 12 Sep 2018 12:14:44 -0400
> > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > >
> > > > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote:
> > > > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin
> wrote:
> > > > > > > [...]
> > > > > > > >
> > > > > > > > I generally wonder how are restarts of the backend handled
> > > > > > > > with this approach: closing the VFIO device tends to reset
> > > > > > > > the whole device.
> > > > > > >
> > > > > > > Hi Michael,
> > > > > > >
> > > > > > > I missed this comment previously.. This is a good point!
> > > > > > > In this RFC, before sending the VFIO group fd to QEMU,
> > > > > > > backend needs to close the VFIO device and unset the VFIO
> > > > > > > container first. Otherwise, QEMU won't be able to set the
> > > > > > > VFIO container for the VFIO group.
> > > > > > >
> > > > > > > Another option is to share the container fd instead of
> > > > > > > the group fd to QEMU. In this case, backend won't need
> > > > > > > to close any fd. But there is one problem that, it's
> > > > > > > hard to unmap the old mappings, especially when QEMU
> > > > > > > crashes.
> > > > > >
> > > > > > What are these old mappings and who creates them?
> > > > > > If you want to just reset everything the way it was
> > > > > > on open, surely it would be easy to add such a reset ioctl.
> > > > > >
> > > > > > > Do you have any suggestions? Thanks!
> > > > > > >
> > > > > > > Best regards,
> > > > > > > Tiwei Bie
> > > > > >
> > > > > > Donnu. Alex, any thoughts? Which approach would you prefer?
> > > > >
> > > > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only
> requires
> > > > > that an unmap does not bisect previous mappings, ie. a previous
> mapping
> > > > > cannot be partially unmapped.  Therefore you can already dump the
> > > > > entire IOVA space for a container with one UNMAP_DMA call, iova =
> 0,
> > > > > size = (u64)-1.  Thanks,
> > > > >
> > > > > Alex
> > > >
> > > > Hmm this would exclude the last byte (address (u64)-1).
> > > > VTD does not support such iova values for now but something
> > > > to keep in mind e.g. for virtio-iommu with nested virt
> > > > which does.
> > >
> > > Yes, technically you'd need another ioctl to get the last byte,
> >
> > It will violate the requirement of not unmapping part of a region,
> > won't it? So it won't work.
> 
> Yes, in theory it's more complicated, but in practice it's not because
> Intel only supports up to 48-bits of IOVA space (modulo round down by
> a page, not a byte).  It would also be deterministic if we could ever
> get past the RMRR issues with IGD to implement an IOVA list as every
> IOMMU (AFAIK) has some reserved ranges that cannot be mapped and
> would
> bisect the IOVA space somewhere.
> 

One correction - Intel VT-d supports 5-level paging now, which gives
57-bits IOVA space. though this comment doesn't affect your
discussion... :-)

Thanks
Kevin

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2018-09-13  5:26 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-23  4:59 [Qemu-devel] [RFC 0/3] Supporting programming IOMMU in QEMU (vDPA/vhost-user) Tiwei Bie
2018-07-23  4:59 ` [Qemu-devel] [RFC 1/3] vfio: split vfio_get_group() into small functions Tiwei Bie
2018-07-23  4:59 ` [Qemu-devel] [RFC 2/3] vfio: support getting VFIOGroup from groupfd Tiwei Bie
2018-07-23  4:59 ` [Qemu-devel] [RFC 3/3] vhost-user: support programming VFIO group in master Tiwei Bie
2018-07-23  9:19   ` Michael S. Tsirkin
2018-07-23 11:59     ` Tiwei Bie
2018-07-23  9:20   ` Michael S. Tsirkin
2018-07-23 12:04     ` Tiwei Bie
2018-07-26 20:45   ` Alex Williamson
2018-07-27  1:58     ` Tiwei Bie
2018-07-27 20:03       ` Alex Williamson
2018-07-30  8:10         ` Tiwei Bie
2018-07-30  9:30           ` Michael S. Tsirkin
2018-07-30 16:20             ` Alex Williamson
2018-07-31  7:47             ` Tiwei Bie
2018-09-12  8:04             ` Tiwei Bie
2018-09-12 16:14               ` Michael S. Tsirkin
2018-09-12 16:34                 ` Alex Williamson
2018-09-12 16:44                   ` Michael S. Tsirkin
2018-09-12 17:15                     ` Alex Williamson
2018-09-12 17:29                       ` Michael S. Tsirkin
2018-09-12 18:09                         ` Alex Williamson
2018-09-13  5:26                           ` Tian, Kevin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.