All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
@ 2017-06-26  8:53 Yulei Zhang
  2017-06-26 20:19 ` Alex Williamson
  0 siblings, 1 reply; 13+ messages in thread
From: Yulei Zhang @ 2017-06-26  8:53 UTC (permalink / raw)
  To: qemu-devel
  Cc: zhenyuw, zhi.a.wang, joonas.lahtinen, kevin.tian, xiao.zheng,
	Yulei Zhang

New VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP is used to sync the
pci device dirty pages during the migration.

Signed-off-by: Yulei Zhang <yulei.zhang@intel.com>
---
 hw/vfio/pci.c              | 32 ++++++++++++++++++++++++++++++++
 hw/vfio/pci.h              |  2 ++
 linux-headers/linux/vfio.h | 14 ++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 833cd90..64c851f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -32,6 +32,7 @@
 #include "pci.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "exec/ram_addr.h"
 
 #define MSIX_CAP_LENGTH 12
 
@@ -39,6 +40,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 static VMStateDescription vfio_pci_vmstate;
 static void vfio_vm_change_state_handler(void *pv, int running, RunState state);
+static void vfio_log_sync(MemoryListener *listener, MemoryRegionSection *section);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2869,6 +2871,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
     vfio_setup_resetfn_quirk(vdev);
     qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
 
+    vdev->vfio_memory_listener = (MemoryListener) {
+	    .log_sync = vfio_log_sync,
+    };
+    memory_listener_register(&vdev->vfio_memory_listener, &address_space_memory);
+
     return;
 
 out_teardown:
@@ -2964,6 +2971,7 @@ static void vfio_vm_change_state_handler(void *pv, int running, RunState state)
     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status)) {
         error_report("vfio: Failed to %s device\n", running ? "start" : "stop");
     }
+    vdev->device_stop = running ? false : true;
     g_free(vfio_status);
 }
 
@@ -3079,6 +3087,30 @@ static int vfio_device_get(QEMUFile *f, void *pv, size_t size, VMStateField *fie
     return 0;
 }
 
+static void vfio_log_sync(MemoryListener *listener, MemoryRegionSection *section)
+{
+    VFIOPCIDevice *vdev = container_of(listener, struct VFIOPCIDevice, vfio_memory_listener);
+
+    if (vdev->device_stop) {
+        struct vfio_pci_get_dirty_bitmap *d;
+        ram_addr_t size = int128_get64(section->size);
+        unsigned long page_nr = size >> TARGET_PAGE_BITS;
+        unsigned long bitmap_size = (BITS_TO_LONGS(page_nr) + 1) * sizeof(unsigned long);
+        d = g_malloc0(sizeof(*d) + bitmap_size);
+        d->start_addr = section->offset_within_address_space;
+        d->page_nr = page_nr;
+
+        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_GET_DIRTY_BITMAP, d)) {
+            error_report("vfio: Failed to fetch dirty pages for migration\n");
+            goto exit;
+        }
+        cpu_physical_memory_set_dirty_lebitmap((unsigned long*)&d->dirty_bitmap, d->start_addr, d->page_nr);
+
+exit:
+        g_free(d);
+    }
+}
+
 static void vfio_instance_init(Object *obj)
 {
     PCIDevice *pci_dev = PCI_DEVICE(obj);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index bd98618..984391d 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -144,6 +144,8 @@ typedef struct VFIOPCIDevice {
     bool no_kvm_intx;
     bool no_kvm_msi;
     bool no_kvm_msix;
+    bool device_stop;
+    MemoryListener vfio_memory_listener;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index fa17848..aa73ee1 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -502,6 +502,20 @@ struct vfio_pci_status_set{
 
 #define VFIO_DEVICE_PCI_STATUS_SET	_IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/**
+ * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 15,
+ *				    struct vfio_pci_get_dirty_bitmap)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_pci_get_dirty_bitmap{
+	__u64	       start_addr;
+	__u64	       page_nr;
+	__u8           dirty_bitmap[];
+};
+
+#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 15)
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-26  8:53 [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP Yulei Zhang
@ 2017-06-26 20:19 ` Alex Williamson
  2017-06-27  8:56   ` Zhang, Yulei
  0 siblings, 1 reply; 13+ messages in thread
From: Alex Williamson @ 2017-06-26 20:19 UTC (permalink / raw)
  To: Yulei Zhang
  Cc: qemu-devel, kevin.tian, joonas.lahtinen, zhenyuw, xiao.zheng, zhi.a.wang

On Tue,  4 Apr 2017 18:28:04 +0800
Yulei Zhang <yulei.zhang@intel.com> wrote:

> New VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP is used to sync the
> pci device dirty pages during the migration.

If this needs to exist, it needs a lot more documentation.  Why is this
a PCI specific device ioctl?  Couldn't any vfio device need this?  

> Signed-off-by: Yulei Zhang <yulei.zhang@intel.com>
> ---
>  hw/vfio/pci.c              | 32 ++++++++++++++++++++++++++++++++
>  hw/vfio/pci.h              |  2 ++
>  linux-headers/linux/vfio.h | 14 ++++++++++++++
>  3 files changed, 48 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 833cd90..64c851f 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -32,6 +32,7 @@
>  #include "pci.h"
>  #include "trace.h"
>  #include "qapi/error.h"
> +#include "exec/ram_addr.h"
>  
>  #define MSIX_CAP_LENGTH 12
>  
> @@ -39,6 +40,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>  static VMStateDescription vfio_pci_vmstate;
>  static void vfio_vm_change_state_handler(void *pv, int running, RunState state);
> +static void vfio_log_sync(MemoryListener *listener, MemoryRegionSection *section);
>  
>  /*
>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> @@ -2869,6 +2871,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>      vfio_setup_resetfn_quirk(vdev);
>      qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
>  
> +    vdev->vfio_memory_listener = (MemoryListener) {
> +	    .log_sync = vfio_log_sync,
> +    };
> +    memory_listener_register(&vdev->vfio_memory_listener, &address_space_memory);
> +
>      return;
>  
>  out_teardown:
> @@ -2964,6 +2971,7 @@ static void vfio_vm_change_state_handler(void *pv, int running, RunState state)
>      if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status)) {
>          error_report("vfio: Failed to %s device\n", running ? "start" : "stop");
>      }
> +    vdev->device_stop = running ? false : true;
>      g_free(vfio_status);
>  }
>  
> @@ -3079,6 +3087,30 @@ static int vfio_device_get(QEMUFile *f, void *pv, size_t size, VMStateField *fie
>      return 0;
>  }
>  
> +static void vfio_log_sync(MemoryListener *listener, MemoryRegionSection *section)
> +{
> +    VFIOPCIDevice *vdev = container_of(listener, struct VFIOPCIDevice, vfio_memory_listener);
> +
> +    if (vdev->device_stop) {
> +        struct vfio_pci_get_dirty_bitmap *d;
> +        ram_addr_t size = int128_get64(section->size);
> +        unsigned long page_nr = size >> TARGET_PAGE_BITS;
> +        unsigned long bitmap_size = (BITS_TO_LONGS(page_nr) + 1) * sizeof(unsigned long);
> +        d = g_malloc0(sizeof(*d) + bitmap_size);
> +        d->start_addr = section->offset_within_address_space;
> +        d->page_nr = page_nr;
> +
> +        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_GET_DIRTY_BITMAP, d)) {
> +            error_report("vfio: Failed to fetch dirty pages for migration\n");
> +            goto exit;
> +        }
> +        cpu_physical_memory_set_dirty_lebitmap((unsigned long*)&d->dirty_bitmap, d->start_addr, d->page_nr);
> +
> +exit:
> +        g_free(d);
> +    }
> +}
> +
>  static void vfio_instance_init(Object *obj)
>  {
>      PCIDevice *pci_dev = PCI_DEVICE(obj);
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index bd98618..984391d 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -144,6 +144,8 @@ typedef struct VFIOPCIDevice {
>      bool no_kvm_intx;
>      bool no_kvm_msi;
>      bool no_kvm_msix;
> +    bool device_stop;
> +    MemoryListener vfio_memory_listener;
>  } VFIOPCIDevice;
>  
>  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index fa17848..aa73ee1 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -502,6 +502,20 @@ struct vfio_pci_status_set{
>  
>  #define VFIO_DEVICE_PCI_STATUS_SET	_IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/**
> + * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 15,
> + *				    struct vfio_pci_get_dirty_bitmap)
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_pci_get_dirty_bitmap{
> +	__u64	       start_addr;
> +	__u64	       page_nr;
> +	__u8           dirty_bitmap[];
> +};
> +
> +#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 15)
> +

Dirty since when?  Since the last time we asked?  Since the device was
stopped?  Why is anything dirtied after the device is stopped?  Is this
any pages the device has ever touched?  Thanks,

Alex

>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-26 20:19 ` Alex Williamson
@ 2017-06-27  8:56   ` Zhang, Yulei
  2017-06-27 19:44     ` Alex Williamson
  0 siblings, 1 reply; 13+ messages in thread
From: Zhang, Yulei @ 2017-06-27  8:56 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Tian, Kevin, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A



> -----Original Message-----
> From: Qemu-devel [mailto:qemu-devel-
> bounces+yulei.zhang=intel.com@nongnu.org] On Behalf Of Alex Williamson
> Sent: Tuesday, June 27, 2017 4:19 AM
> To: Zhang, Yulei <yulei.zhang@intel.com>
> Cc: Tian, Kevin <kevin.tian@intel.com>; joonas.lahtinen@linux.intel.com;
> qemu-devel@nongnu.org; zhenyuw@linux.intel.com; Zheng, Xiao
> <xiao.zheng@intel.com>; Wang, Zhi A <zhi.a.wang@intel.com>
> Subject: Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl
> VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
> 
> On Tue,  4 Apr 2017 18:28:04 +0800
> Yulei Zhang <yulei.zhang@intel.com> wrote:
> 
> > New VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP is used to sync the
> > pci device dirty pages during the migration.
> 
> If this needs to exist, it needs a lot more documentation.  Why is this
> a PCI specific device ioctl?  Couldn't any vfio device need this?
>
> > Signed-off-by: Yulei Zhang <yulei.zhang@intel.com>
> > ---
> >  hw/vfio/pci.c              | 32 ++++++++++++++++++++++++++++++++
> >  hw/vfio/pci.h              |  2 ++
> >  linux-headers/linux/vfio.h | 14 ++++++++++++++
> >  3 files changed, 48 insertions(+)
> >
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index 833cd90..64c851f 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -32,6 +32,7 @@
> >  #include "pci.h"
> >  #include "trace.h"
> >  #include "qapi/error.h"
> > +#include "exec/ram_addr.h"
> >
> >  #define MSIX_CAP_LENGTH 12
> >
> > @@ -39,6 +40,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice
> *vdev);
> >  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> >  static VMStateDescription vfio_pci_vmstate;
> >  static void vfio_vm_change_state_handler(void *pv, int running, RunState
> state);
> > +static void vfio_log_sync(MemoryListener *listener,
> MemoryRegionSection *section);
> >
> >  /*
> >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > @@ -2869,6 +2871,11 @@ static void vfio_realize(PCIDevice *pdev, Error
> **errp)
> >      vfio_setup_resetfn_quirk(vdev);
> >      qemu_add_vm_change_state_handler(vfio_vm_change_state_handler,
> vdev);
> >
> > +    vdev->vfio_memory_listener = (MemoryListener) {
> > +	    .log_sync = vfio_log_sync,
> > +    };
> > +    memory_listener_register(&vdev->vfio_memory_listener,
> &address_space_memory);
> > +
> >      return;
> >
> >  out_teardown:
> > @@ -2964,6 +2971,7 @@ static void vfio_vm_change_state_handler(void
> *pv, int running, RunState state)
> >      if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status))
> {
> >          error_report("vfio: Failed to %s device\n", running ? "start" : "stop");
> >      }
> > +    vdev->device_stop = running ? false : true;
> >      g_free(vfio_status);
> >  }
> >
> > @@ -3079,6 +3087,30 @@ static int vfio_device_get(QEMUFile *f, void *pv,
> size_t size, VMStateField *fie
> >      return 0;
> >  }
> >
> > +static void vfio_log_sync(MemoryListener *listener,
> MemoryRegionSection *section)
> > +{
> > +    VFIOPCIDevice *vdev = container_of(listener, struct VFIOPCIDevice,
> vfio_memory_listener);
> > +
> > +    if (vdev->device_stop) {
> > +        struct vfio_pci_get_dirty_bitmap *d;
> > +        ram_addr_t size = int128_get64(section->size);
> > +        unsigned long page_nr = size >> TARGET_PAGE_BITS;
> > +        unsigned long bitmap_size = (BITS_TO_LONGS(page_nr) + 1) *
> sizeof(unsigned long);
> > +        d = g_malloc0(sizeof(*d) + bitmap_size);
> > +        d->start_addr = section->offset_within_address_space;
> > +        d->page_nr = page_nr;
> > +
> > +        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_GET_DIRTY_BITMAP, d))
> {
> > +            error_report("vfio: Failed to fetch dirty pages for migration\n");
> > +            goto exit;
> > +        }
> > +        cpu_physical_memory_set_dirty_lebitmap((unsigned long*)&d-
> >dirty_bitmap, d->start_addr, d->page_nr);
> > +
> > +exit:
> > +        g_free(d);
> > +    }
> > +}
> > +
> >  static void vfio_instance_init(Object *obj)
> >  {
> >      PCIDevice *pci_dev = PCI_DEVICE(obj);
> > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > index bd98618..984391d 100644
> > --- a/hw/vfio/pci.h
> > +++ b/hw/vfio/pci.h
> > @@ -144,6 +144,8 @@ typedef struct VFIOPCIDevice {
> >      bool no_kvm_intx;
> >      bool no_kvm_msi;
> >      bool no_kvm_msix;
> > +    bool device_stop;
> > +    MemoryListener vfio_memory_listener;
> >  } VFIOPCIDevice;
> >
> >  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> > index fa17848..aa73ee1 100644
> > --- a/linux-headers/linux/vfio.h
> > +++ b/linux-headers/linux/vfio.h
> > @@ -502,6 +502,20 @@ struct vfio_pci_status_set{
> >
> >  #define VFIO_DEVICE_PCI_STATUS_SET	_IO(VFIO_TYPE, VFIO_BASE + 14)
> >
> > +/**
> > + * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE +
> 15,
> > + *				    struct vfio_pci_get_dirty_bitmap)
> > + *
> > + * Return: 0 on success, -errno on failure.
> > + */
> > +struct vfio_pci_get_dirty_bitmap{
> > +	__u64	       start_addr;
> > +	__u64	       page_nr;
> > +	__u8           dirty_bitmap[];
> > +};
> > +
> > +#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE
> + 15)
> > +
> 
> Dirty since when?  Since the last time we asked?  Since the device was
> stopped?  Why is anything dirtied after the device is stopped?  Is this
> any pages the device has ever touched?  Thanks,
> 
> Alex
Dirty since the device start operation and before it was stopped. We track
down all the guest pages that device was using before it was stopped, and
leverage this dirty bitmap for page sync during migration.

> 
> >  /* -------- API for Type1 VFIO IOMMU -------- */
> >
> >  /**
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-27  8:56   ` Zhang, Yulei
@ 2017-06-27 19:44     ` Alex Williamson
  2017-06-28  6:04       ` Tian, Kevin
  0 siblings, 1 reply; 13+ messages in thread
From: Alex Williamson @ 2017-06-27 19:44 UTC (permalink / raw)
  To: Zhang, Yulei
  Cc: Tian, Kevin, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A

On Tue, 27 Jun 2017 08:56:01 +0000
"Zhang, Yulei" <yulei.zhang@intel.com> wrote:

> > -----Original Message-----
> > From: Qemu-devel [mailto:qemu-devel-
> > bounces+yulei.zhang=intel.com@nongnu.org] On Behalf Of Alex Williamson
> > Sent: Tuesday, June 27, 2017 4:19 AM
> > To: Zhang, Yulei <yulei.zhang@intel.com>
> > Cc: Tian, Kevin <kevin.tian@intel.com>; joonas.lahtinen@linux.intel.com;
> > qemu-devel@nongnu.org; zhenyuw@linux.intel.com; Zheng, Xiao
> > <xiao.zheng@intel.com>; Wang, Zhi A <zhi.a.wang@intel.com>
> > Subject: Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl
> > VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
> > 
> > On Tue,  4 Apr 2017 18:28:04 +0800
> > Yulei Zhang <yulei.zhang@intel.com> wrote:
> >   
> > > New VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP is used to sync the
> > > pci device dirty pages during the migration.  
> > 
> > If this needs to exist, it needs a lot more documentation.  Why is this
> > a PCI specific device ioctl?  Couldn't any vfio device need this?
> >  
> > > Signed-off-by: Yulei Zhang <yulei.zhang@intel.com>
> > > ---
> > >  hw/vfio/pci.c              | 32 ++++++++++++++++++++++++++++++++
> > >  hw/vfio/pci.h              |  2 ++
> > >  linux-headers/linux/vfio.h | 14 ++++++++++++++
> > >  3 files changed, 48 insertions(+)
> > >
> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index 833cd90..64c851f 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -32,6 +32,7 @@
> > >  #include "pci.h"
> > >  #include "trace.h"
> > >  #include "qapi/error.h"
> > > +#include "exec/ram_addr.h"
> > >
> > >  #define MSIX_CAP_LENGTH 12
> > >
> > > @@ -39,6 +40,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice  
> > *vdev);  
> > >  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> > >  static VMStateDescription vfio_pci_vmstate;
> > >  static void vfio_vm_change_state_handler(void *pv, int running, RunState  
> > state);  
> > > +static void vfio_log_sync(MemoryListener *listener,  
> > MemoryRegionSection *section);  
> > >
> > >  /*
> > >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > > @@ -2869,6 +2871,11 @@ static void vfio_realize(PCIDevice *pdev, Error  
> > **errp)  
> > >      vfio_setup_resetfn_quirk(vdev);
> > >      qemu_add_vm_change_state_handler(vfio_vm_change_state_handler,  
> > vdev);  
> > >
> > > +    vdev->vfio_memory_listener = (MemoryListener) {
> > > +	    .log_sync = vfio_log_sync,
> > > +    };
> > > +    memory_listener_register(&vdev->vfio_memory_listener,  
> > &address_space_memory);  
> > > +
> > >      return;
> > >
> > >  out_teardown:
> > > @@ -2964,6 +2971,7 @@ static void vfio_vm_change_state_handler(void  
> > *pv, int running, RunState state)  
> > >      if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status))  
> > {  
> > >          error_report("vfio: Failed to %s device\n", running ? "start" : "stop");
> > >      }
> > > +    vdev->device_stop = running ? false : true;
> > >      g_free(vfio_status);
> > >  }
> > >
> > > @@ -3079,6 +3087,30 @@ static int vfio_device_get(QEMUFile *f, void *pv,  
> > size_t size, VMStateField *fie  
> > >      return 0;
> > >  }
> > >
> > > +static void vfio_log_sync(MemoryListener *listener,  
> > MemoryRegionSection *section)  
> > > +{
> > > +    VFIOPCIDevice *vdev = container_of(listener, struct VFIOPCIDevice,  
> > vfio_memory_listener);  
> > > +
> > > +    if (vdev->device_stop) {
> > > +        struct vfio_pci_get_dirty_bitmap *d;
> > > +        ram_addr_t size = int128_get64(section->size);
> > > +        unsigned long page_nr = size >> TARGET_PAGE_BITS;
> > > +        unsigned long bitmap_size = (BITS_TO_LONGS(page_nr) + 1) *  
> > sizeof(unsigned long);  
> > > +        d = g_malloc0(sizeof(*d) + bitmap_size);
> > > +        d->start_addr = section->offset_within_address_space;
> > > +        d->page_nr = page_nr;
> > > +
> > > +        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_GET_DIRTY_BITMAP, d))  
> > {  
> > > +            error_report("vfio: Failed to fetch dirty pages for migration\n");
> > > +            goto exit;
> > > +        }
> > > +        cpu_physical_memory_set_dirty_lebitmap((unsigned long*)&d-
> > >dirty_bitmap, d->start_addr, d->page_nr);
> > > +
> > > +exit:
> > > +        g_free(d);
> > > +    }
> > > +}
> > > +
> > >  static void vfio_instance_init(Object *obj)
> > >  {
> > >      PCIDevice *pci_dev = PCI_DEVICE(obj);
> > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > > index bd98618..984391d 100644
> > > --- a/hw/vfio/pci.h
> > > +++ b/hw/vfio/pci.h
> > > @@ -144,6 +144,8 @@ typedef struct VFIOPCIDevice {
> > >      bool no_kvm_intx;
> > >      bool no_kvm_msi;
> > >      bool no_kvm_msix;
> > > +    bool device_stop;
> > > +    MemoryListener vfio_memory_listener;
> > >  } VFIOPCIDevice;
> > >
> > >  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> > > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> > > index fa17848..aa73ee1 100644
> > > --- a/linux-headers/linux/vfio.h
> > > +++ b/linux-headers/linux/vfio.h
> > > @@ -502,6 +502,20 @@ struct vfio_pci_status_set{
> > >
> > >  #define VFIO_DEVICE_PCI_STATUS_SET	_IO(VFIO_TYPE, VFIO_BASE + 14)
> > >
> > > +/**
> > > + * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE +  
> > 15,  
> > > + *				    struct vfio_pci_get_dirty_bitmap)
> > > + *
> > > + * Return: 0 on success, -errno on failure.
> > > + */
> > > +struct vfio_pci_get_dirty_bitmap{
> > > +	__u64	       start_addr;
> > > +	__u64	       page_nr;
> > > +	__u8           dirty_bitmap[];
> > > +};
> > > +
> > > +#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE  
> > + 15)  
> > > +  
> > 
> > Dirty since when?  Since the last time we asked?  Since the device was
> > stopped?  Why is anything dirtied after the device is stopped?  Is this
> > any pages the device has ever touched?  Thanks,
> > 
> > Alex  
> Dirty since the device start operation and before it was stopped. We track
> down all the guest pages that device was using before it was stopped, and
> leverage this dirty bitmap for page sync during migration.

I don't understand how this is useful or efficient.  This implies that
the device is always tracking dirtied pages even when we don't care
about migration.  Don't we want to enable dirty logging at some point
and track dirty pages since then?  Otherwise we can just assume the
device dirties all pages and get rid of this ioctl.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-27 19:44     ` Alex Williamson
@ 2017-06-28  6:04       ` Tian, Kevin
  2017-06-28 15:59         ` Alex Williamson
  0 siblings, 1 reply; 13+ messages in thread
From: Tian, Kevin @ 2017-06-28  6:04 UTC (permalink / raw)
  To: Alex Williamson, Zhang, Yulei
  Cc: joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao, Wang, Zhi A

> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Wednesday, June 28, 2017 3:45 AM
> 
> On Tue, 27 Jun 2017 08:56:01 +0000
> "Zhang, Yulei" <yulei.zhang@intel.com> wrote:
> 
> > > -----Original Message-----
> > > From: Qemu-devel [mailto:qemu-devel-
> > > bounces+yulei.zhang=intel.com@nongnu.org] On Behalf Of Alex
> Williamson
> > > Sent: Tuesday, June 27, 2017 4:19 AM
> > > To: Zhang, Yulei <yulei.zhang@intel.com>
> > > Cc: Tian, Kevin <kevin.tian@intel.com>; joonas.lahtinen@linux.intel.com;
> > > qemu-devel@nongnu.org; zhenyuw@linux.intel.com; Zheng, Xiao
> > > <xiao.zheng@intel.com>; Wang, Zhi A <zhi.a.wang@intel.com>
> > > Subject: Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl
> > > VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
> > >
> > > On Tue,  4 Apr 2017 18:28:04 +0800
> > > Yulei Zhang <yulei.zhang@intel.com> wrote:
> > >
> > > > New VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP is used to sync
> the
> > > > pci device dirty pages during the migration.
> > >
> > > If this needs to exist, it needs a lot more documentation.  Why is this
> > > a PCI specific device ioctl?  Couldn't any vfio device need this?
> > >
> > > > Signed-off-by: Yulei Zhang <yulei.zhang@intel.com>
> > > > ---
> > > >  hw/vfio/pci.c              | 32 ++++++++++++++++++++++++++++++++
> > > >  hw/vfio/pci.h              |  2 ++
> > > >  linux-headers/linux/vfio.h | 14 ++++++++++++++
> > > >  3 files changed, 48 insertions(+)
> > > >
> > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > > index 833cd90..64c851f 100644
> > > > --- a/hw/vfio/pci.c
> > > > +++ b/hw/vfio/pci.c
> > > > @@ -32,6 +32,7 @@
> > > >  #include "pci.h"
> > > >  #include "trace.h"
> > > >  #include "qapi/error.h"
> > > > +#include "exec/ram_addr.h"
> > > >
> > > >  #define MSIX_CAP_LENGTH 12
> > > >
> > > > @@ -39,6 +40,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice
> > > *vdev);
> > > >  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool
> enabled);
> > > >  static VMStateDescription vfio_pci_vmstate;
> > > >  static void vfio_vm_change_state_handler(void *pv, int running,
> RunState
> > > state);
> > > > +static void vfio_log_sync(MemoryListener *listener,
> > > MemoryRegionSection *section);
> > > >
> > > >  /*
> > > >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > > > @@ -2869,6 +2871,11 @@ static void vfio_realize(PCIDevice *pdev,
> Error
> > > **errp)
> > > >      vfio_setup_resetfn_quirk(vdev);
> > > >
> qemu_add_vm_change_state_handler(vfio_vm_change_state_handler,
> > > vdev);
> > > >
> > > > +    vdev->vfio_memory_listener = (MemoryListener) {
> > > > +	    .log_sync = vfio_log_sync,
> > > > +    };
> > > > +    memory_listener_register(&vdev->vfio_memory_listener,
> > > &address_space_memory);
> > > > +
> > > >      return;
> > > >
> > > >  out_teardown:
> > > > @@ -2964,6 +2971,7 @@ static void
> vfio_vm_change_state_handler(void
> > > *pv, int running, RunState state)
> > > >      if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET,
> vfio_status))
> > > {
> > > >          error_report("vfio: Failed to %s device\n", running ? "start" :
> "stop");
> > > >      }
> > > > +    vdev->device_stop = running ? false : true;
> > > >      g_free(vfio_status);
> > > >  }
> > > >
> > > > @@ -3079,6 +3087,30 @@ static int vfio_device_get(QEMUFile *f, void
> *pv,
> > > size_t size, VMStateField *fie
> > > >      return 0;
> > > >  }
> > > >
> > > > +static void vfio_log_sync(MemoryListener *listener,
> > > MemoryRegionSection *section)
> > > > +{
> > > > +    VFIOPCIDevice *vdev = container_of(listener, struct VFIOPCIDevice,
> > > vfio_memory_listener);
> > > > +
> > > > +    if (vdev->device_stop) {
> > > > +        struct vfio_pci_get_dirty_bitmap *d;
> > > > +        ram_addr_t size = int128_get64(section->size);
> > > > +        unsigned long page_nr = size >> TARGET_PAGE_BITS;
> > > > +        unsigned long bitmap_size = (BITS_TO_LONGS(page_nr) + 1) *
> > > sizeof(unsigned long);
> > > > +        d = g_malloc0(sizeof(*d) + bitmap_size);
> > > > +        d->start_addr = section->offset_within_address_space;
> > > > +        d->page_nr = page_nr;
> > > > +
> > > > +        if (ioctl(vdev->vbasedev.fd,
> VFIO_DEVICE_PCI_GET_DIRTY_BITMAP, d))
> > > {
> > > > +            error_report("vfio: Failed to fetch dirty pages for migration\n");
> > > > +            goto exit;
> > > > +        }
> > > > +        cpu_physical_memory_set_dirty_lebitmap((unsigned long*)&d-
> > > >dirty_bitmap, d->start_addr, d->page_nr);
> > > > +
> > > > +exit:
> > > > +        g_free(d);
> > > > +    }
> > > > +}
> > > > +
> > > >  static void vfio_instance_init(Object *obj)
> > > >  {
> > > >      PCIDevice *pci_dev = PCI_DEVICE(obj);
> > > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > > > index bd98618..984391d 100644
> > > > --- a/hw/vfio/pci.h
> > > > +++ b/hw/vfio/pci.h
> > > > @@ -144,6 +144,8 @@ typedef struct VFIOPCIDevice {
> > > >      bool no_kvm_intx;
> > > >      bool no_kvm_msi;
> > > >      bool no_kvm_msix;
> > > > +    bool device_stop;
> > > > +    MemoryListener vfio_memory_listener;
> > > >  } VFIOPCIDevice;
> > > >
> > > >  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> > > > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> > > > index fa17848..aa73ee1 100644
> > > > --- a/linux-headers/linux/vfio.h
> > > > +++ b/linux-headers/linux/vfio.h
> > > > @@ -502,6 +502,20 @@ struct vfio_pci_status_set{
> > > >
> > > >  #define VFIO_DEVICE_PCI_STATUS_SET	_IO(VFIO_TYPE, VFIO_BASE +
> 14)
> > > >
> > > > +/**
> > > > + * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE,
> VFIO_BASE +
> > > 15,
> > > > + *				    struct vfio_pci_get_dirty_bitmap)
> > > > + *
> > > > + * Return: 0 on success, -errno on failure.
> > > > + */
> > > > +struct vfio_pci_get_dirty_bitmap{
> > > > +	__u64	       start_addr;
> > > > +	__u64	       page_nr;
> > > > +	__u8           dirty_bitmap[];
> > > > +};
> > > > +
> > > > +#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE,
> VFIO_BASE
> > > + 15)
> > > > +
> > >
> > > Dirty since when?  Since the last time we asked?  Since the device was
> > > stopped?  Why is anything dirtied after the device is stopped?  Is this
> > > any pages the device has ever touched?  Thanks,
> > >
> > > Alex
> > Dirty since the device start operation and before it was stopped. We track
> > down all the guest pages that device was using before it was stopped, and
> > leverage this dirty bitmap for page sync during migration.
> 
> I don't understand how this is useful or efficient.  This implies that
> the device is always tracking dirtied pages even when we don't care
> about migration.  Don't we want to enable dirty logging at some point
> and track dirty pages since then?  Otherwise we can just assume the
> device dirties all pages and get rid of this ioctl.  Thanks,
> 

Agree. Regarding to interface definition we'd better follow general
dirty logging scheme as Alex pointed out, possibly through another
ioctl cmd to enable/disable logging. However vendor specific 
implementation may choose to ignore the cmd while always tracking 
dirty pages, as on Intel Processor Graphics. Below is some background.

CPU dirty logging is done through either CPU page fault or HW dirty
bit logging (e.g. Intel PML). However there is a gap in DMA side today. 
DMA page fault requires both IOMMU and device support (through 
PCI ATS/PRS), which is not widely available today and mostly for 
special types of workloads (e.g. Shared Virtual Memory). Regarding
to dirty bit, at least VTd doesn't support it today.

So the alternative option is to rely on mediation layer to track the
dirty pages, since workload submissions on vGPU are mediated. It's
feasible for simple devices such as NIC, which has a clear definition
of descriptors so it's easy to scan and capture which pages will be
dirtied. However doing same thing for complex GPU (meaning
to scan all GPU commands, shader instructions, indirect structures, 
etc.) is weigh too complex and insufficient. Today we only scan
privileged commands for security purpose which is only a very 
small portion of all possible cmd set.

Then in reality we choose a simplified approach. instead of tracking
incrementally dirtied pages since last query, we treat all pages which
are currently mapped in GPU page tables as dirtied. To avoid overhead
of walking global page table (GGTT) and all active per-process page
tables (PPGTTs) upon query, we choose to always maintain a bitmap 
which is updated when mediating guest updates to those GTT entries. 
It adds negligible overhead at run-time since those operations are 
already mediated.

Every time when Qemu tries to query dirty map, it likely will get
a similar large dirty bitmap (not exactly same since GPU page tables
are being changed) then will exit iterative memory copy very soon,
which will ends up like below:

1st round: Qemu copies all the memory (say 4GB) to another machine
2nd round: Qemu queries vGPU dirty map (usually in several hundreds
of MBs) and combine with CPU dirty map to copy
3rd round: Qemu will get similar size of dirty pages then exit the 
pre-copy phase since dirty set doesn't converge

Although it's not that efficient, no need to stop service for whole 4GB
memory copy still saves a lot. In our measurement the service 
shutdown time is ~300ms over 10Gb link when running 3D benchmarks
(e.g. 3DMark, Heaven, etc.) and media transcoding workloads, while
copying whole system memory may easily take seconds to trigger TDR.
though service shutdown time is bigger than usual server-based scenario,
it's somehow OK for interactive usages (e.g. VDI) or offline transcoding 
usages. You may take a look at our demo at:

https://www.youtube.com/watch?v=y2SkU5JODIY

In a nutshell, our current dirty logging implementation is a bit
awkward due to arch limitation, but it does work well for some 
scenarios. Most importantly I agree we should design interface 
in a more general way to enable/disable dirty logging as stated
earlier.

Hope above makes the whole background clearer. :-)

Thanks
Kevin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-28  6:04       ` Tian, Kevin
@ 2017-06-28 15:59         ` Alex Williamson
  2017-06-29  0:10           ` Tian, Kevin
  0 siblings, 1 reply; 13+ messages in thread
From: Alex Williamson @ 2017-06-28 15:59 UTC (permalink / raw)
  To: Tian, Kevin
  Cc: Zhang, Yulei, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A

On Wed, 28 Jun 2017 06:04:10 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Wednesday, June 28, 2017 3:45 AM
> > 
> > On Tue, 27 Jun 2017 08:56:01 +0000
> > "Zhang, Yulei" <yulei.zhang@intel.com> wrote:
> > > > > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> > > > > index fa17848..aa73ee1 100644
> > > > > --- a/linux-headers/linux/vfio.h
> > > > > +++ b/linux-headers/linux/vfio.h
> > > > > @@ -502,6 +502,20 @@ struct vfio_pci_status_set{
> > > > >
> > > > >  #define VFIO_DEVICE_PCI_STATUS_SET	_IO(VFIO_TYPE, VFIO_BASE +  
> > 14)  
> > > > >
> > > > > +/**
> > > > > + * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE,  
> > VFIO_BASE +  
> > > > 15,  
> > > > > + *				    struct vfio_pci_get_dirty_bitmap)
> > > > > + *
> > > > > + * Return: 0 on success, -errno on failure.
> > > > > + */
> > > > > +struct vfio_pci_get_dirty_bitmap{
> > > > > +	__u64	       start_addr;
> > > > > +	__u64	       page_nr;
> > > > > +	__u8           dirty_bitmap[];
> > > > > +};
> > > > > +
> > > > > +#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE,  
> > VFIO_BASE  
> > > > + 15)  
> > > > > +  
> > > >
> > > > Dirty since when?  Since the last time we asked?  Since the device was
> > > > stopped?  Why is anything dirtied after the device is stopped?  Is this
> > > > any pages the device has ever touched?  Thanks,
> > > >
> > > > Alex  
> > > Dirty since the device start operation and before it was stopped. We track
> > > down all the guest pages that device was using before it was stopped, and
> > > leverage this dirty bitmap for page sync during migration.  
> > 
> > I don't understand how this is useful or efficient.  This implies that
> > the device is always tracking dirtied pages even when we don't care
> > about migration.  Don't we want to enable dirty logging at some point
> > and track dirty pages since then?  Otherwise we can just assume the
> > device dirties all pages and get rid of this ioctl.  Thanks,
> >   
> 
> Agree. Regarding to interface definition we'd better follow general
> dirty logging scheme as Alex pointed out, possibly through another
> ioctl cmd to enable/disable logging. However vendor specific 
> implementation may choose to ignore the cmd while always tracking 
> dirty pages, as on Intel Processor Graphics. Below is some background.
> 
> CPU dirty logging is done through either CPU page fault or HW dirty
> bit logging (e.g. Intel PML). However there is a gap in DMA side today. 
> DMA page fault requires both IOMMU and device support (through 
> PCI ATS/PRS), which is not widely available today and mostly for 
> special types of workloads (e.g. Shared Virtual Memory). Regarding
> to dirty bit, at least VTd doesn't support it today.
> 
> So the alternative option is to rely on mediation layer to track the
> dirty pages, since workload submissions on vGPU are mediated. It's
> feasible for simple devices such as NIC, which has a clear definition
> of descriptors so it's easy to scan and capture which pages will be
> dirtied. However doing same thing for complex GPU (meaning
> to scan all GPU commands, shader instructions, indirect structures, 
> etc.) is weigh too complex and insufficient. Today we only scan
> privileged commands for security purpose which is only a very 
> small portion of all possible cmd set.
> 
> Then in reality we choose a simplified approach. instead of tracking
> incrementally dirtied pages since last query, we treat all pages which
> are currently mapped in GPU page tables as dirtied. To avoid overhead
> of walking global page table (GGTT) and all active per-process page
> tables (PPGTTs) upon query, we choose to always maintain a bitmap 
> which is updated when mediating guest updates to those GTT entries. 
> It adds negligible overhead at run-time since those operations are 
> already mediated.
> 
> Every time when Qemu tries to query dirty map, it likely will get
> a similar large dirty bitmap (not exactly same since GPU page tables
> are being changed) then will exit iterative memory copy very soon,
> which will ends up like below:
> 
> 1st round: Qemu copies all the memory (say 4GB) to another machine
> 2nd round: Qemu queries vGPU dirty map (usually in several hundreds
> of MBs) and combine with CPU dirty map to copy
> 3rd round: Qemu will get similar size of dirty pages then exit the 
> pre-copy phase since dirty set doesn't converge
> 
> Although it's not that efficient, no need to stop service for whole 4GB
> memory copy still saves a lot. In our measurement the service 
> shutdown time is ~300ms over 10Gb link when running 3D benchmarks
> (e.g. 3DMark, Heaven, etc.) and media transcoding workloads, while
> copying whole system memory may easily take seconds to trigger TDR.
> though service shutdown time is bigger than usual server-based scenario,
> it's somehow OK for interactive usages (e.g. VDI) or offline transcoding 
> usages. You may take a look at our demo at:
> 
> https://www.youtube.com/watch?v=y2SkU5JODIY
> 
> In a nutshell, our current dirty logging implementation is a bit
> awkward due to arch limitation, but it does work well for some 
> scenarios. Most importantly I agree we should design interface 
> in a more general way to enable/disable dirty logging as stated
> earlier.
> 
> Hope above makes the whole background clearer. :-)

Thanks Kevin.  So really it's not really a dirty bitmap, it's just a
bitmap of pages that the device has access to and may have dirtied.
Don't we have this more generally in the vfio type1 IOMMU backend?  For
a mediated device, we know all the pages that the vendor driver has
asked to be pinned.  Should we perhaps make this interface on the vfio
container rather than the device?  Any mediated device can provide this
level of detail without specific vendor support.  If we had DMA page
faulting, this would be the natural place to put it as well, so maybe
we should design the interface there to support everything similarly.
Thanks,

Alex

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-28 15:59         ` Alex Williamson
@ 2017-06-29  0:10           ` Tian, Kevin
  2017-06-29 20:57             ` Alex Williamson
  0 siblings, 1 reply; 13+ messages in thread
From: Tian, Kevin @ 2017-06-29  0:10 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Zhang, Yulei, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A

> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Thursday, June 29, 2017 12:00 AM
> 
> On Wed, 28 Jun 2017 06:04:10 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Wednesday, June 28, 2017 3:45 AM
> > >
> > > On Tue, 27 Jun 2017 08:56:01 +0000
> > > "Zhang, Yulei" <yulei.zhang@intel.com> wrote:
> > > > > > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> > > > > > index fa17848..aa73ee1 100644
> > > > > > --- a/linux-headers/linux/vfio.h
> > > > > > +++ b/linux-headers/linux/vfio.h
> > > > > > @@ -502,6 +502,20 @@ struct vfio_pci_status_set{
> > > > > >
> > > > > >  #define VFIO_DEVICE_PCI_STATUS_SET	_IO(VFIO_TYPE,
> VFIO_BASE +
> > > 14)
> > > > > >
> > > > > > +/**
> > > > > > + * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE,
> > > VFIO_BASE +
> > > > > 15,
> > > > > > + *				    struct vfio_pci_get_dirty_bitmap)
> > > > > > + *
> > > > > > + * Return: 0 on success, -errno on failure.
> > > > > > + */
> > > > > > +struct vfio_pci_get_dirty_bitmap{
> > > > > > +	__u64	       start_addr;
> > > > > > +	__u64	       page_nr;
> > > > > > +	__u8           dirty_bitmap[];
> > > > > > +};
> > > > > > +
> > > > > > +#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE,
> > > VFIO_BASE
> > > > > + 15)
> > > > > > +
> > > > >
> > > > > Dirty since when?  Since the last time we asked?  Since the device was
> > > > > stopped?  Why is anything dirtied after the device is stopped?  Is this
> > > > > any pages the device has ever touched?  Thanks,
> > > > >
> > > > > Alex
> > > > Dirty since the device start operation and before it was stopped. We
> track
> > > > down all the guest pages that device was using before it was stopped,
> and
> > > > leverage this dirty bitmap for page sync during migration.
> > >
> > > I don't understand how this is useful or efficient.  This implies that
> > > the device is always tracking dirtied pages even when we don't care
> > > about migration.  Don't we want to enable dirty logging at some point
> > > and track dirty pages since then?  Otherwise we can just assume the
> > > device dirties all pages and get rid of this ioctl.  Thanks,
> > >
> >
> > Agree. Regarding to interface definition we'd better follow general
> > dirty logging scheme as Alex pointed out, possibly through another
> > ioctl cmd to enable/disable logging. However vendor specific
> > implementation may choose to ignore the cmd while always tracking
> > dirty pages, as on Intel Processor Graphics. Below is some background.
> >
> > CPU dirty logging is done through either CPU page fault or HW dirty
> > bit logging (e.g. Intel PML). However there is a gap in DMA side today.
> > DMA page fault requires both IOMMU and device support (through
> > PCI ATS/PRS), which is not widely available today and mostly for
> > special types of workloads (e.g. Shared Virtual Memory). Regarding
> > to dirty bit, at least VTd doesn't support it today.
> >
> > So the alternative option is to rely on mediation layer to track the
> > dirty pages, since workload submissions on vGPU are mediated. It's
> > feasible for simple devices such as NIC, which has a clear definition
> > of descriptors so it's easy to scan and capture which pages will be
> > dirtied. However doing same thing for complex GPU (meaning
> > to scan all GPU commands, shader instructions, indirect structures,
> > etc.) is weigh too complex and insufficient. Today we only scan
> > privileged commands for security purpose which is only a very
> > small portion of all possible cmd set.
> >
> > Then in reality we choose a simplified approach. instead of tracking
> > incrementally dirtied pages since last query, we treat all pages which
> > are currently mapped in GPU page tables as dirtied. To avoid overhead
> > of walking global page table (GGTT) and all active per-process page
> > tables (PPGTTs) upon query, we choose to always maintain a bitmap
> > which is updated when mediating guest updates to those GTT entries.
> > It adds negligible overhead at run-time since those operations are
> > already mediated.
> >
> > Every time when Qemu tries to query dirty map, it likely will get
> > a similar large dirty bitmap (not exactly same since GPU page tables
> > are being changed) then will exit iterative memory copy very soon,
> > which will ends up like below:
> >
> > 1st round: Qemu copies all the memory (say 4GB) to another machine
> > 2nd round: Qemu queries vGPU dirty map (usually in several hundreds
> > of MBs) and combine with CPU dirty map to copy
> > 3rd round: Qemu will get similar size of dirty pages then exit the
> > pre-copy phase since dirty set doesn't converge
> >
> > Although it's not that efficient, no need to stop service for whole 4GB
> > memory copy still saves a lot. In our measurement the service
> > shutdown time is ~300ms over 10Gb link when running 3D benchmarks
> > (e.g. 3DMark, Heaven, etc.) and media transcoding workloads, while
> > copying whole system memory may easily take seconds to trigger TDR.
> > though service shutdown time is bigger than usual server-based scenario,
> > it's somehow OK for interactive usages (e.g. VDI) or offline transcoding
> > usages. You may take a look at our demo at:
> >
> > https://www.youtube.com/watch?v=y2SkU5JODIY
> >
> > In a nutshell, our current dirty logging implementation is a bit
> > awkward due to arch limitation, but it does work well for some
> > scenarios. Most importantly I agree we should design interface
> > in a more general way to enable/disable dirty logging as stated
> > earlier.
> >
> > Hope above makes the whole background clearer. :-)
> 
> Thanks Kevin.  So really it's not really a dirty bitmap, it's just a
> bitmap of pages that the device has access to and may have dirtied.
> Don't we have this more generally in the vfio type1 IOMMU backend?  For
> a mediated device, we know all the pages that the vendor driver has
> asked to be pinned.  Should we perhaps make this interface on the vfio
> container rather than the device?  Any mediated device can provide this
> level of detail without specific vendor support.  If we had DMA page
> faulting, this would be the natural place to put it as well, so maybe
> we should design the interface there to support everything similarly.
> Thanks,
> 

That's a nice idea. Just two comments:

1) If some mediated device has its own way to construct true dirty
bitmap (not thru DMA page faulting), the interface is better designed
to allow that flexibility. Maybe an optional callback if not registered
then use common type1 IOMMU logic otherwise prefers to vendor
specific callback

2) If there could be multiple mediated devices from different vendors
in same container while not all mediated devices support live migration,
would container-level interface impose some limitation?

Thanks
Kevin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-29  0:10           ` Tian, Kevin
@ 2017-06-29 20:57             ` Alex Williamson
  2017-06-30  5:14               ` Tian, Kevin
  0 siblings, 1 reply; 13+ messages in thread
From: Alex Williamson @ 2017-06-29 20:57 UTC (permalink / raw)
  To: Tian, Kevin
  Cc: Zhang, Yulei, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A

On Thu, 29 Jun 2017 00:10:59 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Thursday, June 29, 2017 12:00 AM 
> > Thanks Kevin.  So really it's not really a dirty bitmap, it's just a
> > bitmap of pages that the device has access to and may have dirtied.
> > Don't we have this more generally in the vfio type1 IOMMU backend?  For
> > a mediated device, we know all the pages that the vendor driver has
> > asked to be pinned.  Should we perhaps make this interface on the vfio
> > container rather than the device?  Any mediated device can provide this
> > level of detail without specific vendor support.  If we had DMA page
> > faulting, this would be the natural place to put it as well, so maybe
> > we should design the interface there to support everything similarly.
> > Thanks,
> >   
> 
> That's a nice idea. Just two comments:
> 
> 1) If some mediated device has its own way to construct true dirty
> bitmap (not thru DMA page faulting), the interface is better designed
> to allow that flexibility. Maybe an optional callback if not registered
> then use common type1 IOMMU logic otherwise prefers to vendor
> specific callback

I'm not sure what that looks like, but I agree with the idea.  Could
the pages that type1 knows about every be anything other than a
superset of the dirty pages?  Perhaps a device ioctl to flush unused
mappings would be sufficient.

> 2) If there could be multiple mediated devices from different vendors
> in same container while not all mediated devices support live migration,
> would container-level interface impose some limitation?

Dirty page logging is only one small part of migration, each
migrate-able device would still need to provide a device-level
interface to save/restore state.  The migration would fail when we get
to the device(s) that don't provide that.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-29 20:57             ` Alex Williamson
@ 2017-06-30  5:14               ` Tian, Kevin
  2017-06-30 16:59                 ` Alex Williamson
  0 siblings, 1 reply; 13+ messages in thread
From: Tian, Kevin @ 2017-06-30  5:14 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Zhang, Yulei, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A

> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Friday, June 30, 2017 4:57 AM
> 
> On Thu, 29 Jun 2017 00:10:59 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Thursday, June 29, 2017 12:00 AM
> > > Thanks Kevin.  So really it's not really a dirty bitmap, it's just a
> > > bitmap of pages that the device has access to and may have dirtied.
> > > Don't we have this more generally in the vfio type1 IOMMU backend?  For
> > > a mediated device, we know all the pages that the vendor driver has
> > > asked to be pinned.  Should we perhaps make this interface on the vfio
> > > container rather than the device?  Any mediated device can provide this
> > > level of detail without specific vendor support.  If we had DMA page
> > > faulting, this would be the natural place to put it as well, so maybe
> > > we should design the interface there to support everything similarly.
> > > Thanks,
> > >
> >
> > That's a nice idea. Just two comments:
> >
> > 1) If some mediated device has its own way to construct true dirty
> > bitmap (not thru DMA page faulting), the interface is better designed
> > to allow that flexibility. Maybe an optional callback if not registered
> > then use common type1 IOMMU logic otherwise prefers to vendor
> > specific callback
> 
> I'm not sure what that looks like, but I agree with the idea.  Could
> the pages that type1 knows about every be anything other than a
> superset of the dirty pages?  Perhaps a device ioctl to flush unused
> mappings would be sufficient.

sorry I didn't quite get your idea here. My understanding is that
type1 is OK as an alternative in case mediated device has no way
to track dirtied pages (as for Intel GPU), so we can use type1 pinned
pages as an indirect way to indicate dirtied pages. But if mediated
device has its own way (e.g. a device private MMU) to track dirty
pages, then we should allow that device to provide dirty bitmap
instead of using type1.

> 
> > 2) If there could be multiple mediated devices from different vendors
> > in same container while not all mediated devices support live migration,
> > would container-level interface impose some limitation?
> 
> Dirty page logging is only one small part of migration, each
> migrate-able device would still need to provide a device-level
> interface to save/restore state.  The migration would fail when we get
> to the device(s) that don't provide that.  Thanks,
> 

Agree here. Yulei, can you investigate this direction and report back
whether it's feasible or anything overlooked?

Thanks
Kevin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-30  5:14               ` Tian, Kevin
@ 2017-06-30 16:59                 ` Alex Williamson
  2017-07-07  6:40                   ` Tian, Kevin
  0 siblings, 1 reply; 13+ messages in thread
From: Alex Williamson @ 2017-06-30 16:59 UTC (permalink / raw)
  To: Tian, Kevin
  Cc: Zhang, Yulei, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A

On Fri, 30 Jun 2017 05:14:40 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Friday, June 30, 2017 4:57 AM
> > 
> > On Thu, 29 Jun 2017 00:10:59 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Thursday, June 29, 2017 12:00 AM
> > > > Thanks Kevin.  So really it's not really a dirty bitmap, it's just a
> > > > bitmap of pages that the device has access to and may have dirtied.
> > > > Don't we have this more generally in the vfio type1 IOMMU backend?  For
> > > > a mediated device, we know all the pages that the vendor driver has
> > > > asked to be pinned.  Should we perhaps make this interface on the vfio
> > > > container rather than the device?  Any mediated device can provide this
> > > > level of detail without specific vendor support.  If we had DMA page
> > > > faulting, this would be the natural place to put it as well, so maybe
> > > > we should design the interface there to support everything similarly.
> > > > Thanks,
> > > >  
> > >
> > > That's a nice idea. Just two comments:
> > >
> > > 1) If some mediated device has its own way to construct true dirty
> > > bitmap (not thru DMA page faulting), the interface is better designed
> > > to allow that flexibility. Maybe an optional callback if not registered
> > > then use common type1 IOMMU logic otherwise prefers to vendor
> > > specific callback  
> > 
> > I'm not sure what that looks like, but I agree with the idea.  Could
> > the pages that type1 knows about every be anything other than a
> > superset of the dirty pages?  Perhaps a device ioctl to flush unused
> > mappings would be sufficient.  
> 
> sorry I didn't quite get your idea here. My understanding is that
> type1 is OK as an alternative in case mediated device has no way
> to track dirtied pages (as for Intel GPU), so we can use type1 pinned
> pages as an indirect way to indicate dirtied pages. But if mediated
> device has its own way (e.g. a device private MMU) to track dirty
> pages, then we should allow that device to provide dirty bitmap
> instead of using type1.

My thought was that our current mdev iommu interface allows the vendor
driver to pin specific pages.  In order for the mdev device to dirty a
page, we need for it to be pinned.  Therefore at worst, the set of
pages pinned in type1 is the superset of all pages that can potentially
be dirtied by the device.  In the worst case, this devolves to all
pages mapped through the iommu in the case of direct assigned devices.
My assertion is therefore that a device specific dirty page bitmap can
only be a subset of the type1 pinned pages.  Therefore if the mdev
vendor driver can flush any stale pinnings, then the type1 view off
pinned pages should match the devices view of the current working set.
Then we wouldn't need a device specific dirty bitmap, we'd only need a
mechanism to trigger a flush of stale mappings on the device.

Otherwise I'm not sure how we cleanly create an interface where the
dirty bitmap can either come from the device or the container... but
I'd welcome suggestions.  Thanks,

Alex
 
> > > 2) If there could be multiple mediated devices from different vendors
> > > in same container while not all mediated devices support live migration,
> > > would container-level interface impose some limitation?  
> > 
> > Dirty page logging is only one small part of migration, each
> > migrate-able device would still need to provide a device-level
> > interface to save/restore state.  The migration would fail when we get
> > to the device(s) that don't provide that.  Thanks,
> >   
> 
> Agree here. Yulei, can you investigate this direction and report back
> whether it's feasible or anything overlooked?
> 
> Thanks
> Kevin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-06-30 16:59                 ` Alex Williamson
@ 2017-07-07  6:40                   ` Tian, Kevin
  2017-07-10 19:47                     ` Alex Williamson
  0 siblings, 1 reply; 13+ messages in thread
From: Tian, Kevin @ 2017-07-07  6:40 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Zhang, Yulei, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A

> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Saturday, July 1, 2017 1:00 AM
> 
> On Fri, 30 Jun 2017 05:14:40 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Friday, June 30, 2017 4:57 AM
> > >
> > > On Thu, 29 Jun 2017 00:10:59 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > >
> > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > Sent: Thursday, June 29, 2017 12:00 AM
> > > > > Thanks Kevin.  So really it's not really a dirty bitmap, it's just a
> > > > > bitmap of pages that the device has access to and may have dirtied.
> > > > > Don't we have this more generally in the vfio type1 IOMMU backend?
> For
> > > > > a mediated device, we know all the pages that the vendor driver has
> > > > > asked to be pinned.  Should we perhaps make this interface on the
> vfio
> > > > > container rather than the device?  Any mediated device can provide
> this
> > > > > level of detail without specific vendor support.  If we had DMA page
> > > > > faulting, this would be the natural place to put it as well, so maybe
> > > > > we should design the interface there to support everything similarly.
> > > > > Thanks,
> > > > >
> > > >
> > > > That's a nice idea. Just two comments:
> > > >
> > > > 1) If some mediated device has its own way to construct true dirty
> > > > bitmap (not thru DMA page faulting), the interface is better designed
> > > > to allow that flexibility. Maybe an optional callback if not registered
> > > > then use common type1 IOMMU logic otherwise prefers to vendor
> > > > specific callback
> > >
> > > I'm not sure what that looks like, but I agree with the idea.  Could
> > > the pages that type1 knows about every be anything other than a
> > > superset of the dirty pages?  Perhaps a device ioctl to flush unused
> > > mappings would be sufficient.
> >
> > sorry I didn't quite get your idea here. My understanding is that
> > type1 is OK as an alternative in case mediated device has no way
> > to track dirtied pages (as for Intel GPU), so we can use type1 pinned
> > pages as an indirect way to indicate dirtied pages. But if mediated
> > device has its own way (e.g. a device private MMU) to track dirty
> > pages, then we should allow that device to provide dirty bitmap
> > instead of using type1.
> 
> My thought was that our current mdev iommu interface allows the vendor
> driver to pin specific pages.  In order for the mdev device to dirty a
> page, we need for it to be pinned.  Therefore at worst, the set of
> pages pinned in type1 is the superset of all pages that can potentially
> be dirtied by the device.  In the worst case, this devolves to all
> pages mapped through the iommu in the case of direct assigned devices.
> My assertion is therefore that a device specific dirty page bitmap can
> only be a subset of the type1 pinned pages.  Therefore if the mdev

this assertion is correct.

> vendor driver can flush any stale pinnings, then the type1 view off
> pinned pages should match the devices view of the current working set.
> Then we wouldn't need a device specific dirty bitmap, we'd only need a
> mechanism to trigger a flush of stale mappings on the device.

what's your definition of 'stale pinnings"? take Intel vGPU for example,
we pin pages when they are mapped to GPU page table and then unpin
them when unmapped from GPU page table. Based on this policy type1
view of pinned pages should just match device view.  There is no 'stale' 
page example which I can think of since all currently-pinned pages must
be pinned for various read/write purpose. 

but I do realize one limitation of this option now. Given there are
both physical device and mdev assigned to the same guest, we
have to pin all pages for physically assigned device. In that case we
may need away to differentiate which device a given pinned page
is for. I'm not sure whether it's an easy thing or require lots of change.
interface wise we don't need additional one since there is still
unpin request from mdev path. Just internal structure needs to 
be extended to maintain that ownership info.

> 
> Otherwise I'm not sure how we cleanly create an interface where the
> dirty bitmap can either come from the device or the container... but
> I'd welcome suggestions.  Thanks,
> 

My original thought is simple:

- by default type1 can give dirty bitmap
- when mdev is created, mdev driver has chance to claim whether
it would like to track its own dirty bitmap
- then when Qemu requests dirty bitmap of mdev, vfio check the
flag to decide whether return type1 or call into mdev driver

Thanks
kevin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-07-07  6:40                   ` Tian, Kevin
@ 2017-07-10 19:47                     ` Alex Williamson
  2017-07-12  8:24                       ` Tian, Kevin
  0 siblings, 1 reply; 13+ messages in thread
From: Alex Williamson @ 2017-07-10 19:47 UTC (permalink / raw)
  To: Tian, Kevin
  Cc: Zhang, Yulei, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A

On Fri, 7 Jul 2017 06:40:58 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Saturday, July 1, 2017 1:00 AM
> > 
> > On Fri, 30 Jun 2017 05:14:40 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Friday, June 30, 2017 4:57 AM
> > > >
> > > > On Thu, 29 Jun 2017 00:10:59 +0000
> > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > >  
> > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > Sent: Thursday, June 29, 2017 12:00 AM
> > > > > > Thanks Kevin.  So really it's not really a dirty bitmap, it's just a
> > > > > > bitmap of pages that the device has access to and may have dirtied.
> > > > > > Don't we have this more generally in the vfio type1 IOMMU backend?  
> > For  
> > > > > > a mediated device, we know all the pages that the vendor driver has
> > > > > > asked to be pinned.  Should we perhaps make this interface on the  
> > vfio  
> > > > > > container rather than the device?  Any mediated device can provide  
> > this  
> > > > > > level of detail without specific vendor support.  If we had DMA page
> > > > > > faulting, this would be the natural place to put it as well, so maybe
> > > > > > we should design the interface there to support everything similarly.
> > > > > > Thanks,
> > > > > >  
> > > > >
> > > > > That's a nice idea. Just two comments:
> > > > >
> > > > > 1) If some mediated device has its own way to construct true dirty
> > > > > bitmap (not thru DMA page faulting), the interface is better designed
> > > > > to allow that flexibility. Maybe an optional callback if not registered
> > > > > then use common type1 IOMMU logic otherwise prefers to vendor
> > > > > specific callback  
> > > >
> > > > I'm not sure what that looks like, but I agree with the idea.  Could
> > > > the pages that type1 knows about every be anything other than a
> > > > superset of the dirty pages?  Perhaps a device ioctl to flush unused
> > > > mappings would be sufficient.  
> > >
> > > sorry I didn't quite get your idea here. My understanding is that
> > > type1 is OK as an alternative in case mediated device has no way
> > > to track dirtied pages (as for Intel GPU), so we can use type1 pinned
> > > pages as an indirect way to indicate dirtied pages. But if mediated
> > > device has its own way (e.g. a device private MMU) to track dirty
> > > pages, then we should allow that device to provide dirty bitmap
> > > instead of using type1.  
> > 
> > My thought was that our current mdev iommu interface allows the vendor
> > driver to pin specific pages.  In order for the mdev device to dirty a
> > page, we need for it to be pinned.  Therefore at worst, the set of
> > pages pinned in type1 is the superset of all pages that can potentially
> > be dirtied by the device.  In the worst case, this devolves to all
> > pages mapped through the iommu in the case of direct assigned devices.
> > My assertion is therefore that a device specific dirty page bitmap can
> > only be a subset of the type1 pinned pages.  Therefore if the mdev  
> 
> this assertion is correct.
> 
> > vendor driver can flush any stale pinnings, then the type1 view off
> > pinned pages should match the devices view of the current working set.
> > Then we wouldn't need a device specific dirty bitmap, we'd only need a
> > mechanism to trigger a flush of stale mappings on the device.  
> 
> what's your definition of 'stale pinnings"? take Intel vGPU for example,
> we pin pages when they are mapped to GPU page table and then unpin
> them when unmapped from GPU page table. Based on this policy type1
> view of pinned pages should just match device view.  There is no 'stale' 
> page example which I can think of since all currently-pinned pages must
> be pinned for various read/write purpose. 

Then it seems that the mdev support in the IOMMU already tracks the
minimum page set for Intel vGPUs, it can be a noop in that case.  Are
we missing some ability to identify pages through vendor specific
hooks, or were those hooks never really needed?
 
> but I do realize one limitation of this option now. Given there are
> both physical device and mdev assigned to the same guest, we
> have to pin all pages for physically assigned device. In that case we
> may need away to differentiate which device a given pinned page
> is for. I'm not sure whether it's an easy thing or require lots of change.
> interface wise we don't need additional one since there is still
> unpin request from mdev path. Just internal structure needs to 
> be extended to maintain that ownership info.

We can't migrate with regular assigned device and this model fails in
the right way, assuming that all memory is dirty, which is correct.
Currently we'd need to detach any directly assigned devices, leaving
only the mdev device.  If we had PRI support, then a way to flush
stale mappings and fault in new ones would be a way to reduce the
working set during migration.  I don't really see that per device
tracking helps us, we need to consider the page dirty regardless of
who dirtied it.
 
> > 
> > Otherwise I'm not sure how we cleanly create an interface where the
> > dirty bitmap can either come from the device or the container... but
> > I'd welcome suggestions.  Thanks,
> >   
> 
> My original thought is simple:
> 
> - by default type1 can give dirty bitmap
> - when mdev is created, mdev driver has chance to claim whether
> it would like to track its own dirty bitmap
> - then when Qemu requests dirty bitmap of mdev, vfio check the
> flag to decide whether return type1 or call into mdev driver

What would the device do differently?  The container is the logical
component tracking memory and therefore seems the logical component to
track dirty memory.  Potentially there's even some efficiency that we
can handle the dirty memory for multiple devices via a container
interface.  It also means that we don't have per vendor driver
implementations (and bugs) for dirty bitmaps.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
  2017-07-10 19:47                     ` Alex Williamson
@ 2017-07-12  8:24                       ` Tian, Kevin
  0 siblings, 0 replies; 13+ messages in thread
From: Tian, Kevin @ 2017-07-12  8:24 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Zhang, Yulei, joonas.lahtinen, qemu-devel, zhenyuw, Zheng, Xiao,
	Wang, Zhi A

> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, July 11, 2017 3:47 AM
> 
> On Fri, 7 Jul 2017 06:40:58 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Saturday, July 1, 2017 1:00 AM
> > >
> > > On Fri, 30 Jun 2017 05:14:40 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > >
> > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > Sent: Friday, June 30, 2017 4:57 AM
> > > > >
> > > > > On Thu, 29 Jun 2017 00:10:59 +0000
> > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > > >
> > > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > > Sent: Thursday, June 29, 2017 12:00 AM
> > > > > > > Thanks Kevin.  So really it's not really a dirty bitmap, it's just a
> > > > > > > bitmap of pages that the device has access to and may have dirtied.
> > > > > > > Don't we have this more generally in the vfio type1 IOMMU
> backend?
> > > For
> > > > > > > a mediated device, we know all the pages that the vendor driver
> has
> > > > > > > asked to be pinned.  Should we perhaps make this interface on the
> > > vfio
> > > > > > > container rather than the device?  Any mediated device can
> provide
> > > this
> > > > > > > level of detail without specific vendor support.  If we had DMA
> page
> > > > > > > faulting, this would be the natural place to put it as well, so maybe
> > > > > > > we should design the interface there to support everything
> similarly.
> > > > > > > Thanks,
> > > > > > >
> > > > > >
> > > > > > That's a nice idea. Just two comments:
> > > > > >
> > > > > > 1) If some mediated device has its own way to construct true dirty
> > > > > > bitmap (not thru DMA page faulting), the interface is better
> designed
> > > > > > to allow that flexibility. Maybe an optional callback if not registered
> > > > > > then use common type1 IOMMU logic otherwise prefers to vendor
> > > > > > specific callback
> > > > >
> > > > > I'm not sure what that looks like, but I agree with the idea.  Could
> > > > > the pages that type1 knows about every be anything other than a
> > > > > superset of the dirty pages?  Perhaps a device ioctl to flush unused
> > > > > mappings would be sufficient.
> > > >
> > > > sorry I didn't quite get your idea here. My understanding is that
> > > > type1 is OK as an alternative in case mediated device has no way
> > > > to track dirtied pages (as for Intel GPU), so we can use type1 pinned
> > > > pages as an indirect way to indicate dirtied pages. But if mediated
> > > > device has its own way (e.g. a device private MMU) to track dirty
> > > > pages, then we should allow that device to provide dirty bitmap
> > > > instead of using type1.
> > >
> > > My thought was that our current mdev iommu interface allows the
> vendor
> > > driver to pin specific pages.  In order for the mdev device to dirty a
> > > page, we need for it to be pinned.  Therefore at worst, the set of
> > > pages pinned in type1 is the superset of all pages that can potentially
> > > be dirtied by the device.  In the worst case, this devolves to all
> > > pages mapped through the iommu in the case of direct assigned devices.
> > > My assertion is therefore that a device specific dirty page bitmap can
> > > only be a subset of the type1 pinned pages.  Therefore if the mdev
> >
> > this assertion is correct.
> >
> > > vendor driver can flush any stale pinnings, then the type1 view off
> > > pinned pages should match the devices view of the current working set.
> > > Then we wouldn't need a device specific dirty bitmap, we'd only need a
> > > mechanism to trigger a flush of stale mappings on the device.
> >
> > what's your definition of 'stale pinnings"? take Intel vGPU for example,
> > we pin pages when they are mapped to GPU page table and then unpin
> > them when unmapped from GPU page table. Based on this policy type1
> > view of pinned pages should just match device view.  There is no 'stale'
> > page example which I can think of since all currently-pinned pages must
> > be pinned for various read/write purpose.
> 
> Then it seems that the mdev support in the IOMMU already tracks the
> minimum page set for Intel vGPUs, it can be a noop in that case.  Are
> we missing some ability to identify pages through vendor specific
> hooks, or were those hooks never really needed?

No vendor specific hook for Intel side. In the future once IOMMU 
provides standard dirty bit tracking, also no vendor specific hook
is required. I'm just thinking about such possibility in the design. :-)

> 
> > but I do realize one limitation of this option now. Given there are
> > both physical device and mdev assigned to the same guest, we
> > have to pin all pages for physically assigned device. In that case we
> > may need away to differentiate which device a given pinned page
> > is for. I'm not sure whether it's an easy thing or require lots of change.
> > interface wise we don't need additional one since there is still
> > unpin request from mdev path. Just internal structure needs to
> > be extended to maintain that ownership info.
> 
> We can't migrate with regular assigned device and this model fails in
> the right way, assuming that all memory is dirty, which is correct.
> Currently we'd need to detach any directly assigned devices, leaving
> only the mdev device.  If we had PRI support, then a way to flush
> stale mappings and fault in new ones would be a way to reduce the
> working set during migration.  I don't really see that per device
> tracking helps us, we need to consider the page dirty regardless of
> who dirtied it.
> 
> > >
> > > Otherwise I'm not sure how we cleanly create an interface where the
> > > dirty bitmap can either come from the device or the container... but
> > > I'd welcome suggestions.  Thanks,
> > >
> >
> > My original thought is simple:
> >
> > - by default type1 can give dirty bitmap
> > - when mdev is created, mdev driver has chance to claim whether
> > it would like to track its own dirty bitmap
> > - then when Qemu requests dirty bitmap of mdev, vfio check the
> > flag to decide whether return type1 or call into mdev driver
> 
> What would the device do differently?  The container is the logical
> component tracking memory and therefore seems the logical component to
> track dirty memory.  Potentially there's even some efficiency that we
> can handle the dirty memory for multiple devices via a container
> interface.  It also means that we don't have per vendor driver
> implementations (and bugs) for dirty bitmaps.  Thanks,
> 

I'm OK with this since I don't have a real example for that case.
let's go with this proposal - tracking dirty bitmap in VFIO through
type1 interface. No vendor specific hook.

Thanks
Kevin

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2017-07-12  8:24 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-26  8:53 [Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP Yulei Zhang
2017-06-26 20:19 ` Alex Williamson
2017-06-27  8:56   ` Zhang, Yulei
2017-06-27 19:44     ` Alex Williamson
2017-06-28  6:04       ` Tian, Kevin
2017-06-28 15:59         ` Alex Williamson
2017-06-29  0:10           ` Tian, Kevin
2017-06-29 20:57             ` Alex Williamson
2017-06-30  5:14               ` Tian, Kevin
2017-06-30 16:59                 ` Alex Williamson
2017-07-07  6:40                   ` Tian, Kevin
2017-07-10 19:47                     ` Alex Williamson
2017-07-12  8:24                       ` Tian, Kevin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.