All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] sample: vfio mdev display - host device
       [not found] <20180409103513.8020-1-kraxel@redhat.com>
@ 2018-04-09 10:35 ` Gerd Hoffmann
  2018-04-24  2:41   ` Alex Williamson
  2018-04-09 10:35 ` [PATCH 2/3] sample: vfio mdev display - guest driver Gerd Hoffmann
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 41+ messages in thread
From: Gerd Hoffmann @ 2018-04-09 10:35 UTC (permalink / raw)
  To: kvm; +Cc: alex.williamson, kwankhede, Gerd Hoffmann, open list

Simple framebuffer display, demo-ing the vfio region display interface
(VFIO_GFX_PLANE_TYPE_REGION).

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 samples/vfio-mdev/mdpy-defs.h |  19 +
 samples/vfio-mdev/mdpy.c      | 791 ++++++++++++++++++++++++++++++++++++++++++
 samples/Kconfig               |   8 +
 samples/vfio-mdev/Makefile    |   1 +
 4 files changed, 819 insertions(+)
 create mode 100644 samples/vfio-mdev/mdpy-defs.h
 create mode 100644 samples/vfio-mdev/mdpy.c

diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h
new file mode 100644
index 0000000000..79f0795e11
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-defs.h
@@ -0,0 +1,19 @@
+/*
+ * Simple pci display device.
+ *
+ * Framebuffer memory is pci bar 0.
+ * Configuration (read-only) is in pci config space.
+ * Format field uses drm fourcc codes.
+ * ATM only DRM_FORMAT_XRGB8888 is supported.
+ */
+
+/* pci ids */
+#define MDPY_PCI_VENDOR_ID	0x1b36 /* redhat */
+#define MDPY_PCI_DEVICE_ID	0x00f0
+#define MDPY_PCI_SUBVENDOR_ID	PCI_SUBVENDOR_ID_REDHAT_QUMRANET
+#define MDPY_PCI_SUBDEVICE_ID	PCI_SUBDEVICE_ID_QEMU
+
+/* pci cfg space offsets for fb config (dword) */
+#define MDPY_FORMAT_OFFSET	0x40
+#define MDPY_WIDTH_OFFSET	0x44
+#define MDPY_HEIGHT_OFFSET	0x48
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
new file mode 100644
index 0000000000..25882c7b37
--- /dev/null
+++ b/samples/vfio-mdev/mdpy.c
@@ -0,0 +1,791 @@
+/*
+ * Mediated virtual PCI display host device driver
+ *
+ * See mdpy-defs.h for device specs
+ *
+ *   (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * based on mtty driver which is:
+ *   Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *	 Author: Neo Jia <cjia@nvidia.com>
+ *		 Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/cdev.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <drm/drm_fourcc.h>
+#include "mdpy-defs.h"
+
+#define MDPY_NAME		"mdpy"
+#define MDPY_CLASS_NAME		"mdpy"
+
+#define MDPY_CONFIG_SPACE_SIZE	0xff
+#define MDPY_MEMORY_BAR_OFFSET	PAGE_SIZE
+#define MDPY_DISPLAY_REGION	16
+
+#define STORE_LE16(addr, val)	(*(u16 *)addr = val)
+#define STORE_LE32(addr, val)	(*(u32 *)addr = val)
+
+
+MODULE_LICENSE("GPL v2");
+
+static int max_devices = 4;
+module_param_named(count, max_devices, int, 0444);
+MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices");
+
+
+#define MDPY_TYPE_1 "vga"
+#define MDPY_TYPE_2 "xga"
+#define MDPY_TYPE_3 "hd"
+
+static const struct mdpy_type {
+	const char *name;
+	u32 format;
+	u32 bytepp;
+	u32 width;
+	u32 height;
+} mdpy_types[] = {
+	{
+		.name	= MDPY_CLASS_NAME "-" MDPY_TYPE_1,
+		.format = DRM_FORMAT_XRGB8888,
+		.bytepp = 4,
+		.width	= 640,
+		.height = 480,
+	},{
+		.name	= MDPY_CLASS_NAME "-" MDPY_TYPE_2,
+		.format = DRM_FORMAT_XRGB8888,
+		.bytepp = 4,
+		.width	= 1024,
+		.height = 768,
+	},{
+		.name	= MDPY_CLASS_NAME "-" MDPY_TYPE_3,
+		.format = DRM_FORMAT_XRGB8888,
+		.bytepp = 4,
+		.width	= 1920,
+		.height = 1080,
+	},
+};
+
+static dev_t		mdpy_devt;
+static struct class	*mdpy_class;
+static struct cdev	mdpy_cdev;
+static struct device	mdpy_dev;
+static u32		mdpy_count;
+
+/* State of each mdev device */
+struct mdev_state {
+	u8 *vconfig;
+	u32 bar_mask;
+	struct mutex ops_lock;
+	struct mdev_device *mdev;
+	struct vfio_device_info dev_info;
+
+	const struct mdpy_type *type;
+	u32 memsize;
+	void *memblk;
+};
+
+static const struct mdpy_type *mdpy_find_type(struct kobject *kobj)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mdpy_types); i++)
+		if (strcmp(mdpy_types[i].name, kobj->name) == 0)
+			return mdpy_types + i;
+	return NULL;
+}
+
+static void mdpy_create_config_space(struct mdev_state *mdev_state)
+{
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
+		   MDPY_PCI_VENDOR_ID);
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
+		   MDPY_PCI_DEVICE_ID);
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
+		   MDPY_PCI_SUBVENDOR_ID);
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
+		   MDPY_PCI_SUBDEVICE_ID);
+
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
+		   PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
+		   PCI_CLASS_DISPLAY_OTHER);
+	mdev_state->vconfig[PCI_CLASS_REVISION] =  0x01;
+
+	STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
+		   PCI_BASE_ADDRESS_SPACE_MEMORY |
+		   PCI_BASE_ADDRESS_MEM_TYPE_32	 |
+		   PCI_BASE_ADDRESS_MEM_PREFETCH);
+	mdev_state->bar_mask = ~(mdev_state->memsize) + 1;
+
+	/* Vendor specific data */
+	STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_FORMAT_OFFSET],
+		   mdev_state->type->format);
+	STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_WIDTH_OFFSET],
+		   mdev_state->type->width);
+	STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_HEIGHT_OFFSET],
+		   mdev_state->type->height);
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+				 char *buf, u32 count)
+{
+	struct device *dev = mdev_dev(mdev_state->mdev);
+	u32 cfg_addr;
+
+	switch (offset) {
+	case PCI_BASE_ADDRESS_0:
+		cfg_addr = *(u32 *)buf;
+
+		if (cfg_addr == 0xffffffff) {
+			cfg_addr = (cfg_addr & mdev_state->bar_mask);
+		} else {
+			cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
+			if (cfg_addr)
+				dev_info(dev, "BAR0 @ 0x%x\n", cfg_addr);
+		}
+
+		cfg_addr |= (mdev_state->vconfig[offset] &
+			     ~PCI_BASE_ADDRESS_MEM_MASK);
+		STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+		break;
+	}
+}
+
+static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
+			   loff_t pos, bool is_write)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct device *dev = mdev_dev(mdev);
+	int ret = 0;
+
+	mutex_lock(&mdev_state->ops_lock);
+
+	if (pos < MDPY_CONFIG_SPACE_SIZE) {
+		if (is_write) {
+			handle_pci_cfg_write(mdev_state, pos, buf, count);
+		} else {
+			memcpy(buf, (mdev_state->vconfig + pos), count);
+		}
+
+	} else {
+		dev_info(dev, "%s: %s @0x%llx (unhandled)\n",
+			 __func__, is_write ? "WR" : "RD", pos);
+		ret = -1;
+		goto accessfailed;
+	}
+
+	ret = count;
+
+
+accessfailed:
+	mutex_unlock(&mdev_state->ops_lock);
+
+	return ret;
+}
+
+int mdpy_reset(struct mdev_device *mdev)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	u32 stride, i;
+
+	/* initialize with gray gradient */
+	stride = mdev_state->type->width * mdev_state->type->bytepp;
+	for (i = 0; i < mdev_state->type->height; i++)
+		memset(mdev_state->memblk + i * stride,
+		       i * 255 / mdev_state->type->height,
+		       stride);
+	return 0;
+}
+
+int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+	const struct mdpy_type *type = mdpy_find_type(kobj);
+	struct device *dev = mdev_dev(mdev);
+	struct mdev_state *mdev_state;
+	u32 fbsize;
+
+	if (mdpy_count >= max_devices)
+		return -ENOMEM;
+
+	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
+	if (mdev_state == NULL)
+		return -ENOMEM;
+
+	mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL);
+	if (mdev_state->vconfig == NULL) {
+		kfree(mdev_state);
+		return -ENOMEM;
+	}
+
+	if (!type)
+		type = &mdpy_types[0];
+	fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp);
+
+	mdev_state->memblk = vmalloc_user(fbsize);
+	if (!mdev_state->memblk) {
+		kfree(mdev_state->vconfig);
+		kfree(mdev_state);
+		return -ENOMEM;
+	}
+	dev_info(dev, "%s: %s (%dx%d)\n",
+		 __func__, kobj->name, type->width, type->height);
+
+	mutex_init(&mdev_state->ops_lock);
+	mdev_state->mdev = mdev;
+	mdev_set_drvdata(mdev, mdev_state);
+
+	mdev_state->type    = type;
+	mdev_state->memsize = fbsize;
+	mdpy_create_config_space(mdev_state);
+	mdpy_reset(mdev);
+
+	mdpy_count++;
+	return 0;
+}
+
+int mdpy_remove(struct mdev_device *mdev)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct device *dev = mdev_dev(mdev);
+
+	dev_info(dev, "%s\n", __func__);
+
+	mdev_set_drvdata(mdev, NULL);
+	vfree(mdev_state->memblk);
+	kfree(mdev_state->vconfig);
+	kfree(mdev_state);
+
+	mdpy_count--;
+	return 0;
+}
+
+ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf, size_t count,
+		  loff_t *ppos)
+{
+	unsigned int done = 0;
+	int ret;
+
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			ret =  mdev_access(mdev, (char *)&val, sizeof(val),
+					   *ppos, false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	return done;
+
+read_err:
+	return -EFAULT;
+}
+
+ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
+		   size_t count, loff_t *ppos)
+{
+	unsigned int done = 0;
+	int ret;
+
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 1;
+		}
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	return done;
+write_err:
+	return -EFAULT;
+}
+
+int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+	if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
+		return -EINVAL;
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if (vma->vm_end - vma->vm_start > mdev_state->memsize)
+		return -EINVAL;
+	if ((vma->vm_flags & VM_SHARED) == 0)
+		return -EINVAL;
+
+	return remap_vmalloc_range_partial(vma, vma->vm_start,
+					   mdev_state->memblk,
+					   vma->vm_end - vma->vm_start);
+}
+
+int mdpy_get_region_info(struct mdev_device *mdev,
+			 struct vfio_region_info *region_info,
+			 u16 *cap_type_id, void **cap_type)
+{
+	struct mdev_state *mdev_state;
+
+	mdev_state = mdev_get_drvdata(mdev);
+	if (!mdev_state)
+		return -EINVAL;
+
+	if (region_info->index >= VFIO_PCI_NUM_REGIONS &&
+	    region_info->index != MDPY_DISPLAY_REGION)
+		return -EINVAL;
+
+	switch (region_info->index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		region_info->offset = 0;
+		region_info->size   = MDPY_CONFIG_SPACE_SIZE;
+		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ |
+				       VFIO_REGION_INFO_FLAG_WRITE);
+		break;
+	case VFIO_PCI_BAR0_REGION_INDEX:
+	case MDPY_DISPLAY_REGION:
+		region_info->offset = MDPY_MEMORY_BAR_OFFSET;
+		region_info->size   = mdev_state->memsize;
+		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ  |
+				       VFIO_REGION_INFO_FLAG_WRITE |
+				       VFIO_REGION_INFO_FLAG_MMAP);
+		break;
+	default:
+		region_info->size   = 0;
+		region_info->offset = 0;
+		region_info->flags  = 0;
+	}
+
+	return 0;
+}
+
+int mdpy_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info)
+{
+	irq_info->count = 0;
+	return 0;
+}
+
+int mdpy_get_device_info(struct mdev_device *mdev,
+			 struct vfio_device_info *dev_info)
+{
+	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+	dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
+	dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+	return 0;
+}
+
+int mdpy_query_gfx_plane(struct mdev_device *mdev,
+			 struct vfio_device_gfx_plane_info *plane)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+	if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
+		if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
+				     VFIO_GFX_PLANE_TYPE_REGION))
+			return 0;
+		return -EINVAL;
+	}
+
+	if (plane->flags != VFIO_GFX_PLANE_TYPE_REGION)
+		return -EINVAL;
+
+	plane->drm_format     = mdev_state->type->format;
+	plane->width	      = mdev_state->type->width;
+	plane->height	      = mdev_state->type->height;
+	plane->stride	      = (mdev_state->type->width *
+				 mdev_state->type->bytepp);
+	plane->size	      = mdev_state->memsize;
+	plane->region_index   = MDPY_DISPLAY_REGION;
+
+	/* unused */
+	plane->drm_format_mod = 0;
+	plane->x_pos	      = 0;
+	plane->y_pos	      = 0;
+	plane->x_hot	      = 0;
+	plane->y_hot	      = 0;
+
+	return 0;
+}
+
+static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
+			unsigned long arg)
+{
+	int ret = 0;
+	unsigned long minsz;
+	struct mdev_state *mdev_state;
+
+	mdev_state = mdev_get_drvdata(mdev);
+
+	switch (cmd) {
+	case VFIO_DEVICE_GET_INFO:
+	{
+		struct vfio_device_info info;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		ret = mdpy_get_device_info(mdev, &info);
+		if (ret)
+			return ret;
+
+		memcpy(&mdev_state->dev_info, &info, sizeof(info));
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_DEVICE_GET_REGION_INFO:
+	{
+		struct vfio_region_info info;
+		u16 cap_type_id = 0;
+		void *cap_type = NULL;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		ret = mdpy_get_region_info(mdev, &info, &cap_type_id,
+					   &cap_type);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case VFIO_DEVICE_GET_IRQ_INFO:
+	{
+		struct vfio_irq_info info;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if ((info.argsz < minsz) ||
+		    (info.index >= mdev_state->dev_info.num_irqs))
+			return -EINVAL;
+
+		ret = mdpy_get_irq_info(mdev, &info);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case VFIO_DEVICE_QUERY_GFX_PLANE:
+	{
+		struct vfio_device_gfx_plane_info plane;
+
+		minsz = offsetofend(struct vfio_device_gfx_plane_info,
+				    region_index);
+
+		if (copy_from_user(&plane, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (plane.argsz < minsz)
+			return -EINVAL;
+
+		ret = mdpy_query_gfx_plane(mdev, &plane);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((void __user *)arg, &plane, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case VFIO_DEVICE_SET_IRQS:
+		return -EINVAL;
+
+	case VFIO_DEVICE_RESET:
+		return mdpy_reset(mdev);
+	}
+	return -ENOTTY;
+}
+
+int mdpy_open(struct mdev_device *mdev)
+{
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	return 0;
+}
+
+void mdpy_close(struct mdev_device *mdev)
+{
+	module_put(THIS_MODULE);
+}
+
+static ssize_t
+resolution_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct mdev_device *mdev = mdev_from_dev(dev);
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+	return sprintf(buf, "%dx%d\n",
+		       mdev_state->type->width,
+		       mdev_state->type->height);
+}
+static DEVICE_ATTR_RO(resolution);
+
+static struct attribute *mdev_dev_attrs[] = {
+	&dev_attr_resolution.attr,
+	NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+	.name  = "vendor",
+	.attrs = mdev_dev_attrs,
+};
+
+const struct attribute_group *mdev_dev_groups[] = {
+	&mdev_dev_group,
+	NULL,
+};
+
+static ssize_t
+name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+	return sprintf(buf, "%s\n", kobj->name);
+}
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t
+description_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+	const struct mdpy_type *type = mdpy_find_type(kobj);
+
+	return sprintf(buf, "virtual display, %dx%d framebuffer\n",
+		       type ? type->width  : 0,
+		       type ? type->height : 0);
+}
+MDEV_TYPE_ATTR_RO(description);
+
+static ssize_t
+available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+	return sprintf(buf, "%d\n", max_devices - mdpy_count);
+}
+MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+			       char *buf)
+{
+	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *mdev_types_attrs[] = {
+	&mdev_type_attr_name.attr,
+	&mdev_type_attr_description.attr,
+	&mdev_type_attr_device_api.attr,
+	&mdev_type_attr_available_instances.attr,
+	NULL,
+};
+
+static struct attribute_group mdev_type_group1 = {
+	.name  = MDPY_TYPE_1,
+	.attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group2 = {
+	.name  = MDPY_TYPE_2,
+	.attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group3 = {
+	.name  = MDPY_TYPE_3,
+	.attrs = mdev_types_attrs,
+};
+
+static struct attribute_group *mdev_type_groups[] = {
+	&mdev_type_group1,
+	&mdev_type_group2,
+	&mdev_type_group3,
+	NULL,
+};
+
+static const struct mdev_parent_ops mdev_fops = {
+	.owner			= THIS_MODULE,
+	.mdev_attr_groups	= mdev_dev_groups,
+	.supported_type_groups	= mdev_type_groups,
+	.create			= mdpy_create,
+	.remove			= mdpy_remove,
+	.open			= mdpy_open,
+	.release		= mdpy_close,
+	.read			= mdpy_read,
+	.write			= mdpy_write,
+	.ioctl			= mdpy_ioctl,
+	.mmap			= mdpy_mmap,
+};
+
+static const struct file_operations vd_fops = {
+	.owner		= THIS_MODULE,
+};
+
+static void mdpy_device_release(struct device *dev)
+{
+	/* nothing */
+}
+
+static int __init mdpy_dev_init(void)
+{
+	int ret = 0;
+
+	ret = alloc_chrdev_region(&mdpy_devt, 0, MINORMASK, MDPY_NAME);
+	if (ret < 0) {
+		pr_err("Error: failed to register mdpy_dev, err: %d\n", ret);
+		return ret;
+	}
+	cdev_init(&mdpy_cdev, &vd_fops);
+	cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK);
+	pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt));
+
+	mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME);
+	if (IS_ERR(mdpy_class)) {
+		pr_err("Error: failed to register mdpy_dev class\n");
+		ret = PTR_ERR(mdpy_class);
+		goto failed1;
+	}
+	mdpy_dev.class = mdpy_class;
+	mdpy_dev.release = mdpy_device_release;
+	dev_set_name(&mdpy_dev, "%s", MDPY_NAME);
+
+	ret = device_register(&mdpy_dev);
+	if (ret)
+		goto failed2;
+
+	ret = mdev_register_device(&mdpy_dev, &mdev_fops);
+	if (ret)
+		goto failed3;
+
+	return 0;
+
+failed3:
+	device_unregister(&mdpy_dev);
+failed2:
+	class_destroy(mdpy_class);
+failed1:
+	cdev_del(&mdpy_cdev);
+	unregister_chrdev_region(mdpy_devt, MINORMASK);
+	return ret;
+}
+
+static void __exit mdpy_dev_exit(void)
+{
+	mdpy_dev.bus = NULL;
+	mdev_unregister_device(&mdpy_dev);
+
+	device_unregister(&mdpy_dev);
+	cdev_del(&mdpy_cdev);
+	unregister_chrdev_region(mdpy_devt, MINORMASK);
+	class_destroy(mdpy_class);
+	mdpy_class = NULL;
+}
+
+module_init(mdpy_dev_init)
+module_exit(mdpy_dev_exit)
diff --git a/samples/Kconfig b/samples/Kconfig
index c332a3b9de..a0c104adda 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -111,6 +111,14 @@ config SAMPLE_VFIO_MDEV_MTTY
 	  Build a virtual tty sample driver for use as a VFIO
 	  mediated device
 
+config SAMPLE_VFIO_MDEV_MDPY
+	tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
+	depends on VFIO_MDEV_DEVICE && m
+	help
+	  Build a virtual display sample driver for use as a VFIO
+	  mediated device.  It is a simple framebuffer and supports
+	  the region display interface (VFIO_GFX_PLANE_TYPE_REGION).
+
 config SAMPLE_STATX
 	bool "Build example extended-stat using code"
 	depends on BROKEN
diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
index cbbd868a50..031d6b88e9 100644
--- a/samples/vfio-mdev/Makefile
+++ b/samples/vfio-mdev/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 2/3] sample: vfio mdev display - guest driver
       [not found] <20180409103513.8020-1-kraxel@redhat.com>
  2018-04-09 10:35 ` [PATCH 1/3] sample: vfio mdev display - host device Gerd Hoffmann
@ 2018-04-09 10:35 ` Gerd Hoffmann
  2018-04-11 20:39   ` Bjorn Helgaas
                     ` (2 more replies)
  2018-04-09 10:35 ` [PATCH 3/3] sample: vfio bochs vbe display (host device for bochs-drm) Gerd Hoffmann
  2018-04-18 18:31 ` [libvirt] [PATCH 0/3] sample: vfio mdev display devices Alex Williamson
  3 siblings, 3 replies; 41+ messages in thread
From: Gerd Hoffmann @ 2018-04-09 10:35 UTC (permalink / raw)
  To: kvm; +Cc: alex.williamson, kwankhede, Gerd Hoffmann, open list

Guest fbdev driver for CONFIG_SAMPLE_VFIO_MDEV_MDPY.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 samples/vfio-mdev/mdpy-fb.c | 232 ++++++++++++++++++++++++++++++++++++++++++++
 samples/Kconfig             |   9 ++
 samples/vfio-mdev/Makefile  |   1 +
 3 files changed, 242 insertions(+)
 create mode 100644 samples/vfio-mdev/mdpy-fb.c

diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c
new file mode 100644
index 0000000000..0ebd8feead
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-fb.c
@@ -0,0 +1,232 @@
+/*
+ * Framebuffer driver for mdpy (mediated virtual pci display device).
+ *
+ * See mdpy-defs.h for device specs
+ *
+ *   (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * Using some code snippets from simplefb and cirrusfb.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/errno.h>
+#include <linux/fb.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <drm/drm_fourcc.h>
+#include "mdpy-defs.h"
+
+static const struct fb_fix_screeninfo mdpy_fb_fix = {
+	.id		= "mdpy-fb",
+	.type		= FB_TYPE_PACKED_PIXELS,
+	.visual		= FB_VISUAL_TRUECOLOR,
+	.accel		= FB_ACCEL_NONE,
+};
+
+static const struct fb_var_screeninfo mdpy_fb_var = {
+	.height		= -1,
+	.width		= -1,
+	.activate	= FB_ACTIVATE_NOW,
+	.vmode		= FB_VMODE_NONINTERLACED,
+
+	.bits_per_pixel = 32,
+	.transp.offset	= 24,
+	.red.offset	= 16,
+	.green.offset	= 8,
+	.blue.offset	= 0,
+	.transp.length	= 8,
+	.red.length	= 8,
+	.green.length	= 8,
+	.blue.length	= 8,
+};
+
+#define PSEUDO_PALETTE_SIZE 16
+
+struct mdpy_fb_par {
+	u32 palette[PSEUDO_PALETTE_SIZE];
+};
+
+static int mdpy_fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
+			      u_int transp, struct fb_info *info)
+{
+	u32 *pal = info->pseudo_palette;
+	u32 cr = red >> (16 - info->var.red.length);
+	u32 cg = green >> (16 - info->var.green.length);
+	u32 cb = blue >> (16 - info->var.blue.length);
+	u32 value;
+
+	if (regno >= PSEUDO_PALETTE_SIZE)
+		return -EINVAL;
+
+	value = (cr << info->var.red.offset) |
+		(cg << info->var.green.offset) |
+		(cb << info->var.blue.offset);
+	if (info->var.transp.length > 0) {
+		u32 mask = (1 << info->var.transp.length) - 1;
+		mask <<= info->var.transp.offset;
+		value |= mask;
+	}
+	pal[regno] = value;
+
+	return 0;
+}
+
+static void mdpy_fb_destroy(struct fb_info *info)
+{
+	if (info->screen_base)
+		iounmap(info->screen_base);
+}
+
+static struct fb_ops mdpy_fb_ops = {
+	.owner		= THIS_MODULE,
+	.fb_destroy	= mdpy_fb_destroy,
+	.fb_setcolreg	= mdpy_fb_setcolreg,
+	.fb_fillrect	= cfb_fillrect,
+	.fb_copyarea	= cfb_copyarea,
+	.fb_imageblit	= cfb_imageblit,
+};
+
+static int mdpy_fb_probe(struct pci_dev *pdev,
+			 const struct pci_device_id *ent)
+{
+	struct fb_info *info;
+	struct mdpy_fb_par *par;
+	u32 format, width, height;
+	int ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret < 0)
+		return ret;
+
+	ret = pci_request_regions(pdev, "mdpy-fb");
+	if (ret < 0)
+		return ret;
+
+	pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format);
+	pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET,	&width);
+	pci_read_config_dword(pdev, MDPY_HEIGHT_OFFSET, &height);
+	if (format != DRM_FORMAT_XRGB8888) {
+		dev_err(&pdev->dev, "format mismatch (0x%x != 0x%x)\n",
+			format, DRM_FORMAT_XRGB8888);
+		return -EINVAL;
+	}
+	if (width < 100	 || width > 10000) {
+		dev_err(&pdev->dev, "width (%d) out of range\n", width);
+		return -EINVAL;
+	}
+	if (height < 100 || height > 10000) {
+		dev_err(&pdev->dev, "height (%d) out of range\n", height);
+		return -EINVAL;
+	}
+	dev_info(&pdev->dev, "mdpy found: %dx%d framebuffer\n",
+		 width, height);
+
+	info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev);
+	if (!info)
+		goto err_release_regions;
+	pci_set_drvdata(pdev, info);
+	par = info->par;
+
+	info->fix = mdpy_fb_fix;
+	info->fix.smem_start = pci_resource_start(pdev, 0);
+	info->fix.smem_len = pci_resource_len(pdev, 0);
+	info->fix.line_length = width * 4;
+
+	info->var = mdpy_fb_var;
+	info->var.xres = width;
+	info->var.yres = height;
+	info->var.xres_virtual = width;
+	info->var.yres_virtual = height;
+
+	info->screen_size = info->fix.smem_len;
+	info->screen_base = ioremap(info->fix.smem_start,
+				    info->screen_size);
+	if (!info->screen_base) {
+		dev_err(&pdev->dev, "ioremap(pcibar) failed\n");
+		ret = -EIO;
+		goto err_release_fb;
+	}
+
+	info->apertures = alloc_apertures(1);
+	if (!info->apertures) {
+		ret = -ENOMEM;
+		goto err_unmap;
+	}
+	info->apertures->ranges[0].base = info->fix.smem_start;
+	info->apertures->ranges[0].size = info->fix.smem_len;
+
+	info->fbops = &mdpy_fb_ops;
+	info->flags = FBINFO_DEFAULT;
+	info->pseudo_palette = par->palette;
+
+	ret = register_framebuffer(info);
+	if (ret < 0) {
+		dev_err(&pdev->dev,
+			"mdpy-fb device register failed: %d\n", ret);
+		goto err_unmap;
+	}
+
+	dev_info(&pdev->dev, "fb%d registered\n", info->node);
+	return 0;
+
+err_unmap:
+	iounmap(info->screen_base);
+
+err_release_fb:
+	framebuffer_release(info);
+
+err_release_regions:
+	pci_release_regions(pdev);
+
+	return ret;
+}
+
+static void mdpy_fb_remove(struct pci_dev *pdev)
+{
+	struct fb_info *info = pci_get_drvdata(pdev);
+
+	unregister_framebuffer(info);
+	framebuffer_release(info);
+}
+
+static struct pci_device_id mdpy_fb_pci_table[] = {
+	{
+		.vendor	   = MDPY_PCI_VENDOR_ID,
+		.device	   = MDPY_PCI_DEVICE_ID,
+		.subvendor = MDPY_PCI_SUBVENDOR_ID,
+		.subdevice = MDPY_PCI_SUBDEVICE_ID,
+	},{
+		/* end of list */
+	}
+};
+
+static struct pci_driver mdpy_fb_pci_driver = {
+	.name		= "mdpy-fb",
+	.id_table	= mdpy_fb_pci_table,
+	.probe		= mdpy_fb_probe,
+	.remove		= mdpy_fb_remove,
+};
+
+static int __init mdpy_fb_init(void)
+{
+	int ret;
+
+	ret = pci_register_driver(&mdpy_fb_pci_driver);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+module_init(mdpy_fb_init);
+
+MODULE_DEVICE_TABLE(pci, mdpy_fb_pci_table);
+MODULE_LICENSE("GPL v2");
diff --git a/samples/Kconfig b/samples/Kconfig
index a0c104adda..755430c788 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -119,6 +119,15 @@ config SAMPLE_VFIO_MDEV_MDPY
 	  mediated device.  It is a simple framebuffer and supports
 	  the region display interface (VFIO_GFX_PLANE_TYPE_REGION).
 
+config SAMPLE_VFIO_MDEV_MDPY_FB
+	tristate "Build VFIO mdpy example guest fbdev driver -- loadable module only"
+	depends on FB && m
+	select FB_CFB_FILLRECT
+	select FB_CFB_COPYAREA
+	select FB_CFB_IMAGEBLIT
+	help
+	  Guest fbdev driver for the virtual display sample driver.
+
 config SAMPLE_STATX
 	bool "Build example extended-stat using code"
 	depends on BROKEN
diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
index 031d6b88e9..7a5790aaec 100644
--- a/samples/vfio-mdev/Makefile
+++ b/samples/vfio-mdev/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
 obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 3/3] sample: vfio bochs vbe display (host device for bochs-drm)
       [not found] <20180409103513.8020-1-kraxel@redhat.com>
  2018-04-09 10:35 ` [PATCH 1/3] sample: vfio mdev display - host device Gerd Hoffmann
  2018-04-09 10:35 ` [PATCH 2/3] sample: vfio mdev display - guest driver Gerd Hoffmann
@ 2018-04-09 10:35 ` Gerd Hoffmann
  2018-04-24  3:05   ` Alex Williamson
  2018-04-18 18:31 ` [libvirt] [PATCH 0/3] sample: vfio mdev display devices Alex Williamson
  3 siblings, 1 reply; 41+ messages in thread
From: Gerd Hoffmann @ 2018-04-09 10:35 UTC (permalink / raw)
  To: kvm; +Cc: alex.williamson, kwankhede, Gerd Hoffmann, open list

Display device, demo-ing the vfio dmabuf display interface
(VFIO_GFX_PLANE_TYPE_DMABUF).  Compatible enough to qemu stdvga
that bochs-drm.ko can be used as guest driver.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 samples/vfio-mdev/mbochs.c | 1379 ++++++++++++++++++++++++++++++++++++++++++++
 samples/Kconfig            |   13 +
 samples/vfio-mdev/Makefile |    1 +
 3 files changed, 1393 insertions(+)
 create mode 100644 samples/vfio-mdev/mbochs.c

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
new file mode 100644
index 0000000000..fc91523190
--- /dev/null
+++ b/samples/vfio-mdev/mbochs.c
@@ -0,0 +1,1379 @@
+/*
+ * Mediated virtual PCI display host device driver
+ *
+ * Emulate enough of qemu stdvga to make bochs-drm.ko happy.  That is
+ * basically the vram memory bar and the bochs dispi interface vbe
+ * registers in the mmio register bar.	Specifically it does *not*
+ * include any legacy vga stuff.  Device looks alot like "qemu -device
+ * secondary-vga".
+ *
+ *   (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * based on mtty driver which is:
+ *   Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *	 Author: Neo Jia <cjia@nvidia.com>
+ *		 Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/cdev.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <linux/dma-buf.h>
+#include <linux/highmem.h>
+#include <drm/drm_fourcc.h>
+#include <drm/drm_rect.h>
+#include <drm/drm_modeset_lock.h>
+#include <drm/drm_plane.h>
+
+
+#define VBE_DISPI_INDEX_ID		0x0
+#define VBE_DISPI_INDEX_XRES		0x1
+#define VBE_DISPI_INDEX_YRES		0x2
+#define VBE_DISPI_INDEX_BPP		0x3
+#define VBE_DISPI_INDEX_ENABLE		0x4
+#define VBE_DISPI_INDEX_BANK		0x5
+#define VBE_DISPI_INDEX_VIRT_WIDTH	0x6
+#define VBE_DISPI_INDEX_VIRT_HEIGHT	0x7
+#define VBE_DISPI_INDEX_X_OFFSET	0x8
+#define VBE_DISPI_INDEX_Y_OFFSET	0x9
+#define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa
+#define VBE_DISPI_INDEX_COUNT		0xb
+
+#define VBE_DISPI_ID0			0xB0C0
+#define VBE_DISPI_ID1			0xB0C1
+#define VBE_DISPI_ID2			0xB0C2
+#define VBE_DISPI_ID3			0xB0C3
+#define VBE_DISPI_ID4			0xB0C4
+#define VBE_DISPI_ID5			0xB0C5
+
+#define VBE_DISPI_DISABLED		0x00
+#define VBE_DISPI_ENABLED		0x01
+#define VBE_DISPI_GETCAPS		0x02
+#define VBE_DISPI_8BIT_DAC		0x20
+#define VBE_DISPI_LFB_ENABLED		0x40
+#define VBE_DISPI_NOCLEARMEM		0x80
+
+
+#define MBOCHS_NAME		  "mbochs"
+#define MBOCHS_CLASS_NAME	  "mbochs"
+
+#define MBOCHS_CONFIG_SPACE_SIZE  0xff
+#define MBOCHS_MMIO_BAR_OFFSET	  PAGE_SIZE
+#define MBOCHS_MMIO_BAR_SIZE	  PAGE_SIZE
+#define MBOCHS_MEMORY_BAR_OFFSET  (MBOCHS_MMIO_BAR_OFFSET + MBOCHS_MMIO_BAR_SIZE)
+
+#define STORE_LE16(addr, val)	(*(u16 *)addr = val)
+#define STORE_LE32(addr, val)	(*(u32 *)addr = val)
+
+
+MODULE_LICENSE("GPL v2");
+
+static int max_mbytes = 256;
+module_param_named(count, max_mbytes, int, 0444);
+MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices");
+
+
+#define MBOCHS_TYPE_1 "small"
+#define MBOCHS_TYPE_2 "medium"
+#define MBOCHS_TYPE_3 "large"
+
+static const struct mbochs_type {
+	const char *name;
+	u32 mbytes;
+} mbochs_types[] = {
+	{
+		.name	= MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1,
+		.mbytes = 4,
+	},{
+		.name	= MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2,
+		.mbytes = 16,
+	},{
+		.name	= MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3,
+		.mbytes = 64,
+	},
+};
+
+
+static dev_t		mbochs_devt;
+static struct class	*mbochs_class;
+static struct cdev	mbochs_cdev;
+static struct device	mbochs_dev;
+static int		mbochs_used_mbytes;
+
+struct mbochs_mode {
+	u32 drm_format;
+	u32 bytepp;
+	u32 width;
+	u32 height;
+	u32 stride;
+	u32 __pad;
+	u64 offset;
+	u64 size;
+};
+
+struct mbochs_dmabuf {
+	struct mbochs_mode mode;
+	u32 id;
+	struct page **pages;
+	pgoff_t pagecount;
+	struct dma_buf *buf;
+	struct mdev_state *mdev_state;
+	struct list_head next;
+	bool unlinked;
+};
+
+/* State of each mdev device */
+struct mdev_state {
+	u8 *vconfig;
+	u64 bar_mask[3];
+	u32 memory_bar_mask;
+	struct mutex ops_lock;
+	struct mdev_device *mdev;
+	struct vfio_device_info dev_info;
+
+	const struct mbochs_type *type;
+	u16 vbe[VBE_DISPI_INDEX_COUNT];
+	u64 memsize;
+	struct page **pages;
+	pgoff_t pagecount;
+
+	struct list_head dmabufs;
+	u32 active_id;
+	u32 next_id;
+};
+
+static const char *vbe_name_list[VBE_DISPI_INDEX_COUNT] = {
+	[ VBE_DISPI_INDEX_ID		   ] = "id",
+	[ VBE_DISPI_INDEX_XRES		   ] = "xres",
+	[ VBE_DISPI_INDEX_YRES		   ] = "yres",
+	[ VBE_DISPI_INDEX_BPP		   ] = "bpp",
+	[ VBE_DISPI_INDEX_ENABLE	   ] = "enable",
+	[ VBE_DISPI_INDEX_BANK		   ] = "bank",
+	[ VBE_DISPI_INDEX_VIRT_WIDTH	   ] = "virt-width",
+	[ VBE_DISPI_INDEX_VIRT_HEIGHT	   ] = "virt-height",
+	[ VBE_DISPI_INDEX_X_OFFSET	   ] = "x-offset",
+	[ VBE_DISPI_INDEX_Y_OFFSET	   ] = "y-offset",
+	[ VBE_DISPI_INDEX_VIDEO_MEMORY_64K ] = "video-mem",
+};
+
+static const char *vbe_name(u32 index)
+{
+	if (index < ARRAY_SIZE(vbe_name_list))
+		return vbe_name_list[index];
+	return "(invalid)";
+}
+
+static const struct mbochs_type *mbochs_find_type(struct kobject *kobj)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mbochs_types); i++)
+		if (strcmp(mbochs_types[i].name, kobj->name) == 0)
+			return mbochs_types + i;
+	return NULL;
+}
+
+static void mbochs_create_config_space(struct mdev_state *mdev_state)
+{
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
+		   0x1234);
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
+		   0x1111);
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
+		   PCI_SUBVENDOR_ID_REDHAT_QUMRANET);
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
+		   PCI_SUBDEVICE_ID_QEMU);
+
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
+		   PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
+		   PCI_CLASS_DISPLAY_OTHER);
+	mdev_state->vconfig[PCI_CLASS_REVISION] =  0x01;
+
+	STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
+		   PCI_BASE_ADDRESS_SPACE_MEMORY |
+		   PCI_BASE_ADDRESS_MEM_TYPE_32	 |
+		   PCI_BASE_ADDRESS_MEM_PREFETCH);
+	mdev_state->bar_mask[0] = ~(mdev_state->memsize) + 1;
+
+	STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_2],
+		   PCI_BASE_ADDRESS_SPACE_MEMORY |
+		   PCI_BASE_ADDRESS_MEM_TYPE_32);
+	mdev_state->bar_mask[2] = ~(MBOCHS_MMIO_BAR_SIZE) + 1;
+}
+
+static int mbochs_check_framebuffer(struct mdev_state *mdev_state,
+				    struct mbochs_mode *mode)
+{
+	struct device *dev = mdev_dev(mdev_state->mdev);
+	u16 *vbe = mdev_state->vbe;
+	u32 virt_width;
+
+	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+	if (!(vbe[VBE_DISPI_INDEX_ENABLE] & VBE_DISPI_ENABLED))
+		goto nofb;
+
+	memset(mode, 0, sizeof(*mode));
+	switch (vbe[VBE_DISPI_INDEX_BPP]) {
+	case 32:
+		mode->drm_format = DRM_FORMAT_XRGB8888;
+		mode->bytepp = 4;
+		break;
+	default:
+		dev_info_ratelimited(dev, "%s: bpp %d not supported\n", __func__,
+				     vbe[VBE_DISPI_INDEX_BPP]);
+		goto nofb;
+	}
+
+	mode->width  = vbe[VBE_DISPI_INDEX_XRES];
+	mode->height = vbe[VBE_DISPI_INDEX_YRES];
+	virt_width  = vbe[VBE_DISPI_INDEX_VIRT_WIDTH];
+	if (virt_width < mode->width)
+		virt_width = mode->width;
+	mode->stride = virt_width * mode->bytepp;
+	mode->size   = (u64)mode->stride * mode->height;
+	mode->offset = ((u64)vbe[VBE_DISPI_INDEX_X_OFFSET] * mode->bytepp +
+		       (u64)vbe[VBE_DISPI_INDEX_Y_OFFSET] * mode->stride);
+
+	if (mode->width < 64 || mode->height < 64) {
+		dev_info_ratelimited(dev, "%s: invalid resolution %dx%d\n",
+				     __func__, mode->width, mode->height);
+		goto nofb;
+	}
+	if (mode->offset + mode->size > mdev_state->memsize) {
+		dev_info_ratelimited(dev, "%s: framebuffer memory overflow\n",
+				     __func__);
+		goto nofb;
+	}
+
+	return 0;
+
+nofb:
+	memset(mode, 0, sizeof(*mode));
+	return -EINVAL;
+}
+
+static bool mbochs_modes_equal(struct mbochs_mode *mode1,
+			       struct mbochs_mode *mode2)
+{
+	return memcmp(mode1, mode2, sizeof(struct mbochs_mode)) == 0;
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+				 char *buf, u32 count)
+{
+	struct device *dev = mdev_dev(mdev_state->mdev);
+	int index = (offset - PCI_BASE_ADDRESS_0) / 0x04;
+	u32 cfg_addr;
+
+	switch (offset) {
+	case PCI_BASE_ADDRESS_0:
+	case PCI_BASE_ADDRESS_2:
+		cfg_addr = *(u32 *)buf;
+
+		if (cfg_addr == 0xffffffff) {
+			cfg_addr = (cfg_addr & mdev_state->bar_mask[index]);
+		} else {
+			cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
+			if (cfg_addr)
+				dev_info(dev, "BAR #%d @ 0x%x\n",
+					 index, cfg_addr);
+		}
+
+		cfg_addr |= (mdev_state->vconfig[offset] &
+			     ~PCI_BASE_ADDRESS_MEM_MASK);
+		STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+		break;
+	}
+}
+
+static void handle_mmio_write(struct mdev_state *mdev_state, u16 offset,
+			      char *buf, u32 count)
+{
+	struct device *dev = mdev_dev(mdev_state->mdev);
+	int index;
+	u16 reg16;
+
+	switch (offset) {
+	case 0x400 ... 0x41f: /* vga ioports remapped */
+		goto unhandled;
+	case 0x500 ... 0x515: /* bochs dispi interface */
+		if (count != 2)
+			goto unhandled;
+		index = (offset - 0x500) / 2;
+		reg16 = *(u16*)buf;
+		if (index < ARRAY_SIZE(mdev_state->vbe))
+			mdev_state->vbe[index] = reg16;
+		dev_dbg(dev, "%s: vbe write %d = %d (%s)\n",
+			__func__, index, reg16, vbe_name(index));
+		break;
+	case 0x600 ... 0x607: /* qemu extended regs */
+		goto unhandled;
+	default:
+	unhandled:
+		dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
+			__func__, offset, count);
+		break;
+	}
+}
+
+static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset,
+			     char *buf, u32 count)
+{
+	struct device *dev = mdev_dev(mdev_state->mdev);
+	u16 reg16 = 0;
+	int index;
+
+	switch (offset) {
+	case 0x500 ... 0x515: /* bochs dispi interface */
+		if (count != 2)
+			goto unhandled;
+		index = (offset - 0x500) / 2;
+		if (index < ARRAY_SIZE(mdev_state->vbe))
+			reg16 = mdev_state->vbe[index];
+		dev_dbg(dev, "%s: vbe read %d = %d (%s)\n",
+			__func__, index, reg16, vbe_name(index));
+		*(u16*)buf = reg16;
+		break;
+	default:
+	unhandled:
+		dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
+			__func__, offset, count);
+		memset(buf, 0, count);
+		break;
+	}
+}
+
+static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
+			   loff_t pos, bool is_write)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct device *dev = mdev_dev(mdev);
+	int ret = 0;
+
+	mutex_lock(&mdev_state->ops_lock);
+
+	if (pos < MBOCHS_CONFIG_SPACE_SIZE) {
+		if (is_write) {
+			handle_pci_cfg_write(mdev_state, pos, buf, count);
+		} else {
+			memcpy(buf, (mdev_state->vconfig + pos), count);
+		}
+
+	} else if (pos >= MBOCHS_MMIO_BAR_OFFSET &&
+		   pos + count <  MBOCHS_MEMORY_BAR_OFFSET) {
+		pos -= MBOCHS_MMIO_BAR_OFFSET;
+		if (is_write) {
+			handle_mmio_write(mdev_state, pos, buf, count);
+		} else {
+			handle_mmio_read(mdev_state, pos, buf, count);
+		}
+
+	} else {
+		dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n",
+			__func__, is_write ? "WR" : "RD", pos);
+		ret = -1;
+		goto accessfailed;
+	}
+
+	ret = count;
+
+
+accessfailed:
+	mutex_unlock(&mdev_state->ops_lock);
+
+	return ret;
+}
+
+int mbochs_reset(struct mdev_device *mdev)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	u32 size64k = mdev_state->memsize / (64 * 1024);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mdev_state->vbe); i++)
+		mdev_state->vbe[i] = 0;
+	mdev_state->vbe[VBE_DISPI_INDEX_ID] = VBE_DISPI_ID5;
+	mdev_state->vbe[VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = size64k;
+	return 0;
+}
+
+int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+	const struct mbochs_type *type = mbochs_find_type(kobj);
+	struct device *dev = mdev_dev(mdev);
+	struct mdev_state *mdev_state;
+
+	if (!type)
+		type = &mbochs_types[0];
+	if (type->mbytes + mbochs_used_mbytes > max_mbytes)
+		return -ENOMEM;
+
+	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
+	if (mdev_state == NULL)
+		return -ENOMEM;
+
+	mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL);
+	if (mdev_state->vconfig == NULL)
+		goto err_mem;
+
+	mdev_state->memsize = type->mbytes * 1024 * 1024;
+	mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT;
+	mdev_state->pages = kzalloc(mdev_state->pagecount * sizeof(struct page*),
+				    GFP_KERNEL);
+	if (!mdev_state->pages)
+		goto err_mem;
+
+	dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__,
+		 kobj->name, type->mbytes, mdev_state->pagecount);
+
+	mutex_init(&mdev_state->ops_lock);
+	mdev_state->mdev = mdev;
+	mdev_set_drvdata(mdev, mdev_state);
+	INIT_LIST_HEAD(&mdev_state->dmabufs);
+	mdev_state->next_id = 1;
+
+	mdev_state->type = type;
+	mbochs_create_config_space(mdev_state);
+	mbochs_reset(mdev);
+
+	mbochs_used_mbytes += type->mbytes;
+	return 0;
+
+err_mem:
+	kfree(mdev_state->vconfig);
+	kfree(mdev_state);
+	return -ENOMEM;
+}
+
+int mbochs_remove(struct mdev_device *mdev)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+	mbochs_used_mbytes -= mdev_state->type->mbytes;
+	mdev_set_drvdata(mdev, NULL);
+	kfree(mdev_state->pages);
+	kfree(mdev_state->vconfig);
+	kfree(mdev_state);
+	return 0;
+}
+
+ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
+		    size_t count, loff_t *ppos)
+{
+	unsigned int done = 0;
+	int ret;
+
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			ret =  mdev_access(mdev, (char *)&val, sizeof(val),
+					   *ppos, false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	return done;
+
+read_err:
+	return -EFAULT;
+}
+
+ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
+		     size_t count, loff_t *ppos)
+{
+	unsigned int done = 0;
+	int ret;
+
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+					  *ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 1;
+		}
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	return done;
+write_err:
+	return -EFAULT;
+}
+
+struct page *__mbochs_get_page(struct mdev_state *mdev_state, pgoff_t pgoff)
+{
+	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+	if (!mdev_state->pages[pgoff]) {
+		mdev_state->pages[pgoff] =
+			alloc_pages(GFP_HIGHUSER | __GFP_ZERO, 0);
+		if (!mdev_state->pages[pgoff])
+			return NULL;
+	}
+
+	get_page(mdev_state->pages[pgoff]);
+	return mdev_state->pages[pgoff];
+}
+
+struct page *mbochs_get_page(struct mdev_state *mdev_state, pgoff_t pgoff)
+{
+	struct page *page;
+
+	if (WARN_ON(pgoff >= mdev_state->pagecount))
+		return NULL;
+
+	mutex_lock(&mdev_state->ops_lock);
+	page = __mbochs_get_page(mdev_state, pgoff);
+	mutex_unlock(&mdev_state->ops_lock);
+
+	return page;
+}
+
+void mbochs_put_pages(struct mdev_state *mdev_state)
+{
+	struct device *dev = mdev_dev(mdev_state->mdev);
+	int i, count = 0;
+
+	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+	for (i = 0; i < mdev_state->pagecount; i++) {
+		if (!mdev_state->pages[i])
+			continue;
+		put_page(mdev_state->pages[i]);
+		mdev_state->pages[i] = NULL;
+		count++;
+	}
+	dev_dbg(dev, "%s: %d pages released\n", __func__, count);
+}
+
+static int mbochs_region_vm_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct mdev_state *mdev_state = vma->vm_private_data;
+	pgoff_t page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+
+	if (page_offset >= mdev_state->pagecount)
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = mbochs_get_page(mdev_state, page_offset);
+	if (!vmf->page)
+		return VM_FAULT_SIGBUS;
+
+	return 0;
+}
+
+static const struct vm_operations_struct mbochs_region_vm_ops = {
+	.fault = mbochs_region_vm_fault,
+};
+
+int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+	if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
+		return -EINVAL;
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if (vma->vm_end - vma->vm_start > mdev_state->memsize)
+		return -EINVAL;
+	if ((vma->vm_flags & VM_SHARED) == 0)
+		return -EINVAL;
+
+	vma->vm_ops = &mbochs_region_vm_ops;
+	vma->vm_private_data = mdev_state;
+	return 0;
+}
+
+static int mbochs_dmabuf_vm_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct mbochs_dmabuf *dmabuf = vma->vm_private_data;
+
+	if (WARN_ON(vmf->pgoff >= dmabuf->pagecount))
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = dmabuf->pages[vmf->pgoff];
+	get_page(vmf->page);
+	return 0;
+}
+
+static const struct vm_operations_struct mbochs_dmabuf_vm_ops = {
+	.fault = mbochs_dmabuf_vm_fault,
+};
+
+static int mbochs_mmap_dmabuf(struct dma_buf *buf, struct vm_area_struct *vma)
+{
+	struct mbochs_dmabuf *dmabuf = buf->priv;
+	struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+
+	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+	if ((vma->vm_flags & VM_SHARED) == 0)
+		return -EINVAL;
+
+	vma->vm_ops = &mbochs_dmabuf_vm_ops;
+	vma->vm_private_data = dmabuf;
+	return 0;
+}
+
+static void mbochs_print_dmabuf(struct mbochs_dmabuf *dmabuf,
+				const char *prefix)
+{
+	struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+	u32 fourcc = dmabuf->mode.drm_format;
+
+	dev_dbg(dev, "%s/%d: %c%c%c%c, %dx%d, stride %d, off 0x%llx, size 0x%llx, pages %ld\n",
+		prefix, dmabuf->id,
+		fourcc ? ((fourcc >>  0) & 0xff) : '-',
+		fourcc ? ((fourcc >>  8) & 0xff) : '-',
+		fourcc ? ((fourcc >> 16) & 0xff) : '-',
+		fourcc ? ((fourcc >> 24) & 0xff) : '-',
+		dmabuf->mode.width, dmabuf->mode.height, dmabuf->mode.stride,
+		dmabuf->mode.offset, dmabuf->mode.size, dmabuf->pagecount);
+}
+
+static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at,
+					  enum dma_data_direction direction)
+{
+	struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
+	struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+	struct sg_table *sg;
+
+	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+	sg = kzalloc(sizeof(*sg), GFP_KERNEL);
+	if (!sg)
+		goto err1;
+	if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount,
+				      0, dmabuf->mode.size, GFP_KERNEL) < 0)
+		goto err2;
+	if (!dma_map_sg(at->dev, sg->sgl, sg->nents, direction))
+		goto err3;
+
+	return sg;
+
+err3:
+	sg_free_table(sg);
+err2:
+	kfree(sg);
+err1:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at,
+				struct sg_table *sg,
+				enum dma_data_direction direction)
+{
+	struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
+	struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+
+	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+	sg_free_table(sg);
+	kfree(sg);
+}
+
+static void mbochs_release_dmabuf(struct dma_buf *buf)
+{
+	struct mbochs_dmabuf *dmabuf = buf->priv;
+	struct mdev_state *mdev_state = dmabuf->mdev_state;
+	struct device *dev = mdev_dev(mdev_state->mdev);
+	pgoff_t pg;
+
+	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+	for (pg = 0; pg < dmabuf->pagecount; pg++)
+		put_page(dmabuf->pages[pg]);
+
+	mutex_lock(&mdev_state->ops_lock);
+	dmabuf->buf = NULL;
+	if (dmabuf->unlinked)
+		kfree(dmabuf);
+	mutex_unlock(&mdev_state->ops_lock);
+}
+
+static void *mbochs_kmap_atomic_dmabuf(struct dma_buf *buf, unsigned long page_num)
+{
+	struct mbochs_dmabuf *dmabuf = buf->priv;
+	struct page *page = dmabuf->pages[page_num];
+
+	return kmap_atomic(page);
+}
+
+static void *mbochs_kmap_dmabuf(struct dma_buf *buf, unsigned long page_num)
+{
+	struct mbochs_dmabuf *dmabuf = buf->priv;
+	struct page *page = dmabuf->pages[page_num];
+
+	return kmap(page);
+}
+
+static struct dma_buf_ops mbochs_dmabuf_ops = {
+	.map_dma_buf	  = mbochs_map_dmabuf,
+	.unmap_dma_buf	  = mbochs_unmap_dmabuf,
+	.release	  = mbochs_release_dmabuf,
+	.map_atomic	  = mbochs_kmap_atomic_dmabuf,
+	.map		  = mbochs_kmap_dmabuf,
+	.mmap		  = mbochs_mmap_dmabuf,
+};
+
+static struct mbochs_dmabuf *mbochs_dmabuf_alloc(struct mdev_state *mdev_state,
+						 struct mbochs_mode *mode)
+{
+	struct mbochs_dmabuf *dmabuf;
+	pgoff_t page_offset, pg;
+
+	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+	dmabuf = kzalloc(sizeof(struct mbochs_dmabuf), GFP_KERNEL);
+	if (!dmabuf)
+		return NULL;
+
+	dmabuf->mode = *mode;
+	dmabuf->id = mdev_state->next_id++;
+	dmabuf->pagecount = DIV_ROUND_UP(mode->size, PAGE_SIZE);
+	dmabuf->pages = kzalloc(dmabuf->pagecount * sizeof(struct page*),
+				GFP_KERNEL);
+	if (!dmabuf->pages)
+		goto err_free_dmabuf;
+
+	page_offset = dmabuf->mode.offset >> PAGE_SHIFT;
+	for (pg = 0; pg < dmabuf->pagecount; pg++) {
+		dmabuf->pages[pg] = __mbochs_get_page(mdev_state, page_offset + pg);
+		if (!dmabuf->pages[pg])
+			goto err_free_pages;
+	}
+
+	dmabuf->mdev_state = mdev_state;
+	list_add(&dmabuf->next, &mdev_state->dmabufs);
+
+	mbochs_print_dmabuf(dmabuf, __func__);
+	return dmabuf;
+
+err_free_pages:
+	while (pg > 0)
+		put_page(dmabuf->pages[--pg]);
+	kfree(dmabuf->pages);
+err_free_dmabuf:
+	kfree(dmabuf);
+	return NULL;
+}
+
+static struct mbochs_dmabuf *
+mbochs_dmabuf_find_by_mode(struct mdev_state *mdev_state,
+			   struct mbochs_mode *mode)
+{
+	struct mbochs_dmabuf *dmabuf;
+
+	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+	list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
+		if (mbochs_modes_equal(&dmabuf->mode, mode))
+			return dmabuf;
+
+	return NULL;
+}
+
+static struct mbochs_dmabuf *
+mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id)
+{
+	struct mbochs_dmabuf *dmabuf;
+
+	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+	list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
+		if (dmabuf->id == id)
+			return dmabuf;
+
+	return NULL;
+}
+
+static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
+{
+	struct mdev_state *mdev_state = dmabuf->mdev_state;
+	struct device *dev = mdev_dev(mdev_state->mdev);
+	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+	struct dma_buf *buf;
+
+	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+	if (!IS_ALIGNED(dmabuf->mode.offset, PAGE_SIZE)) {
+		dev_info_ratelimited(dev, "%s: framebuffer not page-aligned\n",
+				     __func__);
+		return -EINVAL;
+	}
+
+	exp_info.ops = &mbochs_dmabuf_ops;
+	exp_info.size = dmabuf->mode.size;
+	exp_info.priv = dmabuf;
+
+	buf = dma_buf_export(&exp_info);
+	if (IS_ERR(buf)) {
+		dev_info_ratelimited(dev, "%s: dma_buf_export failed: %ld\n",
+				     __func__, PTR_ERR(buf));
+		return PTR_ERR(buf);
+	}
+
+	dmabuf->buf = buf;
+	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+	return 0;
+}
+
+int mbochs_get_region_info(struct mdev_device *mdev,
+			   struct vfio_region_info *region_info,
+			   u16 *cap_type_id, void **cap_type)
+{
+	struct mdev_state *mdev_state;
+
+	mdev_state = mdev_get_drvdata(mdev);
+	if (!mdev_state)
+		return -EINVAL;
+
+	if (region_info->index >= VFIO_PCI_NUM_REGIONS)
+		return -EINVAL;
+
+	switch (region_info->index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		region_info->offset = 0;
+		region_info->size   = MBOCHS_CONFIG_SPACE_SIZE;
+		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ |
+				       VFIO_REGION_INFO_FLAG_WRITE);
+		break;
+	case VFIO_PCI_BAR0_REGION_INDEX:
+		region_info->offset = MBOCHS_MEMORY_BAR_OFFSET;
+		region_info->size   = mdev_state->memsize;
+		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ  |
+				       VFIO_REGION_INFO_FLAG_WRITE |
+				       VFIO_REGION_INFO_FLAG_MMAP);
+		break;
+	case VFIO_PCI_BAR2_REGION_INDEX:
+		region_info->offset = MBOCHS_MMIO_BAR_OFFSET;
+		region_info->size   = MBOCHS_MMIO_BAR_SIZE;
+		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ  |
+				       VFIO_REGION_INFO_FLAG_WRITE);
+		break;
+	default:
+		region_info->size   = 0;
+		region_info->offset = 0;
+		region_info->flags  = 0;
+	}
+
+	return 0;
+}
+
+int mbochs_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info)
+{
+	irq_info->count = 0;
+	return 0;
+}
+
+int mbochs_get_device_info(struct mdev_device *mdev,
+			 struct vfio_device_info *dev_info)
+{
+	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+	dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
+	dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+	return 0;
+}
+
+int mbochs_query_gfx_plane(struct mdev_device *mdev,
+			   struct vfio_device_gfx_plane_info *plane)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct device *dev = mdev_dev(mdev);
+	struct mbochs_dmabuf *dmabuf;
+	struct mbochs_mode mode;
+	int ret;
+
+	if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
+		if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
+				     VFIO_GFX_PLANE_TYPE_DMABUF))
+			return 0;
+		return -EINVAL;
+	}
+
+	if (plane->flags != VFIO_GFX_PLANE_TYPE_DMABUF)
+		return -EINVAL;
+
+	plane->drm_format_mod = 0;
+	plane->x_pos	      = 0;
+	plane->y_pos	      = 0;
+	plane->x_hot	      = 0;
+	plane->y_hot	      = 0;
+
+	mutex_lock(&mdev_state->ops_lock);
+
+	ret = -EINVAL;
+	if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY)
+		ret = mbochs_check_framebuffer(mdev_state, &mode);
+	if (ret < 0) {
+		plane->drm_format     = 0;
+		plane->width	      = 0;
+		plane->height	      = 0;
+		plane->stride	      = 0;
+		plane->size	      = 0;
+		plane->dmabuf_id      = 0;
+		goto done;
+	}
+
+	dmabuf = mbochs_dmabuf_find_by_mode(mdev_state, &mode);
+	if (!dmabuf)
+		mbochs_dmabuf_alloc(mdev_state, &mode);
+	if (!dmabuf) {
+		mutex_unlock(&mdev_state->ops_lock);
+		return -ENOMEM;
+	}
+
+	plane->drm_format     = dmabuf->mode.drm_format;
+	plane->width	      = dmabuf->mode.width;
+	plane->height	      = dmabuf->mode.height;
+	plane->stride	      = dmabuf->mode.stride;
+	plane->size	      = dmabuf->mode.size;
+	plane->dmabuf_id      = dmabuf->id;
+
+done:
+	if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY &&
+	    mdev_state->active_id != plane->dmabuf_id) {
+		dev_dbg(dev, "%s: primary: %d => %d\n", __func__,
+			mdev_state->active_id, plane->dmabuf_id);
+		mdev_state->active_id = plane->dmabuf_id;
+	}
+	mutex_unlock(&mdev_state->ops_lock);
+	return 0;
+}
+
+int mbochs_get_gfx_dmabuf(struct mdev_device *mdev,
+			  u32 id)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mbochs_dmabuf *dmabuf;
+
+	mutex_lock(&mdev_state->ops_lock);
+
+	dmabuf = mbochs_dmabuf_find_by_id(mdev_state, id);
+	if (!dmabuf) {
+		mutex_unlock(&mdev_state->ops_lock);
+		return -ENOENT;
+	}
+
+	if (!dmabuf->buf)
+		mbochs_dmabuf_export(dmabuf);
+
+	mutex_unlock(&mdev_state->ops_lock);
+
+	if (!dmabuf->buf)
+		return -EINVAL;
+
+	return dma_buf_fd(dmabuf->buf, 0);
+}
+
+static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
+			unsigned long arg)
+{
+	int ret = 0;
+	unsigned long minsz;
+	struct mdev_state *mdev_state;
+
+	mdev_state = mdev_get_drvdata(mdev);
+
+	switch (cmd) {
+	case VFIO_DEVICE_GET_INFO:
+	{
+		struct vfio_device_info info;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		ret = mbochs_get_device_info(mdev, &info);
+		if (ret)
+			return ret;
+
+		memcpy(&mdev_state->dev_info, &info, sizeof(info));
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_DEVICE_GET_REGION_INFO:
+	{
+		struct vfio_region_info info;
+		u16 cap_type_id = 0;
+		void *cap_type = NULL;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		ret = mbochs_get_region_info(mdev, &info, &cap_type_id,
+					   &cap_type);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case VFIO_DEVICE_GET_IRQ_INFO:
+	{
+		struct vfio_irq_info info;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if ((info.argsz < minsz) ||
+		    (info.index >= mdev_state->dev_info.num_irqs))
+			return -EINVAL;
+
+		ret = mbochs_get_irq_info(mdev, &info);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case VFIO_DEVICE_QUERY_GFX_PLANE:
+	{
+		struct vfio_device_gfx_plane_info plane;
+
+		minsz = offsetofend(struct vfio_device_gfx_plane_info,
+				    region_index);
+
+		if (copy_from_user(&plane, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (plane.argsz < minsz)
+			return -EINVAL;
+
+		ret = mbochs_query_gfx_plane(mdev, &plane);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((void __user *)arg, &plane, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case VFIO_DEVICE_GET_GFX_DMABUF:
+	{
+		u32 dmabuf_id;
+
+		if (get_user(dmabuf_id, (__u32 __user *)arg))
+			return -EFAULT;
+
+		return mbochs_get_gfx_dmabuf(mdev, dmabuf_id);
+	}
+
+	case VFIO_DEVICE_SET_IRQS:
+		return -EINVAL;
+
+	case VFIO_DEVICE_RESET:
+		return mbochs_reset(mdev);
+	}
+	return -ENOTTY;
+}
+
+int mbochs_open(struct mdev_device *mdev)
+{
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	return 0;
+}
+
+void mbochs_close(struct mdev_device *mdev)
+{
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mbochs_dmabuf *dmabuf, *tmp;
+
+	mutex_lock(&mdev_state->ops_lock);
+
+	list_for_each_entry_safe(dmabuf, tmp, &mdev_state->dmabufs, next) {
+		list_del(&dmabuf->next);
+		if (dmabuf->buf) {
+			/* free in mbochs_release_dmabuf() */
+			dmabuf->unlinked = true;
+		} else {
+			kfree(dmabuf);
+		}
+	}
+	mbochs_put_pages(mdev_state);
+
+	mutex_unlock(&mdev_state->ops_lock);
+	module_put(THIS_MODULE);
+}
+
+static ssize_t
+memory_show(struct device *dev, struct device_attribute *attr,
+	    char *buf)
+{
+	struct mdev_device *mdev = mdev_from_dev(dev);
+	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+	return sprintf(buf, "%d MB\n", mdev_state->type->mbytes);
+}
+static DEVICE_ATTR_RO(memory);
+
+static struct attribute *mdev_dev_attrs[] = {
+	&dev_attr_memory.attr,
+	NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+	.name  = "vendor",
+	.attrs = mdev_dev_attrs,
+};
+
+const struct attribute_group *mdev_dev_groups[] = {
+	&mdev_dev_group,
+	NULL,
+};
+
+static ssize_t
+name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+	return sprintf(buf, "%s\n", kobj->name);
+}
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t
+description_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+	const struct mbochs_type *type = mbochs_find_type(kobj);
+
+	return sprintf(buf, "virtual display, %d MB video memory\n",
+		       type ? type->mbytes  : 0);
+}
+MDEV_TYPE_ATTR_RO(description);
+
+static ssize_t
+available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+	const struct mbochs_type *type = mbochs_find_type(kobj);
+	int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
+
+	return sprintf(buf, "%d\n", count);
+}
+MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+			       char *buf)
+{
+	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *mdev_types_attrs[] = {
+	&mdev_type_attr_name.attr,
+	&mdev_type_attr_description.attr,
+	&mdev_type_attr_device_api.attr,
+	&mdev_type_attr_available_instances.attr,
+	NULL,
+};
+
+static struct attribute_group mdev_type_group1 = {
+	.name  = MBOCHS_TYPE_1,
+	.attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group2 = {
+	.name  = MBOCHS_TYPE_2,
+	.attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group3 = {
+	.name  = MBOCHS_TYPE_3,
+	.attrs = mdev_types_attrs,
+};
+
+static struct attribute_group *mdev_type_groups[] = {
+	&mdev_type_group1,
+	&mdev_type_group2,
+	&mdev_type_group3,
+	NULL,
+};
+
+static const struct mdev_parent_ops mdev_fops = {
+	.owner			= THIS_MODULE,
+	.mdev_attr_groups	= mdev_dev_groups,
+	.supported_type_groups	= mdev_type_groups,
+	.create			= mbochs_create,
+	.remove			= mbochs_remove,
+	.open			= mbochs_open,
+	.release		= mbochs_close,
+	.read			= mbochs_read,
+	.write			= mbochs_write,
+	.ioctl			= mbochs_ioctl,
+	.mmap			= mbochs_mmap,
+};
+
+static const struct file_operations vd_fops = {
+	.owner		= THIS_MODULE,
+};
+
+static void mbochs_device_release(struct device *dev)
+{
+	/* nothing */
+}
+
+static int __init mbochs_dev_init(void)
+{
+	int ret = 0;
+
+	ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK, MBOCHS_NAME);
+	if (ret < 0) {
+		pr_err("Error: failed to register mbochs_dev, err: %d\n", ret);
+		return ret;
+	}
+	cdev_init(&mbochs_cdev, &vd_fops);
+	cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK);
+	pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt));
+
+	mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME);
+	if (IS_ERR(mbochs_class)) {
+		pr_err("Error: failed to register mbochs_dev class\n");
+		ret = PTR_ERR(mbochs_class);
+		goto failed1;
+	}
+	mbochs_dev.class = mbochs_class;
+	mbochs_dev.release = mbochs_device_release;
+	dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME);
+
+	ret = device_register(&mbochs_dev);
+	if (ret)
+		goto failed2;
+
+	ret = mdev_register_device(&mbochs_dev, &mdev_fops);
+	if (ret)
+		goto failed3;
+
+	return 0;
+
+failed3:
+	device_unregister(&mbochs_dev);
+failed2:
+	class_destroy(mbochs_class);
+failed1:
+	cdev_del(&mbochs_cdev);
+	unregister_chrdev_region(mbochs_devt, MINORMASK);
+	return ret;
+}
+
+static void __exit mbochs_dev_exit(void)
+{
+	mbochs_dev.bus = NULL;
+	mdev_unregister_device(&mbochs_dev);
+
+	device_unregister(&mbochs_dev);
+	cdev_del(&mbochs_cdev);
+	unregister_chrdev_region(mbochs_devt, MINORMASK);
+	class_destroy(mbochs_class);
+	mbochs_class = NULL;
+}
+
+module_init(mbochs_dev_init)
+module_exit(mbochs_dev_exit)
diff --git a/samples/Kconfig b/samples/Kconfig
index 755430c788..5de0674cdd 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -128,6 +128,19 @@ config SAMPLE_VFIO_MDEV_MDPY_FB
 	help
 	  Guest fbdev driver for the virtual display sample driver.
 
+config SAMPLE_VFIO_MDEV_MBOCHS
+	tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
+	depends on VFIO_MDEV_DEVICE && m
+	help
+	  Build a virtual display sample driver for use as a VFIO
+	  mediated device.  It supports the region display interface
+	  (VFIO_GFX_PLANE_TYPE_DMABUF).
+	  Emulate enough of qemu stdvga to make bochs-drm.ko happy.
+	  That is basically the vram memory bar and the bochs dispi
+	  interface vbe registers in the mmio register bar.
+	  Specifically it does *not* include any legacy vga stuff.
+	  Device looks alot like "qemu -device secondary-vga".
+
 config SAMPLE_STATX
 	bool "Build example extended-stat using code"
 	depends on BROKEN
diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
index 7a5790aaec..7db889ca13 100644
--- a/samples/vfio-mdev/Makefile
+++ b/samples/vfio-mdev/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
 obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o
 obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MBOCHS) += mbochs.o
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] sample: vfio mdev display - guest driver
  2018-04-09 10:35 ` [PATCH 2/3] sample: vfio mdev display - guest driver Gerd Hoffmann
@ 2018-04-11 20:39   ` Bjorn Helgaas
  2018-04-24  2:51   ` Alex Williamson
  2018-04-25 21:03   ` Konrad Rzeszutek Wilk
  2 siblings, 0 replies; 41+ messages in thread
From: Bjorn Helgaas @ 2018-04-11 20:39 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: kvm, alex.williamson, kwankhede, open list

On Mon, Apr 09, 2018 at 12:35:12PM +0200, Gerd Hoffmann wrote:
> Guest fbdev driver for CONFIG_SAMPLE_VFIO_MDEV_MDPY.

> +	dev_info(&pdev->dev, "mdpy found: %dx%d framebuffer\n",
> +		 width, height);

You can now use

  pci_info(pdev, "mdpy found ...")

if it seems worthwhile to you.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
       [not found] <20180409103513.8020-1-kraxel@redhat.com>
                   ` (2 preceding siblings ...)
  2018-04-09 10:35 ` [PATCH 3/3] sample: vfio bochs vbe display (host device for bochs-drm) Gerd Hoffmann
@ 2018-04-18 18:31 ` Alex Williamson
  2018-04-19  8:40   ` Gerd Hoffmann
                     ` (2 more replies)
  3 siblings, 3 replies; 41+ messages in thread
From: Alex Williamson @ 2018-04-18 18:31 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: kvm, Erik Skultety, libvirt, Tina Zhang, kwankhede, intel-gvt-dev

On Mon,  9 Apr 2018 12:35:10 +0200
Gerd Hoffmann <kraxel@redhat.com> wrote:

> This little series adds three drivers, for demo-ing and testing vfio
> display interface code.  There is one mdev device for each interface
> type (mdpy.ko for region and mbochs.ko for dmabuf).

Erik Skultety brought up a good question today regarding how libvirt is
meant to handle these different flavors of display interfaces and
knowing whether a given mdev device has display support at all.  It
seems that we cannot simply use the default display=auto because
libvirt needs to specifically configure gl support for a dmabuf type
interface versus not having such a requirement for a region interface,
perhaps even removing the emulated graphics in some cases (though I
don't think we have boot graphics through either solution yet).
Additionally, GVT-g seems to need the x-igd-opregion support
enabled(?), which is a non-starter for libvirt as it's an experimental
option!

Currently the only way to determine display support is through the
VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
their own they'd need to get to the point where they could open the
vfio device and perform the ioctl.  That means opening a vfio
container, adding the group, setting the iommu type, and getting the
device.  I was initially a bit appalled at asking libvirt to do that,
but the alternative is to put this information in sysfs, but doing that
we risk that we need to describe every nuance of the mdev device
through sysfs and it becomes a dumping ground for every possible
feature an mdev device might have.

So I was ready to return and suggest that maybe libvirt should probe
the device to know about these ancillary configuration details, but
then I remembered that both mdev vGPU vendors had external dependencies
to even allow probing the device.  KVMGT will fail to open the device
if it's not associated with an instance of KVM and NVIDIA vGPU, I
believe, will fail if the vGPU manager process cannot find the QEMU
instance to extract the VM UUID.  (Both of these were bad ideas)

Therefore, how can libvirt know if a given mdev device supports a
display and which type of display it supports, and potentially which
vendor specific options might be required to further enable that
display (if they weren't experimental)?  A terrible solution would be
that libvirt hard codes that NVIDIA works with regions and Intel works
with dmabufs, but even then there's a backwards and forwards
compatibility problem, that libvirt needs to support older kernels and
drivers where display support is not present and newer drivers where
perhaps Intel is now doing regions and NVIDIA is supporting dmabuf, so
it cannot simply be assumed based on the vendor. The only solution I see
down that path would be identifying specific {vendor,type} pairs that
support a predefined display type, but that's just absurd to think that
vendors would rev their mdev types to expose this and that libvirt
would keep a database mapping types to features.  We also have the name
and description attributes, but these are currently free form, so
libvirt rightfully ignores them entirely.  I don't know if we could
create a defined feature string within those free form strings.

Otherwise, it seems we have no choice but to dive into the pool of
exposing such features via sysfs and we'll need to be vigilant of
feature creep or vendor specific features (ex. we're not adding a
feature to indicate an opregion requirement).  How should we do this?
Perhaps a bar we can set is that if a feature cannot be discovered
through a standard vfio API, then it is not suitable for this sysfs
API.  Such things can be described via our existing mdev vendor
specific attribute interface.

We currently have this sysfs interface:

mdev_supported_types/
|-- $VENDOR_TYPE
|   |-- available_instances
|   |-- create
|   |-- description
|   |-- device_api
|   |-- devices
|   `-- name

ioctls for vfio devices which only provide information include:

VFIO_DEVICE_GET_INFO
VFIO_DEVICE_GET_REGION_INFO
VFIO_DEVICE_GET_IRQ_INFO
VFIO_DEVICE_GET_PCI_HOT_RESET_INFO
VFIO_DEVICE_QUERY_GFX_PLANE

We don't need to support all of these initially, but here's a starting
idea for what this may look like in sysfs:

$VENDOR_TYPE/
|-- available_instances
|-- create
|-- description
|-- device_api
|-- devices
|-- name
`-- vfio-pci
    `-- device
        |-- gfx_plane
        |   |-- dmabuf
        |   `-- region
        |-- irqs
        |   |-- 0
        |   |   |-- count
        |   |   `-- flags
        |   `-- 1
        |       |-- count
        |       `-- flags
        `-- regions
            |-- 0
            |   |-- flags
            |   |-- offset
            |   `-- size
            `-- 3
                |-- flags
                |-- offset
                `-- size

The existing device_api file reports "vfio-pci", so we base the device
API info in a directory named vfio-pci.  We're specifically exposing
device information, so we have a device directory.  We have a GFX_PLANE
query ioctl, so we have a gfx_plane sub-directory.  I imagine the
dmabuf and region files here expose either Y/N or 1/0.  I continue on
the example with how we might expose irqs and regions, but even with
regions we can bury down into how is sparse mmap exposed, how are
device specific regions described, etc.  Filling this in to completion
without a specific userspace need to expose the information is just an
exercise in bloating the kernel.

That almost begins to look reasonable, but then we can only expose this
for mdev devices, what if we were to hack a back door into a directly
assigned GPU that tracks the location of active display in the
framebuffer and implement the GFX_PLANE interface for that?  We have no
sysfs representation for either the template or the actual device for
anything other than mdev.  This inconsistency with physically assigned
devices has been one of my arguments against enhancing mdev sysfs.

Thanks to anyone still reading this.  Ideas how we might help libvirt
fill this information void so that they can actually configure a VM
with a display device?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-18 18:31 ` [libvirt] [PATCH 0/3] sample: vfio mdev display devices Alex Williamson
@ 2018-04-19  8:40   ` Gerd Hoffmann
  2018-04-19 10:03     ` Zhenyu Wang
                       ` (2 more replies)
  2018-04-23 21:40   ` Alex Williamson
  2018-04-26  3:44   ` Tian, Kevin
  2 siblings, 3 replies; 41+ messages in thread
From: Gerd Hoffmann @ 2018-04-19  8:40 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Erik Skultety, libvirt, Tina Zhang, kwankhede, intel-gvt-dev

  Hi,

> Erik Skultety brought up a good question today regarding how libvirt is
> meant to handle these different flavors of display interfaces and
> knowing whether a given mdev device has display support at all.  It
> seems that we cannot simply use the default display=auto because
> libvirt needs to specifically configure gl support for a dmabuf type
> interface versus not having such a requirement for a region interface,
> perhaps even removing the emulated graphics in some cases (though I
> don't think we have boot graphics through either solution yet).

Correct, no boot graphics yet.  The option to disable emulated graphics
should be added nevertheless.  It's an option after all, you don't have
to use it.

But after install things usually work just fine, it just takes a little
longer for the guest display to show up..  There is also the option to
add a serial console to the guest for boot loader access.

> Additionally, GVT-g seems to need the x-igd-opregion support
> enabled(?), which is a non-starter for libvirt as it's an experimental
> option!

Windows guests need it, yes.  And it seems we have still have to add igd
opregion support to ovmf as only bios guests are working.  Or hack up a
efi rom doing that.  But patching ovmf is probably alot easier because
it already has support code for fw_cfg access.

Linux i915.ko is happy without opregion.

> So I was ready to return and suggest that maybe libvirt should probe
> the device to know about these ancillary configuration details, but
> then I remembered that both mdev vGPU vendors had external dependencies
> to even allow probing the device.  KVMGT will fail to open the device
> if it's not associated with an instance of KVM and NVIDIA vGPU, I
> believe, will fail if the vGPU manager process cannot find the QEMU
> instance to extract the VM UUID.  (Both of these were bad ideas)

Oops.  I've trapped into the kvm issue too.  Wondering what the reason
is, shouldn't this work with tcg too?

But, yes, that indeed pretty much kills the "just let libvirt use the
probe ioctl" idea.

> The existing device_api file reports "vfio-pci", so we base the device
> API info in a directory named vfio-pci.  We're specifically exposing
> device information, so we have a device directory.  We have a GFX_PLANE
> query ioctl, so we have a gfx_plane sub-directory.  I imagine the
> dmabuf and region files here expose either Y/N or 1/0.

Do we want tie this to vfio-pci?  All existing devices are actually pci,
and the qemu code only works for vfio-pci devices too.  But at vfio api
level there is no vfio-pci dependency I'm aware of, and I think we
shouldn't add one without a good reason.

Should we just add a gfx_plane_api file maybe?  Which would be a
comma-separated list of interfaces, listed in order of preference in
case multiple are supported.

> anything other than mdev.  This inconsistency with physically assigned
> devices has been one of my arguments against enhancing mdev sysfs.
> 
> Thanks to anyone still reading this.  Ideas how we might help libvirt
> fill this information void so that they can actually configure a VM
> with a display device?  Thanks,

Well, no good idea for the physical assigned device case.

cheers,
  Gerd

PS: Any comment on the sample driver patches?  Or should I take the lack
    of comments as "no news is good news, they are queued up already"?

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-19  8:40   ` Gerd Hoffmann
@ 2018-04-19 10:03     ` Zhenyu Wang
  2018-04-19 14:20     ` Alex Williamson
  2018-04-19 14:54     ` Paolo Bonzini
  2 siblings, 0 replies; 41+ messages in thread
From: Zhenyu Wang @ 2018-04-19 10:03 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: kvm, Erik Skultety, libvirt, kwankhede, Tina Zhang, intel-gvt-dev


[-- Attachment #1.1: Type: text/plain, Size: 4256 bytes --]

On 2018.04.19 10:40:18 +0200, Gerd Hoffmann wrote:
>   Hi,
> 
> > Erik Skultety brought up a good question today regarding how libvirt is
> > meant to handle these different flavors of display interfaces and
> > knowing whether a given mdev device has display support at all.  It
> > seems that we cannot simply use the default display=auto because
> > libvirt needs to specifically configure gl support for a dmabuf type
> > interface versus not having such a requirement for a region interface,
> > perhaps even removing the emulated graphics in some cases (though I
> > don't think we have boot graphics through either solution yet).
> 
> Correct, no boot graphics yet.  The option to disable emulated graphics
> should be added nevertheless.  It's an option after all, you don't have
> to use it.
> 
> But after install things usually work just fine, it just takes a little
> longer for the guest display to show up..  There is also the option to
> add a serial console to the guest for boot loader access.
> 
> > Additionally, GVT-g seems to need the x-igd-opregion support
> > enabled(?), which is a non-starter for libvirt as it's an experimental
> > option!
> 
> Windows guests need it, yes.  And it seems we have still have to add igd
> opregion support to ovmf as only bios guests are working.  Or hack up a
> efi rom doing that.  But patching ovmf is probably alot easier because
> it already has support code for fw_cfg access.
> 
> Linux i915.ko is happy without opregion.
>

yeah, that's true.

> > So I was ready to return and suggest that maybe libvirt should probe
> > the device to know about these ancillary configuration details, but
> > then I remembered that both mdev vGPU vendors had external dependencies
> > to even allow probing the device.  KVMGT will fail to open the device
> > if it's not associated with an instance of KVM and NVIDIA vGPU, I
> > believe, will fail if the vGPU manager process cannot find the QEMU
> > instance to extract the VM UUID.  (Both of these were bad ideas)
> 
> Oops.  I've trapped into the kvm issue too.  Wondering what the reason
> is, shouldn't this work with tcg too?
> 
> But, yes, that indeed pretty much kills the "just let libvirt use the
> probe ioctl" idea.

I also don't like that strict link and although now KVM is the only upstream
hypervisor GVT supports, we shouldn't require a must available instance for
some device info access.

> 
> > The existing device_api file reports "vfio-pci", so we base the device
> > API info in a directory named vfio-pci.  We're specifically exposing
> > device information, so we have a device directory.  We have a GFX_PLANE
> > query ioctl, so we have a gfx_plane sub-directory.  I imagine the
> > dmabuf and region files here expose either Y/N or 1/0.
> 
> Do we want tie this to vfio-pci?  All existing devices are actually pci,
> and the qemu code only works for vfio-pci devices too.  But at vfio api
> level there is no vfio-pci dependency I'm aware of, and I think we
> shouldn't add one without a good reason.
> 
> Should we just add a gfx_plane_api file maybe?  Which would be a
> comma-separated list of interfaces, listed in order of preference in
> case multiple are supported.

Or a 'feature' file with defined string list for those capabilities?
Might be easier to extend in future.

> 
> > anything other than mdev.  This inconsistency with physically assigned
> > devices has been one of my arguments against enhancing mdev sysfs.
> > 
> > Thanks to anyone still reading this.  Ideas how we might help libvirt
> > fill this information void so that they can actually configure a VM
> > with a display device?  Thanks,
> 
> Well, no good idea for the physical assigned device case.
> 
> cheers,
>   Gerd
> 
> PS: Any comment on the sample driver patches?  Or should I take the lack
>     of comments as "no news is good news, they are queued up already"?
> _______________________________________________
> intel-gvt-dev mailing list
> intel-gvt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev

-- 
Open Source Technology Center, Intel ltd.

$gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-19  8:40   ` Gerd Hoffmann
  2018-04-19 10:03     ` Zhenyu Wang
@ 2018-04-19 14:20     ` Alex Williamson
  2018-04-19 14:54     ` Paolo Bonzini
  2 siblings, 0 replies; 41+ messages in thread
From: Alex Williamson @ 2018-04-19 14:20 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: kvm, Erik Skultety, libvirt, Tina Zhang, kwankhede, intel-gvt-dev

On Thu, 19 Apr 2018 10:40:18 +0200
Gerd Hoffmann <kraxel@redhat.com> wrote:
> > So I was ready to return and suggest that maybe libvirt should probe
> > the device to know about these ancillary configuration details, but
> > then I remembered that both mdev vGPU vendors had external dependencies
> > to even allow probing the device.  KVMGT will fail to open the device
> > if it's not associated with an instance of KVM and NVIDIA vGPU, I
> > believe, will fail if the vGPU manager process cannot find the QEMU
> > instance to extract the VM UUID.  (Both of these were bad ideas)  
> 
> Oops.  I've trapped into the kvm issue too.  Wondering what the reason
> is, shouldn't this work with tcg too?

It's used for some sort of page tracking backdoor.  Yes, I think vfio
devices, including mdev, should work with tcg.  Separating device
assignment to not be integrally tied to kvm is something I've strived
for with vfio.

> But, yes, that indeed pretty much kills the "just let libvirt use the
> probe ioctl" idea.
> 
> > The existing device_api file reports "vfio-pci", so we base the device
> > API info in a directory named vfio-pci.  We're specifically exposing
> > device information, so we have a device directory.  We have a GFX_PLANE
> > query ioctl, so we have a gfx_plane sub-directory.  I imagine the
> > dmabuf and region files here expose either Y/N or 1/0.  
> 
> Do we want tie this to vfio-pci?  All existing devices are actually pci,
> and the qemu code only works for vfio-pci devices too.  But at vfio api
> level there is no vfio-pci dependency I'm aware of, and I think we
> shouldn't add one without a good reason.

The intention was to tie it to 'device_api' which reports 'vfio-pci',
so the user would read the device_api, learn that it uses vfio-pci,
then look for attributes in a vfio-pci sub-directory.  If device_api
reported vfio-ccw, they'd look for a vfio-ccw directory.

> Should we just add a gfx_plane_api file maybe?  Which would be a
> comma-separated list of interfaces, listed in order of preference in
> case multiple are supported.

I'm afraid that as soon as we get away from a strict representation of
the vfio API, we're going to see feature creep with such a solution.
Ex. which hw encoders are supported, frame rate limiters, number of
heads, etc.

> > anything other than mdev.  This inconsistency with physically assigned
> > devices has been one of my arguments against enhancing mdev sysfs.
> > 
> > Thanks to anyone still reading this.  Ideas how we might help libvirt
> > fill this information void so that they can actually configure a VM
> > with a display device?  Thanks,  
> 
> Well, no good idea for the physical assigned device case.

Minimally, I think anything we decide needs to be placed into the
instantiated device sysfs hierarchy rather than the template directory
for a given mdev type, otherwise we have no hope of supporting it with
physical devices.

> PS: Any comment on the sample driver patches?  Or should I take the lack
>     of comments as "no news is good news, they are queued up already"?

I do not have them queued yet, I'll take a closer look at them shortly
and let you know if I find any issues.  Thanks for doing these!  I think
they'll be very helpful, especially for the task above to provide
reference implementations for whatever API exposure we design.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-19  8:40   ` Gerd Hoffmann
  2018-04-19 10:03     ` Zhenyu Wang
  2018-04-19 14:20     ` Alex Williamson
@ 2018-04-19 14:54     ` Paolo Bonzini
  2 siblings, 0 replies; 41+ messages in thread
From: Paolo Bonzini @ 2018-04-19 14:54 UTC (permalink / raw)
  To: Gerd Hoffmann, Alex Williamson
  Cc: kvm, Erik Skultety, libvirt, Tina Zhang, kwankhede, intel-gvt-dev

On 19/04/2018 10:40, Gerd Hoffmann wrote:
>> So I was ready to return and suggest that maybe libvirt should probe
>> the device to know about these ancillary configuration details, but
>> then I remembered that both mdev vGPU vendors had external dependencies
>> to even allow probing the device.  KVMGT will fail to open the device
>> if it's not associated with an instance of KVM and NVIDIA vGPU, I
>> believe, will fail if the vGPU manager process cannot find the QEMU
>> instance to extract the VM UUID.  (Both of these were bad ideas)
> Oops.  I've trapped into the kvm issue too.  Wondering what the reason
> is, shouldn't this work with tcg too?

As far as I understand, KVMGT requires KVM support in order to track
writes to guest memory.  It's a kernel API provided by the kvm.ko
module, so no TCG support.

Paolo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-18 18:31 ` [libvirt] [PATCH 0/3] sample: vfio mdev display devices Alex Williamson
  2018-04-19  8:40   ` Gerd Hoffmann
@ 2018-04-23 21:40   ` Alex Williamson
  2018-04-24  7:17     ` Gerd Hoffmann
  2018-04-24 19:50     ` Kirti Wankhede
  2018-04-26  3:44   ` Tian, Kevin
  2 siblings, 2 replies; 41+ messages in thread
From: Alex Williamson @ 2018-04-23 21:40 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: kvm, Erik Skultety, libvirt, Tina Zhang, kwankhede, intel-gvt-dev

On Wed, 18 Apr 2018 12:31:53 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Mon,  9 Apr 2018 12:35:10 +0200
> Gerd Hoffmann <kraxel@redhat.com> wrote:
> 
> > This little series adds three drivers, for demo-ing and testing vfio
> > display interface code.  There is one mdev device for each interface
> > type (mdpy.ko for region and mbochs.ko for dmabuf).  
> 
> Erik Skultety brought up a good question today regarding how libvirt is
> meant to handle these different flavors of display interfaces and
> knowing whether a given mdev device has display support at all.  It
> seems that we cannot simply use the default display=auto because
> libvirt needs to specifically configure gl support for a dmabuf type
> interface versus not having such a requirement for a region interface,
> perhaps even removing the emulated graphics in some cases (though I
> don't think we have boot graphics through either solution yet).
> Additionally, GVT-g seems to need the x-igd-opregion support
> enabled(?), which is a non-starter for libvirt as it's an experimental
> option!
> 
> Currently the only way to determine display support is through the
> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
> their own they'd need to get to the point where they could open the
> vfio device and perform the ioctl.  That means opening a vfio
> container, adding the group, setting the iommu type, and getting the
> device.  I was initially a bit appalled at asking libvirt to do that,
> but the alternative is to put this information in sysfs, but doing that
> we risk that we need to describe every nuance of the mdev device
> through sysfs and it becomes a dumping ground for every possible
> feature an mdev device might have.
> 
> So I was ready to return and suggest that maybe libvirt should probe
> the device to know about these ancillary configuration details, but
> then I remembered that both mdev vGPU vendors had external dependencies
> to even allow probing the device.  KVMGT will fail to open the device
> if it's not associated with an instance of KVM and NVIDIA vGPU, I
> believe, will fail if the vGPU manager process cannot find the QEMU
> instance to extract the VM UUID.  (Both of these were bad ideas)

Here's another proposal that's really growing on me:

 * Fix the vendor drivers!  Allow devices to be opened and probed
   without these external dependencies.
 * Libvirt uses the existing vfio API to open the device and probe the
   necessary ioctls, if it can't probe the device, the feature is
   unavailable, ie. display=off, no migration.

I'm really having a hard time getting behind inventing a secondary API
just to work around arbitrary requirements from mdev vendor drivers.
vfio was never intended to be locked to QEMU or KVM, these two vendor
drivers are the only examples of such requirements, and we're only
encouraging this behavior if we add a redundant API for device
probing.  Any solution on the table currently would require changes to
the mdev vendor drivers, so why not this change?  Please defend why
each driver needs these external dependencies and why the device open
callback is the best, or only, place in the stack to enforce that
dependency.  Let's see what we're really dealing with here.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] sample: vfio mdev display - host device
  2018-04-09 10:35 ` [PATCH 1/3] sample: vfio mdev display - host device Gerd Hoffmann
@ 2018-04-24  2:41   ` Alex Williamson
  2018-04-24  6:29     ` Gerd Hoffmann
  0 siblings, 1 reply; 41+ messages in thread
From: Alex Williamson @ 2018-04-24  2:41 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: kvm, kwankhede, open list

On Mon,  9 Apr 2018 12:35:11 +0200
Gerd Hoffmann <kraxel@redhat.com> wrote:

> Simple framebuffer display, demo-ing the vfio region display interface
> (VFIO_GFX_PLANE_TYPE_REGION).
> 
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>  samples/vfio-mdev/mdpy-defs.h |  19 +
>  samples/vfio-mdev/mdpy.c      | 791 ++++++++++++++++++++++++++++++++++++++++++
>  samples/Kconfig               |   8 +
>  samples/vfio-mdev/Makefile    |   1 +
>  4 files changed, 819 insertions(+)
>  create mode 100644 samples/vfio-mdev/mdpy-defs.h
>  create mode 100644 samples/vfio-mdev/mdpy.c
> 
> diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h
> new file mode 100644
> index 0000000000..79f0795e11
> --- /dev/null
> +++ b/samples/vfio-mdev/mdpy-defs.h
> @@ -0,0 +1,19 @@
> +/*
> + * Simple pci display device.

checkpatch wants an SPDX license identifier here now and on mdpy.c
below.

> + *
> + * Framebuffer memory is pci bar 0.
> + * Configuration (read-only) is in pci config space.
> + * Format field uses drm fourcc codes.
> + * ATM only DRM_FORMAT_XRGB8888 is supported.
> + */
> +
> +/* pci ids */
> +#define MDPY_PCI_VENDOR_ID	0x1b36 /* redhat */
> +#define MDPY_PCI_DEVICE_ID	0x00f0

I don't see this on pci-ids, so I assume we're just squatting on an
ID.  How do we do that without risking that we don't interfere with
some future user?  Are we relying on this being a non-default sample
device?  Should we just ask for an allocation?

> +#define MDPY_PCI_SUBVENDOR_ID	PCI_SUBVENDOR_ID_REDHAT_QUMRANET
> +#define MDPY_PCI_SUBDEVICE_ID	PCI_SUBDEVICE_ID_QEMU
> +
> +/* pci cfg space offsets for fb config (dword) */
> +#define MDPY_FORMAT_OFFSET	0x40
> +#define MDPY_WIDTH_OFFSET	0x44
> +#define MDPY_HEIGHT_OFFSET	0x48

As I understand, these are just registers in PCI config space outside
of any capabilities.  Wouldn't it be more correct to put these within a
vendor defined capability?  The only imposed structure of a vendor
capability is that the byte after the next field is a length field.  So
you'd have 34h point to a capability starting address, 40h if you like,
the first byte there would be 09h to identify a vendor capability, the
next byte would be 00h to terminate the capability chain, the next byte
is required to be the length, you're using 3 dwords for 12 bytes plus 4
bytes for the alignment and header, so 10h then the format, width, and
height are the next three dwords and only their offsets within the
vendor capability are hard coded.

If you're feeling motivated, all PCI devices technically also require a
PM capability, but I'm not going to be a stickler for a sample driver.

> diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
> new file mode 100644
> index 0000000000..25882c7b37
> --- /dev/null
> +++ b/samples/vfio-mdev/mdpy.c
> @@ -0,0 +1,791 @@
> +/*
> + * Mediated virtual PCI display host device driver
> + *
> + * See mdpy-defs.h for device specs
> + *
> + *   (c) Gerd Hoffmann <kraxel@redhat.com>
> + *
> + * based on mtty driver which is:
> + *   Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *	 Author: Neo Jia <cjia@nvidia.com>
> + *		 Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/slab.h>
> +#include <linux/vmalloc.h>
> +#include <linux/cdev.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/sysfs.h>
> +#include <linux/mdev.h>
> +#include <linux/pci.h>
> +#include <drm/drm_fourcc.h>
> +#include "mdpy-defs.h"
> +
> +#define MDPY_NAME		"mdpy"
> +#define MDPY_CLASS_NAME		"mdpy"
> +
> +#define MDPY_CONFIG_SPACE_SIZE	0xff
> +#define MDPY_MEMORY_BAR_OFFSET	PAGE_SIZE
> +#define MDPY_DISPLAY_REGION	16
> +
> +#define STORE_LE16(addr, val)	(*(u16 *)addr = val)
> +#define STORE_LE32(addr, val)	(*(u32 *)addr = val)
> +
> +
> +MODULE_LICENSE("GPL v2");
> +
> +static int max_devices = 4;
> +module_param_named(count, max_devices, int, 0444);
> +MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices");
> +
> +
> +#define MDPY_TYPE_1 "vga"
> +#define MDPY_TYPE_2 "xga"
> +#define MDPY_TYPE_3 "hd"
> +
> +static const struct mdpy_type {
> +	const char *name;
> +	u32 format;
> +	u32 bytepp;
> +	u32 width;
> +	u32 height;
> +} mdpy_types[] = {
> +	{
> +		.name	= MDPY_CLASS_NAME "-" MDPY_TYPE_1,
> +		.format = DRM_FORMAT_XRGB8888,
> +		.bytepp = 4,
> +		.width	= 640,
> +		.height = 480,
> +	},{

Here and below checkpatch throws an error for no space after the
comma.  Seems more of a nit in this context, but easy enough to make it
happy.

> +		.name	= MDPY_CLASS_NAME "-" MDPY_TYPE_2,
> +		.format = DRM_FORMAT_XRGB8888,
> +		.bytepp = 4,
> +		.width	= 1024,
> +		.height = 768,
> +	},{
> +		.name	= MDPY_CLASS_NAME "-" MDPY_TYPE_3,
> +		.format = DRM_FORMAT_XRGB8888,
> +		.bytepp = 4,
> +		.width	= 1920,
> +		.height = 1080,
> +	},
> +};
> +
> +static dev_t		mdpy_devt;
> +static struct class	*mdpy_class;
> +static struct cdev	mdpy_cdev;
> +static struct device	mdpy_dev;
> +static u32		mdpy_count;
> +
> +/* State of each mdev device */
> +struct mdev_state {
> +	u8 *vconfig;
> +	u32 bar_mask;
> +	struct mutex ops_lock;
> +	struct mdev_device *mdev;
> +	struct vfio_device_info dev_info;
> +
> +	const struct mdpy_type *type;
> +	u32 memsize;
> +	void *memblk;
> +};
> +
> +static const struct mdpy_type *mdpy_find_type(struct kobject *kobj)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(mdpy_types); i++)
> +		if (strcmp(mdpy_types[i].name, kobj->name) == 0)
> +			return mdpy_types + i;
> +	return NULL;
> +}
> +
> +static void mdpy_create_config_space(struct mdev_state *mdev_state)
> +{
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
> +		   MDPY_PCI_VENDOR_ID);
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
> +		   MDPY_PCI_DEVICE_ID);
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
> +		   MDPY_PCI_SUBVENDOR_ID);
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
> +		   MDPY_PCI_SUBDEVICE_ID);
> +
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
> +		   PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
> +		   PCI_CLASS_DISPLAY_OTHER);
> +	mdev_state->vconfig[PCI_CLASS_REVISION] =  0x01;
> +
> +	STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
> +		   PCI_BASE_ADDRESS_SPACE_MEMORY |
> +		   PCI_BASE_ADDRESS_MEM_TYPE_32	 |
> +		   PCI_BASE_ADDRESS_MEM_PREFETCH);
> +	mdev_state->bar_mask = ~(mdev_state->memsize) + 1;
> +
> +	/* Vendor specific data */
> +	STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_FORMAT_OFFSET],
> +		   mdev_state->type->format);
> +	STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_WIDTH_OFFSET],
> +		   mdev_state->type->width);
> +	STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_HEIGHT_OFFSET],
> +		   mdev_state->type->height);
> +}
> +
> +static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
> +				 char *buf, u32 count)
> +{
> +	struct device *dev = mdev_dev(mdev_state->mdev);
> +	u32 cfg_addr;
> +
> +	switch (offset) {
> +	case PCI_BASE_ADDRESS_0:
> +		cfg_addr = *(u32 *)buf;
> +
> +		if (cfg_addr == 0xffffffff) {
> +			cfg_addr = (cfg_addr & mdev_state->bar_mask);
> +		} else {
> +			cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
> +			if (cfg_addr)
> +				dev_info(dev, "BAR0 @ 0x%x\n", cfg_addr);
> +		}
> +
> +		cfg_addr |= (mdev_state->vconfig[offset] &
> +			     ~PCI_BASE_ADDRESS_MEM_MASK);
> +		STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
> +		break;
> +	}
> +}
> +
> +static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
> +			   loff_t pos, bool is_write)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +	struct device *dev = mdev_dev(mdev);
> +	int ret = 0;
> +
> +	mutex_lock(&mdev_state->ops_lock);
> +
> +	if (pos < MDPY_CONFIG_SPACE_SIZE) {
> +		if (is_write) {
> +			handle_pci_cfg_write(mdev_state, pos, buf, count);
> +		} else {
> +			memcpy(buf, (mdev_state->vconfig + pos), count);
> +		}
> +
> +	} else {
> +		dev_info(dev, "%s: %s @0x%llx (unhandled)\n",
> +			 __func__, is_write ? "WR" : "RD", pos);
> +		ret = -1;
> +		goto accessfailed;
> +	}
> +
> +	ret = count;
> +
> +
> +accessfailed:
> +	mutex_unlock(&mdev_state->ops_lock);
> +
> +	return ret;
> +}
> +
> +int mdpy_reset(struct mdev_device *mdev)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +	u32 stride, i;
> +
> +	/* initialize with gray gradient */
> +	stride = mdev_state->type->width * mdev_state->type->bytepp;
> +	for (i = 0; i < mdev_state->type->height; i++)
> +		memset(mdev_state->memblk + i * stride,
> +		       i * 255 / mdev_state->type->height,
> +		       stride);
> +	return 0;
> +}
> +
> +int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
> +{
> +	const struct mdpy_type *type = mdpy_find_type(kobj);
> +	struct device *dev = mdev_dev(mdev);
> +	struct mdev_state *mdev_state;
> +	u32 fbsize;
> +
> +	if (mdpy_count >= max_devices)
> +		return -ENOMEM;
> +
> +	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
> +	if (mdev_state == NULL)
> +		return -ENOMEM;
> +
> +	mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL);
> +	if (mdev_state->vconfig == NULL) {
> +		kfree(mdev_state);
> +		return -ENOMEM;
> +	}
> +
> +	if (!type)
> +		type = &mdpy_types[0];
> +	fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp);
> +
> +	mdev_state->memblk = vmalloc_user(fbsize);
> +	if (!mdev_state->memblk) {
> +		kfree(mdev_state->vconfig);
> +		kfree(mdev_state);
> +		return -ENOMEM;
> +	}
> +	dev_info(dev, "%s: %s (%dx%d)\n",
> +		 __func__, kobj->name, type->width, type->height);
> +
> +	mutex_init(&mdev_state->ops_lock);
> +	mdev_state->mdev = mdev;
> +	mdev_set_drvdata(mdev, mdev_state);
> +
> +	mdev_state->type    = type;
> +	mdev_state->memsize = fbsize;
> +	mdpy_create_config_space(mdev_state);
> +	mdpy_reset(mdev);
> +
> +	mdpy_count++;
> +	return 0;
> +}
> +
> +int mdpy_remove(struct mdev_device *mdev)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +	struct device *dev = mdev_dev(mdev);
> +
> +	dev_info(dev, "%s\n", __func__);
> +
> +	mdev_set_drvdata(mdev, NULL);
> +	vfree(mdev_state->memblk);
> +	kfree(mdev_state->vconfig);
> +	kfree(mdev_state);
> +
> +	mdpy_count--;
> +	return 0;
> +}
> +
> +ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf, size_t count,
> +		  loff_t *ppos)
> +{
> +	unsigned int done = 0;
> +	int ret;
> +
> +	while (count) {
> +		size_t filled;
> +
> +		if (count >= 4 && !(*ppos % 4)) {
> +			u32 val;
> +
> +			ret =  mdev_access(mdev, (char *)&val, sizeof(val),
> +					   *ppos, false);
> +			if (ret <= 0)
> +				goto read_err;
> +
> +			if (copy_to_user(buf, &val, sizeof(val)))
> +				goto read_err;
> +
> +			filled = 4;
> +		} else if (count >= 2 && !(*ppos % 2)) {
> +			u16 val;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, false);
> +			if (ret <= 0)
> +				goto read_err;
> +
> +			if (copy_to_user(buf, &val, sizeof(val)))
> +				goto read_err;
> +
> +			filled = 2;
> +		} else {
> +			u8 val;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, false);
> +			if (ret <= 0)
> +				goto read_err;
> +
> +			if (copy_to_user(buf, &val, sizeof(val)))
> +				goto read_err;
> +
> +			filled = 1;
> +		}
> +
> +		count -= filled;
> +		done += filled;
> +		*ppos += filled;
> +		buf += filled;
> +	}
> +
> +	return done;
> +
> +read_err:
> +	return -EFAULT;
> +}
> +
> +ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
> +		   size_t count, loff_t *ppos)
> +{
> +	unsigned int done = 0;
> +	int ret;
> +
> +	while (count) {
> +		size_t filled;
> +
> +		if (count >= 4 && !(*ppos % 4)) {
> +			u32 val;
> +
> +			if (copy_from_user(&val, buf, sizeof(val)))
> +				goto write_err;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, true);
> +			if (ret <= 0)
> +				goto write_err;
> +
> +			filled = 4;
> +		} else if (count >= 2 && !(*ppos % 2)) {
> +			u16 val;
> +
> +			if (copy_from_user(&val, buf, sizeof(val)))
> +				goto write_err;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, true);
> +			if (ret <= 0)
> +				goto write_err;
> +
> +			filled = 2;
> +		} else {
> +			u8 val;
> +
> +			if (copy_from_user(&val, buf, sizeof(val)))
> +				goto write_err;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, true);
> +			if (ret <= 0)
> +				goto write_err;
> +
> +			filled = 1;
> +		}
> +		count -= filled;
> +		done += filled;
> +		*ppos += filled;
> +		buf += filled;
> +	}
> +
> +	return done;
> +write_err:
> +	return -EFAULT;
> +}
> +
> +int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +
> +	if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
> +		return -EINVAL;
> +	if (vma->vm_end < vma->vm_start)
> +		return -EINVAL;
> +	if (vma->vm_end - vma->vm_start > mdev_state->memsize)
> +		return -EINVAL;
> +	if ((vma->vm_flags & VM_SHARED) == 0)
> +		return -EINVAL;
> +
> +	return remap_vmalloc_range_partial(vma, vma->vm_start,
> +					   mdev_state->memblk,
> +					   vma->vm_end - vma->vm_start);
> +}
> +
> +int mdpy_get_region_info(struct mdev_device *mdev,
> +			 struct vfio_region_info *region_info,
> +			 u16 *cap_type_id, void **cap_type)
> +{
> +	struct mdev_state *mdev_state;
> +
> +	mdev_state = mdev_get_drvdata(mdev);
> +	if (!mdev_state)
> +		return -EINVAL;
> +
> +	if (region_info->index >= VFIO_PCI_NUM_REGIONS &&
> +	    region_info->index != MDPY_DISPLAY_REGION)
> +		return -EINVAL;
> +
> +	switch (region_info->index) {
> +	case VFIO_PCI_CONFIG_REGION_INDEX:
> +		region_info->offset = 0;
> +		region_info->size   = MDPY_CONFIG_SPACE_SIZE;
> +		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ |
> +				       VFIO_REGION_INFO_FLAG_WRITE);
> +		break;
> +	case VFIO_PCI_BAR0_REGION_INDEX:
> +	case MDPY_DISPLAY_REGION:
> +		region_info->offset = MDPY_MEMORY_BAR_OFFSET;

That's pretty clever, I'd never considered that two regions could point
to the same device file offset.

> +		region_info->size   = mdev_state->memsize;
> +		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ  |
> +				       VFIO_REGION_INFO_FLAG_WRITE |
> +				       VFIO_REGION_INFO_FLAG_MMAP);

This doesn't appear to be true, the read and write functions call the
access function which only handles the config space region.  Are these
really mmap-only regions?  read/write access support is often useful
for tracing and debugging, QEMU will break if x-no-mmap=on is used.
Unfortunately I didn't really consider mmap-only regions for the flags,
so there's no way to specify the access permissions for such a region
other than also indication that it supports read(2), write(2).

Thanks,
Alex


> +		break;
> +	default:
> +		region_info->size   = 0;
> +		region_info->offset = 0;
> +		region_info->flags  = 0;
> +	}
> +
> +	return 0;
> +}
> +
> +int mdpy_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info)
> +{
> +	irq_info->count = 0;
> +	return 0;
> +}
> +
> +int mdpy_get_device_info(struct mdev_device *mdev,
> +			 struct vfio_device_info *dev_info)
> +{
> +	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
> +	dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
> +	dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
> +	return 0;
> +}
> +
> +int mdpy_query_gfx_plane(struct mdev_device *mdev,
> +			 struct vfio_device_gfx_plane_info *plane)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +
> +	if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
> +		if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
> +				     VFIO_GFX_PLANE_TYPE_REGION))
> +			return 0;
> +		return -EINVAL;
> +	}
> +
> +	if (plane->flags != VFIO_GFX_PLANE_TYPE_REGION)
> +		return -EINVAL;
> +
> +	plane->drm_format     = mdev_state->type->format;
> +	plane->width	      = mdev_state->type->width;
> +	plane->height	      = mdev_state->type->height;
> +	plane->stride	      = (mdev_state->type->width *
> +				 mdev_state->type->bytepp);
> +	plane->size	      = mdev_state->memsize;
> +	plane->region_index   = MDPY_DISPLAY_REGION;
> +
> +	/* unused */
> +	plane->drm_format_mod = 0;
> +	plane->x_pos	      = 0;
> +	plane->y_pos	      = 0;
> +	plane->x_hot	      = 0;
> +	plane->y_hot	      = 0;
> +
> +	return 0;
> +}
> +
> +static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
> +			unsigned long arg)
> +{
> +	int ret = 0;
> +	unsigned long minsz;
> +	struct mdev_state *mdev_state;
> +
> +	mdev_state = mdev_get_drvdata(mdev);
> +
> +	switch (cmd) {
> +	case VFIO_DEVICE_GET_INFO:
> +	{
> +		struct vfio_device_info info;
> +
> +		minsz = offsetofend(struct vfio_device_info, num_irqs);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = mdpy_get_device_info(mdev, &info);
> +		if (ret)
> +			return ret;
> +
> +		memcpy(&mdev_state->dev_info, &info, sizeof(info));
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_DEVICE_GET_REGION_INFO:
> +	{
> +		struct vfio_region_info info;
> +		u16 cap_type_id = 0;
> +		void *cap_type = NULL;
> +
> +		minsz = offsetofend(struct vfio_region_info, offset);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = mdpy_get_region_info(mdev, &info, &cap_type_id,
> +					   &cap_type);
> +		if (ret)
> +			return ret;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +
> +	case VFIO_DEVICE_GET_IRQ_INFO:
> +	{
> +		struct vfio_irq_info info;
> +
> +		minsz = offsetofend(struct vfio_irq_info, count);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if ((info.argsz < minsz) ||
> +		    (info.index >= mdev_state->dev_info.num_irqs))
> +			return -EINVAL;
> +
> +		ret = mdpy_get_irq_info(mdev, &info);
> +		if (ret)
> +			return ret;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +
> +	case VFIO_DEVICE_QUERY_GFX_PLANE:
> +	{
> +		struct vfio_device_gfx_plane_info plane;
> +
> +		minsz = offsetofend(struct vfio_device_gfx_plane_info,
> +				    region_index);
> +
> +		if (copy_from_user(&plane, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (plane.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = mdpy_query_gfx_plane(mdev, &plane);
> +		if (ret)
> +			return ret;
> +
> +		if (copy_to_user((void __user *)arg, &plane, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +
> +	case VFIO_DEVICE_SET_IRQS:
> +		return -EINVAL;
> +
> +	case VFIO_DEVICE_RESET:
> +		return mdpy_reset(mdev);
> +	}
> +	return -ENOTTY;
> +}
> +
> +int mdpy_open(struct mdev_device *mdev)
> +{
> +	if (!try_module_get(THIS_MODULE))
> +		return -ENODEV;
> +
> +	return 0;
> +}
> +
> +void mdpy_close(struct mdev_device *mdev)
> +{
> +	module_put(THIS_MODULE);
> +}
> +
> +static ssize_t
> +resolution_show(struct device *dev, struct device_attribute *attr,
> +		char *buf)
> +{
> +	struct mdev_device *mdev = mdev_from_dev(dev);
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +
> +	return sprintf(buf, "%dx%d\n",
> +		       mdev_state->type->width,
> +		       mdev_state->type->height);
> +}
> +static DEVICE_ATTR_RO(resolution);
> +
> +static struct attribute *mdev_dev_attrs[] = {
> +	&dev_attr_resolution.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group mdev_dev_group = {
> +	.name  = "vendor",
> +	.attrs = mdev_dev_attrs,
> +};
> +
> +const struct attribute_group *mdev_dev_groups[] = {
> +	&mdev_dev_group,
> +	NULL,
> +};
> +
> +static ssize_t
> +name_show(struct kobject *kobj, struct device *dev, char *buf)
> +{
> +	return sprintf(buf, "%s\n", kobj->name);
> +}
> +MDEV_TYPE_ATTR_RO(name);
> +
> +static ssize_t
> +description_show(struct kobject *kobj, struct device *dev, char *buf)
> +{
> +	const struct mdpy_type *type = mdpy_find_type(kobj);
> +
> +	return sprintf(buf, "virtual display, %dx%d framebuffer\n",
> +		       type ? type->width  : 0,
> +		       type ? type->height : 0);
> +}
> +MDEV_TYPE_ATTR_RO(description);
> +
> +static ssize_t
> +available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
> +{
> +	return sprintf(buf, "%d\n", max_devices - mdpy_count);
> +}
> +MDEV_TYPE_ATTR_RO(available_instances);
> +
> +static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
> +			       char *buf)
> +{
> +	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
> +}
> +MDEV_TYPE_ATTR_RO(device_api);
> +
> +static struct attribute *mdev_types_attrs[] = {
> +	&mdev_type_attr_name.attr,
> +	&mdev_type_attr_description.attr,
> +	&mdev_type_attr_device_api.attr,
> +	&mdev_type_attr_available_instances.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group mdev_type_group1 = {
> +	.name  = MDPY_TYPE_1,
> +	.attrs = mdev_types_attrs,
> +};
> +
> +static struct attribute_group mdev_type_group2 = {
> +	.name  = MDPY_TYPE_2,
> +	.attrs = mdev_types_attrs,
> +};
> +
> +static struct attribute_group mdev_type_group3 = {
> +	.name  = MDPY_TYPE_3,
> +	.attrs = mdev_types_attrs,
> +};
> +
> +static struct attribute_group *mdev_type_groups[] = {
> +	&mdev_type_group1,
> +	&mdev_type_group2,
> +	&mdev_type_group3,
> +	NULL,
> +};
> +
> +static const struct mdev_parent_ops mdev_fops = {
> +	.owner			= THIS_MODULE,
> +	.mdev_attr_groups	= mdev_dev_groups,
> +	.supported_type_groups	= mdev_type_groups,
> +	.create			= mdpy_create,
> +	.remove			= mdpy_remove,
> +	.open			= mdpy_open,
> +	.release		= mdpy_close,
> +	.read			= mdpy_read,
> +	.write			= mdpy_write,
> +	.ioctl			= mdpy_ioctl,
> +	.mmap			= mdpy_mmap,
> +};
> +
> +static const struct file_operations vd_fops = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +static void mdpy_device_release(struct device *dev)
> +{
> +	/* nothing */
> +}
> +
> +static int __init mdpy_dev_init(void)
> +{
> +	int ret = 0;
> +
> +	ret = alloc_chrdev_region(&mdpy_devt, 0, MINORMASK, MDPY_NAME);
> +	if (ret < 0) {
> +		pr_err("Error: failed to register mdpy_dev, err: %d\n", ret);
> +		return ret;
> +	}
> +	cdev_init(&mdpy_cdev, &vd_fops);
> +	cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK);
> +	pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt));
> +
> +	mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME);
> +	if (IS_ERR(mdpy_class)) {
> +		pr_err("Error: failed to register mdpy_dev class\n");
> +		ret = PTR_ERR(mdpy_class);
> +		goto failed1;
> +	}
> +	mdpy_dev.class = mdpy_class;
> +	mdpy_dev.release = mdpy_device_release;
> +	dev_set_name(&mdpy_dev, "%s", MDPY_NAME);
> +
> +	ret = device_register(&mdpy_dev);
> +	if (ret)
> +		goto failed2;
> +
> +	ret = mdev_register_device(&mdpy_dev, &mdev_fops);
> +	if (ret)
> +		goto failed3;
> +
> +	return 0;
> +
> +failed3:
> +	device_unregister(&mdpy_dev);
> +failed2:
> +	class_destroy(mdpy_class);
> +failed1:
> +	cdev_del(&mdpy_cdev);
> +	unregister_chrdev_region(mdpy_devt, MINORMASK);
> +	return ret;
> +}
> +
> +static void __exit mdpy_dev_exit(void)
> +{
> +	mdpy_dev.bus = NULL;
> +	mdev_unregister_device(&mdpy_dev);
> +
> +	device_unregister(&mdpy_dev);
> +	cdev_del(&mdpy_cdev);
> +	unregister_chrdev_region(mdpy_devt, MINORMASK);
> +	class_destroy(mdpy_class);
> +	mdpy_class = NULL;
> +}
> +
> +module_init(mdpy_dev_init)
> +module_exit(mdpy_dev_exit)
> diff --git a/samples/Kconfig b/samples/Kconfig
> index c332a3b9de..a0c104adda 100644
> --- a/samples/Kconfig
> +++ b/samples/Kconfig
> @@ -111,6 +111,14 @@ config SAMPLE_VFIO_MDEV_MTTY
>  	  Build a virtual tty sample driver for use as a VFIO
>  	  mediated device
>  
> +config SAMPLE_VFIO_MDEV_MDPY
> +	tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
> +	depends on VFIO_MDEV_DEVICE && m
> +	help
> +	  Build a virtual display sample driver for use as a VFIO
> +	  mediated device.  It is a simple framebuffer and supports
> +	  the region display interface (VFIO_GFX_PLANE_TYPE_REGION).
> +
>  config SAMPLE_STATX
>  	bool "Build example extended-stat using code"
>  	depends on BROKEN
> diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
> index cbbd868a50..031d6b88e9 100644
> --- a/samples/vfio-mdev/Makefile
> +++ b/samples/vfio-mdev/Makefile
> @@ -1 +1,2 @@
>  obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
> +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] sample: vfio mdev display - guest driver
  2018-04-09 10:35 ` [PATCH 2/3] sample: vfio mdev display - guest driver Gerd Hoffmann
  2018-04-11 20:39   ` Bjorn Helgaas
@ 2018-04-24  2:51   ` Alex Williamson
  2018-04-25 21:03   ` Konrad Rzeszutek Wilk
  2 siblings, 0 replies; 41+ messages in thread
From: Alex Williamson @ 2018-04-24  2:51 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: kvm, kwankhede, open list

On Mon,  9 Apr 2018 12:35:12 +0200
Gerd Hoffmann <kraxel@redhat.com> wrote:

> Guest fbdev driver for CONFIG_SAMPLE_VFIO_MDEV_MDPY.
> 
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>  samples/vfio-mdev/mdpy-fb.c | 232 ++++++++++++++++++++++++++++++++++++++++++++
>  samples/Kconfig             |   9 ++
>  samples/vfio-mdev/Makefile  |   1 +
>  3 files changed, 242 insertions(+)
>  create mode 100644 samples/vfio-mdev/mdpy-fb.c

Looks good to me, just some trivial checkpatch issues below.
 
> diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c
> new file mode 100644
> index 0000000000..0ebd8feead
> --- /dev/null
> +++ b/samples/vfio-mdev/mdpy-fb.c
> @@ -0,0 +1,232 @@
> +/*

SPDX license

> + * Framebuffer driver for mdpy (mediated virtual pci display device).
> + *
> + * See mdpy-defs.h for device specs
> + *
> + *   (c) Gerd Hoffmann <kraxel@redhat.com>
> + *
> + * Using some code snippets from simplefb and cirrusfb.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + */
> +#include <linux/errno.h>
> +#include <linux/fb.h>
> +#include <linux/io.h>
> +#include <linux/pci.h>
> +#include <linux/module.h>
> +#include <drm/drm_fourcc.h>
> +#include "mdpy-defs.h"
> +
> +static const struct fb_fix_screeninfo mdpy_fb_fix = {
> +	.id		= "mdpy-fb",
> +	.type		= FB_TYPE_PACKED_PIXELS,
> +	.visual		= FB_VISUAL_TRUECOLOR,
> +	.accel		= FB_ACCEL_NONE,
> +};
> +
> +static const struct fb_var_screeninfo mdpy_fb_var = {
> +	.height		= -1,
> +	.width		= -1,
> +	.activate	= FB_ACTIVATE_NOW,
> +	.vmode		= FB_VMODE_NONINTERLACED,
> +
> +	.bits_per_pixel = 32,
> +	.transp.offset	= 24,
> +	.red.offset	= 16,
> +	.green.offset	= 8,
> +	.blue.offset	= 0,
> +	.transp.length	= 8,
> +	.red.length	= 8,
> +	.green.length	= 8,
> +	.blue.length	= 8,
> +};
> +
> +#define PSEUDO_PALETTE_SIZE 16
> +
> +struct mdpy_fb_par {
> +	u32 palette[PSEUDO_PALETTE_SIZE];
> +};
> +
> +static int mdpy_fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
> +			      u_int transp, struct fb_info *info)
> +{
> +	u32 *pal = info->pseudo_palette;
> +	u32 cr = red >> (16 - info->var.red.length);
> +	u32 cg = green >> (16 - info->var.green.length);
> +	u32 cb = blue >> (16 - info->var.blue.length);
> +	u32 value;
> +
> +	if (regno >= PSEUDO_PALETTE_SIZE)
> +		return -EINVAL;
> +
> +	value = (cr << info->var.red.offset) |
> +		(cg << info->var.green.offset) |
> +		(cb << info->var.blue.offset);
> +	if (info->var.transp.length > 0) {
> +		u32 mask = (1 << info->var.transp.length) - 1;

Technically wants a new line here, I'm torn whether I'd ignore that
warning.

> +		mask <<= info->var.transp.offset;
> +		value |= mask;
> +	}
> +	pal[regno] = value;
> +
> +	return 0;
> +}
> +
> +static void mdpy_fb_destroy(struct fb_info *info)
> +{
> +	if (info->screen_base)
> +		iounmap(info->screen_base);
> +}
> +
> +static struct fb_ops mdpy_fb_ops = {
> +	.owner		= THIS_MODULE,
> +	.fb_destroy	= mdpy_fb_destroy,
> +	.fb_setcolreg	= mdpy_fb_setcolreg,
> +	.fb_fillrect	= cfb_fillrect,
> +	.fb_copyarea	= cfb_copyarea,
> +	.fb_imageblit	= cfb_imageblit,
> +};
> +
> +static int mdpy_fb_probe(struct pci_dev *pdev,
> +			 const struct pci_device_id *ent)
> +{
> +	struct fb_info *info;
> +	struct mdpy_fb_par *par;
> +	u32 format, width, height;
> +	int ret;
> +
> +	ret = pci_enable_device(pdev);
> +	if (ret < 0)
> +		return ret;
> +
> +	ret = pci_request_regions(pdev, "mdpy-fb");
> +	if (ret < 0)
> +		return ret;
> +
> +	pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format);
> +	pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET,	&width);
> +	pci_read_config_dword(pdev, MDPY_HEIGHT_OFFSET, &height);
> +	if (format != DRM_FORMAT_XRGB8888) {
> +		dev_err(&pdev->dev, "format mismatch (0x%x != 0x%x)\n",
> +			format, DRM_FORMAT_XRGB8888);
> +		return -EINVAL;
> +	}
> +	if (width < 100	 || width > 10000) {
> +		dev_err(&pdev->dev, "width (%d) out of range\n", width);
> +		return -EINVAL;
> +	}
> +	if (height < 100 || height > 10000) {
> +		dev_err(&pdev->dev, "height (%d) out of range\n", height);
> +		return -EINVAL;
> +	}
> +	dev_info(&pdev->dev, "mdpy found: %dx%d framebuffer\n",
> +		 width, height);
> +
> +	info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev);
> +	if (!info)
> +		goto err_release_regions;
> +	pci_set_drvdata(pdev, info);
> +	par = info->par;
> +
> +	info->fix = mdpy_fb_fix;
> +	info->fix.smem_start = pci_resource_start(pdev, 0);
> +	info->fix.smem_len = pci_resource_len(pdev, 0);
> +	info->fix.line_length = width * 4;
> +
> +	info->var = mdpy_fb_var;
> +	info->var.xres = width;
> +	info->var.yres = height;
> +	info->var.xres_virtual = width;
> +	info->var.yres_virtual = height;
> +
> +	info->screen_size = info->fix.smem_len;
> +	info->screen_base = ioremap(info->fix.smem_start,
> +				    info->screen_size);
> +	if (!info->screen_base) {
> +		dev_err(&pdev->dev, "ioremap(pcibar) failed\n");
> +		ret = -EIO;
> +		goto err_release_fb;
> +	}
> +
> +	info->apertures = alloc_apertures(1);
> +	if (!info->apertures) {
> +		ret = -ENOMEM;
> +		goto err_unmap;
> +	}
> +	info->apertures->ranges[0].base = info->fix.smem_start;
> +	info->apertures->ranges[0].size = info->fix.smem_len;
> +
> +	info->fbops = &mdpy_fb_ops;
> +	info->flags = FBINFO_DEFAULT;
> +	info->pseudo_palette = par->palette;
> +
> +	ret = register_framebuffer(info);
> +	if (ret < 0) {
> +		dev_err(&pdev->dev,
> +			"mdpy-fb device register failed: %d\n", ret);
> +		goto err_unmap;
> +	}
> +
> +	dev_info(&pdev->dev, "fb%d registered\n", info->node);
> +	return 0;
> +
> +err_unmap:
> +	iounmap(info->screen_base);
> +
> +err_release_fb:
> +	framebuffer_release(info);
> +
> +err_release_regions:
> +	pci_release_regions(pdev);
> +
> +	return ret;
> +}
> +
> +static void mdpy_fb_remove(struct pci_dev *pdev)
> +{
> +	struct fb_info *info = pci_get_drvdata(pdev);
> +
> +	unregister_framebuffer(info);
> +	framebuffer_release(info);
> +}
> +
> +static struct pci_device_id mdpy_fb_pci_table[] = {
> +	{
> +		.vendor	   = MDPY_PCI_VENDOR_ID,
> +		.device	   = MDPY_PCI_DEVICE_ID,
> +		.subvendor = MDPY_PCI_SUBVENDOR_ID,
> +		.subdevice = MDPY_PCI_SUBDEVICE_ID,
> +	},{

Space after comma, as in the first patch this is the only error from
checkpatch.  Thanks,

Alex

> +		/* end of list */
> +	}
> +};
> +
> +static struct pci_driver mdpy_fb_pci_driver = {
> +	.name		= "mdpy-fb",
> +	.id_table	= mdpy_fb_pci_table,
> +	.probe		= mdpy_fb_probe,
> +	.remove		= mdpy_fb_remove,
> +};
> +
> +static int __init mdpy_fb_init(void)
> +{
> +	int ret;
> +
> +	ret = pci_register_driver(&mdpy_fb_pci_driver);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +module_init(mdpy_fb_init);
> +
> +MODULE_DEVICE_TABLE(pci, mdpy_fb_pci_table);
> +MODULE_LICENSE("GPL v2");
> diff --git a/samples/Kconfig b/samples/Kconfig
> index a0c104adda..755430c788 100644
> --- a/samples/Kconfig
> +++ b/samples/Kconfig
> @@ -119,6 +119,15 @@ config SAMPLE_VFIO_MDEV_MDPY
>  	  mediated device.  It is a simple framebuffer and supports
>  	  the region display interface (VFIO_GFX_PLANE_TYPE_REGION).
>  
> +config SAMPLE_VFIO_MDEV_MDPY_FB
> +	tristate "Build VFIO mdpy example guest fbdev driver -- loadable module only"
> +	depends on FB && m
> +	select FB_CFB_FILLRECT
> +	select FB_CFB_COPYAREA
> +	select FB_CFB_IMAGEBLIT
> +	help
> +	  Guest fbdev driver for the virtual display sample driver.
> +
>  config SAMPLE_STATX
>  	bool "Build example extended-stat using code"
>  	depends on BROKEN
> diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
> index 031d6b88e9..7a5790aaec 100644
> --- a/samples/vfio-mdev/Makefile
> +++ b/samples/vfio-mdev/Makefile
> @@ -1,2 +1,3 @@
>  obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
>  obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o
> +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] sample: vfio bochs vbe display (host device for bochs-drm)
  2018-04-09 10:35 ` [PATCH 3/3] sample: vfio bochs vbe display (host device for bochs-drm) Gerd Hoffmann
@ 2018-04-24  3:05   ` Alex Williamson
  0 siblings, 0 replies; 41+ messages in thread
From: Alex Williamson @ 2018-04-24  3:05 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: kvm, kwankhede, open list

On Mon,  9 Apr 2018 12:35:13 +0200
Gerd Hoffmann <kraxel@redhat.com> wrote:

> Display device, demo-ing the vfio dmabuf display interface
> (VFIO_GFX_PLANE_TYPE_DMABUF).  Compatible enough to qemu stdvga
> that bochs-drm.ko can be used as guest driver.
> 
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>  samples/vfio-mdev/mbochs.c | 1379 ++++++++++++++++++++++++++++++++++++++++++++
>  samples/Kconfig            |   13 +
>  samples/vfio-mdev/Makefile |    1 +
>  3 files changed, 1393 insertions(+)
>  create mode 100644 samples/vfio-mdev/mbochs.c

A bit more checkpatch error heavy on this one, I won't point out each
one, but please try to resolve them where appropriate.

> diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
> new file mode 100644
> index 0000000000..fc91523190
> --- /dev/null
> +++ b/samples/vfio-mdev/mbochs.c
> @@ -0,0 +1,1379 @@
> +/*
> + * Mediated virtual PCI display host device driver
> + *
> + * Emulate enough of qemu stdvga to make bochs-drm.ko happy.  That is
> + * basically the vram memory bar and the bochs dispi interface vbe
> + * registers in the mmio register bar.	Specifically it does *not*
> + * include any legacy vga stuff.  Device looks alot like "qemu -device
> + * secondary-vga".
> + *
> + *   (c) Gerd Hoffmann <kraxel@redhat.com>
> + *
> + * based on mtty driver which is:
> + *   Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *	 Author: Neo Jia <cjia@nvidia.com>
> + *		 Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/slab.h>
> +#include <linux/vmalloc.h>
> +#include <linux/cdev.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/sysfs.h>
> +#include <linux/mdev.h>
> +#include <linux/pci.h>
> +#include <linux/dma-buf.h>
> +#include <linux/highmem.h>
> +#include <drm/drm_fourcc.h>
> +#include <drm/drm_rect.h>
> +#include <drm/drm_modeset_lock.h>
> +#include <drm/drm_plane.h>
> +
> +
> +#define VBE_DISPI_INDEX_ID		0x0
> +#define VBE_DISPI_INDEX_XRES		0x1
> +#define VBE_DISPI_INDEX_YRES		0x2
> +#define VBE_DISPI_INDEX_BPP		0x3
> +#define VBE_DISPI_INDEX_ENABLE		0x4
> +#define VBE_DISPI_INDEX_BANK		0x5
> +#define VBE_DISPI_INDEX_VIRT_WIDTH	0x6
> +#define VBE_DISPI_INDEX_VIRT_HEIGHT	0x7
> +#define VBE_DISPI_INDEX_X_OFFSET	0x8
> +#define VBE_DISPI_INDEX_Y_OFFSET	0x9
> +#define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa
> +#define VBE_DISPI_INDEX_COUNT		0xb
> +
> +#define VBE_DISPI_ID0			0xB0C0
> +#define VBE_DISPI_ID1			0xB0C1
> +#define VBE_DISPI_ID2			0xB0C2
> +#define VBE_DISPI_ID3			0xB0C3
> +#define VBE_DISPI_ID4			0xB0C4
> +#define VBE_DISPI_ID5			0xB0C5
> +
> +#define VBE_DISPI_DISABLED		0x00
> +#define VBE_DISPI_ENABLED		0x01
> +#define VBE_DISPI_GETCAPS		0x02
> +#define VBE_DISPI_8BIT_DAC		0x20
> +#define VBE_DISPI_LFB_ENABLED		0x40
> +#define VBE_DISPI_NOCLEARMEM		0x80
> +
> +
> +#define MBOCHS_NAME		  "mbochs"
> +#define MBOCHS_CLASS_NAME	  "mbochs"
> +
> +#define MBOCHS_CONFIG_SPACE_SIZE  0xff
> +#define MBOCHS_MMIO_BAR_OFFSET	  PAGE_SIZE

I like that you're using region offsets other than the algorithm
vfio-pci uses, helps to weed out users that assume a region number to
offset mapping rather than consuming the offset field of region info.

> +#define MBOCHS_MMIO_BAR_SIZE	  PAGE_SIZE
> +#define MBOCHS_MEMORY_BAR_OFFSET  (MBOCHS_MMIO_BAR_OFFSET + MBOCHS_MMIO_BAR_SIZE)
> +
> +#define STORE_LE16(addr, val)	(*(u16 *)addr = val)
> +#define STORE_LE32(addr, val)	(*(u32 *)addr = val)
> +
> +
> +MODULE_LICENSE("GPL v2");
> +
> +static int max_mbytes = 256;
> +module_param_named(count, max_mbytes, int, 0444);
> +MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices");
> +
> +
> +#define MBOCHS_TYPE_1 "small"
> +#define MBOCHS_TYPE_2 "medium"
> +#define MBOCHS_TYPE_3 "large"
> +
> +static const struct mbochs_type {
> +	const char *name;
> +	u32 mbytes;
> +} mbochs_types[] = {
> +	{
> +		.name	= MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1,
> +		.mbytes = 4,
> +	},{
> +		.name	= MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2,
> +		.mbytes = 16,
> +	},{
> +		.name	= MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3,
> +		.mbytes = 64,
> +	},
> +};
> +
> +
> +static dev_t		mbochs_devt;
> +static struct class	*mbochs_class;
> +static struct cdev	mbochs_cdev;
> +static struct device	mbochs_dev;
> +static int		mbochs_used_mbytes;
> +
> +struct mbochs_mode {
> +	u32 drm_format;
> +	u32 bytepp;
> +	u32 width;
> +	u32 height;
> +	u32 stride;
> +	u32 __pad;
> +	u64 offset;
> +	u64 size;
> +};
> +
> +struct mbochs_dmabuf {
> +	struct mbochs_mode mode;
> +	u32 id;
> +	struct page **pages;
> +	pgoff_t pagecount;
> +	struct dma_buf *buf;
> +	struct mdev_state *mdev_state;
> +	struct list_head next;
> +	bool unlinked;
> +};
> +
> +/* State of each mdev device */
> +struct mdev_state {
> +	u8 *vconfig;
> +	u64 bar_mask[3];
> +	u32 memory_bar_mask;
> +	struct mutex ops_lock;
> +	struct mdev_device *mdev;
> +	struct vfio_device_info dev_info;
> +
> +	const struct mbochs_type *type;
> +	u16 vbe[VBE_DISPI_INDEX_COUNT];
> +	u64 memsize;
> +	struct page **pages;
> +	pgoff_t pagecount;
> +
> +	struct list_head dmabufs;
> +	u32 active_id;
> +	u32 next_id;
> +};
> +
> +static const char *vbe_name_list[VBE_DISPI_INDEX_COUNT] = {
> +	[ VBE_DISPI_INDEX_ID		   ] = "id",
> +	[ VBE_DISPI_INDEX_XRES		   ] = "xres",
> +	[ VBE_DISPI_INDEX_YRES		   ] = "yres",
> +	[ VBE_DISPI_INDEX_BPP		   ] = "bpp",
> +	[ VBE_DISPI_INDEX_ENABLE	   ] = "enable",
> +	[ VBE_DISPI_INDEX_BANK		   ] = "bank",
> +	[ VBE_DISPI_INDEX_VIRT_WIDTH	   ] = "virt-width",
> +	[ VBE_DISPI_INDEX_VIRT_HEIGHT	   ] = "virt-height",
> +	[ VBE_DISPI_INDEX_X_OFFSET	   ] = "x-offset",
> +	[ VBE_DISPI_INDEX_Y_OFFSET	   ] = "y-offset",
> +	[ VBE_DISPI_INDEX_VIDEO_MEMORY_64K ] = "video-mem",
> +};
> +
> +static const char *vbe_name(u32 index)
> +{
> +	if (index < ARRAY_SIZE(vbe_name_list))
> +		return vbe_name_list[index];
> +	return "(invalid)";
> +}
> +
> +static const struct mbochs_type *mbochs_find_type(struct kobject *kobj)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(mbochs_types); i++)
> +		if (strcmp(mbochs_types[i].name, kobj->name) == 0)
> +			return mbochs_types + i;
> +	return NULL;
> +}
> +
> +static void mbochs_create_config_space(struct mdev_state *mdev_state)
> +{
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
> +		   0x1234);
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
> +		   0x1111);
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
> +		   PCI_SUBVENDOR_ID_REDHAT_QUMRANET);
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
> +		   PCI_SUBDEVICE_ID_QEMU);
> +
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
> +		   PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
> +	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
> +		   PCI_CLASS_DISPLAY_OTHER);
> +	mdev_state->vconfig[PCI_CLASS_REVISION] =  0x01;
> +
> +	STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
> +		   PCI_BASE_ADDRESS_SPACE_MEMORY |
> +		   PCI_BASE_ADDRESS_MEM_TYPE_32	 |
> +		   PCI_BASE_ADDRESS_MEM_PREFETCH);
> +	mdev_state->bar_mask[0] = ~(mdev_state->memsize) + 1;
> +
> +	STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_2],
> +		   PCI_BASE_ADDRESS_SPACE_MEMORY |
> +		   PCI_BASE_ADDRESS_MEM_TYPE_32);
> +	mdev_state->bar_mask[2] = ~(MBOCHS_MMIO_BAR_SIZE) + 1;
> +}
> +
> +static int mbochs_check_framebuffer(struct mdev_state *mdev_state,
> +				    struct mbochs_mode *mode)
> +{
> +	struct device *dev = mdev_dev(mdev_state->mdev);
> +	u16 *vbe = mdev_state->vbe;
> +	u32 virt_width;
> +
> +	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
> +
> +	if (!(vbe[VBE_DISPI_INDEX_ENABLE] & VBE_DISPI_ENABLED))
> +		goto nofb;
> +
> +	memset(mode, 0, sizeof(*mode));
> +	switch (vbe[VBE_DISPI_INDEX_BPP]) {
> +	case 32:
> +		mode->drm_format = DRM_FORMAT_XRGB8888;
> +		mode->bytepp = 4;
> +		break;
> +	default:
> +		dev_info_ratelimited(dev, "%s: bpp %d not supported\n", __func__,
> +				     vbe[VBE_DISPI_INDEX_BPP]);
> +		goto nofb;
> +	}
> +
> +	mode->width  = vbe[VBE_DISPI_INDEX_XRES];
> +	mode->height = vbe[VBE_DISPI_INDEX_YRES];
> +	virt_width  = vbe[VBE_DISPI_INDEX_VIRT_WIDTH];
> +	if (virt_width < mode->width)
> +		virt_width = mode->width;
> +	mode->stride = virt_width * mode->bytepp;
> +	mode->size   = (u64)mode->stride * mode->height;
> +	mode->offset = ((u64)vbe[VBE_DISPI_INDEX_X_OFFSET] * mode->bytepp +
> +		       (u64)vbe[VBE_DISPI_INDEX_Y_OFFSET] * mode->stride);
> +
> +	if (mode->width < 64 || mode->height < 64) {
> +		dev_info_ratelimited(dev, "%s: invalid resolution %dx%d\n",
> +				     __func__, mode->width, mode->height);
> +		goto nofb;
> +	}
> +	if (mode->offset + mode->size > mdev_state->memsize) {
> +		dev_info_ratelimited(dev, "%s: framebuffer memory overflow\n",
> +				     __func__);
> +		goto nofb;
> +	}
> +
> +	return 0;
> +
> +nofb:
> +	memset(mode, 0, sizeof(*mode));
> +	return -EINVAL;
> +}
> +
> +static bool mbochs_modes_equal(struct mbochs_mode *mode1,
> +			       struct mbochs_mode *mode2)
> +{
> +	return memcmp(mode1, mode2, sizeof(struct mbochs_mode)) == 0;
> +}
> +
> +static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
> +				 char *buf, u32 count)
> +{
> +	struct device *dev = mdev_dev(mdev_state->mdev);
> +	int index = (offset - PCI_BASE_ADDRESS_0) / 0x04;
> +	u32 cfg_addr;
> +
> +	switch (offset) {
> +	case PCI_BASE_ADDRESS_0:
> +	case PCI_BASE_ADDRESS_2:
> +		cfg_addr = *(u32 *)buf;
> +
> +		if (cfg_addr == 0xffffffff) {
> +			cfg_addr = (cfg_addr & mdev_state->bar_mask[index]);
> +		} else {
> +			cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
> +			if (cfg_addr)
> +				dev_info(dev, "BAR #%d @ 0x%x\n",
> +					 index, cfg_addr);
> +		}
> +
> +		cfg_addr |= (mdev_state->vconfig[offset] &
> +			     ~PCI_BASE_ADDRESS_MEM_MASK);
> +		STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
> +		break;
> +	}
> +}
> +
> +static void handle_mmio_write(struct mdev_state *mdev_state, u16 offset,
> +			      char *buf, u32 count)
> +{
> +	struct device *dev = mdev_dev(mdev_state->mdev);
> +	int index;
> +	u16 reg16;
> +
> +	switch (offset) {
> +	case 0x400 ... 0x41f: /* vga ioports remapped */
> +		goto unhandled;
> +	case 0x500 ... 0x515: /* bochs dispi interface */
> +		if (count != 2)
> +			goto unhandled;
> +		index = (offset - 0x500) / 2;
> +		reg16 = *(u16*)buf;
> +		if (index < ARRAY_SIZE(mdev_state->vbe))
> +			mdev_state->vbe[index] = reg16;
> +		dev_dbg(dev, "%s: vbe write %d = %d (%s)\n",
> +			__func__, index, reg16, vbe_name(index));
> +		break;
> +	case 0x600 ... 0x607: /* qemu extended regs */
> +		goto unhandled;
> +	default:
> +	unhandled:
> +		dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
> +			__func__, offset, count);
> +		break;
> +	}
> +}
> +
> +static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset,
> +			     char *buf, u32 count)
> +{
> +	struct device *dev = mdev_dev(mdev_state->mdev);
> +	u16 reg16 = 0;
> +	int index;
> +
> +	switch (offset) {
> +	case 0x500 ... 0x515: /* bochs dispi interface */
> +		if (count != 2)
> +			goto unhandled;
> +		index = (offset - 0x500) / 2;
> +		if (index < ARRAY_SIZE(mdev_state->vbe))
> +			reg16 = mdev_state->vbe[index];
> +		dev_dbg(dev, "%s: vbe read %d = %d (%s)\n",
> +			__func__, index, reg16, vbe_name(index));
> +		*(u16*)buf = reg16;
> +		break;
> +	default:
> +	unhandled:
> +		dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
> +			__func__, offset, count);
> +		memset(buf, 0, count);
> +		break;
> +	}
> +}
> +
> +static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
> +			   loff_t pos, bool is_write)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +	struct device *dev = mdev_dev(mdev);
> +	int ret = 0;
> +
> +	mutex_lock(&mdev_state->ops_lock);
> +
> +	if (pos < MBOCHS_CONFIG_SPACE_SIZE) {
> +		if (is_write) {
> +			handle_pci_cfg_write(mdev_state, pos, buf, count);
> +		} else {
> +			memcpy(buf, (mdev_state->vconfig + pos), count);
> +		}
> +
> +	} else if (pos >= MBOCHS_MMIO_BAR_OFFSET &&
> +		   pos + count <  MBOCHS_MEMORY_BAR_OFFSET) {
> +		pos -= MBOCHS_MMIO_BAR_OFFSET;
> +		if (is_write) {
> +			handle_mmio_write(mdev_state, pos, buf, count);
> +		} else {
> +			handle_mmio_read(mdev_state, pos, buf, count);
> +		}
> +
> +	} else {
> +		dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n",
> +			__func__, is_write ? "WR" : "RD", pos);
> +		ret = -1;
> +		goto accessfailed;
> +	}
> +
> +	ret = count;
> +
> +
> +accessfailed:
> +	mutex_unlock(&mdev_state->ops_lock);
> +
> +	return ret;
> +}
> +
> +int mbochs_reset(struct mdev_device *mdev)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +	u32 size64k = mdev_state->memsize / (64 * 1024);
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(mdev_state->vbe); i++)
> +		mdev_state->vbe[i] = 0;
> +	mdev_state->vbe[VBE_DISPI_INDEX_ID] = VBE_DISPI_ID5;
> +	mdev_state->vbe[VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = size64k;
> +	return 0;
> +}
> +
> +int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
> +{
> +	const struct mbochs_type *type = mbochs_find_type(kobj);
> +	struct device *dev = mdev_dev(mdev);
> +	struct mdev_state *mdev_state;
> +
> +	if (!type)
> +		type = &mbochs_types[0];
> +	if (type->mbytes + mbochs_used_mbytes > max_mbytes)
> +		return -ENOMEM;
> +
> +	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
> +	if (mdev_state == NULL)
> +		return -ENOMEM;
> +
> +	mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL);
> +	if (mdev_state->vconfig == NULL)
> +		goto err_mem;
> +
> +	mdev_state->memsize = type->mbytes * 1024 * 1024;
> +	mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT;
> +	mdev_state->pages = kzalloc(mdev_state->pagecount * sizeof(struct page*),
> +				    GFP_KERNEL);
> +	if (!mdev_state->pages)
> +		goto err_mem;
> +
> +	dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__,
> +		 kobj->name, type->mbytes, mdev_state->pagecount);
> +
> +	mutex_init(&mdev_state->ops_lock);
> +	mdev_state->mdev = mdev;
> +	mdev_set_drvdata(mdev, mdev_state);
> +	INIT_LIST_HEAD(&mdev_state->dmabufs);
> +	mdev_state->next_id = 1;
> +
> +	mdev_state->type = type;
> +	mbochs_create_config_space(mdev_state);
> +	mbochs_reset(mdev);
> +
> +	mbochs_used_mbytes += type->mbytes;
> +	return 0;
> +
> +err_mem:
> +	kfree(mdev_state->vconfig);
> +	kfree(mdev_state);
> +	return -ENOMEM;
> +}
> +
> +int mbochs_remove(struct mdev_device *mdev)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +
> +	mbochs_used_mbytes -= mdev_state->type->mbytes;
> +	mdev_set_drvdata(mdev, NULL);
> +	kfree(mdev_state->pages);
> +	kfree(mdev_state->vconfig);
> +	kfree(mdev_state);
> +	return 0;
> +}
> +
> +ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
> +		    size_t count, loff_t *ppos)
> +{
> +	unsigned int done = 0;
> +	int ret;
> +
> +	while (count) {
> +		size_t filled;
> +
> +		if (count >= 4 && !(*ppos % 4)) {
> +			u32 val;
> +
> +			ret =  mdev_access(mdev, (char *)&val, sizeof(val),
> +					   *ppos, false);
> +			if (ret <= 0)
> +				goto read_err;
> +
> +			if (copy_to_user(buf, &val, sizeof(val)))
> +				goto read_err;
> +
> +			filled = 4;
> +		} else if (count >= 2 && !(*ppos % 2)) {
> +			u16 val;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, false);
> +			if (ret <= 0)
> +				goto read_err;
> +
> +			if (copy_to_user(buf, &val, sizeof(val)))
> +				goto read_err;
> +
> +			filled = 2;
> +		} else {
> +			u8 val;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, false);
> +			if (ret <= 0)
> +				goto read_err;
> +
> +			if (copy_to_user(buf, &val, sizeof(val)))
> +				goto read_err;
> +
> +			filled = 1;
> +		}
> +
> +		count -= filled;
> +		done += filled;
> +		*ppos += filled;
> +		buf += filled;
> +	}
> +
> +	return done;
> +
> +read_err:
> +	return -EFAULT;
> +}
> +
> +ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
> +		     size_t count, loff_t *ppos)
> +{
> +	unsigned int done = 0;
> +	int ret;
> +
> +	while (count) {
> +		size_t filled;
> +
> +		if (count >= 4 && !(*ppos % 4)) {
> +			u32 val;
> +
> +			if (copy_from_user(&val, buf, sizeof(val)))
> +				goto write_err;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, true);
> +			if (ret <= 0)
> +				goto write_err;
> +
> +			filled = 4;
> +		} else if (count >= 2 && !(*ppos % 2)) {
> +			u16 val;
> +
> +			if (copy_from_user(&val, buf, sizeof(val)))
> +				goto write_err;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, true);
> +			if (ret <= 0)
> +				goto write_err;
> +
> +			filled = 2;
> +		} else {
> +			u8 val;
> +
> +			if (copy_from_user(&val, buf, sizeof(val)))
> +				goto write_err;
> +
> +			ret = mdev_access(mdev, (char *)&val, sizeof(val),
> +					  *ppos, true);
> +			if (ret <= 0)
> +				goto write_err;
> +
> +			filled = 1;
> +		}
> +		count -= filled;
> +		done += filled;
> +		*ppos += filled;
> +		buf += filled;
> +	}
> +
> +	return done;
> +write_err:
> +	return -EFAULT;
> +}
> +
> +struct page *__mbochs_get_page(struct mdev_state *mdev_state, pgoff_t pgoff)
> +{
> +	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
> +
> +	if (!mdev_state->pages[pgoff]) {
> +		mdev_state->pages[pgoff] =
> +			alloc_pages(GFP_HIGHUSER | __GFP_ZERO, 0);
> +		if (!mdev_state->pages[pgoff])
> +			return NULL;
> +	}
> +
> +	get_page(mdev_state->pages[pgoff]);
> +	return mdev_state->pages[pgoff];
> +}
> +
> +struct page *mbochs_get_page(struct mdev_state *mdev_state, pgoff_t pgoff)
> +{
> +	struct page *page;
> +
> +	if (WARN_ON(pgoff >= mdev_state->pagecount))
> +		return NULL;
> +
> +	mutex_lock(&mdev_state->ops_lock);
> +	page = __mbochs_get_page(mdev_state, pgoff);
> +	mutex_unlock(&mdev_state->ops_lock);
> +
> +	return page;
> +}
> +
> +void mbochs_put_pages(struct mdev_state *mdev_state)
> +{
> +	struct device *dev = mdev_dev(mdev_state->mdev);
> +	int i, count = 0;
> +
> +	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
> +
> +	for (i = 0; i < mdev_state->pagecount; i++) {
> +		if (!mdev_state->pages[i])
> +			continue;
> +		put_page(mdev_state->pages[i]);
> +		mdev_state->pages[i] = NULL;
> +		count++;
> +	}
> +	dev_dbg(dev, "%s: %d pages released\n", __func__, count);
> +}
> +
> +static int mbochs_region_vm_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct mdev_state *mdev_state = vma->vm_private_data;
> +	pgoff_t page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
> +
> +	if (page_offset >= mdev_state->pagecount)
> +		return VM_FAULT_SIGBUS;
> +
> +	vmf->page = mbochs_get_page(mdev_state, page_offset);
> +	if (!vmf->page)
> +		return VM_FAULT_SIGBUS;
> +
> +	return 0;
> +}
> +
> +static const struct vm_operations_struct mbochs_region_vm_ops = {
> +	.fault = mbochs_region_vm_fault,
> +};
> +
> +int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +
> +	if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
> +		return -EINVAL;
> +	if (vma->vm_end < vma->vm_start)
> +		return -EINVAL;
> +	if (vma->vm_end - vma->vm_start > mdev_state->memsize)
> +		return -EINVAL;
> +	if ((vma->vm_flags & VM_SHARED) == 0)
> +		return -EINVAL;
> +
> +	vma->vm_ops = &mbochs_region_vm_ops;
> +	vma->vm_private_data = mdev_state;
> +	return 0;
> +}
> +
> +static int mbochs_dmabuf_vm_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct mbochs_dmabuf *dmabuf = vma->vm_private_data;
> +
> +	if (WARN_ON(vmf->pgoff >= dmabuf->pagecount))
> +		return VM_FAULT_SIGBUS;
> +
> +	vmf->page = dmabuf->pages[vmf->pgoff];
> +	get_page(vmf->page);
> +	return 0;
> +}
> +
> +static const struct vm_operations_struct mbochs_dmabuf_vm_ops = {
> +	.fault = mbochs_dmabuf_vm_fault,
> +};
> +
> +static int mbochs_mmap_dmabuf(struct dma_buf *buf, struct vm_area_struct *vma)
> +{
> +	struct mbochs_dmabuf *dmabuf = buf->priv;
> +	struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
> +
> +	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
> +
> +	if ((vma->vm_flags & VM_SHARED) == 0)
> +		return -EINVAL;
> +
> +	vma->vm_ops = &mbochs_dmabuf_vm_ops;
> +	vma->vm_private_data = dmabuf;
> +	return 0;
> +}
> +
> +static void mbochs_print_dmabuf(struct mbochs_dmabuf *dmabuf,
> +				const char *prefix)
> +{
> +	struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
> +	u32 fourcc = dmabuf->mode.drm_format;
> +
> +	dev_dbg(dev, "%s/%d: %c%c%c%c, %dx%d, stride %d, off 0x%llx, size 0x%llx, pages %ld\n",
> +		prefix, dmabuf->id,
> +		fourcc ? ((fourcc >>  0) & 0xff) : '-',
> +		fourcc ? ((fourcc >>  8) & 0xff) : '-',
> +		fourcc ? ((fourcc >> 16) & 0xff) : '-',
> +		fourcc ? ((fourcc >> 24) & 0xff) : '-',
> +		dmabuf->mode.width, dmabuf->mode.height, dmabuf->mode.stride,
> +		dmabuf->mode.offset, dmabuf->mode.size, dmabuf->pagecount);
> +}
> +
> +static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at,
> +					  enum dma_data_direction direction)
> +{
> +	struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
> +	struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
> +	struct sg_table *sg;
> +
> +	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
> +
> +	sg = kzalloc(sizeof(*sg), GFP_KERNEL);
> +	if (!sg)
> +		goto err1;
> +	if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount,
> +				      0, dmabuf->mode.size, GFP_KERNEL) < 0)
> +		goto err2;
> +	if (!dma_map_sg(at->dev, sg->sgl, sg->nents, direction))
> +		goto err3;
> +
> +	return sg;
> +
> +err3:
> +	sg_free_table(sg);
> +err2:
> +	kfree(sg);
> +err1:
> +	return ERR_PTR(-ENOMEM);
> +}
> +
> +static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at,
> +				struct sg_table *sg,
> +				enum dma_data_direction direction)
> +{
> +	struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
> +	struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
> +
> +	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
> +
> +	sg_free_table(sg);
> +	kfree(sg);
> +}
> +
> +static void mbochs_release_dmabuf(struct dma_buf *buf)
> +{
> +	struct mbochs_dmabuf *dmabuf = buf->priv;
> +	struct mdev_state *mdev_state = dmabuf->mdev_state;
> +	struct device *dev = mdev_dev(mdev_state->mdev);
> +	pgoff_t pg;
> +
> +	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
> +
> +	for (pg = 0; pg < dmabuf->pagecount; pg++)
> +		put_page(dmabuf->pages[pg]);
> +
> +	mutex_lock(&mdev_state->ops_lock);
> +	dmabuf->buf = NULL;
> +	if (dmabuf->unlinked)
> +		kfree(dmabuf);
> +	mutex_unlock(&mdev_state->ops_lock);
> +}
> +
> +static void *mbochs_kmap_atomic_dmabuf(struct dma_buf *buf, unsigned long page_num)
> +{
> +	struct mbochs_dmabuf *dmabuf = buf->priv;
> +	struct page *page = dmabuf->pages[page_num];
> +
> +	return kmap_atomic(page);
> +}
> +
> +static void *mbochs_kmap_dmabuf(struct dma_buf *buf, unsigned long page_num)
> +{
> +	struct mbochs_dmabuf *dmabuf = buf->priv;
> +	struct page *page = dmabuf->pages[page_num];
> +
> +	return kmap(page);
> +}
> +
> +static struct dma_buf_ops mbochs_dmabuf_ops = {
> +	.map_dma_buf	  = mbochs_map_dmabuf,
> +	.unmap_dma_buf	  = mbochs_unmap_dmabuf,
> +	.release	  = mbochs_release_dmabuf,
> +	.map_atomic	  = mbochs_kmap_atomic_dmabuf,
> +	.map		  = mbochs_kmap_dmabuf,
> +	.mmap		  = mbochs_mmap_dmabuf,
> +};
> +
> +static struct mbochs_dmabuf *mbochs_dmabuf_alloc(struct mdev_state *mdev_state,
> +						 struct mbochs_mode *mode)
> +{
> +	struct mbochs_dmabuf *dmabuf;
> +	pgoff_t page_offset, pg;
> +
> +	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
> +
> +	dmabuf = kzalloc(sizeof(struct mbochs_dmabuf), GFP_KERNEL);
> +	if (!dmabuf)
> +		return NULL;
> +
> +	dmabuf->mode = *mode;
> +	dmabuf->id = mdev_state->next_id++;
> +	dmabuf->pagecount = DIV_ROUND_UP(mode->size, PAGE_SIZE);
> +	dmabuf->pages = kzalloc(dmabuf->pagecount * sizeof(struct page*),
> +				GFP_KERNEL);
> +	if (!dmabuf->pages)
> +		goto err_free_dmabuf;
> +
> +	page_offset = dmabuf->mode.offset >> PAGE_SHIFT;
> +	for (pg = 0; pg < dmabuf->pagecount; pg++) {
> +		dmabuf->pages[pg] = __mbochs_get_page(mdev_state, page_offset + pg);
> +		if (!dmabuf->pages[pg])
> +			goto err_free_pages;
> +	}
> +
> +	dmabuf->mdev_state = mdev_state;
> +	list_add(&dmabuf->next, &mdev_state->dmabufs);
> +
> +	mbochs_print_dmabuf(dmabuf, __func__);
> +	return dmabuf;
> +
> +err_free_pages:
> +	while (pg > 0)
> +		put_page(dmabuf->pages[--pg]);
> +	kfree(dmabuf->pages);
> +err_free_dmabuf:
> +	kfree(dmabuf);
> +	return NULL;
> +}
> +
> +static struct mbochs_dmabuf *
> +mbochs_dmabuf_find_by_mode(struct mdev_state *mdev_state,
> +			   struct mbochs_mode *mode)
> +{
> +	struct mbochs_dmabuf *dmabuf;
> +
> +	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
> +
> +	list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
> +		if (mbochs_modes_equal(&dmabuf->mode, mode))
> +			return dmabuf;
> +
> +	return NULL;
> +}
> +
> +static struct mbochs_dmabuf *
> +mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id)
> +{
> +	struct mbochs_dmabuf *dmabuf;
> +
> +	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
> +
> +	list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
> +		if (dmabuf->id == id)
> +			return dmabuf;
> +
> +	return NULL;
> +}
> +
> +static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
> +{
> +	struct mdev_state *mdev_state = dmabuf->mdev_state;
> +	struct device *dev = mdev_dev(mdev_state->mdev);
> +	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> +	struct dma_buf *buf;
> +
> +	WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
> +
> +	if (!IS_ALIGNED(dmabuf->mode.offset, PAGE_SIZE)) {
> +		dev_info_ratelimited(dev, "%s: framebuffer not page-aligned\n",
> +				     __func__);
> +		return -EINVAL;
> +	}
> +
> +	exp_info.ops = &mbochs_dmabuf_ops;
> +	exp_info.size = dmabuf->mode.size;
> +	exp_info.priv = dmabuf;
> +
> +	buf = dma_buf_export(&exp_info);
> +	if (IS_ERR(buf)) {
> +		dev_info_ratelimited(dev, "%s: dma_buf_export failed: %ld\n",
> +				     __func__, PTR_ERR(buf));
> +		return PTR_ERR(buf);
> +	}
> +
> +	dmabuf->buf = buf;
> +	dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
> +	return 0;
> +}
> +
> +int mbochs_get_region_info(struct mdev_device *mdev,
> +			   struct vfio_region_info *region_info,
> +			   u16 *cap_type_id, void **cap_type)
> +{
> +	struct mdev_state *mdev_state;
> +
> +	mdev_state = mdev_get_drvdata(mdev);
> +	if (!mdev_state)
> +		return -EINVAL;
> +
> +	if (region_info->index >= VFIO_PCI_NUM_REGIONS)
> +		return -EINVAL;
> +
> +	switch (region_info->index) {
> +	case VFIO_PCI_CONFIG_REGION_INDEX:
> +		region_info->offset = 0;
> +		region_info->size   = MBOCHS_CONFIG_SPACE_SIZE;
> +		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ |
> +				       VFIO_REGION_INFO_FLAG_WRITE);
> +		break;
> +	case VFIO_PCI_BAR0_REGION_INDEX:
> +		region_info->offset = MBOCHS_MEMORY_BAR_OFFSET;
> +		region_info->size   = mdev_state->memsize;
> +		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ  |
> +				       VFIO_REGION_INFO_FLAG_WRITE |
> +				       VFIO_REGION_INFO_FLAG_MMAP);

As with the region version, BAR0 doesn't actually seem to support
read(2)/write(2).

Nice set of drivers, thanks for providing these as samples!  Thanks,

Alex

> +		break;
> +	case VFIO_PCI_BAR2_REGION_INDEX:
> +		region_info->offset = MBOCHS_MMIO_BAR_OFFSET;
> +		region_info->size   = MBOCHS_MMIO_BAR_SIZE;
> +		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ  |
> +				       VFIO_REGION_INFO_FLAG_WRITE);
> +		break;
> +	default:
> +		region_info->size   = 0;
> +		region_info->offset = 0;
> +		region_info->flags  = 0;
> +	}
> +
> +	return 0;
> +}
> +
> +int mbochs_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info)
> +{
> +	irq_info->count = 0;
> +	return 0;
> +}
> +
> +int mbochs_get_device_info(struct mdev_device *mdev,
> +			 struct vfio_device_info *dev_info)
> +{
> +	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
> +	dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
> +	dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
> +	return 0;
> +}
> +
> +int mbochs_query_gfx_plane(struct mdev_device *mdev,
> +			   struct vfio_device_gfx_plane_info *plane)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +	struct device *dev = mdev_dev(mdev);
> +	struct mbochs_dmabuf *dmabuf;
> +	struct mbochs_mode mode;
> +	int ret;
> +
> +	if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
> +		if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
> +				     VFIO_GFX_PLANE_TYPE_DMABUF))
> +			return 0;
> +		return -EINVAL;
> +	}
> +
> +	if (plane->flags != VFIO_GFX_PLANE_TYPE_DMABUF)
> +		return -EINVAL;
> +
> +	plane->drm_format_mod = 0;
> +	plane->x_pos	      = 0;
> +	plane->y_pos	      = 0;
> +	plane->x_hot	      = 0;
> +	plane->y_hot	      = 0;
> +
> +	mutex_lock(&mdev_state->ops_lock);
> +
> +	ret = -EINVAL;
> +	if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY)
> +		ret = mbochs_check_framebuffer(mdev_state, &mode);
> +	if (ret < 0) {
> +		plane->drm_format     = 0;
> +		plane->width	      = 0;
> +		plane->height	      = 0;
> +		plane->stride	      = 0;
> +		plane->size	      = 0;
> +		plane->dmabuf_id      = 0;
> +		goto done;
> +	}
> +
> +	dmabuf = mbochs_dmabuf_find_by_mode(mdev_state, &mode);
> +	if (!dmabuf)
> +		mbochs_dmabuf_alloc(mdev_state, &mode);
> +	if (!dmabuf) {
> +		mutex_unlock(&mdev_state->ops_lock);
> +		return -ENOMEM;
> +	}
> +
> +	plane->drm_format     = dmabuf->mode.drm_format;
> +	plane->width	      = dmabuf->mode.width;
> +	plane->height	      = dmabuf->mode.height;
> +	plane->stride	      = dmabuf->mode.stride;
> +	plane->size	      = dmabuf->mode.size;
> +	plane->dmabuf_id      = dmabuf->id;
> +
> +done:
> +	if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY &&
> +	    mdev_state->active_id != plane->dmabuf_id) {
> +		dev_dbg(dev, "%s: primary: %d => %d\n", __func__,
> +			mdev_state->active_id, plane->dmabuf_id);
> +		mdev_state->active_id = plane->dmabuf_id;
> +	}
> +	mutex_unlock(&mdev_state->ops_lock);
> +	return 0;
> +}
> +
> +int mbochs_get_gfx_dmabuf(struct mdev_device *mdev,
> +			  u32 id)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +	struct mbochs_dmabuf *dmabuf;
> +
> +	mutex_lock(&mdev_state->ops_lock);
> +
> +	dmabuf = mbochs_dmabuf_find_by_id(mdev_state, id);
> +	if (!dmabuf) {
> +		mutex_unlock(&mdev_state->ops_lock);
> +		return -ENOENT;
> +	}
> +
> +	if (!dmabuf->buf)
> +		mbochs_dmabuf_export(dmabuf);
> +
> +	mutex_unlock(&mdev_state->ops_lock);
> +
> +	if (!dmabuf->buf)
> +		return -EINVAL;
> +
> +	return dma_buf_fd(dmabuf->buf, 0);
> +}
> +
> +static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
> +			unsigned long arg)
> +{
> +	int ret = 0;
> +	unsigned long minsz;
> +	struct mdev_state *mdev_state;
> +
> +	mdev_state = mdev_get_drvdata(mdev);
> +
> +	switch (cmd) {
> +	case VFIO_DEVICE_GET_INFO:
> +	{
> +		struct vfio_device_info info;
> +
> +		minsz = offsetofend(struct vfio_device_info, num_irqs);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = mbochs_get_device_info(mdev, &info);
> +		if (ret)
> +			return ret;
> +
> +		memcpy(&mdev_state->dev_info, &info, sizeof(info));
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_DEVICE_GET_REGION_INFO:
> +	{
> +		struct vfio_region_info info;
> +		u16 cap_type_id = 0;
> +		void *cap_type = NULL;
> +
> +		minsz = offsetofend(struct vfio_region_info, offset);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = mbochs_get_region_info(mdev, &info, &cap_type_id,
> +					   &cap_type);
> +		if (ret)
> +			return ret;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +
> +	case VFIO_DEVICE_GET_IRQ_INFO:
> +	{
> +		struct vfio_irq_info info;
> +
> +		minsz = offsetofend(struct vfio_irq_info, count);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if ((info.argsz < minsz) ||
> +		    (info.index >= mdev_state->dev_info.num_irqs))
> +			return -EINVAL;
> +
> +		ret = mbochs_get_irq_info(mdev, &info);
> +		if (ret)
> +			return ret;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +
> +	case VFIO_DEVICE_QUERY_GFX_PLANE:
> +	{
> +		struct vfio_device_gfx_plane_info plane;
> +
> +		minsz = offsetofend(struct vfio_device_gfx_plane_info,
> +				    region_index);
> +
> +		if (copy_from_user(&plane, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (plane.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = mbochs_query_gfx_plane(mdev, &plane);
> +		if (ret)
> +			return ret;
> +
> +		if (copy_to_user((void __user *)arg, &plane, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +
> +	case VFIO_DEVICE_GET_GFX_DMABUF:
> +	{
> +		u32 dmabuf_id;
> +
> +		if (get_user(dmabuf_id, (__u32 __user *)arg))
> +			return -EFAULT;
> +
> +		return mbochs_get_gfx_dmabuf(mdev, dmabuf_id);
> +	}
> +
> +	case VFIO_DEVICE_SET_IRQS:
> +		return -EINVAL;
> +
> +	case VFIO_DEVICE_RESET:
> +		return mbochs_reset(mdev);
> +	}
> +	return -ENOTTY;
> +}
> +
> +int mbochs_open(struct mdev_device *mdev)
> +{
> +	if (!try_module_get(THIS_MODULE))
> +		return -ENODEV;
> +
> +	return 0;
> +}
> +
> +void mbochs_close(struct mdev_device *mdev)
> +{
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +	struct mbochs_dmabuf *dmabuf, *tmp;
> +
> +	mutex_lock(&mdev_state->ops_lock);
> +
> +	list_for_each_entry_safe(dmabuf, tmp, &mdev_state->dmabufs, next) {
> +		list_del(&dmabuf->next);
> +		if (dmabuf->buf) {
> +			/* free in mbochs_release_dmabuf() */
> +			dmabuf->unlinked = true;
> +		} else {
> +			kfree(dmabuf);
> +		}
> +	}
> +	mbochs_put_pages(mdev_state);
> +
> +	mutex_unlock(&mdev_state->ops_lock);
> +	module_put(THIS_MODULE);
> +}
> +
> +static ssize_t
> +memory_show(struct device *dev, struct device_attribute *attr,
> +	    char *buf)
> +{
> +	struct mdev_device *mdev = mdev_from_dev(dev);
> +	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
> +
> +	return sprintf(buf, "%d MB\n", mdev_state->type->mbytes);
> +}
> +static DEVICE_ATTR_RO(memory);
> +
> +static struct attribute *mdev_dev_attrs[] = {
> +	&dev_attr_memory.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group mdev_dev_group = {
> +	.name  = "vendor",
> +	.attrs = mdev_dev_attrs,
> +};
> +
> +const struct attribute_group *mdev_dev_groups[] = {
> +	&mdev_dev_group,
> +	NULL,
> +};
> +
> +static ssize_t
> +name_show(struct kobject *kobj, struct device *dev, char *buf)
> +{
> +	return sprintf(buf, "%s\n", kobj->name);
> +}
> +MDEV_TYPE_ATTR_RO(name);
> +
> +static ssize_t
> +description_show(struct kobject *kobj, struct device *dev, char *buf)
> +{
> +	const struct mbochs_type *type = mbochs_find_type(kobj);
> +
> +	return sprintf(buf, "virtual display, %d MB video memory\n",
> +		       type ? type->mbytes  : 0);
> +}
> +MDEV_TYPE_ATTR_RO(description);
> +
> +static ssize_t
> +available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
> +{
> +	const struct mbochs_type *type = mbochs_find_type(kobj);
> +	int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
> +
> +	return sprintf(buf, "%d\n", count);
> +}
> +MDEV_TYPE_ATTR_RO(available_instances);
> +
> +static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
> +			       char *buf)
> +{
> +	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
> +}
> +MDEV_TYPE_ATTR_RO(device_api);
> +
> +static struct attribute *mdev_types_attrs[] = {
> +	&mdev_type_attr_name.attr,
> +	&mdev_type_attr_description.attr,
> +	&mdev_type_attr_device_api.attr,
> +	&mdev_type_attr_available_instances.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group mdev_type_group1 = {
> +	.name  = MBOCHS_TYPE_1,
> +	.attrs = mdev_types_attrs,
> +};
> +
> +static struct attribute_group mdev_type_group2 = {
> +	.name  = MBOCHS_TYPE_2,
> +	.attrs = mdev_types_attrs,
> +};
> +
> +static struct attribute_group mdev_type_group3 = {
> +	.name  = MBOCHS_TYPE_3,
> +	.attrs = mdev_types_attrs,
> +};
> +
> +static struct attribute_group *mdev_type_groups[] = {
> +	&mdev_type_group1,
> +	&mdev_type_group2,
> +	&mdev_type_group3,
> +	NULL,
> +};
> +
> +static const struct mdev_parent_ops mdev_fops = {
> +	.owner			= THIS_MODULE,
> +	.mdev_attr_groups	= mdev_dev_groups,
> +	.supported_type_groups	= mdev_type_groups,
> +	.create			= mbochs_create,
> +	.remove			= mbochs_remove,
> +	.open			= mbochs_open,
> +	.release		= mbochs_close,
> +	.read			= mbochs_read,
> +	.write			= mbochs_write,
> +	.ioctl			= mbochs_ioctl,
> +	.mmap			= mbochs_mmap,
> +};
> +
> +static const struct file_operations vd_fops = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +static void mbochs_device_release(struct device *dev)
> +{
> +	/* nothing */
> +}
> +
> +static int __init mbochs_dev_init(void)
> +{
> +	int ret = 0;
> +
> +	ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK, MBOCHS_NAME);
> +	if (ret < 0) {
> +		pr_err("Error: failed to register mbochs_dev, err: %d\n", ret);
> +		return ret;
> +	}
> +	cdev_init(&mbochs_cdev, &vd_fops);
> +	cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK);
> +	pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt));
> +
> +	mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME);
> +	if (IS_ERR(mbochs_class)) {
> +		pr_err("Error: failed to register mbochs_dev class\n");
> +		ret = PTR_ERR(mbochs_class);
> +		goto failed1;
> +	}
> +	mbochs_dev.class = mbochs_class;
> +	mbochs_dev.release = mbochs_device_release;
> +	dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME);
> +
> +	ret = device_register(&mbochs_dev);
> +	if (ret)
> +		goto failed2;
> +
> +	ret = mdev_register_device(&mbochs_dev, &mdev_fops);
> +	if (ret)
> +		goto failed3;
> +
> +	return 0;
> +
> +failed3:
> +	device_unregister(&mbochs_dev);
> +failed2:
> +	class_destroy(mbochs_class);
> +failed1:
> +	cdev_del(&mbochs_cdev);
> +	unregister_chrdev_region(mbochs_devt, MINORMASK);
> +	return ret;
> +}
> +
> +static void __exit mbochs_dev_exit(void)
> +{
> +	mbochs_dev.bus = NULL;
> +	mdev_unregister_device(&mbochs_dev);
> +
> +	device_unregister(&mbochs_dev);
> +	cdev_del(&mbochs_cdev);
> +	unregister_chrdev_region(mbochs_devt, MINORMASK);
> +	class_destroy(mbochs_class);
> +	mbochs_class = NULL;
> +}
> +
> +module_init(mbochs_dev_init)
> +module_exit(mbochs_dev_exit)
> diff --git a/samples/Kconfig b/samples/Kconfig
> index 755430c788..5de0674cdd 100644
> --- a/samples/Kconfig
> +++ b/samples/Kconfig
> @@ -128,6 +128,19 @@ config SAMPLE_VFIO_MDEV_MDPY_FB
>  	help
>  	  Guest fbdev driver for the virtual display sample driver.
>  
> +config SAMPLE_VFIO_MDEV_MBOCHS
> +	tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
> +	depends on VFIO_MDEV_DEVICE && m
> +	help
> +	  Build a virtual display sample driver for use as a VFIO
> +	  mediated device.  It supports the region display interface
> +	  (VFIO_GFX_PLANE_TYPE_DMABUF).
> +	  Emulate enough of qemu stdvga to make bochs-drm.ko happy.
> +	  That is basically the vram memory bar and the bochs dispi
> +	  interface vbe registers in the mmio register bar.
> +	  Specifically it does *not* include any legacy vga stuff.
> +	  Device looks alot like "qemu -device secondary-vga".
> +
>  config SAMPLE_STATX
>  	bool "Build example extended-stat using code"
>  	depends on BROKEN
> diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
> index 7a5790aaec..7db889ca13 100644
> --- a/samples/vfio-mdev/Makefile
> +++ b/samples/vfio-mdev/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
>  obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o
>  obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o
> +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MBOCHS) += mbochs.o

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] sample: vfio mdev display - host device
  2018-04-24  2:41   ` Alex Williamson
@ 2018-04-24  6:29     ` Gerd Hoffmann
  0 siblings, 0 replies; 41+ messages in thread
From: Gerd Hoffmann @ 2018-04-24  6:29 UTC (permalink / raw)
  To: Alex Williamson; +Cc: kvm, kwankhede, open list

  Hi,

> > +/* pci ids */
> > +#define MDPY_PCI_VENDOR_ID	0x1b36 /* redhat */
> > +#define MDPY_PCI_DEVICE_ID	0x00f0
> 
> I don't see this on pci-ids, so I assume we're just squatting on an
> ID.  How do we do that without risking that we don't interfere with
> some future user?  Are we relying on this being a non-default sample
> device?  Should we just ask for an allocation?

It's grabbed from qemu id range.
Allocating one is probably a good idea even for a sample device.

> > +#define MDPY_PCI_SUBVENDOR_ID	PCI_SUBVENDOR_ID_REDHAT_QUMRANET
> > +#define MDPY_PCI_SUBDEVICE_ID	PCI_SUBDEVICE_ID_QEMU
> > +
> > +/* pci cfg space offsets for fb config (dword) */
> > +#define MDPY_FORMAT_OFFSET	0x40
> > +#define MDPY_WIDTH_OFFSET	0x44
> > +#define MDPY_HEIGHT_OFFSET	0x48
> 
> As I understand, these are just registers in PCI config space outside
> of any capabilities.  Wouldn't it be more correct to put these within a
> vendor defined capability?

Can do that.

> > +		region_info->size   = mdev_state->memsize;
> > +		region_info->flags  = (VFIO_REGION_INFO_FLAG_READ  |
> > +				       VFIO_REGION_INFO_FLAG_WRITE |
> > +				       VFIO_REGION_INFO_FLAG_MMAP);
> 
> This doesn't appear to be true, the read and write functions call the
> access function which only handles the config space region.  Are these
> really mmap-only regions?

Yes, they are mmap-only.

> read/write access support is often useful
> for tracing and debugging, QEMU will break if x-no-mmap=on is used.

Hmm, can look into adding that, should not be that difficuilt after all.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-23 21:40   ` Alex Williamson
@ 2018-04-24  7:17     ` Gerd Hoffmann
  2018-04-24 17:35       ` Alex Williamson
  2018-04-24 19:50     ` Kirti Wankhede
  1 sibling, 1 reply; 41+ messages in thread
From: Gerd Hoffmann @ 2018-04-24  7:17 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Erik Skultety, libvirt, Tina Zhang, kwankhede, intel-gvt-dev

  Hi,

> Here's another proposal that's really growing on me:
> 
>  * Fix the vendor drivers!  Allow devices to be opened and probed
>    without these external dependencies.

Hmm.  If you try use gvt with tcg then, wouldn't qemu think "device
probed ok, all green" then even though that isn't the case?

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-24  7:17     ` Gerd Hoffmann
@ 2018-04-24 17:35       ` Alex Williamson
  2018-04-25  9:49         ` Zhang, Tina
  0 siblings, 1 reply; 41+ messages in thread
From: Alex Williamson @ 2018-04-24 17:35 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: kvm, Erik Skultety, libvirt, Tina Zhang, kwankhede, intel-gvt-dev

On Tue, 24 Apr 2018 09:17:37 +0200
Gerd Hoffmann <kraxel@redhat.com> wrote:

>   Hi,
> 
> > Here's another proposal that's really growing on me:
> > 
> >  * Fix the vendor drivers!  Allow devices to be opened and probed
> >    without these external dependencies.  
> 
> Hmm.  If you try use gvt with tcg then, wouldn't qemu think "device
> probed ok, all green" then even though that isn't the case?

Well, is there a way to make it work with tcg?  That would be the best
solution.  Perhaps KVM could be handled as an accelerator rather than a
required component.  I don't really understand how the page tracking
interface is used and why it's not required by NVIDIA if it's so
fundamental to GVT-g.  Otherwise, are there other points at which the
device could refuse to be enabled, for instance what if the write to
enable bus-master in the PCI command register returned an error if the
device isn't fully configured.  Paolo had suggested offline that maybe
there could be a read-only mode of the device that allows probing.  I
think that would be a fair bit of work and complexity to support, but
I'm open to those sorts of ideas.  I can't be sure the NVIDIA
requirement isn't purely for accounting purposes within their own
proprietary userspace manager, without any real technical requirement.
Hoping Intel and NVIDIA can comment on these so we really understand
why these are in place before we bend over backwards for a secondary API
interface. Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-23 21:40   ` Alex Williamson
  2018-04-24  7:17     ` Gerd Hoffmann
@ 2018-04-24 19:50     ` Kirti Wankhede
  2018-04-24 22:59       ` Alex Williamson
  1 sibling, 1 reply; 41+ messages in thread
From: Kirti Wankhede @ 2018-04-24 19:50 UTC (permalink / raw)
  To: Alex Williamson, Gerd Hoffmann
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Tina Zhang, intel-gvt-dev



On 4/24/2018 3:10 AM, Alex Williamson wrote:
> On Wed, 18 Apr 2018 12:31:53 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
>> On Mon,  9 Apr 2018 12:35:10 +0200
>> Gerd Hoffmann <kraxel@redhat.com> wrote:
>>
>>> This little series adds three drivers, for demo-ing and testing vfio
>>> display interface code.  There is one mdev device for each interface
>>> type (mdpy.ko for region and mbochs.ko for dmabuf).  
>>
>> Erik Skultety brought up a good question today regarding how libvirt is
>> meant to handle these different flavors of display interfaces and
>> knowing whether a given mdev device has display support at all.  It
>> seems that we cannot simply use the default display=auto because
>> libvirt needs to specifically configure gl support for a dmabuf type
>> interface versus not having such a requirement for a region interface,
>> perhaps even removing the emulated graphics in some cases (though I
>> don't think we have boot graphics through either solution yet).
>> Additionally, GVT-g seems to need the x-igd-opregion support
>> enabled(?), which is a non-starter for libvirt as it's an experimental
>> option!
>>
>> Currently the only way to determine display support is through the
>> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
>> their own they'd need to get to the point where they could open the
>> vfio device and perform the ioctl.  That means opening a vfio
>> container, adding the group, setting the iommu type, and getting the
>> device.  I was initially a bit appalled at asking libvirt to do that,
>> but the alternative is to put this information in sysfs, but doing that
>> we risk that we need to describe every nuance of the mdev device
>> through sysfs and it becomes a dumping ground for every possible
>> feature an mdev device might have.
>>

One or two sysfs file for each feature shouldn't be that much of over
head? In kernel, other subsystem modules expose capability through
sysfs, like PCI subsystem adds 'boot_vga' file for VGA device which
returns 0/1 depending on if its boot VGA device. Similarly
'd3cold_allowed', 'msi_bus'...

>> So I was ready to return and suggest that maybe libvirt should probe
>> the device to know about these ancillary configuration details, but
>> then I remembered that both mdev vGPU vendors had external dependencies
>> to even allow probing the device.  KVMGT will fail to open the device
>> if it's not associated with an instance of KVM and NVIDIA vGPU, I
>> believe, will fail if the vGPU manager process cannot find the QEMU
>> instance to extract the VM UUID.  (Both of these were bad ideas)
> 
> Here's another proposal that's really growing on me:
> 
>  * Fix the vendor drivers!  Allow devices to be opened and probed
>    without these external dependencies.
>  * Libvirt uses the existing vfio API to open the device and probe the
>    necessary ioctls, if it can't probe the device, the feature is
>    unavailable, ie. display=off, no migration.
> 

I'm trying to think simpler mechanism using sysfs that could work for
any feature and knowing source-destination migration compatibility check
by libvirt before initiating migration.

I have another proposal:
* Add a ioctl VFIO_DEVICE_PROBE_FEATURES
struct vfio_device_features {
    __u32 argsz;
    __u32 features;
}

Define bit for each feature:
#define VFIO_DEVICE_FEATURE_DISPLAY_REGION	(1 << 0)
#define VFIO_DEVICE_FEATURE_DISPLAY_DMABUF	(1 << 1)
#define VFIO_DEVICE_FEATURE_MIGRATION		(1 << 2)

* Vendor driver returns bitmask of supported features during
initialization phase.

* In vfio core module, trap this ioctl for each device  in
vfio_device_fops_unl_ioctl(), check features bitmask returned by vendor
driver and add a sysfs file if feature is supported that device. This
sysfs file would return 0/1.

For migration this bit will only indicate if host driver supports
migration feature.

For source and destination compatibility check libvirt would need more
data/variables to check like,
* if same type of 'mdev_type' device create-able at destination,
   i.e. if ('mdev_type'->available_instances > 0)

* if host_driver_version at source and destination are compatible.
Host driver from same release branch should be mostly compatible, but if
there are major changes in structures or APIs, host drivers from
different branches might not be compatible, for example, if source and
destination are from different branches and one of the structure had
changed, then data collected at source might not be compatible with
structures at destination and typecasting it to changed structures would
mess up migrated data during restoration.

* if guest_driver_version is compatible with host driver at destination.
For mdev devices, guest driver communicates with host driver in some
form. If there are changes in structures/APIs of such communication,
guest driver at source might not be compatible with host driver at
destination.

'available_instances' sysfs already exist, later two should be added by
vendor driver which libvirt can use for migration compatibility check.

Thanks,
Kirti

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-24 19:50     ` Kirti Wankhede
@ 2018-04-24 22:59       ` Alex Williamson
  2018-04-25 15:30         ` Kirti Wankhede
  0 siblings, 1 reply; 41+ messages in thread
From: Alex Williamson @ 2018-04-24 22:59 UTC (permalink / raw)
  To: Kirti Wankhede
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Tina Zhang, Gerd Hoffmann,
	intel-gvt-dev

On Wed, 25 Apr 2018 01:20:08 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 4/24/2018 3:10 AM, Alex Williamson wrote:
> > On Wed, 18 Apr 2018 12:31:53 -0600
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> >   
> >> On Mon,  9 Apr 2018 12:35:10 +0200
> >> Gerd Hoffmann <kraxel@redhat.com> wrote:
> >>  
> >>> This little series adds three drivers, for demo-ing and testing vfio
> >>> display interface code.  There is one mdev device for each interface
> >>> type (mdpy.ko for region and mbochs.ko for dmabuf).    
> >>
> >> Erik Skultety brought up a good question today regarding how libvirt is
> >> meant to handle these different flavors of display interfaces and
> >> knowing whether a given mdev device has display support at all.  It
> >> seems that we cannot simply use the default display=auto because
> >> libvirt needs to specifically configure gl support for a dmabuf type
> >> interface versus not having such a requirement for a region interface,
> >> perhaps even removing the emulated graphics in some cases (though I
> >> don't think we have boot graphics through either solution yet).
> >> Additionally, GVT-g seems to need the x-igd-opregion support
> >> enabled(?), which is a non-starter for libvirt as it's an experimental
> >> option!
> >>
> >> Currently the only way to determine display support is through the
> >> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
> >> their own they'd need to get to the point where they could open the
> >> vfio device and perform the ioctl.  That means opening a vfio
> >> container, adding the group, setting the iommu type, and getting the
> >> device.  I was initially a bit appalled at asking libvirt to do that,
> >> but the alternative is to put this information in sysfs, but doing that
> >> we risk that we need to describe every nuance of the mdev device
> >> through sysfs and it becomes a dumping ground for every possible
> >> feature an mdev device might have.
> >>  
> 
> One or two sysfs file for each feature shouldn't be that much of over
> head? In kernel, other subsystem modules expose capability through
> sysfs, like PCI subsystem adds 'boot_vga' file for VGA device which
> returns 0/1 depending on if its boot VGA device. Similarly
> 'd3cold_allowed', 'msi_bus'...

Obviously we could add sysfs files, but unlike properties that the PCI
core exposes about struct pci_dev fields, the idea of a vfio_device is
much more abstract.  Each bus driver creates its own device
representation, so we have a top level vfio_device referencing through
an opaque pointer a vfio_pci_device, vfio_platform_device, or
mdev_device, and each mdev vendor driver creates its own private data
structure below the mdev_device.  So it's not quite a simple as one new
attribute "show" function to handle all devices of that bus_type.  We
need a consistent implementation in each bus driver and vendor driver
or we need to figure out how to percolate the information up to the
vfio core.  Your idea below seems to take the percolate approach.
 
> >> So I was ready to return and suggest that maybe libvirt should probe
> >> the device to know about these ancillary configuration details, but
> >> then I remembered that both mdev vGPU vendors had external dependencies
> >> to even allow probing the device.  KVMGT will fail to open the device
> >> if it's not associated with an instance of KVM and NVIDIA vGPU, I
> >> believe, will fail if the vGPU manager process cannot find the QEMU
> >> instance to extract the VM UUID.  (Both of these were bad ideas)  
> > 
> > Here's another proposal that's really growing on me:
> > 
> >  * Fix the vendor drivers!  Allow devices to be opened and probed
> >    without these external dependencies.
> >  * Libvirt uses the existing vfio API to open the device and probe the
> >    necessary ioctls, if it can't probe the device, the feature is
> >    unavailable, ie. display=off, no migration.
> >   
> 
> I'm trying to think simpler mechanism using sysfs that could work for
> any feature and knowing source-destination migration compatibility check
> by libvirt before initiating migration.
> 
> I have another proposal:
> * Add a ioctl VFIO_DEVICE_PROBE_FEATURES
> struct vfio_device_features {
>     __u32 argsz;
>     __u32 features;
> }
> 
> Define bit for each feature:
> #define VFIO_DEVICE_FEATURE_DISPLAY_REGION	(1 << 0)
> #define VFIO_DEVICE_FEATURE_DISPLAY_DMABUF	(1 << 1)
> #define VFIO_DEVICE_FEATURE_MIGRATION		(1 << 2)
> 
> * Vendor driver returns bitmask of supported features during
> initialization phase.
> 
> * In vfio core module, trap this ioctl for each device  in
> vfio_device_fops_unl_ioctl(),

Whoops, chicken and egg problem, VFIO_GROUP_GET_DEVICE_FD is our
blocking point with mdev drivers, we can't get a device fd, so we can't
call an ioctl on the device fd.

> check features bitmask returned by vendor
> driver and add a sysfs file if feature is supported that device. This
> sysfs file would return 0/1.

I don't understand why we have an ioctl interface, if the user can get
to the device fd then we have existing interfaces to probe these
things, it seems like you're just wanting to pass a features bitmap
through to vfio_add_group_dev() that vfio-core would expose through
sysfs, but a list of feature bits doesn't convey enough info except for
the most basic uses.
 
> For migration this bit will only indicate if host driver supports
> migration feature.
> 
> For source and destination compatibility check libvirt would need more
> data/variables to check like,
> * if same type of 'mdev_type' device create-able at destination,
>    i.e. if ('mdev_type'->available_instances > 0)
> 
> * if host_driver_version at source and destination are compatible.
> Host driver from same release branch should be mostly compatible, but if
> there are major changes in structures or APIs, host drivers from
> different branches might not be compatible, for example, if source and
> destination are from different branches and one of the structure had
> changed, then data collected at source might not be compatible with
> structures at destination and typecasting it to changed structures would
> mess up migrated data during restoration.

Of course now you're asking that libvirt understand the release
versioning scheme of every vendor driver and that it remain
programatically consistent.  We can't even do this with in-kernel
drivers.  And in the end, still the best we can do is guess.
 
> * if guest_driver_version is compatible with host driver at destination.
> For mdev devices, guest driver communicates with host driver in some
> form. If there are changes in structures/APIs of such communication,
> guest driver at source might not be compatible with host driver at
> destination.

And another guess plus now the guest driver is involved which libvirt
has no visibility to.
 
> 'available_instances' sysfs already exist, later two should be added by
> vendor driver which libvirt can use for migration compatibility check.

As noted previously, display and migration are not necessarily
mdev-only features, it's possible that vfio-pci or vfio-platform could
also implement these, so the sysfs interface cannot be restricted to
the mdev template and lifecycle interface.

One more try... we have a vfio_group fd.  This is created by the bus
drivers calling vfio_add_group_dev() and registers a struct device, a
struct vfio_device_ops, and private data.  Typically we only wire the
device_ops to the resulting file descriptor we get from
VFIO_GROUP_GET_DEVICE_FD, but could we enable sort of a nested ioctl
through the group fd?  The ioctl would need to take a string arg to
match to a device name, plus an ioctl cmd and arg for the device_ops
ioctl.  The group ioctl would need to filter cmds to known, benign
queries.  We'd also need to verify that the allowed ioctls have no
dependencies on setup done in device_ops.open().  *_INFO and
QUERY_GFX_PLANE ioctls would be the only candidates.  Bus drivers could
of course keep an open count in their private data so they know how the
ioctl is being called (if necessary) and the group fd only allows a
single open, so there's no risk that another user could interact with
the group in bad ways once the device is opened (and of course we use
file level access control on the group device file anyway).  This is
sort of a rethink of Paolo's suggestion of a read-only fd, but the fd
is the existing group fd and any references to the device would only be
held around the calling of the nested ioctl.  Could it work?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-24 17:35       ` Alex Williamson
@ 2018-04-25  9:49         ` Zhang, Tina
  0 siblings, 0 replies; 41+ messages in thread
From: Zhang, Tina @ 2018-04-25  9:49 UTC (permalink / raw)
  To: Alex Williamson, Gerd Hoffmann
  Cc: libvirt, kwankhede, intel-gvt-dev, kvm, Erik Skultety



> -----Original Message-----
> From: intel-gvt-dev [mailto:intel-gvt-dev-bounces@lists.freedesktop.org] On
> Behalf Of Alex Williamson
> Sent: Wednesday, April 25, 2018 1:36 AM
> To: Gerd Hoffmann <kraxel@redhat.com>
> Cc: kvm@vger.kernel.org; Erik Skultety <eskultet@redhat.com>; libvirt <libvir-
> list@redhat.com>; Zhang, Tina <tina.zhang@intel.com>;
> kwankhede@nvidia.com; intel-gvt-dev@lists.freedesktop.org
> Subject: Re: [PATCH 0/3] sample: vfio mdev display devices.
> 
> On Tue, 24 Apr 2018 09:17:37 +0200
> Gerd Hoffmann <kraxel@redhat.com> wrote:
> 
> >   Hi,
> >
> > > Here's another proposal that's really growing on me:
> > >
> > >  * Fix the vendor drivers!  Allow devices to be opened and probed
> > >    without these external dependencies.
> >
> > Hmm.  If you try use gvt with tcg then, wouldn't qemu think "device
> > probed ok, all green" then even though that isn't the case?
> 
> Well, is there a way to make it work with tcg?  That would be the best solution.
> Perhaps KVM could be handled as an accelerator rather than a required
> component.  I don't really understand how the page tracking interface is used
> and why it's not required by NVIDIA if it's so fundamental to GVT-g.  Otherwise,

GVT-g needs hypervisors' (like Xen or KVM) help to trap the guest GPU page table update,
so that GVT-g can update the shadow page table correctly, in host, with host physical address,
not guest physical address. As this page table is in memory, GVT-g needs hypervisors' help
to make it write protected, so that it can trap the updates on time.

> are there other points at which the device could refuse to be enabled, for
> instance what if the write to enable bus-master in the PCI command register
> returned an error if the device isn't fully configured.  Paolo had suggested offline

If we add some logic to let GVT-g support basic VFIO APIs even in tcg use case, could the following things be reasonable?
1. A dummy vGPU is created with an UUID.
2. When VFIO_DEVICE_GET_INFO is invoked by libvirt, GVT-g tells that this vGPU is actually a dummy one and cannot work.
3. Then libvirt choose not to boot a VM with this dummy vGPU.
4. Maybe we also need some logic to let a VM with this dummy vGPU boot and work just as there is no vGPU support.

Thanks.

BR,
Tina

> that maybe there could be a read-only mode of the device that allows probing.  I
> think that would be a fair bit of work and complexity to support, but I'm open to
> those sorts of ideas.  I can't be sure the NVIDIA requirement isn't purely for
> accounting purposes within their own proprietary userspace manager, without
> any real technical requirement.
> Hoping Intel and NVIDIA can comment on these so we really understand why
> these are in place before we bend over backwards for a secondary API interface.
> Thanks,
> 
> Alex
> _______________________________________________
> intel-gvt-dev mailing list
> intel-gvt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-24 22:59       ` Alex Williamson
@ 2018-04-25 15:30         ` Kirti Wankhede
  2018-04-25 18:00           ` Alex Williamson
  0 siblings, 1 reply; 41+ messages in thread
From: Kirti Wankhede @ 2018-04-25 15:30 UTC (permalink / raw)
  To: Alex Williamson, Erik Skultety
  Cc: Neo Jia, kvm, libvirt, Tina Zhang, Gerd Hoffmann, intel-gvt-dev



On 4/25/2018 4:29 AM, Alex Williamson wrote:
> On Wed, 25 Apr 2018 01:20:08 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 4/24/2018 3:10 AM, Alex Williamson wrote:
>>> On Wed, 18 Apr 2018 12:31:53 -0600
>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>   
>>>> On Mon,  9 Apr 2018 12:35:10 +0200
>>>> Gerd Hoffmann <kraxel@redhat.com> wrote:
>>>>  
>>>>> This little series adds three drivers, for demo-ing and testing vfio
>>>>> display interface code.  There is one mdev device for each interface
>>>>> type (mdpy.ko for region and mbochs.ko for dmabuf).    
>>>>
>>>> Erik Skultety brought up a good question today regarding how libvirt is
>>>> meant to handle these different flavors of display interfaces and
>>>> knowing whether a given mdev device has display support at all.  It
>>>> seems that we cannot simply use the default display=auto because
>>>> libvirt needs to specifically configure gl support for a dmabuf type
>>>> interface versus not having such a requirement for a region interface,
>>>> perhaps even removing the emulated graphics in some cases (though I
>>>> don't think we have boot graphics through either solution yet).
>>>> Additionally, GVT-g seems to need the x-igd-opregion support
>>>> enabled(?), which is a non-starter for libvirt as it's an experimental
>>>> option!
>>>>
>>>> Currently the only way to determine display support is through the
>>>> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
>>>> their own they'd need to get to the point where they could open the
>>>> vfio device and perform the ioctl.  That means opening a vfio
>>>> container, adding the group, setting the iommu type, and getting the
>>>> device.  I was initially a bit appalled at asking libvirt to do that,
>>>> but the alternative is to put this information in sysfs, but doing that
>>>> we risk that we need to describe every nuance of the mdev device
>>>> through sysfs and it becomes a dumping ground for every possible
>>>> feature an mdev device might have.
>>>>  
>>
>> One or two sysfs file for each feature shouldn't be that much of over
>> head? In kernel, other subsystem modules expose capability through
>> sysfs, like PCI subsystem adds 'boot_vga' file for VGA device which
>> returns 0/1 depending on if its boot VGA device. Similarly
>> 'd3cold_allowed', 'msi_bus'...
> 
> Obviously we could add sysfs files, but unlike properties that the PCI
> core exposes about struct pci_dev fields, the idea of a vfio_device is
> much more abstract.  Each bus driver creates its own device
> representation, so we have a top level vfio_device referencing through
> an opaque pointer a vfio_pci_device, vfio_platform_device, or
> mdev_device, and each mdev vendor driver creates its own private data
> structure below the mdev_device.  So it's not quite a simple as one new
> attribute "show" function to handle all devices of that bus_type.  We
> need a consistent implementation in each bus driver and vendor driver
> or we need to figure out how to percolate the information up to the
> vfio core.  Your idea below seems to take the percolate approach.
>  
>>>> So I was ready to return and suggest that maybe libvirt should probe
>>>> the device to know about these ancillary configuration details, but
>>>> then I remembered that both mdev vGPU vendors had external dependencies
>>>> to even allow probing the device.  KVMGT will fail to open the device
>>>> if it's not associated with an instance of KVM and NVIDIA vGPU, I
>>>> believe, will fail if the vGPU manager process cannot find the QEMU
>>>> instance to extract the VM UUID.  (Both of these were bad ideas)  
>>>
>>> Here's another proposal that's really growing on me:
>>>
>>>  * Fix the vendor drivers!  Allow devices to be opened and probed
>>>    without these external dependencies.
>>>  * Libvirt uses the existing vfio API to open the device and probe the
>>>    necessary ioctls, if it can't probe the device, the feature is
>>>    unavailable, ie. display=off, no migration.
>>>   
>>
>> I'm trying to think simpler mechanism using sysfs that could work for
>> any feature and knowing source-destination migration compatibility check
>> by libvirt before initiating migration.
>>
>> I have another proposal:
>> * Add a ioctl VFIO_DEVICE_PROBE_FEATURES
>> struct vfio_device_features {
>>     __u32 argsz;
>>     __u32 features;
>> }
>>
>> Define bit for each feature:
>> #define VFIO_DEVICE_FEATURE_DISPLAY_REGION	(1 << 0)
>> #define VFIO_DEVICE_FEATURE_DISPLAY_DMABUF	(1 << 1)
>> #define VFIO_DEVICE_FEATURE_MIGRATION		(1 << 2)
>>
>> * Vendor driver returns bitmask of supported features during
>> initialization phase.
>>
>> * In vfio core module, trap this ioctl for each device  in
>> vfio_device_fops_unl_ioctl(),
> 
> Whoops, chicken and egg problem, VFIO_GROUP_GET_DEVICE_FD is our
> blocking point with mdev drivers, we can't get a device fd, so we can't
> call an ioctl on the device fd.
> 

I'm sorry, I thought we could expose features when QEMU initialize, but
libvirt needs to know supported features before QEMU initialize.


>> check features bitmask returned by vendor
>> driver and add a sysfs file if feature is supported that device. This
>> sysfs file would return 0/1.
> 
> I don't understand why we have an ioctl interface, if the user can get
> to the device fd then we have existing interfaces to probe these
> things, it seems like you're just wanting to pass a features bitmap
> through to vfio_add_group_dev() that vfio-core would expose through
> sysfs, but a list of feature bits doesn't convey enough info except for
> the most basic uses.
>  

Yes, vfio_add_group_dev() seems to be better way to convey features to
vfio core.

>> For migration this bit will only indicate if host driver supports
>> migration feature.
>>
>> For source and destination compatibility check libvirt would need more
>> data/variables to check like,
>> * if same type of 'mdev_type' device create-able at destination,
>>    i.e. if ('mdev_type'->available_instances > 0)
>>
>> * if host_driver_version at source and destination are compatible.
>> Host driver from same release branch should be mostly compatible, but if
>> there are major changes in structures or APIs, host drivers from
>> different branches might not be compatible, for example, if source and
>> destination are from different branches and one of the structure had
>> changed, then data collected at source might not be compatible with
>> structures at destination and typecasting it to changed structures would
>> mess up migrated data during restoration.
> 
> Of course now you're asking that libvirt understand the release
> versioning scheme of every vendor driver and that it remain
> programatically consistent.  We can't even do this with in-kernel
> drivers.  And in the end, still the best we can do is guess.
>

Libvirt doesn't need to understand the version, libvirt need to do
strcmp version string from source and destination. If those are equal,
then libvirt would understand that they are compatible.


>> * if guest_driver_version is compatible with host driver at destination.
>> For mdev devices, guest driver communicates with host driver in some
>> form. If there are changes in structures/APIs of such communication,
>> guest driver at source might not be compatible with host driver at
>> destination.
> 
> And another guess plus now the guest driver is involved which libvirt
> has no visibility to.
>  

Like above libvirt need to do strcmp.

>> 'available_instances' sysfs already exist, later two should be added by
>> vendor driver which libvirt can use for migration compatibility check.
> 
> As noted previously, display and migration are not necessarily
> mdev-only features, it's possible that vfio-pci or vfio-platform could
> also implement these, so the sysfs interface cannot be restricted to
> the mdev template and lifecycle interface.
> 

I agree.
Feature bitmask passed to vfio core is not mdev specific. But here
'available_instances' for migration compatibility check is mdev
specific. If mdev device is not create-able at destination, there is no
point in initiating migration by libvirt.

> One more try... we have a vfio_group fd.  This is created by the bus
> drivers calling vfio_add_group_dev() and registers a struct device, a
> struct vfio_device_ops, and private data.  Typically we only wire the
> device_ops to the resulting file descriptor we get from
> VFIO_GROUP_GET_DEVICE_FD, but could we enable sort of a nested ioctl
> through the group fd?  The ioctl would need to take a string arg to
> match to a device name, plus an ioctl cmd and arg for the device_ops
> ioctl.  The group ioctl would need to filter cmds to known, benign
> queries.  We'd also need to verify that the allowed ioctls have no
> dependencies on setup done in device_ops.open().

So these ioctls would be called without devices open() call, doesn't
this seem to be against file operations standard?

Thanks,
Kirti

>  *_INFO and
> QUERY_GFX_PLANE ioctls would be the only candidates.  Bus drivers could
> of course keep an open count in their private data so they know how the
> ioctl is being called (if necessary) and the group fd only allows a
> single open, so there's no risk that another user could interact with
> the group in bad ways once the device is opened (and of course we use
> file level access control on the group device file anyway).  This is
> sort of a rethink of Paolo's suggestion of a read-only fd, but the fd
> is the existing group fd and any references to the device would only be
> held around the calling of the nested ioctl.  Could it work?  Thanks,
> 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-25 15:30         ` Kirti Wankhede
@ 2018-04-25 18:00           ` Alex Williamson
  2018-04-25 19:52             ` Dr. David Alan Gilbert
  0 siblings, 1 reply; 41+ messages in thread
From: Alex Williamson @ 2018-04-25 18:00 UTC (permalink / raw)
  To: Kirti Wankhede
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Dr. David Alan Gilbert,
	Tina Zhang, Jiri, Gerd Hoffmann, Laine Stump, Denemark,
	intel-gvt-dev

On Wed, 25 Apr 2018 21:00:39 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 4/25/2018 4:29 AM, Alex Williamson wrote:
> > On Wed, 25 Apr 2018 01:20:08 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 4/24/2018 3:10 AM, Alex Williamson wrote:  
> >>> On Wed, 18 Apr 2018 12:31:53 -0600
> >>> Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>     
> >>>> On Mon,  9 Apr 2018 12:35:10 +0200
> >>>> Gerd Hoffmann <kraxel@redhat.com> wrote:
> >>>>    
> >>>>> This little series adds three drivers, for demo-ing and testing vfio
> >>>>> display interface code.  There is one mdev device for each interface
> >>>>> type (mdpy.ko for region and mbochs.ko for dmabuf).      
> >>>>
> >>>> Erik Skultety brought up a good question today regarding how libvirt is
> >>>> meant to handle these different flavors of display interfaces and
> >>>> knowing whether a given mdev device has display support at all.  It
> >>>> seems that we cannot simply use the default display=auto because
> >>>> libvirt needs to specifically configure gl support for a dmabuf type
> >>>> interface versus not having such a requirement for a region interface,
> >>>> perhaps even removing the emulated graphics in some cases (though I
> >>>> don't think we have boot graphics through either solution yet).
> >>>> Additionally, GVT-g seems to need the x-igd-opregion support
> >>>> enabled(?), which is a non-starter for libvirt as it's an experimental
> >>>> option!
> >>>>
> >>>> Currently the only way to determine display support is through the
> >>>> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
> >>>> their own they'd need to get to the point where they could open the
> >>>> vfio device and perform the ioctl.  That means opening a vfio
> >>>> container, adding the group, setting the iommu type, and getting the
> >>>> device.  I was initially a bit appalled at asking libvirt to do that,
> >>>> but the alternative is to put this information in sysfs, but doing that
> >>>> we risk that we need to describe every nuance of the mdev device
> >>>> through sysfs and it becomes a dumping ground for every possible
> >>>> feature an mdev device might have.
...    
> >>>> So I was ready to return and suggest that maybe libvirt should probe
> >>>> the device to know about these ancillary configuration details, but
> >>>> then I remembered that both mdev vGPU vendors had external dependencies
> >>>> to even allow probing the device.  KVMGT will fail to open the device
> >>>> if it's not associated with an instance of KVM and NVIDIA vGPU, I
> >>>> believe, will fail if the vGPU manager process cannot find the QEMU
> >>>> instance to extract the VM UUID.  (Both of these were bad ideas)    
> >>>
> >>> Here's another proposal that's really growing on me:
> >>>
> >>>  * Fix the vendor drivers!  Allow devices to be opened and probed
> >>>    without these external dependencies.
> >>>  * Libvirt uses the existing vfio API to open the device and probe the
> >>>    necessary ioctls, if it can't probe the device, the feature is
> >>>    unavailable, ie. display=off, no migration.
> >>>     
> >>
> >> I'm trying to think simpler mechanism using sysfs that could work for
> >> any feature and knowing source-destination migration compatibility check
> >> by libvirt before initiating migration.
> >>
> >> I have another proposal:
> >> * Add a ioctl VFIO_DEVICE_PROBE_FEATURES
> >> struct vfio_device_features {
> >>     __u32 argsz;
> >>     __u32 features;
> >> }
> >>
> >> Define bit for each feature:
> >> #define VFIO_DEVICE_FEATURE_DISPLAY_REGION	(1 << 0)
> >> #define VFIO_DEVICE_FEATURE_DISPLAY_DMABUF	(1 << 1)
> >> #define VFIO_DEVICE_FEATURE_MIGRATION		(1 << 2)
> >>
> >> * Vendor driver returns bitmask of supported features during
> >> initialization phase.
> >>
> >> * In vfio core module, trap this ioctl for each device  in
> >> vfio_device_fops_unl_ioctl(),  
> > 
> > Whoops, chicken and egg problem, VFIO_GROUP_GET_DEVICE_FD is our
> > blocking point with mdev drivers, we can't get a device fd, so we can't
> > call an ioctl on the device fd.
> >   
> 
> I'm sorry, I thought we could expose features when QEMU initialize, but
> libvirt needs to know supported features before QEMU initialize.
> 
> 
> >> check features bitmask returned by vendor
> >> driver and add a sysfs file if feature is supported that device. This
> >> sysfs file would return 0/1.  
> > 
> > I don't understand why we have an ioctl interface, if the user can get
> > to the device fd then we have existing interfaces to probe these
> > things, it seems like you're just wanting to pass a features bitmap
> > through to vfio_add_group_dev() that vfio-core would expose through
> > sysfs, but a list of feature bits doesn't convey enough info except for
> > the most basic uses.
> >    
> 
> Yes, vfio_add_group_dev() seems to be better way to convey features to
> vfio core.
> 
> >> For migration this bit will only indicate if host driver supports
> >> migration feature.
> >>
> >> For source and destination compatibility check libvirt would need more
> >> data/variables to check like,
> >> * if same type of 'mdev_type' device create-able at destination,
> >>    i.e. if ('mdev_type'->available_instances > 0)
> >>
> >> * if host_driver_version at source and destination are compatible.
> >> Host driver from same release branch should be mostly compatible, but if
> >> there are major changes in structures or APIs, host drivers from
> >> different branches might not be compatible, for example, if source and
> >> destination are from different branches and one of the structure had
> >> changed, then data collected at source might not be compatible with
> >> structures at destination and typecasting it to changed structures would
> >> mess up migrated data during restoration.  
> > 
> > Of course now you're asking that libvirt understand the release
> > versioning scheme of every vendor driver and that it remain
> > programatically consistent.  We can't even do this with in-kernel
> > drivers.  And in the end, still the best we can do is guess.
> >  
> 
> Libvirt doesn't need to understand the version, libvirt need to do
> strcmp version string from source and destination. If those are equal,
> then libvirt would understand that they are compatible.

Who's to say that the driver version and migration compatibility have
any relation at all?  Some drivers might focus on designing their own
migration interface that can maintain compatibility across versions
(QEMU does this), some drivers may only allow identical version
migration (which is going to frustrate upper level management tools and
customers - RHEL goes to great extents to support cross version
migration).  We cannot have a one size fits all here that driver version
defines completely the migration compatibility.

> >> * if guest_driver_version is compatible with host driver at destination.
> >> For mdev devices, guest driver communicates with host driver in some
> >> form. If there are changes in structures/APIs of such communication,
> >> guest driver at source might not be compatible with host driver at
> >> destination.  
> > 
> > And another guess plus now the guest driver is involved which libvirt
> > has no visibility to.
> >    
> 
> Like above libvirt need to do strcmp.

Insufficient, imo

> >> 'available_instances' sysfs already exist, later two should be added by
> >> vendor driver which libvirt can use for migration compatibility check.  
> > 
> > As noted previously, display and migration are not necessarily
> > mdev-only features, it's possible that vfio-pci or vfio-platform could
> > also implement these, so the sysfs interface cannot be restricted to
> > the mdev template and lifecycle interface.
> >   
> 
> I agree.
> Feature bitmask passed to vfio core is not mdev specific. But here
> 'available_instances' for migration compatibility check is mdev
> specific. If mdev device is not create-able at destination, there is no
> point in initiating migration by libvirt.

'available_instances' for migration compatibility check...?  We use
available_instances to know whether we have the resources to create a
given mdev type.  It's certainly a prerequisite to have a device of the
identical type at the migration target and how we define what is an
identical device for a directly assigned PCI device is yet another
overly complicated rat hole.  But an identical device doesn't
necessarily imply migration compatibility and I think that's the
problem we're tackling.  We cannot assume based only on the device type
that migration is compatible, that's basically saying we're never going
to have any bugs or oversights or new features in the migration stream.

Chatting with Laine, it may be worth a step back to include migration
experts and people up the stack with more visibility to how openstack
operates.  The issue here is that if vfio gains migration support then
we have a portion of the migration stream that is not under the control
of QEMU, we cannot necessarily tie it to a QEMU machine type and we
cannot necessarily dictate how the vfio bus driver (vendor driver)
handles versioning and compatibility.  My intent was to expose some
sort of migration information through the vfio API so that upper level
tools could determine source and target compatibility, but this in
itself is I think something new that those tools need to agree how it
might be done.  How would something like openstack want to handle not
only finding a migration target with a compatible device, but also
verifying if the device supports the migration format of the source
device?

Alternatively, should we do anything?  Is the problem too hard and we
should let the driver return an error when it receives an incompatible
migration stream, aborting the migration?

> > One more try... we have a vfio_group fd.  This is created by the bus
> > drivers calling vfio_add_group_dev() and registers a struct device, a
> > struct vfio_device_ops, and private data.  Typically we only wire the
> > device_ops to the resulting file descriptor we get from
> > VFIO_GROUP_GET_DEVICE_FD, but could we enable sort of a nested ioctl
> > through the group fd?  The ioctl would need to take a string arg to
> > match to a device name, plus an ioctl cmd and arg for the device_ops
> > ioctl.  The group ioctl would need to filter cmds to known, benign
> > queries.  We'd also need to verify that the allowed ioctls have no
> > dependencies on setup done in device_ops.open().  
> 
> So these ioctls would be called without devices open() call, doesn't
> this seem to be against file operations standard?

vfio_device_ops is modeled largely after file operations, but I don't
think we're bound by that for the interaction between vfio-core and the
vfio bus drivers.  We could make a separate callback for unprivileged
ioctls, but that seems like more work per driver when we really want to
maintain the identical API, we just want to provide a more limited
interface and change the calling point.

An issue I thought of for migration though is that this path wouldn't
have access to the migration region and therefore if we place a header
within that region containing the compatibility and versioning
information, the user still couldn't access it.  This doesn't seem to
be a blocker though as we could put that information within the region
capability that defines the region as used for migration.  Possibly a
device could have multiple migration regions with different formats
for backwards compatibility, of course then we'd need a way to
determine which to use and which combinations have been validated.
Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-25 18:00           ` Alex Williamson
@ 2018-04-25 19:52             ` Dr. David Alan Gilbert
  2018-04-26 18:45               ` Kirti Wankhede
  0 siblings, 1 reply; 41+ messages in thread
From: Dr. David Alan Gilbert @ 2018-04-25 19:52 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Tina Zhang, Kirti Wankhede,
	Gerd Hoffmann, Laine Stump, Jiri Denemark, intel-gvt-dev

* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Wed, 25 Apr 2018 21:00:39 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 4/25/2018 4:29 AM, Alex Williamson wrote:
> > > On Wed, 25 Apr 2018 01:20:08 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > >> On 4/24/2018 3:10 AM, Alex Williamson wrote:  
> > >>> On Wed, 18 Apr 2018 12:31:53 -0600
> > >>> Alex Williamson <alex.williamson@redhat.com> wrote:
> > >>>     
> > >>>> On Mon,  9 Apr 2018 12:35:10 +0200
> > >>>> Gerd Hoffmann <kraxel@redhat.com> wrote:
> > >>>>    
> > >>>>> This little series adds three drivers, for demo-ing and testing vfio
> > >>>>> display interface code.  There is one mdev device for each interface
> > >>>>> type (mdpy.ko for region and mbochs.ko for dmabuf).      
> > >>>>
> > >>>> Erik Skultety brought up a good question today regarding how libvirt is
> > >>>> meant to handle these different flavors of display interfaces and
> > >>>> knowing whether a given mdev device has display support at all.  It
> > >>>> seems that we cannot simply use the default display=auto because
> > >>>> libvirt needs to specifically configure gl support for a dmabuf type
> > >>>> interface versus not having such a requirement for a region interface,
> > >>>> perhaps even removing the emulated graphics in some cases (though I
> > >>>> don't think we have boot graphics through either solution yet).
> > >>>> Additionally, GVT-g seems to need the x-igd-opregion support
> > >>>> enabled(?), which is a non-starter for libvirt as it's an experimental
> > >>>> option!
> > >>>>
> > >>>> Currently the only way to determine display support is through the
> > >>>> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
> > >>>> their own they'd need to get to the point where they could open the
> > >>>> vfio device and perform the ioctl.  That means opening a vfio
> > >>>> container, adding the group, setting the iommu type, and getting the
> > >>>> device.  I was initially a bit appalled at asking libvirt to do that,
> > >>>> but the alternative is to put this information in sysfs, but doing that
> > >>>> we risk that we need to describe every nuance of the mdev device
> > >>>> through sysfs and it becomes a dumping ground for every possible
> > >>>> feature an mdev device might have.
> ...    
> > >>>> So I was ready to return and suggest that maybe libvirt should probe
> > >>>> the device to know about these ancillary configuration details, but
> > >>>> then I remembered that both mdev vGPU vendors had external dependencies
> > >>>> to even allow probing the device.  KVMGT will fail to open the device
> > >>>> if it's not associated with an instance of KVM and NVIDIA vGPU, I
> > >>>> believe, will fail if the vGPU manager process cannot find the QEMU
> > >>>> instance to extract the VM UUID.  (Both of these were bad ideas)    
> > >>>
> > >>> Here's another proposal that's really growing on me:
> > >>>
> > >>>  * Fix the vendor drivers!  Allow devices to be opened and probed
> > >>>    without these external dependencies.
> > >>>  * Libvirt uses the existing vfio API to open the device and probe the
> > >>>    necessary ioctls, if it can't probe the device, the feature is
> > >>>    unavailable, ie. display=off, no migration.
> > >>>     
> > >>
> > >> I'm trying to think simpler mechanism using sysfs that could work for
> > >> any feature and knowing source-destination migration compatibility check
> > >> by libvirt before initiating migration.
> > >>
> > >> I have another proposal:
> > >> * Add a ioctl VFIO_DEVICE_PROBE_FEATURES
> > >> struct vfio_device_features {
> > >>     __u32 argsz;
> > >>     __u32 features;
> > >> }
> > >>
> > >> Define bit for each feature:
> > >> #define VFIO_DEVICE_FEATURE_DISPLAY_REGION	(1 << 0)
> > >> #define VFIO_DEVICE_FEATURE_DISPLAY_DMABUF	(1 << 1)
> > >> #define VFIO_DEVICE_FEATURE_MIGRATION		(1 << 2)
> > >>
> > >> * Vendor driver returns bitmask of supported features during
> > >> initialization phase.
> > >>
> > >> * In vfio core module, trap this ioctl for each device  in
> > >> vfio_device_fops_unl_ioctl(),  
> > > 
> > > Whoops, chicken and egg problem, VFIO_GROUP_GET_DEVICE_FD is our
> > > blocking point with mdev drivers, we can't get a device fd, so we can't
> > > call an ioctl on the device fd.
> > >   
> > 
> > I'm sorry, I thought we could expose features when QEMU initialize, but
> > libvirt needs to know supported features before QEMU initialize.
> > 
> > 
> > >> check features bitmask returned by vendor
> > >> driver and add a sysfs file if feature is supported that device. This
> > >> sysfs file would return 0/1.  
> > > 
> > > I don't understand why we have an ioctl interface, if the user can get
> > > to the device fd then we have existing interfaces to probe these
> > > things, it seems like you're just wanting to pass a features bitmap
> > > through to vfio_add_group_dev() that vfio-core would expose through
> > > sysfs, but a list of feature bits doesn't convey enough info except for
> > > the most basic uses.
> > >    
> > 
> > Yes, vfio_add_group_dev() seems to be better way to convey features to
> > vfio core.
> > 
> > >> For migration this bit will only indicate if host driver supports
> > >> migration feature.
> > >>
> > >> For source and destination compatibility check libvirt would need more
> > >> data/variables to check like,
> > >> * if same type of 'mdev_type' device create-able at destination,
> > >>    i.e. if ('mdev_type'->available_instances > 0)
> > >>
> > >> * if host_driver_version at source and destination are compatible.
> > >> Host driver from same release branch should be mostly compatible, but if
> > >> there are major changes in structures or APIs, host drivers from
> > >> different branches might not be compatible, for example, if source and
> > >> destination are from different branches and one of the structure had
> > >> changed, then data collected at source might not be compatible with
> > >> structures at destination and typecasting it to changed structures would
> > >> mess up migrated data during restoration.  
> > > 
> > > Of course now you're asking that libvirt understand the release
> > > versioning scheme of every vendor driver and that it remain
> > > programatically consistent.  We can't even do this with in-kernel
> > > drivers.  And in the end, still the best we can do is guess.
> > >  
> > 
> > Libvirt doesn't need to understand the version, libvirt need to do
> > strcmp version string from source and destination. If those are equal,
> > then libvirt would understand that they are compatible.
> 
> Who's to say that the driver version and migration compatibility have
> any relation at all?  Some drivers might focus on designing their own
> migration interface that can maintain compatibility across versions
> (QEMU does this), some drivers may only allow identical version
> migration (which is going to frustrate upper level management tools and
> customers - RHEL goes to great extents to support cross version
> migration).  We cannot have a one size fits all here that driver version
> defines completely the migration compatibility.

I'll agree; I don't know enough about these devices, but to give you
some example of things I'd expect to work:
   a) User adds new machines to their data centre with larger/newer
version of the same vendors GPU; in some cases that should work
(depending on vendor details etc)
   b) The same thing but with identical hardware but a newer driver on
the destination.

Obviously there will be some cut offs that say some versions are
incompatible;  but for normal migration we jump through serious hoops
to make sure stuff works; customers will expect the same with some
VFIO devices.

> > >> * if guest_driver_version is compatible with host driver at destination.
> > >> For mdev devices, guest driver communicates with host driver in some
> > >> form. If there are changes in structures/APIs of such communication,
> > >> guest driver at source might not be compatible with host driver at
> > >> destination.  
> > > 
> > > And another guess plus now the guest driver is involved which libvirt
> > > has no visibility to.
> > >    
> > 
> > Like above libvirt need to do strcmp.
> 
> Insufficient, imo
> 
> > >> 'available_instances' sysfs already exist, later two should be added by
> > >> vendor driver which libvirt can use for migration compatibility check.  
> > > 
> > > As noted previously, display and migration are not necessarily
> > > mdev-only features, it's possible that vfio-pci or vfio-platform could
> > > also implement these, so the sysfs interface cannot be restricted to
> > > the mdev template and lifecycle interface.
> > >   
> > 
> > I agree.
> > Feature bitmask passed to vfio core is not mdev specific. But here
> > 'available_instances' for migration compatibility check is mdev
> > specific. If mdev device is not create-able at destination, there is no
> > point in initiating migration by libvirt.
> 
> 'available_instances' for migration compatibility check...?  We use
> available_instances to know whether we have the resources to create a
> given mdev type.  It's certainly a prerequisite to have a device of the
> identical type at the migration target and how we define what is an
> identical device for a directly assigned PCI device is yet another
> overly complicated rat hole.  But an identical device doesn't
> necessarily imply migration compatibility and I think that's the
> problem we're tackling.  We cannot assume based only on the device type
> that migration is compatible, that's basically saying we're never going
> to have any bugs or oversights or new features in the migration stream.

Those things certainly happen; state that we forgot to transfer, new
features enables on devices, devices configured in different ways.

> Chatting with Laine, it may be worth a step back to include migration
> experts and people up the stack with more visibility to how openstack
> operates.  The issue here is that if vfio gains migration support then
> we have a portion of the migration stream that is not under the control
> of QEMU, we cannot necessarily tie it to a QEMU machine type and we
> cannot necessarily dictate how the vfio bus driver (vendor driver)
> handles versioning and compatibility.  My intent was to expose some
> sort of migration information through the vfio API so that upper level
> tools could determine source and target compatibility, but this in
> itself is I think something new that those tools need to agree how it
> might be done.  How would something like openstack want to handle not
> only finding a migration target with a compatible device, but also
> verifying if the device supports the migration format of the source
> device?
> 
> Alternatively, should we do anything?  Is the problem too hard and we
> should let the driver return an error when it receives an incompatible
> migration stream, aborting the migration?

It's a bit nasty; if you've hit the 'evacuate host' button then what
happens when you've got some incompatible hosts.

Dave

> > > One more try... we have a vfio_group fd.  This is created by the bus
> > > drivers calling vfio_add_group_dev() and registers a struct device, a
> > > struct vfio_device_ops, and private data.  Typically we only wire the
> > > device_ops to the resulting file descriptor we get from
> > > VFIO_GROUP_GET_DEVICE_FD, but could we enable sort of a nested ioctl
> > > through the group fd?  The ioctl would need to take a string arg to
> > > match to a device name, plus an ioctl cmd and arg for the device_ops
> > > ioctl.  The group ioctl would need to filter cmds to known, benign
> > > queries.  We'd also need to verify that the allowed ioctls have no
> > > dependencies on setup done in device_ops.open().  
> > 
> > So these ioctls would be called without devices open() call, doesn't
> > this seem to be against file operations standard?
> 
> vfio_device_ops is modeled largely after file operations, but I don't
> think we're bound by that for the interaction between vfio-core and the
> vfio bus drivers.  We could make a separate callback for unprivileged
> ioctls, but that seems like more work per driver when we really want to
> maintain the identical API, we just want to provide a more limited
> interface and change the calling point.
> 
> An issue I thought of for migration though is that this path wouldn't
> have access to the migration region and therefore if we place a header
> within that region containing the compatibility and versioning
> information, the user still couldn't access it.  This doesn't seem to
> be a blocker though as we could put that information within the region
> capability that defines the region as used for migration.  Possibly a
> device could have multiple migration regions with different formats
> for backwards compatibility, of course then we'd need a way to
> determine which to use and which combinations have been validated.
> Thanks,
> 
> Alex
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] sample: vfio mdev display - guest driver
  2018-04-09 10:35 ` [PATCH 2/3] sample: vfio mdev display - guest driver Gerd Hoffmann
  2018-04-11 20:39   ` Bjorn Helgaas
  2018-04-24  2:51   ` Alex Williamson
@ 2018-04-25 21:03   ` Konrad Rzeszutek Wilk
  2 siblings, 0 replies; 41+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-04-25 21:03 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: kvm, alex.williamson, kwankhede, open list

> new file mode 100644
> index 0000000000..0ebd8feead
> --- /dev/null
> +++ b/samples/vfio-mdev/mdpy-fb.c
> @@ -0,0 +1,232 @@
> +/*
> + * Framebuffer driver for mdpy (mediated virtual pci display device).
> + *
> + * See mdpy-defs.h for device specs
> + *
> + *   (c) Gerd Hoffmann <kraxel@redhat.com>

Year?

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-18 18:31 ` [libvirt] [PATCH 0/3] sample: vfio mdev display devices Alex Williamson
  2018-04-19  8:40   ` Gerd Hoffmann
  2018-04-23 21:40   ` Alex Williamson
@ 2018-04-26  3:44   ` Tian, Kevin
  2018-04-26  6:14     ` Gerd Hoffmann
  2 siblings, 1 reply; 41+ messages in thread
From: Tian, Kevin @ 2018-04-26  3:44 UTC (permalink / raw)
  To: Alex Williamson, Gerd Hoffmann
  Cc: kvm, Erik Skultety, libvirt, Zhang, Tina, kwankhede, intel-gvt-dev

> From: Alex Williamson
> Sent: Thursday, April 19, 2018 2:32 AM
> 
> That almost begins to look reasonable, but then we can only expose this
> for mdev devices, what if we were to hack a back door into a directly
> assigned GPU that tracks the location of active display in the
> framebuffer and implement the GFX_PLANE interface for that?  We have no
> sysfs representation for either the template or the actual device for
> anything other than mdev.  This inconsistency with physically assigned
> devices has been one of my arguments against enhancing mdev sysfs.
> 

One possible option is to wrap directly assigned GPU into a mdev. The
parent driver could be a dummy PCI driver which does basic PCI
initialization, and then provide hooks for vendor-specific hack. 

Thanks
Kevin

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-26  3:44   ` Tian, Kevin
@ 2018-04-26  6:14     ` Gerd Hoffmann
  2018-04-26 15:44       ` Alex Williamson
  0 siblings, 1 reply; 41+ messages in thread
From: Gerd Hoffmann @ 2018-04-26  6:14 UTC (permalink / raw)
  To: Tian, Kevin
  Cc: kvm, Erik Skultety, libvirt, kwankhede, Zhang, Tina, intel-gvt-dev

On Thu, Apr 26, 2018 at 03:44:15AM +0000, Tian, Kevin wrote:
> > From: Alex Williamson
> > Sent: Thursday, April 19, 2018 2:32 AM
> > 
> > That almost begins to look reasonable, but then we can only expose this
> > for mdev devices, what if we were to hack a back door into a directly
> > assigned GPU that tracks the location of active display in the
> > framebuffer and implement the GFX_PLANE interface for that?  We have no
> > sysfs representation for either the template or the actual device for
> > anything other than mdev.  This inconsistency with physically assigned
> > devices has been one of my arguments against enhancing mdev sysfs.
> 
> One possible option is to wrap directly assigned GPU into a mdev. The
> parent driver could be a dummy PCI driver which does basic PCI
> initialization, and then provide hooks for vendor-specific hack. 

Thowing amdgpu into the mix.  Looks they have vgpu support too, but
using sriov instead of mdev.  Having VFIO_GFX support surely looks
useful there.  Adding a mdev dependency to the VFIO_GFX api would makes
things more complicated there for (IMHO) no good reason ...

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-26  6:14     ` Gerd Hoffmann
@ 2018-04-26 15:44       ` Alex Williamson
  0 siblings, 0 replies; 41+ messages in thread
From: Alex Williamson @ 2018-04-26 15:44 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: Tian, Kevin, kvm, Erik Skultety, libvirt, Zhang, Tina, kwankhede,
	intel-gvt-dev

On Thu, 26 Apr 2018 08:14:27 +0200
Gerd Hoffmann <kraxel@redhat.com> wrote:

> On Thu, Apr 26, 2018 at 03:44:15AM +0000, Tian, Kevin wrote:
> > > From: Alex Williamson
> > > Sent: Thursday, April 19, 2018 2:32 AM
> > > 
> > > That almost begins to look reasonable, but then we can only expose this
> > > for mdev devices, what if we were to hack a back door into a directly
> > > assigned GPU that tracks the location of active display in the
> > > framebuffer and implement the GFX_PLANE interface for that?  We have no
> > > sysfs representation for either the template or the actual device for
> > > anything other than mdev.  This inconsistency with physically assigned
> > > devices has been one of my arguments against enhancing mdev sysfs.  
> > 
> > One possible option is to wrap directly assigned GPU into a mdev. The
> > parent driver could be a dummy PCI driver which does basic PCI
> > initialization, and then provide hooks for vendor-specific hack.   
> 
> Thowing amdgpu into the mix.  Looks they have vgpu support too, but
> using sriov instead of mdev.  Having VFIO_GFX support surely looks
> useful there.  Adding a mdev dependency to the VFIO_GFX api would makes
> things more complicated there for (IMHO) no good reason ...

Yes, it may be that a device wanting to implement display or migration
might take the mdev approach, but that should be a choice of the
implementation, not a requirement imposed by the API.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-25 19:52             ` Dr. David Alan Gilbert
@ 2018-04-26 18:45               ` Kirti Wankhede
  2018-04-26 18:55                 ` Dr. David Alan Gilbert
  2018-05-04  8:39                 ` [libvirt] " Erik Skultety
  0 siblings, 2 replies; 41+ messages in thread
From: Kirti Wankhede @ 2018-04-26 18:45 UTC (permalink / raw)
  To: Dr. David Alan Gilbert, Alex Williamson
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Tina Zhang, Gerd Hoffmann,
	Laine Stump, Jiri Denemark, intel-gvt-dev



On 4/26/2018 1:22 AM, Dr. David Alan Gilbert wrote:
> * Alex Williamson (alex.williamson@redhat.com) wrote:
>> On Wed, 25 Apr 2018 21:00:39 +0530
>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>
>>> On 4/25/2018 4:29 AM, Alex Williamson wrote:
>>>> On Wed, 25 Apr 2018 01:20:08 +0530
>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>   
>>>>> On 4/24/2018 3:10 AM, Alex Williamson wrote:  
>>>>>> On Wed, 18 Apr 2018 12:31:53 -0600
>>>>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>>>     
>>>>>>> On Mon,  9 Apr 2018 12:35:10 +0200
>>>>>>> Gerd Hoffmann <kraxel@redhat.com> wrote:
>>>>>>>    
>>>>>>>> This little series adds three drivers, for demo-ing and testing vfio
>>>>>>>> display interface code.  There is one mdev device for each interface
>>>>>>>> type (mdpy.ko for region and mbochs.ko for dmabuf).      
>>>>>>>
>>>>>>> Erik Skultety brought up a good question today regarding how libvirt is
>>>>>>> meant to handle these different flavors of display interfaces and
>>>>>>> knowing whether a given mdev device has display support at all.  It
>>>>>>> seems that we cannot simply use the default display=auto because
>>>>>>> libvirt needs to specifically configure gl support for a dmabuf type
>>>>>>> interface versus not having such a requirement for a region interface,
>>>>>>> perhaps even removing the emulated graphics in some cases (though I
>>>>>>> don't think we have boot graphics through either solution yet).
>>>>>>> Additionally, GVT-g seems to need the x-igd-opregion support
>>>>>>> enabled(?), which is a non-starter for libvirt as it's an experimental
>>>>>>> option!
>>>>>>>
>>>>>>> Currently the only way to determine display support is through the
>>>>>>> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
>>>>>>> their own they'd need to get to the point where they could open the
>>>>>>> vfio device and perform the ioctl.  That means opening a vfio
>>>>>>> container, adding the group, setting the iommu type, and getting the
>>>>>>> device.  I was initially a bit appalled at asking libvirt to do that,
>>>>>>> but the alternative is to put this information in sysfs, but doing that
>>>>>>> we risk that we need to describe every nuance of the mdev device
>>>>>>> through sysfs and it becomes a dumping ground for every possible
>>>>>>> feature an mdev device might have.
>> ...    
>>>>>>> So I was ready to return and suggest that maybe libvirt should probe
>>>>>>> the device to know about these ancillary configuration details, but
>>>>>>> then I remembered that both mdev vGPU vendors had external dependencies
>>>>>>> to even allow probing the device.  KVMGT will fail to open the device
>>>>>>> if it's not associated with an instance of KVM and NVIDIA vGPU, I
>>>>>>> believe, will fail if the vGPU manager process cannot find the QEMU
>>>>>>> instance to extract the VM UUID.  (Both of these were bad ideas)    
>>>>>>
>>>>>> Here's another proposal that's really growing on me:
>>>>>>
>>>>>>  * Fix the vendor drivers!  Allow devices to be opened and probed
>>>>>>    without these external dependencies.
>>>>>>  * Libvirt uses the existing vfio API to open the device and probe the
>>>>>>    necessary ioctls, if it can't probe the device, the feature is
>>>>>>    unavailable, ie. display=off, no migration.
>>>>>>     
>>>>>
>>>>> I'm trying to think simpler mechanism using sysfs that could work for
>>>>> any feature and knowing source-destination migration compatibility check
>>>>> by libvirt before initiating migration.
>>>>>
>>>>> I have another proposal:
>>>>> * Add a ioctl VFIO_DEVICE_PROBE_FEATURES
>>>>> struct vfio_device_features {
>>>>>     __u32 argsz;
>>>>>     __u32 features;
>>>>> }
>>>>>
>>>>> Define bit for each feature:
>>>>> #define VFIO_DEVICE_FEATURE_DISPLAY_REGION	(1 << 0)
>>>>> #define VFIO_DEVICE_FEATURE_DISPLAY_DMABUF	(1 << 1)
>>>>> #define VFIO_DEVICE_FEATURE_MIGRATION		(1 << 2)
>>>>>
>>>>> * Vendor driver returns bitmask of supported features during
>>>>> initialization phase.
>>>>>
>>>>> * In vfio core module, trap this ioctl for each device  in
>>>>> vfio_device_fops_unl_ioctl(),  
>>>>
>>>> Whoops, chicken and egg problem, VFIO_GROUP_GET_DEVICE_FD is our
>>>> blocking point with mdev drivers, we can't get a device fd, so we can't
>>>> call an ioctl on the device fd.
>>>>   
>>>
>>> I'm sorry, I thought we could expose features when QEMU initialize, but
>>> libvirt needs to know supported features before QEMU initialize.
>>>
>>>
>>>>> check features bitmask returned by vendor
>>>>> driver and add a sysfs file if feature is supported that device. This
>>>>> sysfs file would return 0/1.  
>>>>
>>>> I don't understand why we have an ioctl interface, if the user can get
>>>> to the device fd then we have existing interfaces to probe these
>>>> things, it seems like you're just wanting to pass a features bitmap
>>>> through to vfio_add_group_dev() that vfio-core would expose through
>>>> sysfs, but a list of feature bits doesn't convey enough info except for
>>>> the most basic uses.
>>>>    
>>>
>>> Yes, vfio_add_group_dev() seems to be better way to convey features to
>>> vfio core.
>>>
>>>>> For migration this bit will only indicate if host driver supports
>>>>> migration feature.
>>>>>
>>>>> For source and destination compatibility check libvirt would need more
>>>>> data/variables to check like,
>>>>> * if same type of 'mdev_type' device create-able at destination,
>>>>>    i.e. if ('mdev_type'->available_instances > 0)
>>>>>
>>>>> * if host_driver_version at source and destination are compatible.
>>>>> Host driver from same release branch should be mostly compatible, but if
>>>>> there are major changes in structures or APIs, host drivers from
>>>>> different branches might not be compatible, for example, if source and
>>>>> destination are from different branches and one of the structure had
>>>>> changed, then data collected at source might not be compatible with
>>>>> structures at destination and typecasting it to changed structures would
>>>>> mess up migrated data during restoration.  
>>>>
>>>> Of course now you're asking that libvirt understand the release
>>>> versioning scheme of every vendor driver and that it remain
>>>> programatically consistent.  We can't even do this with in-kernel
>>>> drivers.  And in the end, still the best we can do is guess.
>>>>  
>>>
>>> Libvirt doesn't need to understand the version, libvirt need to do
>>> strcmp version string from source and destination. If those are equal,
>>> then libvirt would understand that they are compatible.
>>
>> Who's to say that the driver version and migration compatibility have
>> any relation at all?  Some drivers might focus on designing their own
>> migration interface that can maintain compatibility across versions
>> (QEMU does this), some drivers may only allow identical version
>> migration (which is going to frustrate upper level management tools and
>> customers - RHEL goes to great extents to support cross version
>> migration).  We cannot have a one size fits all here that driver version
>> defines completely the migration compatibility.
> 
> I'll agree; I don't know enough about these devices, but to give you
> some example of things I'd expect to work:
>    a) User adds new machines to their data centre with larger/newer
> version of the same vendors GPU; in some cases that should work
> (depending on vendor details etc)
>    b) The same thing but with identical hardware but a newer driver on
> the destination.
> 
> Obviously there will be some cut offs that say some versions are
> incompatible;  but for normal migration we jump through serious hoops
> to make sure stuff works; customers will expect the same with some
> VFIO devices.
> 

How libvirt checks that cut off where some versions are incompatible?


>>>>> * if guest_driver_version is compatible with host driver at destination.
>>>>> For mdev devices, guest driver communicates with host driver in some
>>>>> form. If there are changes in structures/APIs of such communication,
>>>>> guest driver at source might not be compatible with host driver at
>>>>> destination.  
>>>>
>>>> And another guess plus now the guest driver is involved which libvirt
>>>> has no visibility to.
>>>>    
>>>
>>> Like above libvirt need to do strcmp.
>>
>> Insufficient, imo
>>
>>>>> 'available_instances' sysfs already exist, later two should be added by
>>>>> vendor driver which libvirt can use for migration compatibility check.  
>>>>
>>>> As noted previously, display and migration are not necessarily
>>>> mdev-only features, it's possible that vfio-pci or vfio-platform could
>>>> also implement these, so the sysfs interface cannot be restricted to
>>>> the mdev template and lifecycle interface.
>>>>   
>>>
>>> I agree.
>>> Feature bitmask passed to vfio core is not mdev specific. But here
>>> 'available_instances' for migration compatibility check is mdev
>>> specific. If mdev device is not create-able at destination, there is no
>>> point in initiating migration by libvirt.
>>
>> 'available_instances' for migration compatibility check...?  We use
>> available_instances to know whether we have the resources to create a
>> given mdev type.  It's certainly a prerequisite to have a device of the
>> identical type at the migration target and how we define what is an
>> identical device for a directly assigned PCI device is yet another
>> overly complicated rat hole.  But an identical device doesn't
>> necessarily imply migration compatibility and I think that's the
>> problem we're tackling.  We cannot assume based only on the device type
>> that migration is compatible, that's basically saying we're never going
>> to have any bugs or oversights or new features in the migration stream.
> 
> Those things certainly happen; state that we forgot to transfer, new
> features enables on devices, devices configured in different ways.
> 

How libvirt checks migration compatibility for other devices across QEMU
versions where source support a device and destination running with
older QEMU version doesn't support that device or that device doesn't
exist in that system?

Thanks,
Kirti

>> Chatting with Laine, it may be worth a step back to include migration
>> experts and people up the stack with more visibility to how openstack
>> operates.  The issue here is that if vfio gains migration support then
>> we have a portion of the migration stream that is not under the control
>> of QEMU, we cannot necessarily tie it to a QEMU machine type and we
>> cannot necessarily dictate how the vfio bus driver (vendor driver)
>> handles versioning and compatibility.  My intent was to expose some
>> sort of migration information through the vfio API so that upper level
>> tools could determine source and target compatibility, but this in
>> itself is I think something new that those tools need to agree how it
>> might be done.  How would something like openstack want to handle not
>> only finding a migration target with a compatible device, but also
>> verifying if the device supports the migration format of the source
>> device?
>>
>> Alternatively, should we do anything?  Is the problem too hard and we
>> should let the driver return an error when it receives an incompatible
>> migration stream, aborting the migration?
> 
> It's a bit nasty; if you've hit the 'evacuate host' button then what
> happens when you've got some incompatible hosts.
> 
> Dave
> 
>>>> One more try... we have a vfio_group fd.  This is created by the bus
>>>> drivers calling vfio_add_group_dev() and registers a struct device, a
>>>> struct vfio_device_ops, and private data.  Typically we only wire the
>>>> device_ops to the resulting file descriptor we get from
>>>> VFIO_GROUP_GET_DEVICE_FD, but could we enable sort of a nested ioctl
>>>> through the group fd?  The ioctl would need to take a string arg to
>>>> match to a device name, plus an ioctl cmd and arg for the device_ops
>>>> ioctl.  The group ioctl would need to filter cmds to known, benign
>>>> queries.  We'd also need to verify that the allowed ioctls have no
>>>> dependencies on setup done in device_ops.open().  
>>>
>>> So these ioctls would be called without devices open() call, doesn't
>>> this seem to be against file operations standard?
>>
>> vfio_device_ops is modeled largely after file operations, but I don't
>> think we're bound by that for the interaction between vfio-core and the
>> vfio bus drivers.  We could make a separate callback for unprivileged
>> ioctls, but that seems like more work per driver when we really want to
>> maintain the identical API, we just want to provide a more limited
>> interface and change the calling point.
>>
>> An issue I thought of for migration though is that this path wouldn't
>> have access to the migration region and therefore if we place a header
>> within that region containing the compatibility and versioning
>> information, the user still couldn't access it.  This doesn't seem to
>> be a blocker though as we could put that information within the region
>> capability that defines the region as used for migration.  Possibly a
>> device could have multiple migration regions with different formats
>> for backwards compatibility, of course then we'd need a way to
>> determine which to use and which combinations have been validated.
>> Thanks,
>>
>> Alex
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-26 18:45               ` Kirti Wankhede
@ 2018-04-26 18:55                 ` Dr. David Alan Gilbert
  2018-04-27 17:21                   ` Alex Williamson
  2018-05-03 18:58                   ` [libvirt] Expose vfio device display/migration to libvirt and above, was " Alex Williamson
  2018-05-04  8:39                 ` [libvirt] " Erik Skultety
  1 sibling, 2 replies; 41+ messages in thread
From: Dr. David Alan Gilbert @ 2018-04-26 18:55 UTC (permalink / raw)
  To: Kirti Wankhede
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Tina Zhang, Gerd Hoffmann,
	Laine Stump, Jiri Denemark, intel-gvt-dev

* Kirti Wankhede (kwankhede@nvidia.com) wrote:
> 
> 
> On 4/26/2018 1:22 AM, Dr. David Alan Gilbert wrote:
> > * Alex Williamson (alex.williamson@redhat.com) wrote:
> >> On Wed, 25 Apr 2018 21:00:39 +0530
> >> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>
> >>> On 4/25/2018 4:29 AM, Alex Williamson wrote:
> >>>> On Wed, 25 Apr 2018 01:20:08 +0530
> >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>   
> >>>>> On 4/24/2018 3:10 AM, Alex Williamson wrote:  
> >>>>>> On Wed, 18 Apr 2018 12:31:53 -0600
> >>>>>> Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>>>>     
> >>>>>>> On Mon,  9 Apr 2018 12:35:10 +0200
> >>>>>>> Gerd Hoffmann <kraxel@redhat.com> wrote:
> >>>>>>>    
> >>>>>>>> This little series adds three drivers, for demo-ing and testing vfio
> >>>>>>>> display interface code.  There is one mdev device for each interface
> >>>>>>>> type (mdpy.ko for region and mbochs.ko for dmabuf).      
> >>>>>>>
> >>>>>>> Erik Skultety brought up a good question today regarding how libvirt is
> >>>>>>> meant to handle these different flavors of display interfaces and
> >>>>>>> knowing whether a given mdev device has display support at all.  It
> >>>>>>> seems that we cannot simply use the default display=auto because
> >>>>>>> libvirt needs to specifically configure gl support for a dmabuf type
> >>>>>>> interface versus not having such a requirement for a region interface,
> >>>>>>> perhaps even removing the emulated graphics in some cases (though I
> >>>>>>> don't think we have boot graphics through either solution yet).
> >>>>>>> Additionally, GVT-g seems to need the x-igd-opregion support
> >>>>>>> enabled(?), which is a non-starter for libvirt as it's an experimental
> >>>>>>> option!
> >>>>>>>
> >>>>>>> Currently the only way to determine display support is through the
> >>>>>>> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
> >>>>>>> their own they'd need to get to the point where they could open the
> >>>>>>> vfio device and perform the ioctl.  That means opening a vfio
> >>>>>>> container, adding the group, setting the iommu type, and getting the
> >>>>>>> device.  I was initially a bit appalled at asking libvirt to do that,
> >>>>>>> but the alternative is to put this information in sysfs, but doing that
> >>>>>>> we risk that we need to describe every nuance of the mdev device
> >>>>>>> through sysfs and it becomes a dumping ground for every possible
> >>>>>>> feature an mdev device might have.
> >> ...    
> >>>>>>> So I was ready to return and suggest that maybe libvirt should probe
> >>>>>>> the device to know about these ancillary configuration details, but
> >>>>>>> then I remembered that both mdev vGPU vendors had external dependencies
> >>>>>>> to even allow probing the device.  KVMGT will fail to open the device
> >>>>>>> if it's not associated with an instance of KVM and NVIDIA vGPU, I
> >>>>>>> believe, will fail if the vGPU manager process cannot find the QEMU
> >>>>>>> instance to extract the VM UUID.  (Both of these were bad ideas)    
> >>>>>>
> >>>>>> Here's another proposal that's really growing on me:
> >>>>>>
> >>>>>>  * Fix the vendor drivers!  Allow devices to be opened and probed
> >>>>>>    without these external dependencies.
> >>>>>>  * Libvirt uses the existing vfio API to open the device and probe the
> >>>>>>    necessary ioctls, if it can't probe the device, the feature is
> >>>>>>    unavailable, ie. display=off, no migration.
> >>>>>>     
> >>>>>
> >>>>> I'm trying to think simpler mechanism using sysfs that could work for
> >>>>> any feature and knowing source-destination migration compatibility check
> >>>>> by libvirt before initiating migration.
> >>>>>
> >>>>> I have another proposal:
> >>>>> * Add a ioctl VFIO_DEVICE_PROBE_FEATURES
> >>>>> struct vfio_device_features {
> >>>>>     __u32 argsz;
> >>>>>     __u32 features;
> >>>>> }
> >>>>>
> >>>>> Define bit for each feature:
> >>>>> #define VFIO_DEVICE_FEATURE_DISPLAY_REGION	(1 << 0)
> >>>>> #define VFIO_DEVICE_FEATURE_DISPLAY_DMABUF	(1 << 1)
> >>>>> #define VFIO_DEVICE_FEATURE_MIGRATION		(1 << 2)
> >>>>>
> >>>>> * Vendor driver returns bitmask of supported features during
> >>>>> initialization phase.
> >>>>>
> >>>>> * In vfio core module, trap this ioctl for each device  in
> >>>>> vfio_device_fops_unl_ioctl(),  
> >>>>
> >>>> Whoops, chicken and egg problem, VFIO_GROUP_GET_DEVICE_FD is our
> >>>> blocking point with mdev drivers, we can't get a device fd, so we can't
> >>>> call an ioctl on the device fd.
> >>>>   
> >>>
> >>> I'm sorry, I thought we could expose features when QEMU initialize, but
> >>> libvirt needs to know supported features before QEMU initialize.
> >>>
> >>>
> >>>>> check features bitmask returned by vendor
> >>>>> driver and add a sysfs file if feature is supported that device. This
> >>>>> sysfs file would return 0/1.  
> >>>>
> >>>> I don't understand why we have an ioctl interface, if the user can get
> >>>> to the device fd then we have existing interfaces to probe these
> >>>> things, it seems like you're just wanting to pass a features bitmap
> >>>> through to vfio_add_group_dev() that vfio-core would expose through
> >>>> sysfs, but a list of feature bits doesn't convey enough info except for
> >>>> the most basic uses.
> >>>>    
> >>>
> >>> Yes, vfio_add_group_dev() seems to be better way to convey features to
> >>> vfio core.
> >>>
> >>>>> For migration this bit will only indicate if host driver supports
> >>>>> migration feature.
> >>>>>
> >>>>> For source and destination compatibility check libvirt would need more
> >>>>> data/variables to check like,
> >>>>> * if same type of 'mdev_type' device create-able at destination,
> >>>>>    i.e. if ('mdev_type'->available_instances > 0)
> >>>>>
> >>>>> * if host_driver_version at source and destination are compatible.
> >>>>> Host driver from same release branch should be mostly compatible, but if
> >>>>> there are major changes in structures or APIs, host drivers from
> >>>>> different branches might not be compatible, for example, if source and
> >>>>> destination are from different branches and one of the structure had
> >>>>> changed, then data collected at source might not be compatible with
> >>>>> structures at destination and typecasting it to changed structures would
> >>>>> mess up migrated data during restoration.  
> >>>>
> >>>> Of course now you're asking that libvirt understand the release
> >>>> versioning scheme of every vendor driver and that it remain
> >>>> programatically consistent.  We can't even do this with in-kernel
> >>>> drivers.  And in the end, still the best we can do is guess.
> >>>>  
> >>>
> >>> Libvirt doesn't need to understand the version, libvirt need to do
> >>> strcmp version string from source and destination. If those are equal,
> >>> then libvirt would understand that they are compatible.
> >>
> >> Who's to say that the driver version and migration compatibility have
> >> any relation at all?  Some drivers might focus on designing their own
> >> migration interface that can maintain compatibility across versions
> >> (QEMU does this), some drivers may only allow identical version
> >> migration (which is going to frustrate upper level management tools and
> >> customers - RHEL goes to great extents to support cross version
> >> migration).  We cannot have a one size fits all here that driver version
> >> defines completely the migration compatibility.
> > 
> > I'll agree; I don't know enough about these devices, but to give you
> > some example of things I'd expect to work:
> >    a) User adds new machines to their data centre with larger/newer
> > version of the same vendors GPU; in some cases that should work
> > (depending on vendor details etc)
> >    b) The same thing but with identical hardware but a newer driver on
> > the destination.
> > 
> > Obviously there will be some cut offs that say some versions are
> > incompatible;  but for normal migration we jump through serious hoops
> > to make sure stuff works; customers will expect the same with some
> > VFIO devices.
> > 
> 
> How libvirt checks that cut off where some versions are incompatible?


We have versioned 'machine types' - so for example QEMU has
  pc-i440fx-2.11
  pc-i440fx-2.10

machine types; any version of qemu that supports machine type
pc-i440fx-2.10 should behave the same to it's emulated devices.
If we change the behaviour then we tie it to the new machine type;
so the behaviour of a device in pc-i440fx-2.11 might be a bit different.
Occasionally we'll kill off old machine types; (actually we should do it
more!) - but certainly when we do downstream versions we tie it to
machine types as well.

We also have some migration-capability flags, so some features can only
be used if both sides have that flag, and also Libvirt has some checking
of host CPU flags.

> >>>>> * if guest_driver_version is compatible with host driver at destination.
> >>>>> For mdev devices, guest driver communicates with host driver in some
> >>>>> form. If there are changes in structures/APIs of such communication,
> >>>>> guest driver at source might not be compatible with host driver at
> >>>>> destination.  
> >>>>
> >>>> And another guess plus now the guest driver is involved which libvirt
> >>>> has no visibility to.
> >>>>    
> >>>
> >>> Like above libvirt need to do strcmp.
> >>
> >> Insufficient, imo
> >>
> >>>>> 'available_instances' sysfs already exist, later two should be added by
> >>>>> vendor driver which libvirt can use for migration compatibility check.  
> >>>>
> >>>> As noted previously, display and migration are not necessarily
> >>>> mdev-only features, it's possible that vfio-pci or vfio-platform could
> >>>> also implement these, so the sysfs interface cannot be restricted to
> >>>> the mdev template and lifecycle interface.
> >>>>   
> >>>
> >>> I agree.
> >>> Feature bitmask passed to vfio core is not mdev specific. But here
> >>> 'available_instances' for migration compatibility check is mdev
> >>> specific. If mdev device is not create-able at destination, there is no
> >>> point in initiating migration by libvirt.
> >>
> >> 'available_instances' for migration compatibility check...?  We use
> >> available_instances to know whether we have the resources to create a
> >> given mdev type.  It's certainly a prerequisite to have a device of the
> >> identical type at the migration target and how we define what is an
> >> identical device for a directly assigned PCI device is yet another
> >> overly complicated rat hole.  But an identical device doesn't
> >> necessarily imply migration compatibility and I think that's the
> >> problem we're tackling.  We cannot assume based only on the device type
> >> that migration is compatible, that's basically saying we're never going
> >> to have any bugs or oversights or new features in the migration stream.
> > 
> > Those things certainly happen; state that we forgot to transfer, new
> > features enables on devices, devices configured in different ways.
> > 
> 
> How libvirt checks migration compatibility for other devices across QEMU
> versions where source support a device and destination running with
> older QEMU version doesn't support that device or that device doesn't
> exist in that system?

Libvirt inspects the qemu to get lists of devices and capabilities; I'll
leave it to the libvirt guys to add more detail if needed.

Dave

> Thanks,
> Kirti
> 
> >> Chatting with Laine, it may be worth a step back to include migration
> >> experts and people up the stack with more visibility to how openstack
> >> operates.  The issue here is that if vfio gains migration support then
> >> we have a portion of the migration stream that is not under the control
> >> of QEMU, we cannot necessarily tie it to a QEMU machine type and we
> >> cannot necessarily dictate how the vfio bus driver (vendor driver)
> >> handles versioning and compatibility.  My intent was to expose some
> >> sort of migration information through the vfio API so that upper level
> >> tools could determine source and target compatibility, but this in
> >> itself is I think something new that those tools need to agree how it
> >> might be done.  How would something like openstack want to handle not
> >> only finding a migration target with a compatible device, but also
> >> verifying if the device supports the migration format of the source
> >> device?
> >>
> >> Alternatively, should we do anything?  Is the problem too hard and we
> >> should let the driver return an error when it receives an incompatible
> >> migration stream, aborting the migration?
> > 
> > It's a bit nasty; if you've hit the 'evacuate host' button then what
> > happens when you've got some incompatible hosts.
> > 
> > Dave
> > 
> >>>> One more try... we have a vfio_group fd.  This is created by the bus
> >>>> drivers calling vfio_add_group_dev() and registers a struct device, a
> >>>> struct vfio_device_ops, and private data.  Typically we only wire the
> >>>> device_ops to the resulting file descriptor we get from
> >>>> VFIO_GROUP_GET_DEVICE_FD, but could we enable sort of a nested ioctl
> >>>> through the group fd?  The ioctl would need to take a string arg to
> >>>> match to a device name, plus an ioctl cmd and arg for the device_ops
> >>>> ioctl.  The group ioctl would need to filter cmds to known, benign
> >>>> queries.  We'd also need to verify that the allowed ioctls have no
> >>>> dependencies on setup done in device_ops.open().  
> >>>
> >>> So these ioctls would be called without devices open() call, doesn't
> >>> this seem to be against file operations standard?
> >>
> >> vfio_device_ops is modeled largely after file operations, but I don't
> >> think we're bound by that for the interaction between vfio-core and the
> >> vfio bus drivers.  We could make a separate callback for unprivileged
> >> ioctls, but that seems like more work per driver when we really want to
> >> maintain the identical API, we just want to provide a more limited
> >> interface and change the calling point.
> >>
> >> An issue I thought of for migration though is that this path wouldn't
> >> have access to the migration region and therefore if we place a header
> >> within that region containing the compatibility and versioning
> >> information, the user still couldn't access it.  This doesn't seem to
> >> be a blocker though as we could put that information within the region
> >> capability that defines the region as used for migration.  Possibly a
> >> device could have multiple migration regions with different formats
> >> for backwards compatibility, of course then we'd need a way to
> >> determine which to use and which combinations have been validated.
> >> Thanks,
> >>
> >> Alex
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-26 18:55                 ` Dr. David Alan Gilbert
@ 2018-04-27 17:21                   ` Alex Williamson
  2018-05-03 18:58                   ` [libvirt] Expose vfio device display/migration to libvirt and above, was " Alex Williamson
  1 sibling, 0 replies; 41+ messages in thread
From: Alex Williamson @ 2018-04-27 17:21 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Tina Zhang, Kirti Wankhede,
	Gerd Hoffmann, Laine Stump, Jiri Denemark, intel-gvt-dev

On Thu, 26 Apr 2018 19:55:23 +0100
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

> * Kirti Wankhede (kwankhede@nvidia.com) wrote:
> > 
> > 
> > On 4/26/2018 1:22 AM, Dr. David Alan Gilbert wrote:  
> > > * Alex Williamson (alex.williamson@redhat.com) wrote:  
> > >> On Wed, 25 Apr 2018 21:00:39 +0530
> > >> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>  
> > >>> On 4/25/2018 4:29 AM, Alex Williamson wrote:  
> > >>>> On Wed, 25 Apr 2018 01:20:08 +0530
> > >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>>     
> > >>>>> On 4/24/2018 3:10 AM, Alex Williamson wrote:    
> > >>>>>> On Wed, 18 Apr 2018 12:31:53 -0600
> > >>>>>> Alex Williamson <alex.williamson@redhat.com> wrote:
> > >>>>>>       
> > >>>>>>> On Mon,  9 Apr 2018 12:35:10 +0200
> > >>>>>>> Gerd Hoffmann <kraxel@redhat.com> wrote:
> > >>>>>>>      
> > >>>>>>>> This little series adds three drivers, for demo-ing and testing vfio
> > >>>>>>>> display interface code.  There is one mdev device for each interface
> > >>>>>>>> type (mdpy.ko for region and mbochs.ko for dmabuf).        
> > >>>>>>>
> > >>>>>>> Erik Skultety brought up a good question today regarding how libvirt is
> > >>>>>>> meant to handle these different flavors of display interfaces and
> > >>>>>>> knowing whether a given mdev device has display support at all.  It
> > >>>>>>> seems that we cannot simply use the default display=auto because
> > >>>>>>> libvirt needs to specifically configure gl support for a dmabuf type
> > >>>>>>> interface versus not having such a requirement for a region interface,
> > >>>>>>> perhaps even removing the emulated graphics in some cases (though I
> > >>>>>>> don't think we have boot graphics through either solution yet).
> > >>>>>>> Additionally, GVT-g seems to need the x-igd-opregion support
> > >>>>>>> enabled(?), which is a non-starter for libvirt as it's an experimental
> > >>>>>>> option!
> > >>>>>>>
> > >>>>>>> Currently the only way to determine display support is through the
> > >>>>>>> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
> > >>>>>>> their own they'd need to get to the point where they could open the
> > >>>>>>> vfio device and perform the ioctl.  That means opening a vfio
> > >>>>>>> container, adding the group, setting the iommu type, and getting the
> > >>>>>>> device.  I was initially a bit appalled at asking libvirt to do that,
> > >>>>>>> but the alternative is to put this information in sysfs, but doing that
> > >>>>>>> we risk that we need to describe every nuance of the mdev device
> > >>>>>>> through sysfs and it becomes a dumping ground for every possible
> > >>>>>>> feature an mdev device might have.  
> > >> ...      
> > >>>>>>> So I was ready to return and suggest that maybe libvirt should probe
> > >>>>>>> the device to know about these ancillary configuration details, but
> > >>>>>>> then I remembered that both mdev vGPU vendors had external dependencies
> > >>>>>>> to even allow probing the device.  KVMGT will fail to open the device
> > >>>>>>> if it's not associated with an instance of KVM and NVIDIA vGPU, I
> > >>>>>>> believe, will fail if the vGPU manager process cannot find the QEMU
> > >>>>>>> instance to extract the VM UUID.  (Both of these were bad ideas)      
> > >>>>>>
> > >>>>>> Here's another proposal that's really growing on me:
> > >>>>>>
> > >>>>>>  * Fix the vendor drivers!  Allow devices to be opened and probed
> > >>>>>>    without these external dependencies.
> > >>>>>>  * Libvirt uses the existing vfio API to open the device and probe the
> > >>>>>>    necessary ioctls, if it can't probe the device, the feature is
> > >>>>>>    unavailable, ie. display=off, no migration.
> > >>>>>>       
> > >>>>>
> > >>>>> I'm trying to think simpler mechanism using sysfs that could work for
> > >>>>> any feature and knowing source-destination migration compatibility check
> > >>>>> by libvirt before initiating migration.
> > >>>>>
> > >>>>> I have another proposal:
> > >>>>> * Add a ioctl VFIO_DEVICE_PROBE_FEATURES
> > >>>>> struct vfio_device_features {
> > >>>>>     __u32 argsz;
> > >>>>>     __u32 features;
> > >>>>> }
> > >>>>>
> > >>>>> Define bit for each feature:
> > >>>>> #define VFIO_DEVICE_FEATURE_DISPLAY_REGION	(1 << 0)
> > >>>>> #define VFIO_DEVICE_FEATURE_DISPLAY_DMABUF	(1 << 1)
> > >>>>> #define VFIO_DEVICE_FEATURE_MIGRATION		(1 << 2)
> > >>>>>
> > >>>>> * Vendor driver returns bitmask of supported features during
> > >>>>> initialization phase.
> > >>>>>
> > >>>>> * In vfio core module, trap this ioctl for each device  in
> > >>>>> vfio_device_fops_unl_ioctl(),    
> > >>>>
> > >>>> Whoops, chicken and egg problem, VFIO_GROUP_GET_DEVICE_FD is our
> > >>>> blocking point with mdev drivers, we can't get a device fd, so we can't
> > >>>> call an ioctl on the device fd.
> > >>>>     
> > >>>
> > >>> I'm sorry, I thought we could expose features when QEMU initialize, but
> > >>> libvirt needs to know supported features before QEMU initialize.
> > >>>
> > >>>  
> > >>>>> check features bitmask returned by vendor
> > >>>>> driver and add a sysfs file if feature is supported that device. This
> > >>>>> sysfs file would return 0/1.    
> > >>>>
> > >>>> I don't understand why we have an ioctl interface, if the user can get
> > >>>> to the device fd then we have existing interfaces to probe these
> > >>>> things, it seems like you're just wanting to pass a features bitmap
> > >>>> through to vfio_add_group_dev() that vfio-core would expose through
> > >>>> sysfs, but a list of feature bits doesn't convey enough info except for
> > >>>> the most basic uses.
> > >>>>      
> > >>>
> > >>> Yes, vfio_add_group_dev() seems to be better way to convey features to
> > >>> vfio core.
> > >>>  
> > >>>>> For migration this bit will only indicate if host driver supports
> > >>>>> migration feature.
> > >>>>>
> > >>>>> For source and destination compatibility check libvirt would need more
> > >>>>> data/variables to check like,
> > >>>>> * if same type of 'mdev_type' device create-able at destination,
> > >>>>>    i.e. if ('mdev_type'->available_instances > 0)
> > >>>>>
> > >>>>> * if host_driver_version at source and destination are compatible.
> > >>>>> Host driver from same release branch should be mostly compatible, but if
> > >>>>> there are major changes in structures or APIs, host drivers from
> > >>>>> different branches might not be compatible, for example, if source and
> > >>>>> destination are from different branches and one of the structure had
> > >>>>> changed, then data collected at source might not be compatible with
> > >>>>> structures at destination and typecasting it to changed structures would
> > >>>>> mess up migrated data during restoration.    
> > >>>>
> > >>>> Of course now you're asking that libvirt understand the release
> > >>>> versioning scheme of every vendor driver and that it remain
> > >>>> programatically consistent.  We can't even do this with in-kernel
> > >>>> drivers.  And in the end, still the best we can do is guess.
> > >>>>    
> > >>>
> > >>> Libvirt doesn't need to understand the version, libvirt need to do
> > >>> strcmp version string from source and destination. If those are equal,
> > >>> then libvirt would understand that they are compatible.  
> > >>
> > >> Who's to say that the driver version and migration compatibility have
> > >> any relation at all?  Some drivers might focus on designing their own
> > >> migration interface that can maintain compatibility across versions
> > >> (QEMU does this), some drivers may only allow identical version
> > >> migration (which is going to frustrate upper level management tools and
> > >> customers - RHEL goes to great extents to support cross version
> > >> migration).  We cannot have a one size fits all here that driver version
> > >> defines completely the migration compatibility.  
> > > 
> > > I'll agree; I don't know enough about these devices, but to give you
> > > some example of things I'd expect to work:
> > >    a) User adds new machines to their data centre with larger/newer
> > > version of the same vendors GPU; in some cases that should work
> > > (depending on vendor details etc)
> > >    b) The same thing but with identical hardware but a newer driver on
> > > the destination.
> > > 
> > > Obviously there will be some cut offs that say some versions are
> > > incompatible;  but for normal migration we jump through serious hoops
> > > to make sure stuff works; customers will expect the same with some
> > > VFIO devices.
> > >   
> > 
> > How libvirt checks that cut off where some versions are incompatible?  
> 
> 
> We have versioned 'machine types' - so for example QEMU has
>   pc-i440fx-2.11
>   pc-i440fx-2.10
> 
> machine types; any version of qemu that supports machine type
> pc-i440fx-2.10 should behave the same to it's emulated devices.
> If we change the behaviour then we tie it to the new machine type;
> so the behaviour of a device in pc-i440fx-2.11 might be a bit different.
> Occasionally we'll kill off old machine types; (actually we should do it
> more!) - but certainly when we do downstream versions we tie it to
> machine types as well.
> 
> We also have some migration-capability flags, so some features can only
> be used if both sides have that flag, and also Libvirt has some checking
> of host CPU flags.

I think this sort of host compatibility checking for CPU flags is the
part where we need some libvirt input on how they'd like to extend this
for device compatibility.  A complication here is whether it's
reasonable for libvirt to collect migration compatibility data except
for the actual target device.  For instance, if the user model is to
create mdev devices on demand, the vendor driver might be upgraded
between system startup and migration, I don't think we can assume the
migration information remains static or is necessarily the same for
each mdev type provided by the vendor driver, or maybe for each parent
device.  Is it possible that libvirt would evaluate a migration target
device to this extent immediately before the migration?  How would
openstack handle managing a datacenter with such a model?

> > >>>>> * if guest_driver_version is compatible with host driver at destination.
> > >>>>> For mdev devices, guest driver communicates with host driver in some
> > >>>>> form. If there are changes in structures/APIs of such communication,
> > >>>>> guest driver at source might not be compatible with host driver at
> > >>>>> destination.    
> > >>>>
> > >>>> And another guess plus now the guest driver is involved which libvirt
> > >>>> has no visibility to.
> > >>>>      
> > >>>
> > >>> Like above libvirt need to do strcmp.  
> > >>
> > >> Insufficient, imo
> > >>  
> > >>>>> 'available_instances' sysfs already exist, later two should be added by
> > >>>>> vendor driver which libvirt can use for migration compatibility check.    
> > >>>>
> > >>>> As noted previously, display and migration are not necessarily
> > >>>> mdev-only features, it's possible that vfio-pci or vfio-platform could
> > >>>> also implement these, so the sysfs interface cannot be restricted to
> > >>>> the mdev template and lifecycle interface.
> > >>>>     
> > >>>
> > >>> I agree.
> > >>> Feature bitmask passed to vfio core is not mdev specific. But here
> > >>> 'available_instances' for migration compatibility check is mdev
> > >>> specific. If mdev device is not create-able at destination, there is no
> > >>> point in initiating migration by libvirt.  
> > >>
> > >> 'available_instances' for migration compatibility check...?  We use
> > >> available_instances to know whether we have the resources to create a
> > >> given mdev type.  It's certainly a prerequisite to have a device of the
> > >> identical type at the migration target and how we define what is an
> > >> identical device for a directly assigned PCI device is yet another
> > >> overly complicated rat hole.  But an identical device doesn't
> > >> necessarily imply migration compatibility and I think that's the
> > >> problem we're tackling.  We cannot assume based only on the device type
> > >> that migration is compatible, that's basically saying we're never going
> > >> to have any bugs or oversights or new features in the migration stream.  
> > > 
> > > Those things certainly happen; state that we forgot to transfer, new
> > > features enables on devices, devices configured in different ways.
> > >   
> > 
> > How libvirt checks migration compatibility for other devices across QEMU
> > versions where source support a device and destination running with
> > older QEMU version doesn't support that device or that device doesn't
> > exist in that system?  
> 
> Libvirt inspects the qemu to get lists of devices and capabilities; I'll
> leave it to the libvirt guys to add more detail if needed.

Right, so do we need a way to invoke QEMU with a device to report the
migration capabilities of that device?  To this point, I think the
migration viability of a target system has been entirely encompassed
within QEMU's ability to support the versioned machine type and the
compatibility of CPU flags, devices have not been considered as their
compatibility is guaranteed within a machine type and version.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-26 18:55                 ` Dr. David Alan Gilbert
  2018-04-27 17:21                   ` Alex Williamson
@ 2018-05-03 18:58                   ` Alex Williamson
  2018-05-04  7:49                     ` Erik Skultety
                                       ` (2 more replies)
  1 sibling, 3 replies; 41+ messages in thread
From: Alex Williamson @ 2018-05-03 18:58 UTC (permalink / raw)
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Dr. David Alan Gilbert,
	Tina Zhang, Kirti Wankhede, Laine, Gerd Hoffmann, Stump,
	Jiri Denemark, intel-gvt-dev

Hi,

The previous discussion hasn't produced results, so let's start over.
Here's the situation:

 - We currently have kernel and QEMU support for the QEMU vfio-pci
   display option.

 - The default for this option is 'auto', so the device will attempt to
   generate a display if the underlying device supports it, currently
   only GVTg and some future release of NVIDIA vGPU (plus Gerd's
   sample mdpy and mbochs).

 - The display option is implemented via two different mechanism, a
   vfio region (NVIDIA, mdpy) or a dma-buf (GVTg, mbochs).

 - Displays using dma-buf require OpenGL support, displays making
   use of region support do not.

 - Enabling OpenGL support requires specific VM configurations, which
   libvirt /may/ want to facilitate.

 - Probing display support for a given device is complicated by the
   fact that GVTg and NVIDIA both impose requirements on the process
   opening the device file descriptor through the vfio API:

   - GVTg requires a KVM association or will fail to allow the device
     to be opened.

   - NVIDIA requires that their vgpu-manager process can locate a UUID
     for the VM via the process commandline.

   - These are both horrible impositions and prevent libvirt from
     simply probing the device itself.

The above has pressed the need for investigating some sort of
alternative API through which libvirt might introspect a vfio device
and with vfio device migration on the horizon, it's natural that some
sort of support for migration state compatibility for the device need be
considered as a second user of such an API.  However, we currently have
no concept of migration compatibility on a per-device level as there
are no migratable devices that live outside of the QEMU code base.
It's therefore assumed that per device migration compatibility is
encompassed by the versioned machine type for the overall VM.  We need
participation all the way to the top of the VM management stack to
resolve this issue and it's dragging down the (possibly) more simple
question of how do we resolve the display situation.  Therefore I'm
looking for alternatives for display that work within what we have
available to us at the moment.

Erik Skultety, who initially raised the display question, has identified
one possible solution, which is to simply make the display configuration
the user's problem (apologies if I've misinterpreted Erik).  I believe
this would work something like:

 - libvirt identifies a version of QEMU that includes 'display' support
   for vfio-pci devices and defaults to adding display=off for every
   vfio-pci device [have we chosen the wrong default (auto) in QEMU?].

 - New XML support would allow a user to enable display support on the
   vfio device.

 - Resolving any OpenGL dependencies of that change would be left to
   the user.

A nice aspect of this is that policy decisions are left to the user and
clearly no interface changes are necessary, perhaps with the exception
of deciding whether we've made the wrong default choice for vfio-pci
devices in QEMU.

On the other hand, if we do want to give libvirt a mechanism to probe
the display support for a device, we can make a simplified QEMU
instance be the mechanism through which we do that.  For example the
script[1] can be provided with either a PCI device or sysfs path to an
mdev device and run a minimal VM instance meeting the requirements of
both GVTg and NVIDIA to report the display support and GL requirements
for a device.  There are clearly some unrefined and atrocious bits of
this script, but it's only a proof of concept, the process management
can be improved and we can decide whether we want to provide qmp
mechanism to introspect the device rather than grep'ing error
messages.  The goal is simply to show that we could choose to embrace
QEMU and use it not as a VM, but simply a tool for poking at a device
given the restrictions the mdev vendor drivers have already imposed.

So I think the question bounces back to libvirt, does libvirt want
enough information about the display requirements for a given device to
automatically attempt to add GL support for it, effectively a policy of
'if it's supported try to enable it', or should we leave well enough
alone and let the user choose to enable it?

Maybe some guiding questions:

 - Will dma-buf always require GL support?

 - Does GL support limit our ability to have a display over a remote
   connection?

 - Do region-based displays also work with GL support, even if not
   required?

Furthermore, should QEMU vfio-pci flip the default to 'off' for
compatibility?  Thanks,

Alex

[1] https://gist.github.com/awilliam/2ccd31e85923ac8135694a7db2306646

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-05-03 18:58                   ` [libvirt] Expose vfio device display/migration to libvirt and above, was " Alex Williamson
@ 2018-05-04  7:49                     ` Erik Skultety
  2018-05-04 16:03                       ` Alex Williamson
  2018-05-04  9:16                     ` Daniel P. Berrangé
  2018-05-07  6:15                     ` Gerd Hoffmann
  2 siblings, 1 reply; 41+ messages in thread
From: Erik Skultety @ 2018-05-04  7:49 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Neo Jia, kvm, libvirt, Dr. David Alan Gilbert, Tina Zhang,
	Kirti Wankhede, Gerd Hoffmann, Laine Stump, Jiri Denemark,
	intel-gvt-dev

On Thu, May 03, 2018 at 12:58:00PM -0600, Alex Williamson wrote:
> Hi,
>
> The previous discussion hasn't produced results, so let's start over.
> Here's the situation:
>
>  - We currently have kernel and QEMU support for the QEMU vfio-pci
>    display option.
>
>  - The default for this option is 'auto', so the device will attempt to
>    generate a display if the underlying device supports it, currently
>    only GVTg and some future release of NVIDIA vGPU (plus Gerd's
>    sample mdpy and mbochs).
>
>  - The display option is implemented via two different mechanism, a
>    vfio region (NVIDIA, mdpy) or a dma-buf (GVTg, mbochs).
>
>  - Displays using dma-buf require OpenGL support, displays making
>    use of region support do not.
>
>  - Enabling OpenGL support requires specific VM configurations, which
>    libvirt /may/ want to facilitate.
>
>  - Probing display support for a given device is complicated by the
>    fact that GVTg and NVIDIA both impose requirements on the process
>    opening the device file descriptor through the vfio API:
>
>    - GVTg requires a KVM association or will fail to allow the device
>      to be opened.

How exactly is this association checked?

>
>    - NVIDIA requires that their vgpu-manager process can locate a UUID
>      for the VM via the process commandline.
>
>    - These are both horrible impositions and prevent libvirt from
>      simply probing the device itself.

So I feel like we're trying to solve a problem coming from one layer on a bunch
of different layers which inherently prevents us to produce a viable long term
solution without dragging a significant amount of hacky nasty code and it is
not the missing sysfs attributes I have in mind. Why does NVIDIA's vgpu-manager
need to locate a UUID of a qemu VM? I assume that's to prevent multiple VM
instances trying to use the same mdev device, in which case can't the
vgpu-manager track references to how many "open" and "close" calls have been
made to the same device? This is just from a layman's perspective, but it would
allow the following:
    - when libvirt starts, it initializes all its drivers (let's focus on
      QEMU)
    - as part of this initialization, libvirt probes QEMU for capabilities and
      caches them in order to use them when spawning VMs

Now, if we (theoretically) can settle on easing the restrictions Alex has
mentioned, we in fact could introduce a QMP command to probe these devices and
provide libvirt with useful information at that point in time. Of course, since
the 3rd party vendor is "de-coupled" from qemu, libvirt would have no way to
find out that the driver has changed in the meantime, thus still using the old
information we gathered, ergo potentially causing the QEMU process to fail
eventually. But then again, there's very often a strong recommendation to reboot
your host after a driver update, especially in NVIDIA's case, which means this
fact wouldn't matter. However, there's also a significant drawback to my
proposal which probably renders it completely useless (but we can continue from
there...) and that is the devices would either have to be present already (not
an option) or QEMU would need to be enhanced in a way, that it would create a
dummy device during QMP probing, open it, collect the information libvirt
needs, close it and remove it. If the driver doesn't change in the meantime,
this should be sufficient for a VM to be successfully instantiated with a
display, right?

>
> The above has pressed the need for investigating some sort of
> alternative API through which libvirt might introspect a vfio device
> and with vfio device migration on the horizon, it's natural that some
> sort of support for migration state compatibility for the device need be
> considered as a second user of such an API.  However, we currently have
> no concept of migration compatibility on a per-device level as there
> are no migratable devices that live outside of the QEMU code base.
> It's therefore assumed that per device migration compatibility is
> encompassed by the versioned machine type for the overall VM.  We need
> participation all the way to the top of the VM management stack to
> resolve this issue and it's dragging down the (possibly) more simple
> question of how do we resolve the display situation.  Therefore I'm
> looking for alternatives for display that work within what we have
> available to us at the moment.
>
> Erik Skultety, who initially raised the display question, has identified
> one possible solution, which is to simply make the display configuration
> the user's problem (apologies if I've misinterpreted Erik).  I believe
> this would work something like:
>
>  - libvirt identifies a version of QEMU that includes 'display' support
>    for vfio-pci devices and defaults to adding display=off for every
>    vfio-pci device [have we chosen the wrong default (auto) in QEMU?].

>From libvirt's POV, having a new XML attribute display to the host device type
mdev should with a default value 'off', potentially extending this to 'auto'
once we have enough information to base our decision on. We'll need to combine
this with a new attribute value for the <video> element that would prevent
adding an emulated VGA any time <graphics> (spice,VNC) is requested, but that's
something we'd need to do anyway, so I'm just mentioning it.

>
>  - New XML support would allow a user to enable display support on the
>    vfio device.
>
>  - Resolving any OpenGL dependencies of that change would be left to
>    the user.
>
> A nice aspect of this is that policy decisions are left to the user and
> clearly no interface changes are necessary, perhaps with the exception
> of deciding whether we've made the wrong default choice for vfio-pci
> devices in QEMU.

It's a common practice that we offload decisions like this to users
(including management layer, i.e. openstack, ovirt).

>
> On the other hand, if we do want to give libvirt a mechanism to probe
> the display support for a device, we can make a simplified QEMU
> instance be the mechanism through which we do that.  For example the
> script[1] can be provided with either a PCI device or sysfs path to an
> mdev device and run a minimal VM instance meeting the requirements of
> both GVTg and NVIDIA to report the display support and GL requirements
> for a device.  There are clearly some unrefined and atrocious bits of
> this script, but it's only a proof of concept, the process management
> can be improved and we can decide whether we want to provide qmp
> mechanism to introspect the device rather than grep'ing error
> messages.  The goal is simply to show that we could choose to embrace

if not for anything else, error messages change, so that's not a way, QMP is a
much more standardized approach, but then again, as I mentioned above, at the
moment, libvirt probes for capabilities during its start.

> QEMU and use it not as a VM, but simply a tool for poking at a device
> given the restrictions the mdev vendor drivers have already imposed.
>
> So I think the question bounces back to libvirt, does libvirt want
> enough information about the display requirements for a given device to
> automatically attempt to add GL support for it, effectively a policy of
> 'if it's supported try to enable it', or should we leave well enough
> alone and let the user choose to enable it?
>
> Maybe some guiding questions:
>
>  - Will dma-buf always require GL support?
>
>  - Does GL support limit our ability to have a display over a remote
>    connection?
>
>  - Do region-based displays also work with GL support, even if not
>    required?

Yeah, these are IMHO really tough to answer because we can't really predict the
future, which again favours a new libvirt attribute more. Even if we decided
that we truly need a dummy VM as tool for libvirt to probe this info, I still
feel like this should be done up in the virtualization stack and libvirt again
would be just a tool to do stuff the way it's told to do it. But I'd very much
like to hear Dan's opinion, since beside libvirt he can cover openstack too.

Regards,
Erik

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] [PATCH 0/3] sample: vfio mdev display devices.
  2018-04-26 18:45               ` Kirti Wankhede
  2018-04-26 18:55                 ` Dr. David Alan Gilbert
@ 2018-05-04  8:39                 ` Erik Skultety
  1 sibling, 0 replies; 41+ messages in thread
From: Erik Skultety @ 2018-05-04  8:39 UTC (permalink / raw)
  To: Kirti Wankhede
  Cc: Neo Jia, kvm, libvirt, Dr. David Alan Gilbert, Tina Zhang,
	Gerd Hoffmann, Laine Stump, Jiri Denemark, intel-gvt-dev

On Fri, Apr 27, 2018 at 12:15:01AM +0530, Kirti Wankhede wrote:
>
>
> On 4/26/2018 1:22 AM, Dr. David Alan Gilbert wrote:
> > * Alex Williamson (alex.williamson@redhat.com) wrote:
> >> On Wed, 25 Apr 2018 21:00:39 +0530
> >> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>
> >>> On 4/25/2018 4:29 AM, Alex Williamson wrote:
> >>>> On Wed, 25 Apr 2018 01:20:08 +0530
> >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>
> >>>>> On 4/24/2018 3:10 AM, Alex Williamson wrote:
> >>>>>> On Wed, 18 Apr 2018 12:31:53 -0600
> >>>>>> Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>>>>
> >>>>>>> On Mon,  9 Apr 2018 12:35:10 +0200
> >>>>>>> Gerd Hoffmann <kraxel@redhat.com> wrote:
> >>>>>>>
> >>>>>>>> This little series adds three drivers, for demo-ing and testing vfio
> >>>>>>>> display interface code.  There is one mdev device for each interface
> >>>>>>>> type (mdpy.ko for region and mbochs.ko for dmabuf).
> >>>>>>>
> >>>>>>> Erik Skultety brought up a good question today regarding how libvirt is
> >>>>>>> meant to handle these different flavors of display interfaces and
> >>>>>>> knowing whether a given mdev device has display support at all.  It
> >>>>>>> seems that we cannot simply use the default display=auto because
> >>>>>>> libvirt needs to specifically configure gl support for a dmabuf type
> >>>>>>> interface versus not having such a requirement for a region interface,
> >>>>>>> perhaps even removing the emulated graphics in some cases (though I
> >>>>>>> don't think we have boot graphics through either solution yet).
> >>>>>>> Additionally, GVT-g seems to need the x-igd-opregion support
> >>>>>>> enabled(?), which is a non-starter for libvirt as it's an experimental
> >>>>>>> option!
> >>>>>>>
> >>>>>>> Currently the only way to determine display support is through the
> >>>>>>> VFIO_DEVICE_QUERY_GFX_PLANE ioctl, but for libvirt to probe that on
> >>>>>>> their own they'd need to get to the point where they could open the
> >>>>>>> vfio device and perform the ioctl.  That means opening a vfio
> >>>>>>> container, adding the group, setting the iommu type, and getting the
> >>>>>>> device.  I was initially a bit appalled at asking libvirt to do that,
> >>>>>>> but the alternative is to put this information in sysfs, but doing that
> >>>>>>> we risk that we need to describe every nuance of the mdev device
> >>>>>>> through sysfs and it becomes a dumping ground for every possible
> >>>>>>> feature an mdev device might have.
> >> ...
> >>>>>>> So I was ready to return and suggest that maybe libvirt should probe
> >>>>>>> the device to know about these ancillary configuration details, but
> >>>>>>> then I remembered that both mdev vGPU vendors had external dependencies
> >>>>>>> to even allow probing the device.  KVMGT will fail to open the device
> >>>>>>> if it's not associated with an instance of KVM and NVIDIA vGPU, I
> >>>>>>> believe, will fail if the vGPU manager process cannot find the QEMU
> >>>>>>> instance to extract the VM UUID.  (Both of these were bad ideas)
> >>>>>>
> >>>>>> Here's another proposal that's really growing on me:
> >>>>>>
> >>>>>>  * Fix the vendor drivers!  Allow devices to be opened and probed
> >>>>>>    without these external dependencies.
> >>>>>>  * Libvirt uses the existing vfio API to open the device and probe the
> >>>>>>    necessary ioctls, if it can't probe the device, the feature is
> >>>>>>    unavailable, ie. display=off, no migration.
> >>>>>>
> >>>>>
> >>>>> I'm trying to think simpler mechanism using sysfs that could work for
> >>>>> any feature and knowing source-destination migration compatibility check
> >>>>> by libvirt before initiating migration.
> >>>>>
> >>>>> I have another proposal:
> >>>>> * Add a ioctl VFIO_DEVICE_PROBE_FEATURES
> >>>>> struct vfio_device_features {
> >>>>>     __u32 argsz;
> >>>>>     __u32 features;
> >>>>> }
> >>>>>
> >>>>> Define bit for each feature:
> >>>>> #define VFIO_DEVICE_FEATURE_DISPLAY_REGION	(1 << 0)
> >>>>> #define VFIO_DEVICE_FEATURE_DISPLAY_DMABUF	(1 << 1)
> >>>>> #define VFIO_DEVICE_FEATURE_MIGRATION		(1 << 2)
> >>>>>
> >>>>> * Vendor driver returns bitmask of supported features during
> >>>>> initialization phase.
> >>>>>
> >>>>> * In vfio core module, trap this ioctl for each device  in
> >>>>> vfio_device_fops_unl_ioctl(),
> >>>>
> >>>> Whoops, chicken and egg problem, VFIO_GROUP_GET_DEVICE_FD is our
> >>>> blocking point with mdev drivers, we can't get a device fd, so we can't
> >>>> call an ioctl on the device fd.
> >>>>
> >>>
> >>> I'm sorry, I thought we could expose features when QEMU initialize, but
> >>> libvirt needs to know supported features before QEMU initialize.
> >>>
> >>>
> >>>>> check features bitmask returned by vendor
> >>>>> driver and add a sysfs file if feature is supported that device. This
> >>>>> sysfs file would return 0/1.
> >>>>
> >>>> I don't understand why we have an ioctl interface, if the user can get
> >>>> to the device fd then we have existing interfaces to probe these
> >>>> things, it seems like you're just wanting to pass a features bitmap
> >>>> through to vfio_add_group_dev() that vfio-core would expose through
> >>>> sysfs, but a list of feature bits doesn't convey enough info except for
> >>>> the most basic uses.
> >>>>
> >>>
> >>> Yes, vfio_add_group_dev() seems to be better way to convey features to
> >>> vfio core.
> >>>
> >>>>> For migration this bit will only indicate if host driver supports
> >>>>> migration feature.
> >>>>>
> >>>>> For source and destination compatibility check libvirt would need more
> >>>>> data/variables to check like,
> >>>>> * if same type of 'mdev_type' device create-able at destination,
> >>>>>    i.e. if ('mdev_type'->available_instances > 0)
> >>>>>
> >>>>> * if host_driver_version at source and destination are compatible.
> >>>>> Host driver from same release branch should be mostly compatible, but if
> >>>>> there are major changes in structures or APIs, host drivers from
> >>>>> different branches might not be compatible, for example, if source and
> >>>>> destination are from different branches and one of the structure had
> >>>>> changed, then data collected at source might not be compatible with
> >>>>> structures at destination and typecasting it to changed structures would
> >>>>> mess up migrated data during restoration.
> >>>>
> >>>> Of course now you're asking that libvirt understand the release
> >>>> versioning scheme of every vendor driver and that it remain
> >>>> programatically consistent.  We can't even do this with in-kernel
> >>>> drivers.  And in the end, still the best we can do is guess.
> >>>>
> >>>
> >>> Libvirt doesn't need to understand the version, libvirt need to do
> >>> strcmp version string from source and destination. If those are equal,
> >>> then libvirt would understand that they are compatible.
> >>
> >> Who's to say that the driver version and migration compatibility have
> >> any relation at all?  Some drivers might focus on designing their own
> >> migration interface that can maintain compatibility across versions
> >> (QEMU does this), some drivers may only allow identical version
> >> migration (which is going to frustrate upper level management tools and
> >> customers - RHEL goes to great extents to support cross version
> >> migration).  We cannot have a one size fits all here that driver version
> >> defines completely the migration compatibility.
> >
> > I'll agree; I don't know enough about these devices, but to give you
> > some example of things I'd expect to work:
> >    a) User adds new machines to their data centre with larger/newer
> > version of the same vendors GPU; in some cases that should work
> > (depending on vendor details etc)
> >    b) The same thing but with identical hardware but a newer driver on
> > the destination.
> >
> > Obviously there will be some cut offs that say some versions are
> > incompatible;  but for normal migration we jump through serious hoops
> > to make sure stuff works; customers will expect the same with some
> > VFIO devices.
> >
>
> How libvirt checks that cut off where some versions are incompatible?
>
>
> >>>>> * if guest_driver_version is compatible with host driver at destination.
> >>>>> For mdev devices, guest driver communicates with host driver in some
> >>>>> form. If there are changes in structures/APIs of such communication,
> >>>>> guest driver at source might not be compatible with host driver at
> >>>>> destination.
> >>>>
> >>>> And another guess plus now the guest driver is involved which libvirt
> >>>> has no visibility to.
> >>>>
> >>>
> >>> Like above libvirt need to do strcmp.
> >>
> >> Insufficient, imo
> >>
> >>>>> 'available_instances' sysfs already exist, later two should be added by
> >>>>> vendor driver which libvirt can use for migration compatibility check.
> >>>>
> >>>> As noted previously, display and migration are not necessarily
> >>>> mdev-only features, it's possible that vfio-pci or vfio-platform could
> >>>> also implement these, so the sysfs interface cannot be restricted to
> >>>> the mdev template and lifecycle interface.
> >>>>
> >>>
> >>> I agree.
> >>> Feature bitmask passed to vfio core is not mdev specific. But here
> >>> 'available_instances' for migration compatibility check is mdev
> >>> specific. If mdev device is not create-able at destination, there is no
> >>> point in initiating migration by libvirt.
> >>
> >> 'available_instances' for migration compatibility check...?  We use
> >> available_instances to know whether we have the resources to create a
> >> given mdev type.  It's certainly a prerequisite to have a device of the
> >> identical type at the migration target and how we define what is an
> >> identical device for a directly assigned PCI device is yet another
> >> overly complicated rat hole.  But an identical device doesn't
> >> necessarily imply migration compatibility and I think that's the
> >> problem we're tackling.  We cannot assume based only on the device type
> >> that migration is compatible, that's basically saying we're never going
> >> to have any bugs or oversights or new features in the migration stream.
> >
> > Those things certainly happen; state that we forgot to transfer, new
> > features enables on devices, devices configured in different ways.
> >
>
> How libvirt checks migration compatibility for other devices across QEMU
> versions where source support a device and destination running with
> older QEMU version doesn't support that device or that device doesn't
> exist in that system?

We spoke about this on the call, but I'll write it down anyway so that we have
a track of it. Currently, libvirt doesn't support migration of a domain with
devices living outside of QEMU, therefore we'd need a completely new schema to
support this. The other thing I mentioned regarding probing of the migration
capabilities was that we should really consider openstack as both the consumer
and a commander of libvirt, because it's openstack that maintains a global
view of all the hosts in a cluster, so rather than poking libvirt to probe a
random host, I assume they'd already like to have this information beforehand
so that they can incorporate the logic in their scheduler, IOW at the point
where migration is about to happen, openstack should imho already know which
host is capable of hosting the VM to be migrated. That being said, it would
most probably be libvirt who provides openstack with this information just like
openstack probes libvirt for domain capabilities, but the idea stays the same,
ideally we'd need to have this information before users decide to migrate a VM.

Thanks,
Erik

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-05-03 18:58                   ` [libvirt] Expose vfio device display/migration to libvirt and above, was " Alex Williamson
  2018-05-04  7:49                     ` Erik Skultety
@ 2018-05-04  9:16                     ` Daniel P. Berrangé
  2018-05-04 17:06                       ` Alex Williamson
  2018-05-07  6:15                     ` Gerd Hoffmann
  2 siblings, 1 reply; 41+ messages in thread
From: Daniel P. Berrangé @ 2018-05-04  9:16 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Dr. David Alan Gilbert,
	Tina Zhang, Kirti Wankhede, Gerd Hoffmann, Laine Stump,
	Jiri Denemark, intel-gvt-dev

On Thu, May 03, 2018 at 12:58:00PM -0600, Alex Williamson wrote:
> Hi,
> 
> The previous discussion hasn't produced results, so let's start over.
> Here's the situation:
> 
>  - We currently have kernel and QEMU support for the QEMU vfio-pci
>    display option.
> 
>  - The default for this option is 'auto', so the device will attempt to
>    generate a display if the underlying device supports it, currently
>    only GVTg and some future release of NVIDIA vGPU (plus Gerd's
>    sample mdpy and mbochs).
> 
>  - The display option is implemented via two different mechanism, a
>    vfio region (NVIDIA, mdpy) or a dma-buf (GVTg, mbochs).
> 
>  - Displays using dma-buf require OpenGL support, displays making
>    use of region support do not.
> 
>  - Enabling OpenGL support requires specific VM configurations, which
>    libvirt /may/ want to facilitate.
> 
>  - Probing display support for a given device is complicated by the
>    fact that GVTg and NVIDIA both impose requirements on the process
>    opening the device file descriptor through the vfio API:
> 
>    - GVTg requires a KVM association or will fail to allow the device
>      to be opened.
> 
>    - NVIDIA requires that their vgpu-manager process can locate a UUID
>      for the VM via the process commandline.
> 
>    - These are both horrible impositions and prevent libvirt from
>      simply probing the device itself.

Agreed, these requirements are just horrific. Probing for features
should not require this kind of level environmental setup. I can
just about understand & accept how we ended up here, because this
scenario is not one that was strongly considered when the first impls
were being done. I don't think we should accept it as a long term
requirement though.

> Erik Skultety, who initially raised the display question, has identified
> one possible solution, which is to simply make the display configuration
> the user's problem (apologies if I've misinterpreted Erik).  I believe
> this would work something like:
> 
>  - libvirt identifies a version of QEMU that includes 'display' support
>    for vfio-pci devices and defaults to adding display=off for every
>    vfio-pci device [have we chosen the wrong default (auto) in QEMU?].
> 
>  - New XML support would allow a user to enable display support on the
>    vfio device.
> 
>  - Resolving any OpenGL dependencies of that change would be left to
>    the user.
> 
> A nice aspect of this is that policy decisions are left to the user and
> clearly no interface changes are necessary, perhaps with the exception
> of deciding whether we've made the wrong default choice for vfio-pci
> devices in QEMU.

Unless I'm mis-understanding this isn't really a solution to the
problem, rather it is us simply giving up and telling someone else
to try to fix the problem. The 'user' here is not a human - it is
simply the next level up in the mgmt stack, eg OpenStack or oVirt.
If we can't solve it acceptably in libvirt code, I don't have much
hope that OpenStack can solve it in their code, since they have
even stronger need to automate everything.

> On the other hand, if we do want to give libvirt a mechanism to probe
> the display support for a device, we can make a simplified QEMU
> instance be the mechanism through which we do that.  For example the
> script[1] can be provided with either a PCI device or sysfs path to an
> mdev device and run a minimal VM instance meeting the requirements of
> both GVTg and NVIDIA to report the display support and GL requirements
> for a device.  There are clearly some unrefined and atrocious bits of
> this script, but it's only a proof of concept, the process management
> can be improved and we can decide whether we want to provide qmp
> mechanism to introspect the device rather than grep'ing error
> messages.  The goal is simply to show that we could choose to embrace
> QEMU and use it not as a VM, but simply a tool for poking at a device
> given the restrictions the mdev vendor drivers have already imposed.

Feels like a pretty heavy weight solution, that just encourages the
drivers to continue down the undesirable path they're already on,
possibly making the situation even worse over time.


Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-05-04  7:49                     ` Erik Skultety
@ 2018-05-04 16:03                       ` Alex Williamson
  2018-05-07  6:25                         ` Gerd Hoffmann
  2018-05-10 11:00                         ` Erik Skultety
  0 siblings, 2 replies; 41+ messages in thread
From: Alex Williamson @ 2018-05-04 16:03 UTC (permalink / raw)
  To: Erik Skultety
  Cc: Neo Jia, kvm, libvirt, Dr. David Alan Gilbert, Tina Zhang,
	Kirti Wankhede, Gerd Hoffmann, Laine Stump, Jiri Denemark,
	intel-gvt-dev

On Fri, 4 May 2018 09:49:44 +0200
Erik Skultety <eskultet@redhat.com> wrote:

> On Thu, May 03, 2018 at 12:58:00PM -0600, Alex Williamson wrote:
> > Hi,
> >
> > The previous discussion hasn't produced results, so let's start over.
> > Here's the situation:
> >
> >  - We currently have kernel and QEMU support for the QEMU vfio-pci
> >    display option.
> >
> >  - The default for this option is 'auto', so the device will attempt to
> >    generate a display if the underlying device supports it, currently
> >    only GVTg and some future release of NVIDIA vGPU (plus Gerd's
> >    sample mdpy and mbochs).
> >
> >  - The display option is implemented via two different mechanism, a
> >    vfio region (NVIDIA, mdpy) or a dma-buf (GVTg, mbochs).
> >
> >  - Displays using dma-buf require OpenGL support, displays making
> >    use of region support do not.
> >
> >  - Enabling OpenGL support requires specific VM configurations, which
> >    libvirt /may/ want to facilitate.
> >
> >  - Probing display support for a given device is complicated by the
> >    fact that GVTg and NVIDIA both impose requirements on the process
> >    opening the device file descriptor through the vfio API:
> >
> >    - GVTg requires a KVM association or will fail to allow the device
> >      to be opened.  
> 
> How exactly is this association checked?

The intel_vgpu_open() callback for the mdev device registers a vfio
group notifier for VFIO_GROUP_NOTIFY_SET_KVM events. The KVM pointer is
already registered via the addition of the vfio group to the vfio-kvm
pseudo device, so the registration synchronously triggers the notifier
callback and the result is tested slightly later in the open path in
kvmgt_guest_init().
 
> >
> >    - NVIDIA requires that their vgpu-manager process can locate a
> > UUID for the VM via the process commandline.
> >
> >    - These are both horrible impositions and prevent libvirt from
> >      simply probing the device itself.  
> 
> So I feel like we're trying to solve a problem coming from one layer
> on a bunch of different layers which inherently prevents us to
> produce a viable long term solution without dragging a significant
> amount of hacky nasty code and it is not the missing sysfs attributes
> I have in mind. Why does NVIDIA's vgpu-manager need to locate a UUID
> of a qemu VM? I assume that's to prevent multiple VM instances trying
> to use the same mdev device, in which case can't the vgpu-manager
> track references to how many "open" and "close" calls have been made

Hard to say, NVIDIA hasn't been terribly forthcoming about this
requirement, but probably not multiple users of the same mdev device
as that's already prevented through vfio in general.  Intel has
discussed that their requirement is to be able to track VM page table
updates so they can update their shadow tables, so effectively rather
than mediating interactions directly with the device, they're using a
KVM back channel to manage the DMA translation address space for the
device.

The flip side is that while these requirements are annoying and hard
for non-VM users to deal with, is there a next logical point in the
interaction with the vfio device where the vendor driver can reasonably
impose those requirements?  For instance, both vendors expose a
vfio-pci interface, so they could prevent the user driver from enabling
bus master in the PCI command register, but that's a fairly subtle
failure, typically drivers wouldn't even bother to read back after a
write to the bus master bit to see if it sticks and this sort of
enabling is done by the guest, not the hypervisor.  There's really no
error path for a write to the device.

> to the same device? This is just from a layman's perspective, but it
> would allow the following:
>     - when libvirt starts, it initializes all its drivers (let's
> focus on QEMU)
>     - as part of this initialization, libvirt probes QEMU for
> capabilities and caches them in order to use them when spawning VMs
> 
> Now, if we (theoretically) can settle on easing the restrictions Alex
> has mentioned, we in fact could introduce a QMP command to probe
> these devices and provide libvirt with useful information at that
> point in time. Of course, since the 3rd party vendor is "de-coupled"
> from qemu, libvirt would have no way to find out that the driver has
> changed in the meantime, thus still using the old information we
> gathered, ergo potentially causing the QEMU process to fail
> eventually. But then again, there's very often a strong
> recommendation to reboot your host after a driver update, especially
> in NVIDIA's case, which means this fact wouldn't matter. However,
> there's also a significant drawback to my proposal which probably
> renders it completely useless (but we can continue from there...) and
> that is the devices would either have to be present already (not an
> option) or QEMU would need to be enhanced in a way, that it would
> create a dummy device during QMP probing, open it, collect the
> information libvirt needs, close it and remove it. If the driver
> doesn't change in the meantime, this should be sufficient for a VM to
> be successfully instantiated with a display, right?

I don't think this last requirement is possible, QEMU is as clueless
about the capabilities of an mdev device as anyone else until that
device is opened and probed, so how would we invent this "dummy
device"?  I don't really see how there's any ability for
pre-determination of the device capabilities, we can only probe the
actual device we intend to use.

> > The above has pressed the need for investigating some sort of
> > alternative API through which libvirt might introspect a vfio device
> > and with vfio device migration on the horizon, it's natural that
> > some sort of support for migration state compatibility for the
> > device need be considered as a second user of such an API.
> > However, we currently have no concept of migration compatibility on
> > a per-device level as there are no migratable devices that live
> > outside of the QEMU code base. It's therefore assumed that per
> > device migration compatibility is encompassed by the versioned
> > machine type for the overall VM.  We need participation all the way
> > to the top of the VM management stack to resolve this issue and
> > it's dragging down the (possibly) more simple question of how do we
> > resolve the display situation.  Therefore I'm looking for
> > alternatives for display that work within what we have available to
> > us at the moment.
> >
> > Erik Skultety, who initially raised the display question, has
> > identified one possible solution, which is to simply make the
> > display configuration the user's problem (apologies if I've
> > misinterpreted Erik).  I believe this would work something like:
> >
> >  - libvirt identifies a version of QEMU that includes 'display'
> > support for vfio-pci devices and defaults to adding display=off for
> > every vfio-pci device [have we chosen the wrong default (auto) in
> > QEMU?].  
> 
> From libvirt's POV, having a new XML attribute display to the host
> device type mdev should with a default value 'off', potentially
> extending this to 'auto' once we have enough information to base our
> decision on. We'll need to combine this with a new attribute value
> for the <video> element that would prevent adding an emulated VGA any
> time <graphics> (spice,VNC) is requested, but that's something we'd
> need to do anyway, so I'm just mentioning it.

This raises another question, is the configuration of the emulated
graphics a factor in the handling the mdev device's display option?
AFAIK, neither vGPU vendor provides a VBIOS for boot graphics, so even
with a display option, we're mostly targeting a secondary graphics
head, otherwise the user will be running headless until the guest OS
drivers initialize.

> >  - New XML support would allow a user to enable display support on
> > the vfio device.
> >
> >  - Resolving any OpenGL dependencies of that change would be left to
> >    the user.
> >
> > A nice aspect of this is that policy decisions are left to the user
> > and clearly no interface changes are necessary, perhaps with the
> > exception of deciding whether we've made the wrong default choice
> > for vfio-pci devices in QEMU.  
> 
> It's a common practice that we offload decisions like this to users
> (including management layer, i.e. openstack, ovirt).
> 
> >
> > On the other hand, if we do want to give libvirt a mechanism to
> > probe the display support for a device, we can make a simplified
> > QEMU instance be the mechanism through which we do that.  For
> > example the script[1] can be provided with either a PCI device or
> > sysfs path to an mdev device and run a minimal VM instance meeting
> > the requirements of both GVTg and NVIDIA to report the display
> > support and GL requirements for a device.  There are clearly some
> > unrefined and atrocious bits of this script, but it's only a proof
> > of concept, the process management can be improved and we can
> > decide whether we want to provide qmp mechanism to introspect the
> > device rather than grep'ing error messages.  The goal is simply to
> > show that we could choose to embrace  
> 
> if not for anything else, error messages change, so that's not a way,
> QMP is a much more standardized approach, but then again, as I
> mentioned above, at the moment, libvirt probes for capabilities
> during its start.

Right, and none of these device capabilities are currently present via
qmp, and in fact the VM fails to start in my example script when GL is
needed but not present, so there's no QMP interface to probe until a
configuration is found that the VM at least initializes w/o error.

> > QEMU and use it not as a VM, but simply a tool for poking at a
> > device given the restrictions the mdev vendor drivers have already
> > imposed.
> >
> > So I think the question bounces back to libvirt, does libvirt want
> > enough information about the display requirements for a given
> > device to automatically attempt to add GL support for it,
> > effectively a policy of 'if it's supported try to enable it', or
> > should we leave well enough alone and let the user choose to enable
> > it?
> >
> > Maybe some guiding questions:
> >
> >  - Will dma-buf always require GL support?
> >
> >  - Does GL support limit our ability to have a display over a remote
> >    connection?
> >
> >  - Do region-based displays also work with GL support, even if not
> >    required?  
> 
> Yeah, these are IMHO really tough to answer because we can't really
> predict the future, which again favours a new libvirt attribute more.
> Even if we decided that we truly need a dummy VM as tool for libvirt
> to probe this info, I still feel like this should be done up in the
> virtualization stack and libvirt again would be just a tool to do
> stuff the way it's told to do it. But I'd very much like to hear
> Dan's opinion, since beside libvirt he can cover openstack too.

I've learned from Gerd offline that remote connections are possible,
requiring maybe yet a different set of options, so I'm leaning even
further in the direction that libvirt can really only provide the user
with options, but cannot reasonably infer the intentions of the user's
configuration even if device capabilities were exposed.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-05-04  9:16                     ` Daniel P. Berrangé
@ 2018-05-04 17:06                       ` Alex Williamson
  0 siblings, 0 replies; 41+ messages in thread
From: Alex Williamson @ 2018-05-04 17:06 UTC (permalink / raw)
  To: Daniel P. Berrangé
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Dr. David Alan Gilbert,
	Tina Zhang, Kirti Wankhede, Gerd Hoffmann, Laine Stump,
	Jiri Denemark, intel-gvt-dev

On Fri, 4 May 2018 10:16:09 +0100
Daniel P. Berrangé <berrange@redhat.com> wrote:

> On Thu, May 03, 2018 at 12:58:00PM -0600, Alex Williamson wrote:
> > Hi,
> > 
> > The previous discussion hasn't produced results, so let's start over.
> > Here's the situation:
> > 
> >  - We currently have kernel and QEMU support for the QEMU vfio-pci
> >    display option.
> > 
> >  - The default for this option is 'auto', so the device will attempt to
> >    generate a display if the underlying device supports it, currently
> >    only GVTg and some future release of NVIDIA vGPU (plus Gerd's
> >    sample mdpy and mbochs).
> > 
> >  - The display option is implemented via two different mechanism, a
> >    vfio region (NVIDIA, mdpy) or a dma-buf (GVTg, mbochs).
> > 
> >  - Displays using dma-buf require OpenGL support, displays making
> >    use of region support do not.
> > 
> >  - Enabling OpenGL support requires specific VM configurations, which
> >    libvirt /may/ want to facilitate.
> > 
> >  - Probing display support for a given device is complicated by the
> >    fact that GVTg and NVIDIA both impose requirements on the process
> >    opening the device file descriptor through the vfio API:
> > 
> >    - GVTg requires a KVM association or will fail to allow the device
> >      to be opened.
> > 
> >    - NVIDIA requires that their vgpu-manager process can locate a UUID
> >      for the VM via the process commandline.
> > 
> >    - These are both horrible impositions and prevent libvirt from
> >      simply probing the device itself.  
> 
> Agreed, these requirements are just horrific. Probing for features
> should not require this kind of level environmental setup. I can
> just about understand & accept how we ended up here, because this
> scenario is not one that was strongly considered when the first impls
> were being done. I don't think we should accept it as a long term
> requirement though.
> 
> > Erik Skultety, who initially raised the display question, has identified
> > one possible solution, which is to simply make the display configuration
> > the user's problem (apologies if I've misinterpreted Erik).  I believe
> > this would work something like:
> > 
> >  - libvirt identifies a version of QEMU that includes 'display' support
> >    for vfio-pci devices and defaults to adding display=off for every
> >    vfio-pci device [have we chosen the wrong default (auto) in QEMU?].
> > 
> >  - New XML support would allow a user to enable display support on the
> >    vfio device.
> > 
> >  - Resolving any OpenGL dependencies of that change would be left to
> >    the user.
> > 
> > A nice aspect of this is that policy decisions are left to the user and
> > clearly no interface changes are necessary, perhaps with the exception
> > of deciding whether we've made the wrong default choice for vfio-pci
> > devices in QEMU.  
> 
> Unless I'm mis-understanding this isn't really a solution to the
> problem, rather it is us simply giving up and telling someone else
> to try to fix the problem. The 'user' here is not a human - it is
> simply the next level up in the mgmt stack, eg OpenStack or oVirt.
> If we can't solve it acceptably in libvirt code, I don't have much
> hope that OpenStack can solve it in their code, since they have
> even stronger need to automate everything.

But to solve this at any level other than the user suggests there is
one "right" answer to automatically configuring the device.  Is there?
If a device supports a display, does the user necessarily want to
enable it?  If there's a difference between enabling a display for a
local user or a remote user, is there any reasonable expectation that
we can automatically make that determination?

> > On the other hand, if we do want to give libvirt a mechanism to probe
> > the display support for a device, we can make a simplified QEMU
> > instance be the mechanism through which we do that.  For example the
> > script[1] can be provided with either a PCI device or sysfs path to an
> > mdev device and run a minimal VM instance meeting the requirements of
> > both GVTg and NVIDIA to report the display support and GL requirements
> > for a device.  There are clearly some unrefined and atrocious bits of
> > this script, but it's only a proof of concept, the process management
> > can be improved and we can decide whether we want to provide qmp
> > mechanism to introspect the device rather than grep'ing error
> > messages.  The goal is simply to show that we could choose to embrace
> > QEMU and use it not as a VM, but simply a tool for poking at a device
> > given the restrictions the mdev vendor drivers have already imposed.  
> 
> Feels like a pretty heavy weight solution, that just encourages the
> drivers to continue down the undesirable path they're already on,
> possibly making the situation even worse over time.

I'm not getting the impression that the vendor drivers are considering
a change, or necessarily can change.  The NVIDIA UUID requirement
certainly seems arbitrary, but page tracking via KVM seems to be more
directly useful to maintaining the address space of the device relative
to the VM, even if it really wasn't the intent of the mdev interface.
Perhaps we could introduce vfio interfaces to replace this, but is that
just adding an unnecessary layer of interaction for all but this probe
activity.  Maybe the KVM interface should never have been added, but
given that it exists, does it make sense to say that it can't be used,
or required?  Thanks,

Alex

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-05-03 18:58                   ` [libvirt] Expose vfio device display/migration to libvirt and above, was " Alex Williamson
  2018-05-04  7:49                     ` Erik Skultety
  2018-05-04  9:16                     ` Daniel P. Berrangé
@ 2018-05-07  6:15                     ` Gerd Hoffmann
  2 siblings, 0 replies; 41+ messages in thread
From: Gerd Hoffmann @ 2018-05-07  6:15 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Dr. David Alan Gilbert,
	Tina Zhang, Kirti Wankhede, Laine Stump, Jiri Denemark,
	intel-gvt-dev

  Hi,

> Maybe some guiding questions:
> 
>  - Will dma-buf always require GL support?

Yes.

>  - Does GL support limit our ability to have a display over a remote
>    connection?

Currently yes, althrough the plan is to support gl display remotely in
spice.  The workflow will be completely different though.  Non-gl spice
uses the classic display channel, the plan for gl spice is to feed the
dma-bufs into the gpu's video encoder then send a video stream.

>  - Do region-based displays also work with GL support, even if not
>    required?

Yes.  Any qemu display device works with gl-enabled UI.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-05-04 16:03                       ` Alex Williamson
@ 2018-05-07  6:25                         ` Gerd Hoffmann
  2018-07-20  4:56                           ` Yuan, Hang
  2018-05-10 11:00                         ` Erik Skultety
  1 sibling, 1 reply; 41+ messages in thread
From: Gerd Hoffmann @ 2018-05-07  6:25 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Dr. David Alan Gilbert,
	Tina Zhang, Kirti Wankhede, Laine Stump, Jiri Denemark,
	intel-gvt-dev

  Hi,

> This raises another question, is the configuration of the emulated
> graphics a factor in the handling the mdev device's display option?
> AFAIK, neither vGPU vendor provides a VBIOS for boot graphics, so even
> with a display option, we're mostly targeting a secondary graphics
> head, otherwise the user will be running headless until the guest OS
> drivers initialize.

Right now yes, no boot display for vgpu devices.  I'm trying to fix that
with ramfb.  There are a bunch of rough edges still and details to
hashed out.  It'll probably be uefi only.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-05-04 16:03                       ` Alex Williamson
  2018-05-07  6:25                         ` Gerd Hoffmann
@ 2018-05-10 11:00                         ` Erik Skultety
  2018-05-10 15:57                           ` Alex Williamson
  1 sibling, 1 reply; 41+ messages in thread
From: Erik Skultety @ 2018-05-10 11:00 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Neo Jia, kvm, libvirt, Dr. David Alan Gilbert, Tina Zhang,
	Kirti Wankhede, Gerd Hoffmann, Laine Stump, Jiri Denemark,
	intel-gvt-dev

...

> > Now, if we (theoretically) can settle on easing the restrictions Alex
> > has mentioned, we in fact could introduce a QMP command to probe
> > these devices and provide libvirt with useful information at that
> > point in time. Of course, since the 3rd party vendor is "de-coupled"
> > from qemu, libvirt would have no way to find out that the driver has
> > changed in the meantime, thus still using the old information we
> > gathered, ergo potentially causing the QEMU process to fail
> > eventually. But then again, there's very often a strong
> > recommendation to reboot your host after a driver update, especially
> > in NVIDIA's case, which means this fact wouldn't matter. However,
> > there's also a significant drawback to my proposal which probably
> > renders it completely useless (but we can continue from there...) and
> > that is the devices would either have to be present already (not an
> > option) or QEMU would need to be enhanced in a way, that it would
> > create a dummy device during QMP probing, open it, collect the
> > information libvirt needs, close it and remove it. If the driver
> > doesn't change in the meantime, this should be sufficient for a VM to
> > be successfully instantiated with a display, right?
>
> I don't think this last requirement is possible, QEMU is as clueless
> about the capabilities of an mdev device as anyone else until that
> device is opened and probed, so how would we invent this "dummy
> device"?  I don't really see how there's any ability for
> pre-determination of the device capabilities, we can only probe the
> actual device we intend to use.

Hmm, let's say libvirt is able to create mdevs. Do the vendor drivers impose
any kind of limitations on whether a specific device-type or a specific
instance of a type does or does not present certain features like display or
migration in comparison to the other types/instances? IOW I would assume that
once the driver version does support display/migration, any mdev instance of any
mdev type the driver supports will "inherit" the support for display/migration.
If this assumption works, libvirt, knowing there are some mdev capable parent
devices, could technically create a dummy instance of the first type it can for
each parent device, passing the UUID to qemu QMP query command, qemu would then
open and probe the device, returning the capabilities which libvirt would then
cache. Next time a VM is due to start, libvirt can use the device UUID to check
the capabilities we cached and try setting appropriate config options. However,
as you've mentioned, this approach is fairly policy-driven, which doesn't cope
with what libvirt's goal is. Would such a suggestion help at all from QEMU's
POV?

>
> > > The above has pressed the need for investigating some sort of
> > > alternative API through which libvirt might introspect a vfio device
> > > and with vfio device migration on the horizon, it's natural that
> > > some sort of support for migration state compatibility for the
> > > device need be considered as a second user of such an API.
> > > However, we currently have no concept of migration compatibility on
> > > a per-device level as there are no migratable devices that live
> > > outside of the QEMU code base. It's therefore assumed that per
> > > device migration compatibility is encompassed by the versioned
> > > machine type for the overall VM.  We need participation all the way
> > > to the top of the VM management stack to resolve this issue and
> > > it's dragging down the (possibly) more simple question of how do we
> > > resolve the display situation.  Therefore I'm looking for
> > > alternatives for display that work within what we have available to
> > > us at the moment.
> > >
> > > Erik Skultety, who initially raised the display question, has
> > > identified one possible solution, which is to simply make the
> > > display configuration the user's problem (apologies if I've
> > > misinterpreted Erik).  I believe this would work something like:
> > >
> > >  - libvirt identifies a version of QEMU that includes 'display'
> > > support for vfio-pci devices and defaults to adding display=off for
> > > every vfio-pci device [have we chosen the wrong default (auto) in
> > > QEMU?].
> >
> > From libvirt's POV, having a new XML attribute display to the host
> > device type mdev should with a default value 'off', potentially
> > extending this to 'auto' once we have enough information to base our
> > decision on. We'll need to combine this with a new attribute value
> > for the <video> element that would prevent adding an emulated VGA any
> > time <graphics> (spice,VNC) is requested, but that's something we'd
> > need to do anyway, so I'm just mentioning it.
>
> This raises another question, is the configuration of the emulated
> graphics a factor in the handling the mdev device's display option?
> AFAIK, neither vGPU vendor provides a VBIOS for boot graphics, so even

Good point, I forgot about the fact that we don't have boot graphics yet, in
which case no, having the 'none' value isn't a factor here, libvirt can continue
adding an emulated VGA device just to have some boot output. I'm also curious
how the display on the secondary GPU is going to be presented to the end user,
but that's out of scope for libvirt.

> with a display option, we're mostly targeting a secondary graphics
> head, otherwise the user will be running headless until the guest OS
> drivers initialize.
>
> > >  - New XML support would allow a user to enable display support on
> > > the vfio device.
> > >
> > >  - Resolving any OpenGL dependencies of that change would be left to
> > >    the user.
> > >
> > > A nice aspect of this is that policy decisions are left to the user
> > > and clearly no interface changes are necessary, perhaps with the
> > > exception of deciding whether we've made the wrong default choice
> > > for vfio-pci devices in QEMU.
> >
> > It's a common practice that we offload decisions like this to users
> > (including management layer, i.e. openstack, ovirt).
> >
> > >
> > > On the other hand, if we do want to give libvirt a mechanism to
> > > probe the display support for a device, we can make a simplified
> > > QEMU instance be the mechanism through which we do that.  For
> > > example the script[1] can be provided with either a PCI device or
> > > sysfs path to an mdev device and run a minimal VM instance meeting
> > > the requirements of both GVTg and NVIDIA to report the display
> > > support and GL requirements for a device.  There are clearly some
> > > unrefined and atrocious bits of this script, but it's only a proof
> > > of concept, the process management can be improved and we can
> > > decide whether we want to provide qmp mechanism to introspect the
> > > device rather than grep'ing error messages.  The goal is simply to
> > > show that we could choose to embrace
> >
> > if not for anything else, error messages change, so that's not a way,
> > QMP is a much more standardized approach, but then again, as I
> > mentioned above, at the moment, libvirt probes for capabilities
> > during its start.
>
> Right, and none of these device capabilities are currently present via
> qmp, and in fact the VM fails to start in my example script when GL is
> needed but not present, so there's no QMP interface to probe until a
> configuration is found that the VM at least initializes w/o error.
>
> > > QEMU and use it not as a VM, but simply a tool for poking at a
> > > device given the restrictions the mdev vendor drivers have already
> > > imposed.
> > >
> > > So I think the question bounces back to libvirt, does libvirt want
> > > enough information about the display requirements for a given
> > > device to automatically attempt to add GL support for it,
> > > effectively a policy of 'if it's supported try to enable it', or
> > > should we leave well enough alone and let the user choose to enable
> > > it?
> > >
> > > Maybe some guiding questions:
> > >
> > >  - Will dma-buf always require GL support?
> > >
> > >  - Does GL support limit our ability to have a display over a remote
> > >    connection?
> > >
> > >  - Do region-based displays also work with GL support, even if not
> > >    required?
> >
> > Yeah, these are IMHO really tough to answer because we can't really
> > predict the future, which again favours a new libvirt attribute more.
> > Even if we decided that we truly need a dummy VM as tool for libvirt
> > to probe this info, I still feel like this should be done up in the
> > virtualization stack and libvirt again would be just a tool to do
> > stuff the way it's told to do it. But I'd very much like to hear
> > Dan's opinion, since beside libvirt he can cover openstack too.
>
> I've learned from Gerd offline that remote connections are possible,
> requiring maybe yet a different set of options, so I'm leaning even
> further in the direction that libvirt can really only provide the user
> with options, but cannot reasonably infer the intentions of the user's
> configuration even if device capabilities were exposed.  Thanks,

Agreed, this would turn being extremely policy-based, but like Daniel, I'm
really not sure whether these can be determined in an automated way on any
level, sure, ovirt could present a set of contextual menus so a 'human' user
would make the call (even a wrong one for that matter), not as much for
openstack I guess.

Erik

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-05-10 11:00                         ` Erik Skultety
@ 2018-05-10 15:57                           ` Alex Williamson
  0 siblings, 0 replies; 41+ messages in thread
From: Alex Williamson @ 2018-05-10 15:57 UTC (permalink / raw)
  To: Erik Skultety
  Cc: Neo Jia, kvm, libvirt, Dr. David Alan Gilbert, Tina Zhang, Jiri,
	Kirti Wankhede, Gerd Hoffmann, Laine Stump, Denemark,
	intel-gvt-dev

On Thu, 10 May 2018 13:00:29 +0200
Erik Skultety <eskultet@redhat.com> wrote:

> ...
> 
> > > Now, if we (theoretically) can settle on easing the restrictions Alex
> > > has mentioned, we in fact could introduce a QMP command to probe
> > > these devices and provide libvirt with useful information at that
> > > point in time. Of course, since the 3rd party vendor is "de-coupled"
> > > from qemu, libvirt would have no way to find out that the driver has
> > > changed in the meantime, thus still using the old information we
> > > gathered, ergo potentially causing the QEMU process to fail
> > > eventually. But then again, there's very often a strong
> > > recommendation to reboot your host after a driver update, especially
> > > in NVIDIA's case, which means this fact wouldn't matter. However,
> > > there's also a significant drawback to my proposal which probably
> > > renders it completely useless (but we can continue from there...) and
> > > that is the devices would either have to be present already (not an
> > > option) or QEMU would need to be enhanced in a way, that it would
> > > create a dummy device during QMP probing, open it, collect the
> > > information libvirt needs, close it and remove it. If the driver
> > > doesn't change in the meantime, this should be sufficient for a VM to
> > > be successfully instantiated with a display, right?  
> >
> > I don't think this last requirement is possible, QEMU is as clueless
> > about the capabilities of an mdev device as anyone else until that
> > device is opened and probed, so how would we invent this "dummy
> > device"?  I don't really see how there's any ability for
> > pre-determination of the device capabilities, we can only probe the
> > actual device we intend to use.  
> 
> Hmm, let's say libvirt is able to create mdevs. Do the vendor drivers impose
> any kind of limitations on whether a specific device-type or a specific
> instance of a type does or does not present certain features like display or
> migration in comparison to the other types/instances? IOW I would assume that
> once the driver version does support display/migration, any mdev instance of any
> mdev type the driver supports will "inherit" the support for display/migration.
> If this assumption works, libvirt, knowing there are some mdev capable parent
> devices, could technically create a dummy instance of the first type it can for
> each parent device, passing the UUID to qemu QMP query command, qemu would then
> open and probe the device, returning the capabilities which libvirt would then
> cache. Next time a VM is due to start, libvirt can use the device UUID to check
> the capabilities we cached and try setting appropriate config options. However,
> as you've mentioned, this approach is fairly policy-driven, which doesn't cope
> with what libvirt's goal is. Would such a suggestion help at all from QEMU's
> POV?

There is no guarantee that all mdevs are equal for a given vendor.  For
instance we know that the smallest vGPU instance for Intel is intended
for compute offload, it's configured with barely enough framebuffer and
screen resolution for a working desktop.  Does it necessarily make
sense that it would support all of the same capabilities as a more
desktop focused mdev instance?  For that matter, can we necessarily
guarantee that all mdev types for a given parent device are the same
class of device?  For a GPU parent device we might have some VGA class
devices supporting a display and some 3D controllers which don't.  So I
think the operative word above is "assumption".  You can make whatever
assumptions you want, but they're only that, there's nothing that binds
the mdev vendor driver to those assumptions.

> > > > The above has pressed the need for investigating some sort of
> > > > alternative API through which libvirt might introspect a vfio device
> > > > and with vfio device migration on the horizon, it's natural that
> > > > some sort of support for migration state compatibility for the
> > > > device need be considered as a second user of such an API.
> > > > However, we currently have no concept of migration compatibility on
> > > > a per-device level as there are no migratable devices that live
> > > > outside of the QEMU code base. It's therefore assumed that per
> > > > device migration compatibility is encompassed by the versioned
> > > > machine type for the overall VM.  We need participation all the way
> > > > to the top of the VM management stack to resolve this issue and
> > > > it's dragging down the (possibly) more simple question of how do we
> > > > resolve the display situation.  Therefore I'm looking for
> > > > alternatives for display that work within what we have available to
> > > > us at the moment.
> > > >
> > > > Erik Skultety, who initially raised the display question, has
> > > > identified one possible solution, which is to simply make the
> > > > display configuration the user's problem (apologies if I've
> > > > misinterpreted Erik).  I believe this would work something like:
> > > >
> > > >  - libvirt identifies a version of QEMU that includes 'display'
> > > > support for vfio-pci devices and defaults to adding display=off for
> > > > every vfio-pci device [have we chosen the wrong default (auto) in
> > > > QEMU?].  
> > >
> > > From libvirt's POV, having a new XML attribute display to the host
> > > device type mdev should with a default value 'off', potentially
> > > extending this to 'auto' once we have enough information to base our
> > > decision on. We'll need to combine this with a new attribute value
> > > for the <video> element that would prevent adding an emulated VGA any
> > > time <graphics> (spice,VNC) is requested, but that's something we'd
> > > need to do anyway, so I'm just mentioning it.  
> >
> > This raises another question, is the configuration of the emulated
> > graphics a factor in the handling the mdev device's display option?
> > AFAIK, neither vGPU vendor provides a VBIOS for boot graphics, so even  
> 
> Good point, I forgot about the fact that we don't have boot graphics yet, in
> which case no, having the 'none' value isn't a factor here, libvirt can continue
> adding an emulated VGA device just to have some boot output. I'm also curious
> how the display on the secondary GPU is going to be presented to the end user,
> but that's out of scope for libvirt.

I don't believe the guest behavior necessarily changes, depending on
the guest OS capabilities, the emulated and assigned/mdev graphics are
separate displays and the user can configure which to use.  The change
is that now there are ways to get to that mdev display that are in-band
for the hypervisor, such as virt-viewer.  I haven't actually managed to
get this to work yet, but I can see that a second display should be
offered when this is configured properly.

> > with a display option, we're mostly targeting a secondary graphics
> > head, otherwise the user will be running headless until the guest OS
> > drivers initialize.
> >  
> > > >  - New XML support would allow a user to enable display support on
> > > > the vfio device.
> > > >
> > > >  - Resolving any OpenGL dependencies of that change would be left to
> > > >    the user.
> > > >
> > > > A nice aspect of this is that policy decisions are left to the user
> > > > and clearly no interface changes are necessary, perhaps with the
> > > > exception of deciding whether we've made the wrong default choice
> > > > for vfio-pci devices in QEMU.  
> > >
> > > It's a common practice that we offload decisions like this to users
> > > (including management layer, i.e. openstack, ovirt).
> > >  
> > > >
> > > > On the other hand, if we do want to give libvirt a mechanism to
> > > > probe the display support for a device, we can make a simplified
> > > > QEMU instance be the mechanism through which we do that.  For
> > > > example the script[1] can be provided with either a PCI device or
> > > > sysfs path to an mdev device and run a minimal VM instance meeting
> > > > the requirements of both GVTg and NVIDIA to report the display
> > > > support and GL requirements for a device.  There are clearly some
> > > > unrefined and atrocious bits of this script, but it's only a proof
> > > > of concept, the process management can be improved and we can
> > > > decide whether we want to provide qmp mechanism to introspect the
> > > > device rather than grep'ing error messages.  The goal is simply to
> > > > show that we could choose to embrace  
> > >
> > > if not for anything else, error messages change, so that's not a way,
> > > QMP is a much more standardized approach, but then again, as I
> > > mentioned above, at the moment, libvirt probes for capabilities
> > > during its start.  
> >
> > Right, and none of these device capabilities are currently present via
> > qmp, and in fact the VM fails to start in my example script when GL is
> > needed but not present, so there's no QMP interface to probe until a
> > configuration is found that the VM at least initializes w/o error.
> >  
> > > > QEMU and use it not as a VM, but simply a tool for poking at a
> > > > device given the restrictions the mdev vendor drivers have already
> > > > imposed.
> > > >
> > > > So I think the question bounces back to libvirt, does libvirt want
> > > > enough information about the display requirements for a given
> > > > device to automatically attempt to add GL support for it,
> > > > effectively a policy of 'if it's supported try to enable it', or
> > > > should we leave well enough alone and let the user choose to enable
> > > > it?
> > > >
> > > > Maybe some guiding questions:
> > > >
> > > >  - Will dma-buf always require GL support?
> > > >
> > > >  - Does GL support limit our ability to have a display over a remote
> > > >    connection?
> > > >
> > > >  - Do region-based displays also work with GL support, even if not
> > > >    required?  
> > >
> > > Yeah, these are IMHO really tough to answer because we can't really
> > > predict the future, which again favours a new libvirt attribute more.
> > > Even if we decided that we truly need a dummy VM as tool for libvirt
> > > to probe this info, I still feel like this should be done up in the
> > > virtualization stack and libvirt again would be just a tool to do
> > > stuff the way it's told to do it. But I'd very much like to hear
> > > Dan's opinion, since beside libvirt he can cover openstack too.  
> >
> > I've learned from Gerd offline that remote connections are possible,
> > requiring maybe yet a different set of options, so I'm leaning even
> > further in the direction that libvirt can really only provide the user
> > with options, but cannot reasonably infer the intentions of the user's
> > configuration even if device capabilities were exposed.  Thanks,  
> 
> Agreed, this would turn being extremely policy-based, but like Daniel, I'm
> really not sure whether these can be determined in an automated way on any
> level, sure, ovirt could present a set of contextual menus so a 'human' user
> would make the call (even a wrong one for that matter), not as much for
> openstack I guess.

Perhaps the idea of a local display really has no place in either an
ovirt or openstack configuration, so if everything works with GL and
SPICE will use something GL compatible (and \assuming\ the overhead of
enabling that thing is trivial), perhaps data center management tools
would simply always direct libvirt to use such a configuration.  They'd
need to know then whether display is supported or have things wired
such that the current default of display=auto will always work when
it's available.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-05-07  6:25                         ` Gerd Hoffmann
@ 2018-07-20  4:56                           ` Yuan, Hang
  2018-08-08  7:43                             ` Gerd Hoffmann
  0 siblings, 1 reply; 41+ messages in thread
From: Yuan, Hang @ 2018-07-20  4:56 UTC (permalink / raw)
  To: Gerd Hoffmann, Alex Williamson
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Dr. David Alan Gilbert,
	Zhang, Tina, Kirti Wankhede, Laine Stump, Jiri Denemark,
	intel-gvt-dev

Hi Gerd,

Can I know your status on the boot display support work? I'm interested to try it in some real use cases.

Thanks,
Henry

> -----Original Message-----
> From: intel-gvt-dev [mailto:intel-gvt-dev-bounces@lists.freedesktop.org] On
> Behalf Of Gerd Hoffmann
> Sent: Monday, May 7, 2018 2:26 PM
> To: Alex Williamson <alex.williamson@redhat.com>
> Cc: Neo Jia <cjia@nvidia.com>; kvm@vger.kernel.org; Erik Skultety
> <eskultet@redhat.com>; libvirt <libvir-list@redhat.com>; Dr. David Alan
> Gilbert <dgilbert@redhat.com>; Zhang, Tina <tina.zhang@intel.com>; Kirti
> Wankhede <kwankhede@nvidia.com>; Laine Stump <laine@redhat.com>;
> Daniel P. Berrange <berrange@redhat.com>; Jiri Denemark
> <jdenemar@redhat.com>; intel-gvt-dev@lists.freedesktop.org
> Subject: Re: Expose vfio device display/migration to libvirt and above, was Re:
> [PATCH 0/3] sample: vfio mdev display devices.
> 
>   Hi,
> 
> > This raises another question, is the configuration of the emulated
> > graphics a factor in the handling the mdev device's display option?
> > AFAIK, neither vGPU vendor provides a VBIOS for boot graphics, so even
> > with a display option, we're mostly targeting a secondary graphics
> > head, otherwise the user will be running headless until the guest OS
> > drivers initialize.
> 
> Right now yes, no boot display for vgpu devices.  I'm trying to fix that with
> ramfb.  There are a bunch of rough edges still and details to hashed out.  It'll
> probably be uefi only.
> 
> cheers,
>   Gerd

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [libvirt] Expose vfio device display/migration to libvirt and above, was Re: [PATCH 0/3] sample: vfio mdev display devices.
  2018-07-20  4:56                           ` Yuan, Hang
@ 2018-08-08  7:43                             ` Gerd Hoffmann
  0 siblings, 0 replies; 41+ messages in thread
From: Gerd Hoffmann @ 2018-08-08  7:43 UTC (permalink / raw)
  To: Yuan, Hang
  Cc: Neo Jia, kvm, Erik Skultety, libvirt, Dr. David Alan Gilbert,
	Kirti Wankhede, Zhang, Tina, Laine Stump, Jiri Denemark,
	intel-gvt-dev

On Fri, Jul 20, 2018 at 04:56:15AM +0000, Yuan, Hang wrote:
> Hi Gerd,
> 
> Can I know your status on the boot display support work? I'm interested to try it in some real use cases.

https://git.kraxel.org/cgit/qemu/log/?h=sirius/ramfb-vfio

Most of the bits needed (general ramfb support) is merged upstream and will be in 3.0.

Wiring up ramfb for vfio display devices is in the branch listed above
and should follow for 3.1

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2018-08-08  7:43 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20180409103513.8020-1-kraxel@redhat.com>
2018-04-09 10:35 ` [PATCH 1/3] sample: vfio mdev display - host device Gerd Hoffmann
2018-04-24  2:41   ` Alex Williamson
2018-04-24  6:29     ` Gerd Hoffmann
2018-04-09 10:35 ` [PATCH 2/3] sample: vfio mdev display - guest driver Gerd Hoffmann
2018-04-11 20:39   ` Bjorn Helgaas
2018-04-24  2:51   ` Alex Williamson
2018-04-25 21:03   ` Konrad Rzeszutek Wilk
2018-04-09 10:35 ` [PATCH 3/3] sample: vfio bochs vbe display (host device for bochs-drm) Gerd Hoffmann
2018-04-24  3:05   ` Alex Williamson
2018-04-18 18:31 ` [libvirt] [PATCH 0/3] sample: vfio mdev display devices Alex Williamson
2018-04-19  8:40   ` Gerd Hoffmann
2018-04-19 10:03     ` Zhenyu Wang
2018-04-19 14:20     ` Alex Williamson
2018-04-19 14:54     ` Paolo Bonzini
2018-04-23 21:40   ` Alex Williamson
2018-04-24  7:17     ` Gerd Hoffmann
2018-04-24 17:35       ` Alex Williamson
2018-04-25  9:49         ` Zhang, Tina
2018-04-24 19:50     ` Kirti Wankhede
2018-04-24 22:59       ` Alex Williamson
2018-04-25 15:30         ` Kirti Wankhede
2018-04-25 18:00           ` Alex Williamson
2018-04-25 19:52             ` Dr. David Alan Gilbert
2018-04-26 18:45               ` Kirti Wankhede
2018-04-26 18:55                 ` Dr. David Alan Gilbert
2018-04-27 17:21                   ` Alex Williamson
2018-05-03 18:58                   ` [libvirt] Expose vfio device display/migration to libvirt and above, was " Alex Williamson
2018-05-04  7:49                     ` Erik Skultety
2018-05-04 16:03                       ` Alex Williamson
2018-05-07  6:25                         ` Gerd Hoffmann
2018-07-20  4:56                           ` Yuan, Hang
2018-08-08  7:43                             ` Gerd Hoffmann
2018-05-10 11:00                         ` Erik Skultety
2018-05-10 15:57                           ` Alex Williamson
2018-05-04  9:16                     ` Daniel P. Berrangé
2018-05-04 17:06                       ` Alex Williamson
2018-05-07  6:15                     ` Gerd Hoffmann
2018-05-04  8:39                 ` [libvirt] " Erik Skultety
2018-04-26  3:44   ` Tian, Kevin
2018-04-26  6:14     ` Gerd Hoffmann
2018-04-26 15:44       ` Alex Williamson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.