linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] vfio: enabled and supported on power (v7)
       [not found] <20120821113534.GS29724@truffula.fritz.box>
@ 2012-09-04  7:33 ` Alexey Kardashevskiy
  2012-09-04  7:35   ` [PATCH] powerpc-powernv: added tce_get callback for powernv platform Alexey Kardashevskiy
                     ` (3 more replies)
  0 siblings, 4 replies; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-09-04  7:33 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, David Gibson
  Cc: Alexey Kardashevskiy, linuxppc-dev, Alex Williamson, Paul Mackerras

Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h    |    3 +
 drivers/iommu/Kconfig               |    8 +
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  440 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   29 +++
 6 files changed, 487 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 957a83f..c64bce7 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -66,6 +66,9 @@ struct iommu_table {
 	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
 	spinlock_t     it_lock;      /* Protects it_map */
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 3bd9fff..19cf2d9 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -162,4 +162,12 @@ config TEGRA_IOMMU_SMMU
 	  space through the SMMU (System Memory Management Unit)
 	  hardware included on Tegra SoCs.
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_PSERIES
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..21f1909
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,440 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_x86.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <linux/spinlock.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+
+/*
+ * SPAPR TCE API
+ */
+static void tce_free(struct iommu_table *tbl, unsigned long entry,
+		unsigned long tce)
+{
+	struct page *page = pfn_to_page(tce >> PAGE_SHIFT);
+
+	WARN_ON(!page);
+	if (page) {
+		if (tce & VFIO_SPAPR_TCE_WRITE)
+			SetPageDirty(page);
+		put_page(page);
+	}
+	ppc_md.tce_free(tbl, entry, 1);
+}
+
+static long tce_put(struct iommu_table *tbl,
+		unsigned long entry, uint64_t tce, uint32_t flags)
+{
+	int ret;
+	unsigned long oldtce, kva, offset;
+	struct page *page = NULL;
+	enum dma_data_direction direction = DMA_NONE;
+
+	switch (flags & VFIO_SPAPR_TCE_PUT_MASK) {
+	case VFIO_SPAPR_TCE_READ:
+		direction = DMA_TO_DEVICE;
+		break;
+	case VFIO_SPAPR_TCE_WRITE:
+		direction = DMA_FROM_DEVICE;
+		break;
+	case VFIO_SPAPR_TCE_BIDIRECTIONAL:
+		direction = DMA_BIDIRECTIONAL;
+		break;
+	}
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+
+	/* Free page if still allocated */
+	if (oldtce & VFIO_SPAPR_TCE_PUT_MASK)
+		tce_free(tbl, entry, oldtce);
+
+	/* Map new TCE */
+	if (direction != DMA_NONE) {
+		offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+				direction != DMA_TO_DEVICE, &page);
+		BUG_ON(ret > 1);
+		if (ret < 1) {
+			printk(KERN_ERR "tce_vfio: get_user_pages_fast failed "
+					"tce=%llx ioba=%lx ret=%d\n",
+					tce, entry << IOMMU_PAGE_SHIFT, ret);
+			if (!ret)
+				ret = -EFAULT;
+			goto unlock_exit;
+		}
+
+		kva = (unsigned long) page_address(page);
+		kva += offset;
+		BUG_ON(!kva);
+		if (WARN_ON(kva & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		/* Preserve access bits */
+		kva |= flags & VFIO_SPAPR_TCE_PUT_MASK;
+
+		/* tce_build receives a virtual address */
+		entry += tbl->it_offset;	/* Offset into real TCE table */
+		ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+		/* tce_build() only returns non-zero for transient errors */
+		if (unlikely(ret)) {
+			printk(KERN_ERR "tce_vfio: Failed to add TCE\n");
+			ret = -EIO;
+			goto unlock_exit;
+		}
+	}
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+unlock_exit:
+	if (ret && page)
+		put_page(page);
+
+	if (ret)
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx "
+				"ioba=%lx kva=%lx\n", tce,
+				entry << IOMMU_PAGE_SHIFT, kva);
+	return ret;
+}
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = container->tbl;
+	unsigned long i, tce;
+
+	/* Unmap leftovers */
+	spin_lock_irq(&tbl->it_lock);
+	for (i = tbl->it_offset; i < tbl->it_offset + tbl->it_size; ++i) {
+		tce = ppc_md.tce_get(tbl, i);
+		if (tce & VFIO_SPAPR_TCE_PUT_MASK)
+			tce_free(tbl, i, tce);
+	}
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	spin_unlock_irq(&tbl->it_lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma64_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		if (!tbl)
+			return -ENXIO;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.dma64_window_start = 0;
+		info.dma64_window_size = 0;
+		info.flags = 0;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+	case VFIO_IOMMU_SPAPR_TCE_PUT: {
+		struct vfio_iommu_spapr_tce_put par;
+		struct iommu_table *tbl = container->tbl;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_put, tce);
+
+		if (copy_from_user(&par, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (par.argsz < minsz)
+			return -EINVAL;
+
+		if (!tbl) {
+			return -ENXIO;
+		}
+
+		spin_lock_irq(&tbl->it_lock);
+		ret = tce_put(tbl, par.ioba >> IOMMU_PAGE_SHIFT,
+				par.tce, par.flags);
+		spin_unlock_irq(&tbl->it_lock);
+
+		return ret;
+	}
+	default:
+		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	printk(KERN_DEBUG "tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU "
+				"container is allowed, "
+				"existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	if (tbl != container->tbl) {
+		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected "
+				"group is #%u\n", iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+		return;
+	}
+	printk(KERN_DEBUG "tce_vfio: detaching group #%u from iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+/*
+ * Add/delete devices support (hotplug, module_init, module_exit)
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (dev->iommu_group) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu "
+				"group %d, skipping\n", dev->kobj.name,
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		printk(KERN_DEBUG "tce_vfio: skipping device %s with no tbl\n",
+				dev->kobj.name);
+		return 0;
+	}
+
+	printk(KERN_DEBUG "tce_vfio: adding %s to iommu group %d\n",
+			dev->kobj.name, iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev->kobj.name, ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* If the current platform does not support tce_get
+	   we are unable to clean TCE table properly and
+	   therefore it is better not to touch it at all */
+	if (!ppc_md.tce_get) {
+		printk(KERN_ERR "tce_vfio: ppc_md.tce_get isn't implemented\n");
+		return -EOPNOTSUPP;
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Allocate and initialize VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return -EFAULT;
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..2c0a927 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,32 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;
+	__u32 dma32_window_start;
+	__u32 dma32_window_size;
+	__u64 dma64_window_start;
+	__u64 dma64_window_size;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct vfio_iommu_spapr_tce_put {
+	__u32 argsz;
+	__u32 flags;
+#define VFIO_SPAPR_TCE_READ		1
+#define VFIO_SPAPR_TCE_WRITE		2
+#define VFIO_SPAPR_TCE_BIDIRECTIONAL	(VFIO_SPAPR_TCE_READ|VFIO_SPAPR_TCE_WRITE)
+#define VFIO_SPAPR_TCE_PUT_MASK		VFIO_SPAPR_TCE_BIDIRECTIONAL
+	__u64 ioba;
+	__u64 tce;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_PUT	_IO(VFIO_TYPE, VFIO_BASE + 13)
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH] powerpc-powernv: added tce_get callback for powernv platform
  2012-09-04  7:33 ` [PATCH] vfio: enabled and supported on power (v7) Alexey Kardashevskiy
@ 2012-09-04  7:35   ` Alexey Kardashevskiy
  2012-09-04 19:41     ` Benjamin Herrenschmidt
  2012-09-04  7:36   ` [PATCH] powerpc-kvm: fixing page alignment for TCE Alexey Kardashevskiy
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-09-04  7:35 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Alexey Kardashevskiy, linuxppc-dev, Paul Mackerras, David Gibson

The upcoming VFIO support requires a way to know which
entry in the TCE map is not empty in order to do cleanup
at QEMU exit/crash. This patch adds such functionality
to POWERNV platform code.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/platforms/powernv/pci.c |    6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index be3cfc5..61f8068 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -447,6 +447,11 @@ static void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 		pnv_tce_invalidate(tbl, tces, tcep - 1);
 }
 
+static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
+{
+	return ((u64 *)tbl->it_base)[index - tbl->it_offset] & IOMMU_PAGE_MASK;
+}
+
 void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 			       void *tce_mem, u64 tce_size,
 			       u64 dma_offset)
@@ -597,6 +602,7 @@ void __init pnv_pci_init(void)
 	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
 	ppc_md.tce_build = pnv_tce_build;
 	ppc_md.tce_free = pnv_tce_free;
+	ppc_md.tce_get = pnv_tce_get;
 	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
 	set_pci_dma_ops(&dma_iommu_ops);
 
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH] powerpc-kvm: fixing page alignment for TCE
  2012-09-04  7:33 ` [PATCH] vfio: enabled and supported on power (v7) Alexey Kardashevskiy
  2012-09-04  7:35   ` [PATCH] powerpc-powernv: added tce_get callback for powernv platform Alexey Kardashevskiy
@ 2012-09-04  7:36   ` Alexey Kardashevskiy
  2012-09-20  9:01     ` Alexander Graf
  2012-09-04  7:36   ` [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform Alexey Kardashevskiy
  2012-09-10 16:02   ` [PATCH] vfio: enabled and supported on power (v7) Alex Williamson
  3 siblings, 1 reply; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-09-04  7:36 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Alexey Kardashevskiy, linuxppc-dev, Paul Mackerras, kvm-ppc,
	David Gibson

From: Paul Mackerras <paulus@samba.org>

TODO: ask Paul to make a proper message.

This is the fix for a host kernel compiled with a page size
other than 4K (TCE page size). In the case of a 64K page size,
the host used to lose address bits in hpte_rpn().
The patch fixes it.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c |    9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 80a5775..a41f11b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -503,7 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long *hptep, hpte[3], r;
 	unsigned long mmu_seq, psize, pte_size;
-	unsigned long gfn, hva, pfn;
+	unsigned long gpa, gfn, hva, pfn;
 	struct kvm_memory_slot *memslot;
 	unsigned long *rmap;
 	struct revmap_entry *rev;
@@ -541,15 +541,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	/* Translate the logical address and get the page */
 	psize = hpte_page_size(hpte[0], r);
-	gfn = hpte_rpn(r, psize);
+	gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1));
+	gfn = gpa >> PAGE_SHIFT;
 	memslot = gfn_to_memslot(kvm, gfn);
 
 	/* No memslot means it's an emulated MMIO region */
-	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
-		unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
 					      dsisr & DSISR_ISSTORE);
-	}
 
 	if (!kvm->arch.using_mmu_notifiers)
 		return -EFAULT;		/* should never get here */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform
  2012-09-04  7:33 ` [PATCH] vfio: enabled and supported on power (v7) Alexey Kardashevskiy
  2012-09-04  7:35   ` [PATCH] powerpc-powernv: added tce_get callback for powernv platform Alexey Kardashevskiy
  2012-09-04  7:36   ` [PATCH] powerpc-kvm: fixing page alignment for TCE Alexey Kardashevskiy
@ 2012-09-04  7:36   ` Alexey Kardashevskiy
  2012-09-04 19:45     ` Benjamin Herrenschmidt
  2012-09-10 16:02   ` [PATCH] vfio: enabled and supported on power (v7) Alex Williamson
  3 siblings, 1 reply; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-09-04  7:36 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Alexey Kardashevskiy, linuxppc-dev, Paul Mackerras, David Gibson

VFIO adds a separate memory region for every BAR and tries
to mmap() it to provide direct BAR mapping to the guest.
If it succeedes, QEMU registers this address with kvm_set_phys_mem().
However it is not always possible because such a BAR should
be host page size aligned. In this case VFIO uses "slow" path
and emulated BAR access in QEMU.

In order to avoid "slow" path, BARs have to be PAGE_SIZE aligned
in the host kernel and this is what the patch does.

The patch adds powernv platform specific hook which makes all
BARs sizes 64K aligned. The pci_reassigndev_resource_alignment()
function from drivers/pci/pci.c has been used as a reference.

This is purely an optimization patch, the things will work without
it, just a bit slower.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/platforms/powernv/setup.c |   26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index db1ad1c..331838e 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -25,6 +25,7 @@
 #include <linux/of.h>
 #include <linux/interrupt.h>
 #include <linux/bug.h>
+#include <linux/pci.h>
 
 #include <asm/machdep.h>
 #include <asm/firmware.h>
@@ -179,6 +180,30 @@ static int __init pnv_probe(void)
 	return 1;
 }
 
+static void pnv_pcibios_fixup_resources(struct pci_dev *pdev)
+{
+	struct resource *r;
+	int i;
+
+	/*
+	 * Aligning resources to PAGE_SIZE in order to
+	 * support "fast" path for PCI BAR access under VFIO
+	 * which maps every BAR individually to the guest
+	 * so BARs have to be PAGE aligned.
+	 */
+	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+		r = &pdev->resource[i];
+		if (!r->flags)
+			continue;
+		pr_debug("powernv: %s, aligning BAR#%d %llx..%llx",
+			pdev->dev.kobj.name, i, r->start, r->end);
+		r->end = PAGE_ALIGN(r->end - r->start + 1) - 1;
+		r->start = 0;
+		r->flags |= IORESOURCE_UNSET;
+		pr_debug(" to  %llx..%llx\n", r->start, r->end);
+	}
+}
+
 define_machine(powernv) {
 	.name			= "PowerNV",
 	.probe			= pnv_probe,
@@ -189,6 +214,7 @@ define_machine(powernv) {
 	.progress		= pnv_progress,
 	.power_save             = power7_idle,
 	.calibrate_decr		= generic_calibrate_decr,
+	.pcibios_fixup_resources= pnv_pcibios_fixup_resources,
 #ifdef CONFIG_KEXEC
 	.kexec_cpu_down		= pnv_kexec_cpu_down,
 #endif
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: added tce_get callback for powernv platform
  2012-09-04  7:35   ` [PATCH] powerpc-powernv: added tce_get callback for powernv platform Alexey Kardashevskiy
@ 2012-09-04 19:41     ` Benjamin Herrenschmidt
  2012-09-04 22:35       ` David Gibson
  2012-09-05  0:19       ` Alexey Kardashevskiy
  0 siblings, 2 replies; 25+ messages in thread
From: Benjamin Herrenschmidt @ 2012-09-04 19:41 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, Paul Mackerras, David Gibson

On Tue, 2012-09-04 at 17:35 +1000, Alexey Kardashevskiy wrote:
> The upcoming VFIO support requires a way to know which
> entry in the TCE map is not empty in order to do cleanup
> at QEMU exit/crash. This patch adds such functionality
> to POWERNV platform code.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/platforms/powernv/pci.c |    6 ++++++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index be3cfc5..61f8068 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -447,6 +447,11 @@ static void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
>  		pnv_tce_invalidate(tbl, tces, tcep - 1);
>  }
>  
> +static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
> +{
> +	return ((u64 *)tbl->it_base)[index - tbl->it_offset] & IOMMU_PAGE_MASK;
> +}

Why the masking here ?

Cheers,
Ben.

>  void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
>  			       void *tce_mem, u64 tce_size,
>  			       u64 dma_offset)
> @@ -597,6 +602,7 @@ void __init pnv_pci_init(void)
>  	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
>  	ppc_md.tce_build = pnv_tce_build;
>  	ppc_md.tce_free = pnv_tce_free;
> +	ppc_md.tce_get = pnv_tce_get;
>  	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
>  	set_pci_dma_ops(&dma_iommu_ops);
>  

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform
  2012-09-04  7:36   ` [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform Alexey Kardashevskiy
@ 2012-09-04 19:45     ` Benjamin Herrenschmidt
  2012-09-05  0:55       ` Alexey Kardashevskiy
  0 siblings, 1 reply; 25+ messages in thread
From: Benjamin Herrenschmidt @ 2012-09-04 19:45 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, Alex Williamson, Paul Mackerras, David Gibson

On Tue, 2012-09-04 at 17:36 +1000, Alexey Kardashevskiy wrote:
> VFIO adds a separate memory region for every BAR and tries
> to mmap() it to provide direct BAR mapping to the guest.
> If it succeedes, QEMU registers this address with kvm_set_phys_mem().
> However it is not always possible because such a BAR should
> be host page size aligned. In this case VFIO uses "slow" path
> and emulated BAR access in QEMU.
> 
> In order to avoid "slow" path, BARs have to be PAGE_SIZE aligned
> in the host kernel and this is what the patch does.
> 
> The patch adds powernv platform specific hook which makes all
> BARs sizes 64K aligned. The pci_reassigndev_resource_alignment()
> function from drivers/pci/pci.c has been used as a reference.
> 
> This is purely an optimization patch, the things will work without
> it, just a bit slower.

It's still bad in more ways that I care to explain...

The main one is that you do the "fixup" in a very wrong place anyway and
it might cause cases of overlapping BARs.

In any case this is wrong. It's a VFIO design bug and needs to be fixed
there (CC'ing Alex).

IE. We need a way to know where the BAR is within a page at which point
VFIO can still map the page, but can also properly take into account the
offset.

We also need a way to tell VFIO userspace that it's OK to use the fast
path for such small BARs. It's not for all host platforms. We know it's
ok for PowerNV because we know the devices are grouped by PEs and the PE
granularity is larger than a page but that's not necessarily going to be
the case on all powerpc platforms that support KVM.

Cheers,
Ben.

> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/platforms/powernv/setup.c |   26 ++++++++++++++++++++++++++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
> index db1ad1c..331838e 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ b/arch/powerpc/platforms/powernv/setup.c
> @@ -25,6 +25,7 @@
>  #include <linux/of.h>
>  #include <linux/interrupt.h>
>  #include <linux/bug.h>
> +#include <linux/pci.h>
>  
>  #include <asm/machdep.h>
>  #include <asm/firmware.h>
> @@ -179,6 +180,30 @@ static int __init pnv_probe(void)
>  	return 1;
>  }
>  
> +static void pnv_pcibios_fixup_resources(struct pci_dev *pdev)
> +{
> +	struct resource *r;
> +	int i;
> +
> +	/*
> +	 * Aligning resources to PAGE_SIZE in order to
> +	 * support "fast" path for PCI BAR access under VFIO
> +	 * which maps every BAR individually to the guest
> +	 * so BARs have to be PAGE aligned.
> +	 */
> +	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
> +		r = &pdev->resource[i];
> +		if (!r->flags)
> +			continue;
> +		pr_debug("powernv: %s, aligning BAR#%d %llx..%llx",
> +			pdev->dev.kobj.name, i, r->start, r->end);
> +		r->end = PAGE_ALIGN(r->end - r->start + 1) - 1;
> +		r->start = 0;
> +		r->flags |= IORESOURCE_UNSET;
> +		pr_debug(" to  %llx..%llx\n", r->start, r->end);
> +	}
> +}
> +
>  define_machine(powernv) {
>  	.name			= "PowerNV",
>  	.probe			= pnv_probe,
> @@ -189,6 +214,7 @@ define_machine(powernv) {
>  	.progress		= pnv_progress,
>  	.power_save             = power7_idle,
>  	.calibrate_decr		= generic_calibrate_decr,
> +	.pcibios_fixup_resources= pnv_pcibios_fixup_resources,
>  #ifdef CONFIG_KEXEC
>  	.kexec_cpu_down		= pnv_kexec_cpu_down,
>  #endif

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: added tce_get callback for powernv platform
  2012-09-04 19:41     ` Benjamin Herrenschmidt
@ 2012-09-04 22:35       ` David Gibson
  2012-09-05  0:19       ` Alexey Kardashevskiy
  1 sibling, 0 replies; 25+ messages in thread
From: David Gibson @ 2012-09-04 22:35 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Alexey Kardashevskiy, Paul Mackerras, linuxppc-dev

On Wed, Sep 05, 2012 at 05:41:42AM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2012-09-04 at 17:35 +1000, Alexey Kardashevskiy wrote:
> > The upcoming VFIO support requires a way to know which
> > entry in the TCE map is not empty in order to do cleanup
> > at QEMU exit/crash. This patch adds such functionality
> > to POWERNV platform code.
> > 
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> >  arch/powerpc/platforms/powernv/pci.c |    6 ++++++
> >  1 file changed, 6 insertions(+)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> > index be3cfc5..61f8068 100644
> > --- a/arch/powerpc/platforms/powernv/pci.c
> > +++ b/arch/powerpc/platforms/powernv/pci.c
> > @@ -447,6 +447,11 @@ static void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
> >  		pnv_tce_invalidate(tbl, tces, tcep - 1);
> >  }
> >  
> > +static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
> > +{
> > +	return ((u64 *)tbl->it_base)[index - tbl->it_offset] & IOMMU_PAGE_MASK;
> > +}
> 
> Why the masking here ?

Yes.  Especially since you're masking out the permission bits which
are actually the ones you want to determine if a TCE is empty or not.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: added tce_get callback for powernv platform
  2012-09-04 19:41     ` Benjamin Herrenschmidt
  2012-09-04 22:35       ` David Gibson
@ 2012-09-05  0:19       ` Alexey Kardashevskiy
  2012-09-05  0:32         ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-09-05  0:19 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Paul Mackerras, David Gibson

On 05/09/12 05:41, Benjamin Herrenschmidt wrote:
> On Tue, 2012-09-04 at 17:35 +1000, Alexey Kardashevskiy wrote:
>> The upcoming VFIO support requires a way to know which
>> entry in the TCE map is not empty in order to do cleanup
>> at QEMU exit/crash. This patch adds such functionality
>> to POWERNV platform code.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/platforms/powernv/pci.c |    6 ++++++
>>   1 file changed, 6 insertions(+)
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index be3cfc5..61f8068 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -447,6 +447,11 @@ static void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
>>   		pnv_tce_invalidate(tbl, tces, tcep - 1);
>>   }
>>
>> +static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
>> +{
>> +	return ((u64 *)tbl->it_base)[index - tbl->it_offset] & IOMMU_PAGE_MASK;
>> +}
>
> Why the masking here ?


Oops. No reason. Will remove.


>
> Cheers,
> Ben.
>
>>   void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
>>   			       void *tce_mem, u64 tce_size,
>>   			       u64 dma_offset)
>> @@ -597,6 +602,7 @@ void __init pnv_pci_init(void)
>>   	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
>>   	ppc_md.tce_build = pnv_tce_build;
>>   	ppc_md.tce_free = pnv_tce_free;
>> +	ppc_md.tce_get = pnv_tce_get;
>>   	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
>>   	set_pci_dma_ops(&dma_iommu_ops);
>>
>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: added tce_get callback for powernv platform
  2012-09-05  0:19       ` Alexey Kardashevskiy
@ 2012-09-05  0:32         ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 25+ messages in thread
From: Benjamin Herrenschmidt @ 2012-09-05  0:32 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, Paul Mackerras, David Gibson

On Wed, 2012-09-05 at 10:19 +1000, Alexey Kardashevskiy wrote:
> >> +static unsigned long pnv_tce_get(struct iommu_table *tbl, long
> index)
> >> +{
> >> +    return ((u64 *)tbl->it_base)[index - tbl->it_offset] &
> IOMMU_PAGE_MASK;
> >> +}
> >
> > Why the masking here ?
> 
> 
> Oops. No reason. Will remove.

Right. The caller wants to know both whether the low bits are set and
whether there's an address up.

On the H_PUT_TCE path, you want to make sure:

 - If any of the low bit is set, set the TCE entry & get_page()
 - If none, then clear the whole entry (ignore the high bits passed by
the guest) and maybe put_page() the old page

IE the TCE either contains a valid page address + low bit(s) or all 0

That way, on the cleanup path, you can check the low bits only to decide
whether to cleanup, and if any is set, you know both your direction
(writeable vs. read only) and whether something was there at all.

You do not want to ever compare the high bits (address) to 0. While we
never do it in practice I suspect, there's no fundamental reason why a
physical address of 0 is incorrect in a TCE.

Cheers,
Ben.
 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform
  2012-09-04 19:45     ` Benjamin Herrenschmidt
@ 2012-09-05  0:55       ` Alexey Kardashevskiy
  2012-09-05  1:16         ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-09-05  0:55 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linuxppc-dev, Alex Williamson, Paul Mackerras, David Gibson

On 05/09/12 05:45, Benjamin Herrenschmidt wrote:
> On Tue, 2012-09-04 at 17:36 +1000, Alexey Kardashevskiy wrote:
>> VFIO adds a separate memory region for every BAR and tries
>> to mmap() it to provide direct BAR mapping to the guest.
>> If it succeedes, QEMU registers this address with kvm_set_phys_mem().
>> However it is not always possible because such a BAR should
>> be host page size aligned. In this case VFIO uses "slow" path
>> and emulated BAR access in QEMU.
>>
>> In order to avoid "slow" path, BARs have to be PAGE_SIZE aligned
>> in the host kernel and this is what the patch does.
>>
>> The patch adds powernv platform specific hook which makes all
>> BARs sizes 64K aligned. The pci_reassigndev_resource_alignment()
>> function from drivers/pci/pci.c has been used as a reference.
>>
>> This is purely an optimization patch, the things will work without
>> it, just a bit slower.
>
> It's still bad in more ways that I care to explain...

Well it is right before pci_reassigndev_resource_alignment() which is 
common and does the same thing.

> The main one is that you do the "fixup" in a very wrong place anyway and
> it might cause cases of overlapping BARs.

As far as I can tell it may only happen if someone tries to align resource 
via kernel command line.

But ok. I trust you :)

> In any case this is wrong. It's a VFIO design bug and needs to be fixed
> there (CC'ing Alex).

It can be fixed in VFIO only if VFIO will stop treating functions 
separately and start mapping group's MMIO space as a whole thing. But this 
is not going to happen.

The example of the problem is NEC USB PCI which has 3 functions, each has 
one BAR, these BARs are 4K aligned and I cannot see how it can be fixed 
with 64K page size and VFIO creating memory regions per BAR (not per PHB).


> IE. We need a way to know where the BAR is within a page at which point
> VFIO can still map the page, but can also properly take into account the
> offset.

It is not about VFIO, it is about KVM. I cannot put non-aligned page to 
kvm_set_phys_mem(). Cannot understand how we would solve this.


You better discuss it with David, my vocab is weak.



> We also need a way to tell VFIO userspace that it's OK to use the fast
> path for such small BARs. It's not for all host platforms. We know it's
> ok for PowerNV because we know the devices are grouped by PEs and the PE
> granularity is larger than a page but that's not necessarily going to be
> the case on all powerpc platforms that support KVM.
>
> Cheers,
> Ben.
>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/platforms/powernv/setup.c |   26 ++++++++++++++++++++++++++
>>   1 file changed, 26 insertions(+)
>>
>> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
>> index db1ad1c..331838e 100644
>> --- a/arch/powerpc/platforms/powernv/setup.c
>> +++ b/arch/powerpc/platforms/powernv/setup.c
>> @@ -25,6 +25,7 @@
>>   #include <linux/of.h>
>>   #include <linux/interrupt.h>
>>   #include <linux/bug.h>
>> +#include <linux/pci.h>
>>
>>   #include <asm/machdep.h>
>>   #include <asm/firmware.h>
>> @@ -179,6 +180,30 @@ static int __init pnv_probe(void)
>>   	return 1;
>>   }
>>
>> +static void pnv_pcibios_fixup_resources(struct pci_dev *pdev)
>> +{
>> +	struct resource *r;
>> +	int i;
>> +
>> +	/*
>> +	 * Aligning resources to PAGE_SIZE in order to
>> +	 * support "fast" path for PCI BAR access under VFIO
>> +	 * which maps every BAR individually to the guest
>> +	 * so BARs have to be PAGE aligned.
>> +	 */
>> +	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
>> +		r = &pdev->resource[i];
>> +		if (!r->flags)
>> +			continue;
>> +		pr_debug("powernv: %s, aligning BAR#%d %llx..%llx",
>> +			pdev->dev.kobj.name, i, r->start, r->end);
>> +		r->end = PAGE_ALIGN(r->end - r->start + 1) - 1;
>> +		r->start = 0;
>> +		r->flags |= IORESOURCE_UNSET;
>> +		pr_debug(" to  %llx..%llx\n", r->start, r->end);
>> +	}
>> +}
>> +
>>   define_machine(powernv) {
>>   	.name			= "PowerNV",
>>   	.probe			= pnv_probe,
>> @@ -189,6 +214,7 @@ define_machine(powernv) {
>>   	.progress		= pnv_progress,
>>   	.power_save             = power7_idle,
>>   	.calibrate_decr		= generic_calibrate_decr,
>> +	.pcibios_fixup_resources= pnv_pcibios_fixup_resources,
>>   #ifdef CONFIG_KEXEC
>>   	.kexec_cpu_down		= pnv_kexec_cpu_down,
>>   #endif
>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform
  2012-09-05  0:55       ` Alexey Kardashevskiy
@ 2012-09-05  1:16         ` Benjamin Herrenschmidt
  2012-09-05  4:57           ` Alex Williamson
  0 siblings, 1 reply; 25+ messages in thread
From: Benjamin Herrenschmidt @ 2012-09-05  1:16 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, Alex Williamson, Paul Mackerras, David Gibson


> > It's still bad in more ways that I care to explain...
> 
> Well it is right before pci_reassigndev_resource_alignment() which is 
> common and does the same thing.
> 
> > The main one is that you do the "fixup" in a very wrong place anyway and
> > it might cause cases of overlapping BARs.
> 
> As far as I can tell it may only happen if someone tries to align resource 
> via kernel command line.
> 
> But ok. I trust you :)

I have reasons to believe that this realignment crap is wrong too :-)

> > In any case this is wrong. It's a VFIO design bug and needs to be fixed
> > there (CC'ing Alex).
> 
> It can be fixed in VFIO only if VFIO will stop treating functions 
> separately and start mapping group's MMIO space as a whole thing. But this 
> is not going to happen.

It still can be fixed without that...

> The example of the problem is NEC USB PCI which has 3 functions, each has 
> one BAR, these BARs are 4K aligned and I cannot see how it can be fixed 
> with 64K page size and VFIO creating memory regions per BAR (not per PHB).

VFIO can perfectly well realize it's the same MR or even map the same
area 3 times and create 3 MRs, both options work. All it needs is to
know the offset of the BAR inside the page.

> > IE. We need a way to know where the BAR is within a page at which point
> > VFIO can still map the page, but can also properly take into account the
> > offset.
> 
> It is not about VFIO, it is about KVM. I cannot put non-aligned page to 
> kvm_set_phys_mem(). Cannot understand how we would solve this.

No, VFIO still maps the whole page and creates an MR for the whole page,
that's fine. But you still need to know the offset within the page.

Now the main problem here is going to be that the guest itself might
reallocate the BAR and move it around (well, it's version of the BAR
which isn't the real thing), and so we cannot create a direct MMU
mapping between -that- and the real BAR.

IE. We can only allow that direct mapping if the guest BAR mapping has
the same "offset within page" as the host BAR mapping. 

Our guests don't mess with BARs but SLOF does ... it's really tempting
to look into bringing the whole BAR allocation back into qemu and out of
SLOF :-( (We might have to if we ever do hotplug anyway). That way qemu
could set offsets that match appropriately.
 
Cheers,
Ben.

> You better discuss it with David, my vocab is weak.
> 
> 
> 
> > We also need a way to tell VFIO userspace that it's OK to use the fast
> > path for such small BARs. It's not for all host platforms. We know it's
> > ok for PowerNV because we know the devices are grouped by PEs and the PE
> > granularity is larger than a page but that's not necessarily going to be
> > the case on all powerpc platforms that support KVM.
> >
> > Cheers,
> > Ben.
> >
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>   arch/powerpc/platforms/powernv/setup.c |   26 ++++++++++++++++++++++++++
> >>   1 file changed, 26 insertions(+)
> >>
> >> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
> >> index db1ad1c..331838e 100644
> >> --- a/arch/powerpc/platforms/powernv/setup.c
> >> +++ b/arch/powerpc/platforms/powernv/setup.c
> >> @@ -25,6 +25,7 @@
> >>   #include <linux/of.h>
> >>   #include <linux/interrupt.h>
> >>   #include <linux/bug.h>
> >> +#include <linux/pci.h>
> >>
> >>   #include <asm/machdep.h>
> >>   #include <asm/firmware.h>
> >> @@ -179,6 +180,30 @@ static int __init pnv_probe(void)
> >>   	return 1;
> >>   }
> >>
> >> +static void pnv_pcibios_fixup_resources(struct pci_dev *pdev)
> >> +{
> >> +	struct resource *r;
> >> +	int i;
> >> +
> >> +	/*
> >> +	 * Aligning resources to PAGE_SIZE in order to
> >> +	 * support "fast" path for PCI BAR access under VFIO
> >> +	 * which maps every BAR individually to the guest
> >> +	 * so BARs have to be PAGE aligned.
> >> +	 */
> >> +	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
> >> +		r = &pdev->resource[i];
> >> +		if (!r->flags)
> >> +			continue;
> >> +		pr_debug("powernv: %s, aligning BAR#%d %llx..%llx",
> >> +			pdev->dev.kobj.name, i, r->start, r->end);
> >> +		r->end = PAGE_ALIGN(r->end - r->start + 1) - 1;
> >> +		r->start = 0;
> >> +		r->flags |= IORESOURCE_UNSET;
> >> +		pr_debug(" to  %llx..%llx\n", r->start, r->end);
> >> +	}
> >> +}
> >> +
> >>   define_machine(powernv) {
> >>   	.name			= "PowerNV",
> >>   	.probe			= pnv_probe,
> >> @@ -189,6 +214,7 @@ define_machine(powernv) {
> >>   	.progress		= pnv_progress,
> >>   	.power_save             = power7_idle,
> >>   	.calibrate_decr		= generic_calibrate_decr,
> >> +	.pcibios_fixup_resources= pnv_pcibios_fixup_resources,
> >>   #ifdef CONFIG_KEXEC
> >>   	.kexec_cpu_down		= pnv_kexec_cpu_down,
> >>   #endif
> >
> >
> 
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform
  2012-09-05  1:16         ` Benjamin Herrenschmidt
@ 2012-09-05  4:57           ` Alex Williamson
  2012-09-05  5:17             ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 25+ messages in thread
From: Alex Williamson @ 2012-09-05  4:57 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Alexey Kardashevskiy, linuxppc-dev, Paul Mackerras, David Gibson

On Wed, 2012-09-05 at 11:16 +1000, Benjamin Herrenschmidt wrote:
> > > It's still bad in more ways that I care to explain...
> > 
> > Well it is right before pci_reassigndev_resource_alignment() which is 
> > common and does the same thing.
> > 
> > > The main one is that you do the "fixup" in a very wrong place anyway and
> > > it might cause cases of overlapping BARs.
> > 
> > As far as I can tell it may only happen if someone tries to align resource 
> > via kernel command line.
> > 
> > But ok. I trust you :)
> 
> I have reasons to believe that this realignment crap is wrong too :-)
> 
> > > In any case this is wrong. It's a VFIO design bug and needs to be fixed
> > > there (CC'ing Alex).
> > 
> > It can be fixed in VFIO only if VFIO will stop treating functions 
> > separately and start mapping group's MMIO space as a whole thing. But this 
> > is not going to happen.
> 
> It still can be fixed without that...
> 
> > The example of the problem is NEC USB PCI which has 3 functions, each has 
> > one BAR, these BARs are 4K aligned and I cannot see how it can be fixed 
> > with 64K page size and VFIO creating memory regions per BAR (not per PHB).
> 
> VFIO can perfectly well realize it's the same MR or even map the same
> area 3 times and create 3 MRs, both options work. All it needs is to
> know the offset of the BAR inside the page.

Yep, I think I agree...

> > > IE. We need a way to know where the BAR is within a page at which point
> > > VFIO can still map the page, but can also properly take into account the
> > > offset.
> > 
> > It is not about VFIO, it is about KVM. I cannot put non-aligned page to 
> > kvm_set_phys_mem(). Cannot understand how we would solve this.
> 
> No, VFIO still maps the whole page and creates an MR for the whole page,
> that's fine. But you still need to know the offset within the page.

Do we need an extra region info field, or is it sufficient that we
define a region to be mmap'able with getpagesize() pages when the MMAP
flag is set and simply offset the region within the device fd?  ex.

BAR0: 0x10000 /* no offset */
BAR1: 0x21000 /* 4k offset */
BAR2: 0x32000 /* 8k offset */

A second level optimization might make these 0x10000, 0x11000, 0x12000.

This will obviously require some arch hooks w/in vfio as we can't do
this on x86 since we can't guarantee that whatever lives in the
overflow/gaps is in the same group and power is going to need to make
sure we don't accidentally allow msix table mapping... in fact hiding
the msix table might be a lot more troublesome on 64k page hosts.

> Now the main problem here is going to be that the guest itself might
> reallocate the BAR and move it around (well, it's version of the BAR
> which isn't the real thing), and so we cannot create a direct MMU
> mapping between -that- and the real BAR.
> 
> IE. We can only allow that direct mapping if the guest BAR mapping has
> the same "offset within page" as the host BAR mapping. 

Euw...

> Our guests don't mess with BARs but SLOF does ... it's really tempting
> to look into bringing the whole BAR allocation back into qemu and out of
> SLOF :-( (We might have to if we ever do hotplug anyway). That way qemu
> could set offsets that match appropriately.

BTW, as I mentioned elsewhere, I'm on vacation this week, but I'll try
to keep up as much as I have time for.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform
  2012-09-05  4:57           ` Alex Williamson
@ 2012-09-05  5:17             ` Benjamin Herrenschmidt
  2012-09-05  5:27               ` Alexey Kardashevskiy
  0 siblings, 1 reply; 25+ messages in thread
From: Benjamin Herrenschmidt @ 2012-09-05  5:17 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, linuxppc-dev, Paul Mackerras, David Gibson

On Tue, 2012-09-04 at 22:57 -0600, Alex Williamson wrote:

> Do we need an extra region info field, or is it sufficient that we
> define a region to be mmap'able with getpagesize() pages when the MMAP
> flag is set and simply offset the region within the device fd?  ex.

Alexey ? You mentioned you had ways to get at the offset with the
existing interfaces ?

> BAR0: 0x10000 /* no offset */
> BAR1: 0x21000 /* 4k offset */
> BAR2: 0x32000 /* 8k offset */
> 
> A second level optimization might make these 0x10000, 0x11000, 0x12000.
> 
> This will obviously require some arch hooks w/in vfio as we can't do
> this on x86 since we can't guarantee that whatever lives in the
> overflow/gaps is in the same group and power is going to need to make
> sure we don't accidentally allow msix table mapping... in fact hiding
> the msix table might be a lot more troublesome on 64k page hosts.

Fortunately, our guests don't access the msix table directly anyway, at
least most of the time :-) There's a paravirt API for it, and our iommu
makes sure that if for some reason the guest still accesses it and does
the wrong thing to it, the side effects will be contained to the guest.

> > Now the main problem here is going to be that the guest itself might
> > reallocate the BAR and move it around (well, it's version of the BAR
> > which isn't the real thing), and so we cannot create a direct MMU
> > mapping between -that- and the real BAR.
> > 
> > IE. We can only allow that direct mapping if the guest BAR mapping has
> > the same "offset within page" as the host BAR mapping. 
> 
> Euw...

Yeah sucks :-) Basically, let's say page size is 64K. Host side BAR
(real BAR) is at 0xf0001000.

qemu maps 0xf0000000..0xf000ffff to a virtual address inside QEMU,
itself 64k aligned, let's say 0x80000000 and knows that the BAR is at
offset 0x1000 in there.

However, the KVM "MR" API is such that we can only map PAGE_SIZE regions
into the guest as well, so if the guest assigns a value ADDR to the
guest BAR, let's say 0x40002000, all KVM can do is an MR that maps
0x40000000 (guest physical) to 0x80000000 (qemu). Any access within that
64K page will have the low bits transferred directly from guest to HW.

So the guest will end up having that 0x2000 offset instead of the 0x1000
needed to actually access the BAR. FAIL.

There are ways to fix that but all are nasty.

 - In theory, we have the capability (and use it today) to restrict IO
mappings in the guest to 4K HW pages, so knowing that, KVM could use a
"special" MR that plays tricks here... but that would break all sort of
generic code both in qemu and kvm and generally be very nasty.

 - The best approach is to rely on the fact that our guest kernels don't
do BAR assignment, they rely on FW to do it (ie not at all, unlike x86,
we can't even fixup because in the general case, the hypervisor won't
let us anyway). So we could move our guest BAR allocation code out of
our guest firmware (SLOF) back into qemu (where we had it very early
on), which allows us to make sure that the guest BAR values we assign
have the same "offset within the page" as the host side values. This
would also allow us to avoid messing up too many MRs (this can have a
performance impact with KVM) and eventually handle our "group" regions
instead of individual BARs for mappings. We might need to do that anyway
in the long run for hotplug as our hotplug hypervisor APIs also rely on
the "new" hotplugged devices to have the BARs pre-assigned when they get
handed out to the guest. 

> > Our guests don't mess with BARs but SLOF does ... it's really tempting
> > to look into bringing the whole BAR allocation back into qemu and out of
> > SLOF :-( (We might have to if we ever do hotplug anyway). That way qemu
> > could set offsets that match appropriately.
> 
> BTW, as I mentioned elsewhere, I'm on vacation this week, but I'll try
> to keep up as much as I have time for.

No worries,

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform
  2012-09-05  5:17             ` Benjamin Herrenschmidt
@ 2012-09-05  5:27               ` Alexey Kardashevskiy
  2012-09-10 17:06                 ` Alex Williamson
  0 siblings, 1 reply; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-09-05  5:27 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linuxppc-dev, Alex Williamson, Paul Mackerras, David Gibson

On 05/09/12 15:17, Benjamin Herrenschmidt wrote:
> On Tue, 2012-09-04 at 22:57 -0600, Alex Williamson wrote:
>
>> Do we need an extra region info field, or is it sufficient that we
>> define a region to be mmap'able with getpagesize() pages when the MMAP
>> flag is set and simply offset the region within the device fd?  ex.
>
> Alexey ? You mentioned you had ways to get at the offset with the
> existing interfaces ?


Yes, VFIO_DEVICE_GET_REGION_INFO ioctl of vfio-pci host driver, the "info" 
struct has an "offset" field.
I just do not have a place to use it in the QEMU right now as the guest 
does the same allocation as the host does (by accident).


>> BAR0: 0x10000 /* no offset */
>> BAR1: 0x21000 /* 4k offset */
>> BAR2: 0x32000 /* 8k offset */
>>
>> A second level optimization might make these 0x10000, 0x11000, 0x12000.
>>
>> This will obviously require some arch hooks w/in vfio as we can't do
>> this on x86 since we can't guarantee that whatever lives in the
>> overflow/gaps is in the same group and power is going to need to make
>> sure we don't accidentally allow msix table mapping... in fact hiding
>> the msix table might be a lot more troublesome on 64k page hosts.
>
> Fortunately, our guests don't access the msix table directly anyway, at
> least most of the time :-)


Not at all in our case. It took me some time to push a QEMU patch which 
changes msix table :)


> There's a paravirt API for it, and our iommu
> makes sure that if for some reason the guest still accesses it and does
> the wrong thing to it, the side effects will be contained to the guest.

>>> Now the main problem here is going to be that the guest itself might
>>> reallocate the BAR and move it around (well, it's version of the BAR
>>> which isn't the real thing), and so we cannot create a direct MMU
>>> mapping between -that- and the real BAR.
>>>
>>> IE. We can only allow that direct mapping if the guest BAR mapping has
>>> the same "offset within page" as the host BAR mapping.
>>
>> Euw...
>
> Yeah sucks :-) Basically, let's say page size is 64K. Host side BAR
> (real BAR) is at 0xf0001000.
>
> qemu maps 0xf0000000..0xf000ffff to a virtual address inside QEMU,
> itself 64k aligned, let's say 0x80000000 and knows that the BAR is at
> offset 0x1000 in there.
>
> However, the KVM "MR" API is such that we can only map PAGE_SIZE regions
> into the guest as well, so if the guest assigns a value ADDR to the
> guest BAR, let's say 0x40002000, all KVM can do is an MR that maps
> 0x40000000 (guest physical) to 0x80000000 (qemu). Any access within that
> 64K page will have the low bits transferred directly from guest to HW.
>
> So the guest will end up having that 0x2000 offset instead of the 0x1000
> needed to actually access the BAR. FAIL.
>
> There are ways to fix that but all are nasty.
>
>   - In theory, we have the capability (and use it today) to restrict IO
> mappings in the guest to 4K HW pages, so knowing that, KVM could use a
> "special" MR that plays tricks here... but that would break all sort of
> generic code both in qemu and kvm and generally be very nasty.
>
>   - The best approach is to rely on the fact that our guest kernels don't
> do BAR assignment, they rely on FW to do it (ie not at all, unlike x86,
> we can't even fixup because in the general case, the hypervisor won't
> let us anyway). So we could move our guest BAR allocation code out of
> our guest firmware (SLOF) back into qemu (where we had it very early
> on), which allows us to make sure that the guest BAR values we assign
> have the same "offset within the page" as the host side values. This
> would also allow us to avoid messing up too many MRs (this can have a
> performance impact with KVM) and eventually handle our "group" regions
> instead of individual BARs for mappings. We might need to do that anyway
> in the long run for hotplug as our hotplug hypervisor APIs also rely on
> the "new" hotplugged devices to have the BARs pre-assigned when they get
> handed out to the guest.
>
>>> Our guests don't mess with BARs but SLOF does ... it's really tempting
>>> to look into bringing the whole BAR allocation back into qemu and out of
>>> SLOF :-( (We might have to if we ever do hotplug anyway). That way qemu
>>> could set offsets that match appropriately.
>>
>> BTW, as I mentioned elsewhere, I'm on vacation this week, but I'll try
>> to keep up as much as I have time for.
>
> No worries,


-- 
Alexey

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] vfio: enabled and supported on power (v7)
  2012-09-04  7:33 ` [PATCH] vfio: enabled and supported on power (v7) Alexey Kardashevskiy
                     ` (2 preceding siblings ...)
  2012-09-04  7:36   ` [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform Alexey Kardashevskiy
@ 2012-09-10 16:02   ` Alex Williamson
  2012-09-11  8:28     ` Alexey Kardashevskiy
  3 siblings, 1 reply; 25+ messages in thread
From: Alex Williamson @ 2012-09-10 16:02 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Paul Mackerras, linuxppc-dev, David Gibson

On Tue, 2012-09-04 at 17:33 +1000, Alexey Kardashevskiy wrote:
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---

Please at least cc kvm@vger as well since we list that as the devel list
for vfio.

>  arch/powerpc/include/asm/iommu.h    |    3 +

I'll need an ack from Ben or Paul for this change.

>  drivers/iommu/Kconfig               |    8 +
>  drivers/vfio/Kconfig                |    6 +
>  drivers/vfio/Makefile               |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  440 +++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                |   29 +++
>  6 files changed, 487 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index 957a83f..c64bce7 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -66,6 +66,9 @@ struct iommu_table {
>  	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
>  	spinlock_t     it_lock;      /* Protects it_map */
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };

This seems to only be valid when vfio_iommu_spapr_tce is loaded, which
is a bit misleading.

>  
>  struct scatterlist;
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 3bd9fff..19cf2d9 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -162,4 +162,12 @@ config TEGRA_IOMMU_SMMU
>  	  space through the SMMU (System Memory Management Unit)
>  	  hardware included on Tegra SoCs.
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_PSERIES
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..21f1909
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,440 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_x86.c:

Should this be _type1?  Only the mail archives are going to remember
there was a _x86, so the renamed version is probably a better reference.

> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <linux/spinlock.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +
> +/*
> + * SPAPR TCE API
> + */
> +static void tce_free(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long tce)
> +{
> +	struct page *page = pfn_to_page(tce >> PAGE_SHIFT);
> +
> +	WARN_ON(!page);
> +	if (page) {
> +		if (tce & VFIO_SPAPR_TCE_WRITE)
> +			SetPageDirty(page);
> +		put_page(page);
> +	}
> +	ppc_md.tce_free(tbl, entry, 1);
> +}
> +
> +static long tce_put(struct iommu_table *tbl,
> +		unsigned long entry, uint64_t tce, uint32_t flags)
> +{
> +	int ret;
> +	unsigned long oldtce, kva, offset;
> +	struct page *page = NULL;
> +	enum dma_data_direction direction = DMA_NONE;
> +
> +	switch (flags & VFIO_SPAPR_TCE_PUT_MASK) {
> +	case VFIO_SPAPR_TCE_READ:
> +		direction = DMA_TO_DEVICE;
> +		break;
> +	case VFIO_SPAPR_TCE_WRITE:
> +		direction = DMA_FROM_DEVICE;
> +		break;
> +	case VFIO_SPAPR_TCE_BIDIRECTIONAL:
> +		direction = DMA_BIDIRECTIONAL;
> +		break;
> +	}
> +
> +	oldtce = ppc_md.tce_get(tbl, entry);
> +
> +	/* Free page if still allocated */
> +	if (oldtce & VFIO_SPAPR_TCE_PUT_MASK)
> +		tce_free(tbl, entry, oldtce);
> +
> +	/* Map new TCE */
> +	if (direction != DMA_NONE) {
> +		offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +				direction != DMA_TO_DEVICE, &page);
> +		BUG_ON(ret > 1);

Can this happen?

> +		if (ret < 1) {
> +			printk(KERN_ERR "tce_vfio: get_user_pages_fast failed "
> +					"tce=%llx ioba=%lx ret=%d\n",
> +					tce, entry << IOMMU_PAGE_SHIFT, ret);
> +			if (!ret)
> +				ret = -EFAULT;
> +			goto unlock_exit;
> +		}
> +
> +		kva = (unsigned long) page_address(page);
> +		kva += offset;
> +		BUG_ON(!kva);

Same here, can it happen?  If so, should it BUG or catch the below
EINVAL?

> +		if (WARN_ON(kva & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;

Page leak?  Don't we want to do a put_page(), which means we probably
want a goto exit here.

> +
> +		/* Preserve access bits */
> +		kva |= flags & VFIO_SPAPR_TCE_PUT_MASK;
> +
> +		/* tce_build receives a virtual address */
> +		entry += tbl->it_offset;	/* Offset into real TCE table */
> +		ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +		/* tce_build() only returns non-zero for transient errors */
> +		if (unlikely(ret)) {
> +			printk(KERN_ERR "tce_vfio: Failed to add TCE\n");
> +			ret = -EIO;
> +			goto unlock_exit;
> +		}
> +	}
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +
> +unlock_exit:

unlock seems wrong here, I had to go re-read the code looking for the
lock.

> +	if (ret && page)
> +		put_page(page);
> +
> +	if (ret)
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx "
> +				"ioba=%lx kva=%lx\n", tce,
> +				entry << IOMMU_PAGE_SHIFT, kva);
> +	return ret;
> +}
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = container->tbl;
> +	unsigned long i, tce;
> +

This will segfault if releasing a container that never had an a device
attached.

> +	/* Unmap leftovers */
> +	spin_lock_irq(&tbl->it_lock);
> +	for (i = tbl->it_offset; i < tbl->it_offset + tbl->it_size; ++i) {
> +		tce = ppc_md.tce_get(tbl, i);
> +		if (tce & VFIO_SPAPR_TCE_PUT_MASK)
> +			tce_free(tbl, i, tce);
> +	}
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +
> +	spin_unlock_irq(&tbl->it_lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +	long ret;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION: {
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +	}
> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma64_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		if (!tbl)
> +			return -ENXIO;

nit: why not check this earlier?

> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.dma64_window_start = 0;
> +		info.dma64_window_size = 0;
> +		info.flags = 0;
> +
> +		return copy_to_user((void __user *)arg, &info, minsz);
> +	}
> +	case VFIO_IOMMU_SPAPR_TCE_PUT: {
> +		struct vfio_iommu_spapr_tce_put par;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_put, tce);
> +
> +		if (copy_from_user(&par, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (par.argsz < minsz)
> +			return -EINVAL;
> +
> +		if (!tbl) {
> +			return -ENXIO;
> +		}

Same, plus drop the braces.

> +
> +		spin_lock_irq(&tbl->it_lock);
> +		ret = tce_put(tbl, par.ioba >> IOMMU_PAGE_SHIFT,
> +				par.tce, par.flags);
> +		spin_unlock_irq(&tbl->it_lock);
> +
> +		return ret;
> +	}

Is "PUT" really the name we want for this?

> +	default:
> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	printk(KERN_DEBUG "tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);

Let's use pr_debug() and friends throughout.

> +	if (container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU "
> +				"container is allowed, "
> +				"existing id=%d, attaching id=%d\n",
> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		return -EBUSY;
> +	}
> +

_type1 has a lock to avoid races here, I think you might need one too.

> +	container->tbl = tbl;
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);

Needed?  If so, why is there no check on attach?

> +	if (tbl != container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected "
> +				"group is #%u\n", iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +		return;
> +	}
> +	printk(KERN_DEBUG "tce_vfio: detaching group #%u from iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);

container->tbl = NULL?

> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +/*
> + * Add/delete devices support (hotplug, module_init, module_exit)
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (dev->iommu_group) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu "
> +				"group %d, skipping\n", dev->kobj.name,

Watch line wrapping on strings.

> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		printk(KERN_DEBUG "tce_vfio: skipping device %s with no tbl\n",
> +				dev->kobj.name);
> +		return 0;
> +	}
> +
> +	printk(KERN_DEBUG "tce_vfio: adding %s to iommu group %d\n",
> +			dev->kobj.name, iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev->kobj.name, ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* If the current platform does not support tce_get
> +	   we are unable to clean TCE table properly and
> +	   therefore it is better not to touch it at all */
> +	if (!ppc_md.tce_get) {
> +		printk(KERN_ERR "tce_vfio: ppc_md.tce_get isn't implemented\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Allocate and initialize VFIO groups */

s/VFIO groups/IOMMU groups/

> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return -EFAULT;
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +


It troubles me a bit that you're using the vfio driver to initialize and
tear down IOMMU groups on your platform.  VFIO makes use of IOMMU groups
and is the only user so far, but they're hopefully useful beyond this.
In fact, VFIO used to manage assembling all groups from data provided by
the IOMMU but David wanted to see IOMMU groups be a more universally
available feature, so it's odd to see POWER implementing it this way.

> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..2c0a927 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,32 @@ struct vfio_iommu_type1_dma_unmap {
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;
> +	__u32 dma32_window_start;
> +	__u32 dma32_window_size;
> +	__u64 dma64_window_start;
> +	__u64 dma64_window_size;
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +struct vfio_iommu_spapr_tce_put {
> +	__u32 argsz;
> +	__u32 flags;
> +#define VFIO_SPAPR_TCE_READ		1
> +#define VFIO_SPAPR_TCE_WRITE		2
> +#define VFIO_SPAPR_TCE_BIDIRECTIONAL	(VFIO_SPAPR_TCE_READ|VFIO_SPAPR_TCE_WRITE)
> +#define VFIO_SPAPR_TCE_PUT_MASK		VFIO_SPAPR_TCE_BIDIRECTIONAL
> +	__u64 ioba;
> +	__u64 tce;
> +};

Ok, so if READ & WRITE are both clear and ioba is set, that's an
"unmap"?  This is exactly why _type1 has a MAP and UNMAP, to make it
clear which fields are necessary for which call.  I think we should
probably do the same here.  Besides, _put makes me think there should be
a _get; do these have some unique meaning in POWER?

> +
> +#define VFIO_IOMMU_SPAPR_TCE_PUT	_IO(VFIO_TYPE, VFIO_BASE + 13)
> +

Please document what all of the above means.  Thanks,

Alex

> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform
  2012-09-05  5:27               ` Alexey Kardashevskiy
@ 2012-09-10 17:06                 ` Alex Williamson
  0 siblings, 0 replies; 25+ messages in thread
From: Alex Williamson @ 2012-09-10 17:06 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Paul Mackerras, linuxppc-dev, David Gibson

On Wed, 2012-09-05 at 15:27 +1000, Alexey Kardashevskiy wrote:
> On 05/09/12 15:17, Benjamin Herrenschmidt wrote:
> > On Tue, 2012-09-04 at 22:57 -0600, Alex Williamson wrote:
> >
> >> Do we need an extra region info field, or is it sufficient that we
> >> define a region to be mmap'able with getpagesize() pages when the MMAP
> >> flag is set and simply offset the region within the device fd?  ex.
> >
> > Alexey ? You mentioned you had ways to get at the offset with the
> > existing interfaces ?
> 
> 
> Yes, VFIO_DEVICE_GET_REGION_INFO ioctl of vfio-pci host driver, the "info" 
> struct has an "offset" field.
> I just do not have a place to use it in the QEMU right now as the guest 
> does the same allocation as the host does (by accident).

Yep, this is the offset into the device fd though.  We currently used a
fixed 40bit region for each BAR, but that's mostly a leftover from
before the API described the offset.  It's a little bit of an
optimization on the kernel side to convert offset->BAR w/o a lookup, but
we're hopefully mmap'ing as much as possible, modulo the page size
issues here.

> >> BAR0: 0x10000 /* no offset */
> >> BAR1: 0x21000 /* 4k offset */
> >> BAR2: 0x32000 /* 8k offset */
> >>
> >> A second level optimization might make these 0x10000, 0x11000, 0x12000.
> >>
> >> This will obviously require some arch hooks w/in vfio as we can't do
> >> this on x86 since we can't guarantee that whatever lives in the
> >> overflow/gaps is in the same group and power is going to need to make
> >> sure we don't accidentally allow msix table mapping... in fact hiding
> >> the msix table might be a lot more troublesome on 64k page hosts.
> >
> > Fortunately, our guests don't access the msix table directly anyway, at
> > least most of the time :-)
> 
> 
> Not at all in our case. It took me some time to push a QEMU patch which 
> changes msix table :)

vfio needs to be safe regardless of whether it's being used by qemu or
some other userspace driver though.

> > There's a paravirt API for it, and our iommu
> > makes sure that if for some reason the guest still accesses it and does
> > the wrong thing to it, the side effects will be contained to the guest.

If direct access to the MSIX table neither leaks information nor leads
to exploitable holes, then I have no problem allowing a platform hook to
make it mmap'able.  We should be looking at this for x86 too on
platforms where we have interrupt remapping capabilities.

> >>> Now the main problem here is going to be that the guest itself might
> >>> reallocate the BAR and move it around (well, it's version of the BAR
> >>> which isn't the real thing), and so we cannot create a direct MMU
> >>> mapping between -that- and the real BAR.
> >>>
> >>> IE. We can only allow that direct mapping if the guest BAR mapping has
> >>> the same "offset within page" as the host BAR mapping.
> >>
> >> Euw...
> >
> > Yeah sucks :-) Basically, let's say page size is 64K. Host side BAR
> > (real BAR) is at 0xf0001000.
> >
> > qemu maps 0xf0000000..0xf000ffff to a virtual address inside QEMU,
> > itself 64k aligned, let's say 0x80000000 and knows that the BAR is at
> > offset 0x1000 in there.
> >
> > However, the KVM "MR" API is such that we can only map PAGE_SIZE regions
> > into the guest as well, so if the guest assigns a value ADDR to the
> > guest BAR, let's say 0x40002000, all KVM can do is an MR that maps
> > 0x40000000 (guest physical) to 0x80000000 (qemu). Any access within that
> > 64K page will have the low bits transferred directly from guest to HW.
> >
> > So the guest will end up having that 0x2000 offset instead of the 0x1000
> > needed to actually access the BAR. FAIL.
> >
> > There are ways to fix that but all are nasty.
> >
> >   - In theory, we have the capability (and use it today) to restrict IO
> > mappings in the guest to 4K HW pages, so knowing that, KVM could use a
> > "special" MR that plays tricks here... but that would break all sort of
> > generic code both in qemu and kvm and generally be very nasty.
> >
> >   - The best approach is to rely on the fact that our guest kernels don't
> > do BAR assignment, they rely on FW to do it (ie not at all, unlike x86,
> > we can't even fixup because in the general case, the hypervisor won't
> > let us anyway). So we could move our guest BAR allocation code out of
> > our guest firmware (SLOF) back into qemu (where we had it very early
> > on), which allows us to make sure that the guest BAR values we assign
> > have the same "offset within the page" as the host side values. This
> > would also allow us to avoid messing up too many MRs (this can have a
> > performance impact with KVM) and eventually handle our "group" regions
> > instead of individual BARs for mappings. We might need to do that anyway
> > in the long run for hotplug as our hotplug hypervisor APIs also rely on
> > the "new" hotplugged devices to have the BARs pre-assigned when they get
> > handed out to the guest.

Ok, now it's making more sense how the original patch here was
beneficial.  If the physical BAR is 64k aligned we could expose the BAR
as being at least 64k and therefore everything would line up.  We have
the same issue on x86 where devices might have <4k BARs and we'd prefer
to mmap them.  So far we've successfully ignored them because they're
not "high performance" devices and because we can't always assume the
extra space is unused or even safe to access.  Obviously the problem
gets much worse at 64k.

You know that the MMIO space consumed by a group is 64k aligned, but
individual BARs are not.  Guest and host may end up with different BAR
offsets within a 64k page though, so that doesn't buy us much.  Blech.
Yeah, relying on the guest not changing mappings is pretty ugly, but may
be a solution.  For the general case though we'd really prefer each BAR
to be treated as a minimum 64k then exposed to the user (qemu) as the
same.  Doing that across a system is pretty wasteful and remapping is
not always possible.  You almost want a setting per PE to specify how
BARs are aligned (and x86 might want the same, but it's not as clear how
to define what devices to apply it to).  Hard problem.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] vfio: enabled and supported on power (v7)
  2012-09-10 16:02   ` [PATCH] vfio: enabled and supported on power (v7) Alex Williamson
@ 2012-09-11  8:28     ` Alexey Kardashevskiy
  2012-09-13 22:34       ` Alex Williamson
  0 siblings, 1 reply; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-09-11  8:28 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Paul Mackerras, linuxppc-dev, David Gibson

On 11/09/12 02:02, Alex Williamson wrote:
> On Tue, 2012-09-04 at 17:33 +1000, Alexey Kardashevskiy wrote:
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> Cc: Paul Mackerras <paulus@samba.org>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>
> Please at least cc kvm@vger as well since we list that as the devel list
> for vfio.
>
>>   arch/powerpc/include/asm/iommu.h    |    3 +
>
> I'll need an ack from Ben or Paul for this change.
>
>>   drivers/iommu/Kconfig               |    8 +
>>   drivers/vfio/Kconfig                |    6 +
>>   drivers/vfio/Makefile               |    1 +
>>   drivers/vfio/vfio_iommu_spapr_tce.c |  440 +++++++++++++++++++++++++++++++++++
>>   include/linux/vfio.h                |   29 +++
>>   6 files changed, 487 insertions(+)
>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index 957a83f..c64bce7 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -66,6 +66,9 @@ struct iommu_table {
>>   	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
>>   	spinlock_t     it_lock;      /* Protects it_map */
>>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> +	struct iommu_group *it_group;
>> +#endif
>>   };
>
> This seems to only be valid when vfio_iommu_spapr_tce is loaded, which
> is a bit misleading.
>
>>
>>   struct scatterlist;
>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>> index 3bd9fff..19cf2d9 100644
>> --- a/drivers/iommu/Kconfig
>> +++ b/drivers/iommu/Kconfig
>> @@ -162,4 +162,12 @@ config TEGRA_IOMMU_SMMU
>>   	  space through the SMMU (System Memory Management Unit)
>>   	  hardware included on Tegra SoCs.
>>
>> +config SPAPR_TCE_IOMMU
>> +	bool "sPAPR TCE IOMMU Support"
>> +	depends on PPC_PSERIES
>> +	select IOMMU_API
>> +	help
>> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
>> +	  still not implemented.
>> +
>>   endif # IOMMU_SUPPORT
>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>> index 7cd5dec..b464687 100644
>> --- a/drivers/vfio/Kconfig
>> +++ b/drivers/vfio/Kconfig
>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>   	depends on VFIO
>>   	default n
>>
>> +config VFIO_IOMMU_SPAPR_TCE
>> +	tristate
>> +	depends on VFIO && SPAPR_TCE_IOMMU
>> +	default n
>> +
>>   menuconfig VFIO
>>   	tristate "VFIO Non-Privileged userspace driver framework"
>>   	depends on IOMMU_API
>>   	select VFIO_IOMMU_TYPE1 if X86
>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>   	help
>>   	  VFIO provides a framework for secure userspace device drivers.
>>   	  See Documentation/vfio.txt for more details.
>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>> index 2398d4a..72bfabc 100644
>> --- a/drivers/vfio/Makefile
>> +++ b/drivers/vfio/Makefile
>> @@ -1,3 +1,4 @@
>>   obj-$(CONFIG_VFIO) += vfio.o
>>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>   obj-$(CONFIG_VFIO_PCI) += pci/
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> new file mode 100644
>> index 0000000..21f1909
>> --- /dev/null
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -0,0 +1,440 @@
>> +/*
>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>> + *
>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * Derived from original vfio_iommu_x86.c:
>
> Should this be _type1?  Only the mail archives are going to remember
> there was a _x86, so the renamed version is probably a better reference.
>
>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/err.h>
>> +#include <linux/vfio.h>
>> +#include <linux/spinlock.h>
>> +#include <asm/iommu.h>
>> +
>> +#define DRIVER_VERSION  "0.1"
>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
>> +
>> +
>> +/*
>> + * SPAPR TCE API
>> + */
>> +static void tce_free(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long tce)
>> +{
>> +	struct page *page = pfn_to_page(tce >> PAGE_SHIFT);
>> +
>> +	WARN_ON(!page);
>> +	if (page) {
>> +		if (tce & VFIO_SPAPR_TCE_WRITE)
>> +			SetPageDirty(page);
>> +		put_page(page);
>> +	}
>> +	ppc_md.tce_free(tbl, entry, 1);
>> +}
>> +
>> +static long tce_put(struct iommu_table *tbl,
>> +		unsigned long entry, uint64_t tce, uint32_t flags)
>> +{
>> +	int ret;
>> +	unsigned long oldtce, kva, offset;
>> +	struct page *page = NULL;
>> +	enum dma_data_direction direction = DMA_NONE;
>> +
>> +	switch (flags & VFIO_SPAPR_TCE_PUT_MASK) {
>> +	case VFIO_SPAPR_TCE_READ:
>> +		direction = DMA_TO_DEVICE;
>> +		break;
>> +	case VFIO_SPAPR_TCE_WRITE:
>> +		direction = DMA_FROM_DEVICE;
>> +		break;
>> +	case VFIO_SPAPR_TCE_BIDIRECTIONAL:
>> +		direction = DMA_BIDIRECTIONAL;
>> +		break;
>> +	}
>> +
>> +	oldtce = ppc_md.tce_get(tbl, entry);
>> +
>> +	/* Free page if still allocated */
>> +	if (oldtce & VFIO_SPAPR_TCE_PUT_MASK)
>> +		tce_free(tbl, entry, oldtce);
>> +
>> +	/* Map new TCE */
>> +	if (direction != DMA_NONE) {
>> +		offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>> +		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> +				direction != DMA_TO_DEVICE, &page);
>> +		BUG_ON(ret > 1);
>
> Can this happen?
>
>> +		if (ret < 1) {
>> +			printk(KERN_ERR "tce_vfio: get_user_pages_fast failed "
>> +					"tce=%llx ioba=%lx ret=%d\n",
>> +					tce, entry << IOMMU_PAGE_SHIFT, ret);
>> +			if (!ret)
>> +				ret = -EFAULT;
>> +			goto unlock_exit;
>> +		}
>> +
>> +		kva = (unsigned long) page_address(page);
>> +		kva += offset;
>> +		BUG_ON(!kva);
>
> Same here, can it happen?  If so, should it BUG or catch the below
> EINVAL?
>
>> +		if (WARN_ON(kva & ~IOMMU_PAGE_MASK))
>> +			return -EINVAL;
>
> Page leak?  Don't we want to do a put_page(), which means we probably
> want a goto exit here.
>
>> +
>> +		/* Preserve access bits */
>> +		kva |= flags & VFIO_SPAPR_TCE_PUT_MASK;
>> +
>> +		/* tce_build receives a virtual address */
>> +		entry += tbl->it_offset;	/* Offset into real TCE table */
>> +		ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>> +
>> +		/* tce_build() only returns non-zero for transient errors */
>> +		if (unlikely(ret)) {
>> +			printk(KERN_ERR "tce_vfio: Failed to add TCE\n");
>> +			ret = -EIO;
>> +			goto unlock_exit;
>> +		}
>> +	}
>> +	/* Flush/invalidate TLB caches if necessary */
>> +	if (ppc_md.tce_flush)
>> +		ppc_md.tce_flush(tbl);
>> +
>> +	/* Make sure updates are seen by hardware */
>> +	mb();
>> +
>> +unlock_exit:
>
> unlock seems wrong here, I had to go re-read the code looking for the
> lock.
>
>> +	if (ret && page)
>> +		put_page(page);
>> +
>> +	if (ret)
>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx "
>> +				"ioba=%lx kva=%lx\n", tce,
>> +				entry << IOMMU_PAGE_SHIFT, kva);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>> + */
>> +
>> +/*
>> + * The container descriptor supports only a single group per container.
>> + * Required by the API as the container is not supplied with the IOMMU group
>> + * at the moment of initialization.
>> + */
>> +struct tce_container {
>> +	struct iommu_table *tbl;
>> +};
>> +
>> +static void *tce_iommu_open(unsigned long arg)
>> +{
>> +	struct tce_container *container;
>> +
>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
>> +		return ERR_PTR(-EINVAL);
>> +	}
>> +
>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
>> +	if (!container)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	return container;
>> +}
>> +
>> +static void tce_iommu_release(void *iommu_data)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = container->tbl;
>> +	unsigned long i, tce;
>> +
>
> This will segfault if releasing a container that never had an a device
> attached.
>
>> +	/* Unmap leftovers */
>> +	spin_lock_irq(&tbl->it_lock);
>> +	for (i = tbl->it_offset; i < tbl->it_offset + tbl->it_size; ++i) {
>> +		tce = ppc_md.tce_get(tbl, i);
>> +		if (tce & VFIO_SPAPR_TCE_PUT_MASK)
>> +			tce_free(tbl, i, tce);
>> +	}
>> +	/* Flush/invalidate TLB caches if necessary */
>> +	if (ppc_md.tce_flush)
>> +		ppc_md.tce_flush(tbl);
>> +
>> +	/* Make sure updates are seen by hardware */
>> +	mb();
>> +
>> +	spin_unlock_irq(&tbl->it_lock);
>> +
>> +	kfree(container);
>> +}
>> +
>> +static long tce_iommu_ioctl(void *iommu_data,
>> +				 unsigned int cmd, unsigned long arg)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	unsigned long minsz;
>> +	long ret;
>> +
>> +	switch (cmd) {
>> +	case VFIO_CHECK_EXTENSION: {
>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>> +	}
>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>> +		struct vfio_iommu_spapr_tce_info info;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>> +				dma64_window_size);
>> +
>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (info.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if (!tbl)
>> +			return -ENXIO;
>
> nit: why not check this earlier?
>
>> +
>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>> +		info.dma64_window_start = 0;
>> +		info.dma64_window_size = 0;
>> +		info.flags = 0;
>> +
>> +		return copy_to_user((void __user *)arg, &info, minsz);
>> +	}
>> +	case VFIO_IOMMU_SPAPR_TCE_PUT: {
>> +		struct vfio_iommu_spapr_tce_put par;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_put, tce);
>> +
>> +		if (copy_from_user(&par, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (par.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if (!tbl) {
>> +			return -ENXIO;
>> +		}
>
> Same, plus drop the braces.
>
>> +
>> +		spin_lock_irq(&tbl->it_lock);
>> +		ret = tce_put(tbl, par.ioba >> IOMMU_PAGE_SHIFT,
>> +				par.tce, par.flags);
>> +		spin_unlock_irq(&tbl->it_lock);
>> +
>> +		return ret;
>> +	}
>
> Is "PUT" really the name we want for this?


Yes, it is a single H_PUT_TCE hypercall from POWER architecture spec.


>> +	default:
>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
>> +	}
>> +
>> +	return -ENOTTY;
>> +}
>> +
>> +static int tce_iommu_attach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	printk(KERN_DEBUG "tce_vfio: Attaching group #%u to iommu %p\n",
>> +			iommu_group_id(iommu_group), iommu_group);
>
> Let's use pr_debug() and friends throughout.
>
>> +	if (container->tbl) {
>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU "
>> +				"container is allowed, "
>> +				"existing id=%d, attaching id=%d\n",
>> +				iommu_group_id(container->tbl->it_group),
>> +				iommu_group_id(iommu_group));
>> +		return -EBUSY;
>> +	}
>> +
>
> _type1 has a lock to avoid races here, I think you might need one too.
>
>> +	container->tbl = tbl;
>> +
>> +	return 0;
>> +}
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	BUG_ON(!tbl);
>
> Needed?  If so, why is there no check on attach?

Added to attach() :)


>
>> +	if (tbl != container->tbl) {
>> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected "
>> +				"group is #%u\n", iommu_group_id(iommu_group),
>> +				iommu_group_id(tbl->it_group));
>> +		return;
>> +	}
>> +	printk(KERN_DEBUG "tce_vfio: detaching group #%u from iommu %p\n",
>> +			iommu_group_id(iommu_group), iommu_group);
>
> container->tbl = NULL?


Then I won't be able to release pages in tce_iommu_release().
Releasing pages in tce_iommu_detach_group() caused some other problems, 
cannot recall now which ones.


>> +}
>> +
>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>> +	.name		= "iommu-vfio-powerpc",
>> +	.owner		= THIS_MODULE,
>> +	.open		= tce_iommu_open,
>> +	.release	= tce_iommu_release,
>> +	.ioctl		= tce_iommu_ioctl,
>> +	.attach_group	= tce_iommu_attach_group,
>> +	.detach_group	= tce_iommu_detach_group,
>> +};
>> +
>> +/*
>> + * Add/delete devices support (hotplug, module_init, module_exit)
>> + */
>> +static int add_device(struct device *dev)
>> +{
>> +	struct iommu_table *tbl;
>> +	int ret = 0;
>> +
>> +	if (dev->iommu_group) {
>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu "
>> +				"group %d, skipping\n", dev->kobj.name,
>
> Watch line wrapping on strings.

Pardon?


>> +				iommu_group_id(dev->iommu_group));
>> +		return -EBUSY;
>> +	}
>> +
>> +	tbl = get_iommu_table_base(dev);
>> +	if (!tbl) {
>> +		printk(KERN_DEBUG "tce_vfio: skipping device %s with no tbl\n",
>> +				dev->kobj.name);
>> +		return 0;
>> +	}
>> +
>> +	printk(KERN_DEBUG "tce_vfio: adding %s to iommu group %d\n",
>> +			dev->kobj.name, iommu_group_id(tbl->it_group));
>> +
>> +	ret = iommu_group_add_device(tbl->it_group, dev);
>> +	if (ret < 0)
>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>> +				dev->kobj.name, ret);
>> +
>> +	return ret;
>> +}
>> +
>> +static void del_device(struct device *dev)
>> +{
>> +	iommu_group_remove_device(dev);
>> +}
>> +
>> +static int iommu_bus_notifier(struct notifier_block *nb,
>> +			      unsigned long action, void *data)
>> +{
>> +	struct device *dev = data;
>> +
>> +	switch (action) {
>> +	case BUS_NOTIFY_ADD_DEVICE:
>> +		return add_device(dev);
>> +	case BUS_NOTIFY_DEL_DEVICE:
>> +		del_device(dev);
>> +		return 0;
>> +	default:
>> +		return 0;
>> +	}
>> +}
>> +
>> +static struct notifier_block tce_iommu_bus_nb = {
>> +	.notifier_call = iommu_bus_notifier,
>> +};
>> +
>> +void group_release(void *iommu_data)
>> +{
>> +	struct iommu_table *tbl = iommu_data;
>> +	tbl->it_group = NULL;
>> +}
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp;
>> +
>> +	/* If the current platform does not support tce_get
>> +	   we are unable to clean TCE table properly and
>> +	   therefore it is better not to touch it at all */
>> +	if (!ppc_md.tce_get) {
>> +		printk(KERN_ERR "tce_vfio: ppc_md.tce_get isn't implemented\n");
>> +		return -EOPNOTSUPP;
>> +	}
>> +
>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> +	/* Allocate and initialize VFIO groups */
>
> s/VFIO groups/IOMMU groups/
>
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +
>> +		/* Skip already initialized */
>> +		if (tbl->it_group)
>> +			continue;
>> +
>> +		grp = iommu_group_alloc();
>> +		if (IS_ERR(grp)) {
>> +			printk(KERN_INFO "tce_vfio: cannot create "
>> +					"new IOMMU group, ret=%ld\n",
>> +					PTR_ERR(grp));
>> +			return -EFAULT;
>> +		}
>> +		tbl->it_group = grp;
>> +		iommu_group_set_iommudata(grp, tbl, group_release);
>> +	}
>> +
>> +	/* Add PCI devices to VFIO groups */
>> +	for_each_pci_dev(pdev)
>> +		add_device(&pdev->dev);
>> +
>> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp = NULL;
>> +
>> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> +	/* Delete PCI devices from VFIO groups */
>> +	for_each_pci_dev(pdev)
>> +		del_device(&pdev->dev);
>> +
>> +	/* Release VFIO groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +		grp = tbl->it_group;
>> +
>> +		/* Skip (already) uninitialized */
>> +		if (!grp)
>> +			continue;
>> +
>> +		/* Do actual release, group_release() is expected to work */
>> +		iommu_group_put(grp);
>> +		BUG_ON(tbl->it_group);
>> +	}
>> +
>
>
> It troubles me a bit that you're using the vfio driver to initialize and
> tear down IOMMU groups on your platform.


I am not following you here. Could you please explain a bit?



> VFIO makes use of IOMMU groups
> and is the only user so far, but they're hopefully useful beyond this.
> In fact, VFIO used to manage assembling all groups from data provided by
> the IOMMU but David wanted to see IOMMU groups be a more universally
> available feature, so it's odd to see POWER implementing it this way.


David, help! :)


>> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> +
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 0a4f180..2c0a927 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>>   /* Extensions */
>>
>>   #define VFIO_TYPE1_IOMMU		1
>> +#define VFIO_SPAPR_TCE_IOMMU		2
>>
>>   /*
>>    * The IOCTL interface is designed for extensibility by embedding the
>> @@ -442,4 +443,32 @@ struct vfio_iommu_type1_dma_unmap {
>>
>>   #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>
>> +/* -------- API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>> +
>> +struct vfio_iommu_spapr_tce_info {
>> +	__u32 argsz;
>> +	__u32 flags;
>> +	__u32 dma32_window_start;
>> +	__u32 dma32_window_size;
>> +	__u64 dma64_window_start;
>> +	__u64 dma64_window_size;
>> +};
>> +
>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +struct vfio_iommu_spapr_tce_put {
>> +	__u32 argsz;
>> +	__u32 flags;
>> +#define VFIO_SPAPR_TCE_READ		1
>> +#define VFIO_SPAPR_TCE_WRITE		2
>> +#define VFIO_SPAPR_TCE_BIDIRECTIONAL	(VFIO_SPAPR_TCE_READ|VFIO_SPAPR_TCE_WRITE)
>> +#define VFIO_SPAPR_TCE_PUT_MASK		VFIO_SPAPR_TCE_BIDIRECTIONAL
>> +	__u64 ioba;
>> +	__u64 tce;
>> +};
>
> Ok, so if READ & WRITE are both clear and ioba is set, that's an
> "unmap"?  This is exactly why _type1 has a MAP and UNMAP, to make it
> clear which fields are necessary for which call.  I think we should
> probably do the same here.  Besides, _put makes me think there should be
> a _get; do these have some unique meaning in POWER?


It is a single H_PUT_TCE for putting a record into TCE table. The guest 
calls H_PUT_TCE, QEMU replaces the address and simply forwards the call to 
the host. Calling them map/unmap makes it less clear for powerpc people :)


>
>> +
>> +#define VFIO_IOMMU_SPAPR_TCE_PUT	_IO(VFIO_TYPE, VFIO_BASE + 13)
>> +
>
> Please document what all of the above means.  Thanks,


Something like this?
/*
  * The VFIO_IOMMU_SPAPR_TCE_PUT is implemented as the H_PUT_TCE hypercall.
  * ioba - I/O Bus Address for indexing into TCE table
  * tce - logical address of storage
  *
  * The non-zero flags means adding new page into the table.
  * The zero flags means releasing the existing page and clearing the
  * TCE table entry.
  */




> Alex
>
>> +/* ***************************************************************** */
>> +
>>   #endif /* VFIO_H */
>
>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] vfio: enabled and supported on power (v7)
  2012-09-11  8:28     ` Alexey Kardashevskiy
@ 2012-09-13 22:34       ` Alex Williamson
  2012-09-13 22:41         ` Scott Wood
  2012-09-14  0:51         ` Alexey Kardashevskiy
  0 siblings, 2 replies; 25+ messages in thread
From: Alex Williamson @ 2012-09-13 22:34 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Paul Mackerras, linuxppc-dev, David Gibson

On Tue, 2012-09-11 at 18:28 +1000, Alexey Kardashevskiy wrote:
> On 11/09/12 02:02, Alex Williamson wrote:
> > On Tue, 2012-09-04 at 17:33 +1000, Alexey Kardashevskiy wrote:
> >> Cc: David Gibson <david@gibson.dropbear.id.au>
> >> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> >> Cc: Paul Mackerras <paulus@samba.org>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >
> > Please at least cc kvm@vger as well since we list that as the devel list
> > for vfio.
> >
> >>   arch/powerpc/include/asm/iommu.h    |    3 +
> >
> > I'll need an ack from Ben or Paul for this change.
> >
> >>   drivers/iommu/Kconfig               |    8 +
> >>   drivers/vfio/Kconfig                |    6 +
> >>   drivers/vfio/Makefile               |    1 +
> >>   drivers/vfio/vfio_iommu_spapr_tce.c |  440 +++++++++++++++++++++++++++++++++++
> >>   include/linux/vfio.h                |   29 +++
> >>   6 files changed, 487 insertions(+)
> >>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>
> >> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> >> index 957a83f..c64bce7 100644
> >> --- a/arch/powerpc/include/asm/iommu.h
> >> +++ b/arch/powerpc/include/asm/iommu.h
> >> @@ -66,6 +66,9 @@ struct iommu_table {
> >>   	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
> >>   	spinlock_t     it_lock;      /* Protects it_map */
> >>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
> >> +#ifdef CONFIG_IOMMU_API
> >> +	struct iommu_group *it_group;
> >> +#endif
> >>   };
> >
> > This seems to only be valid when vfio_iommu_spapr_tce is loaded, which
> > is a bit misleading.
> >
> >>
> >>   struct scatterlist;
> >> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> >> index 3bd9fff..19cf2d9 100644
> >> --- a/drivers/iommu/Kconfig
> >> +++ b/drivers/iommu/Kconfig
> >> @@ -162,4 +162,12 @@ config TEGRA_IOMMU_SMMU
> >>   	  space through the SMMU (System Memory Management Unit)
> >>   	  hardware included on Tegra SoCs.
> >>
> >> +config SPAPR_TCE_IOMMU
> >> +	bool "sPAPR TCE IOMMU Support"
> >> +	depends on PPC_PSERIES
> >> +	select IOMMU_API
> >> +	help
> >> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> >> +	  still not implemented.
> >> +
> >>   endif # IOMMU_SUPPORT
> >> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >> index 7cd5dec..b464687 100644
> >> --- a/drivers/vfio/Kconfig
> >> +++ b/drivers/vfio/Kconfig
> >> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>   	depends on VFIO
> >>   	default n
> >>
> >> +config VFIO_IOMMU_SPAPR_TCE
> >> +	tristate
> >> +	depends on VFIO && SPAPR_TCE_IOMMU
> >> +	default n
> >> +
> >>   menuconfig VFIO
> >>   	tristate "VFIO Non-Privileged userspace driver framework"
> >>   	depends on IOMMU_API
> >>   	select VFIO_IOMMU_TYPE1 if X86
> >> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>   	help
> >>   	  VFIO provides a framework for secure userspace device drivers.
> >>   	  See Documentation/vfio.txt for more details.
> >> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >> index 2398d4a..72bfabc 100644
> >> --- a/drivers/vfio/Makefile
> >> +++ b/drivers/vfio/Makefile
> >> @@ -1,3 +1,4 @@
> >>   obj-$(CONFIG_VFIO) += vfio.o
> >>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>   obj-$(CONFIG_VFIO_PCI) += pci/
> >> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> new file mode 100644
> >> index 0000000..21f1909
> >> --- /dev/null
> >> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> @@ -0,0 +1,440 @@
> >> +/*
> >> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >> + *
> >> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License version 2 as
> >> + * published by the Free Software Foundation.
> >> + *
> >> + * Derived from original vfio_iommu_x86.c:
> >
> > Should this be _type1?  Only the mail archives are going to remember
> > there was a _x86, so the renamed version is probably a better reference.
> >
> >> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> >> + */
> >> +
> >> +#include <linux/module.h>
> >> +#include <linux/pci.h>
> >> +#include <linux/slab.h>
> >> +#include <linux/uaccess.h>
> >> +#include <linux/err.h>
> >> +#include <linux/vfio.h>
> >> +#include <linux/spinlock.h>
> >> +#include <asm/iommu.h>
> >> +
> >> +#define DRIVER_VERSION  "0.1"
> >> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> >> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> >> +
> >> +
> >> +/*
> >> + * SPAPR TCE API
> >> + */
> >> +static void tce_free(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long tce)
> >> +{
> >> +	struct page *page = pfn_to_page(tce >> PAGE_SHIFT);
> >> +
> >> +	WARN_ON(!page);
> >> +	if (page) {
> >> +		if (tce & VFIO_SPAPR_TCE_WRITE)
> >> +			SetPageDirty(page);
> >> +		put_page(page);
> >> +	}
> >> +	ppc_md.tce_free(tbl, entry, 1);
> >> +}
> >> +
> >> +static long tce_put(struct iommu_table *tbl,
> >> +		unsigned long entry, uint64_t tce, uint32_t flags)
> >> +{
> >> +	int ret;
> >> +	unsigned long oldtce, kva, offset;
> >> +	struct page *page = NULL;
> >> +	enum dma_data_direction direction = DMA_NONE;
> >> +
> >> +	switch (flags & VFIO_SPAPR_TCE_PUT_MASK) {
> >> +	case VFIO_SPAPR_TCE_READ:
> >> +		direction = DMA_TO_DEVICE;
> >> +		break;
> >> +	case VFIO_SPAPR_TCE_WRITE:
> >> +		direction = DMA_FROM_DEVICE;
> >> +		break;
> >> +	case VFIO_SPAPR_TCE_BIDIRECTIONAL:
> >> +		direction = DMA_BIDIRECTIONAL;
> >> +		break;
> >> +	}
> >> +
> >> +	oldtce = ppc_md.tce_get(tbl, entry);
> >> +
> >> +	/* Free page if still allocated */
> >> +	if (oldtce & VFIO_SPAPR_TCE_PUT_MASK)
> >> +		tce_free(tbl, entry, oldtce);
> >> +
> >> +	/* Map new TCE */
> >> +	if (direction != DMA_NONE) {
> >> +		offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >> +		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >> +				direction != DMA_TO_DEVICE, &page);
> >> +		BUG_ON(ret > 1);
> >
> > Can this happen?
> >
> >> +		if (ret < 1) {
> >> +			printk(KERN_ERR "tce_vfio: get_user_pages_fast failed "
> >> +					"tce=%llx ioba=%lx ret=%d\n",
> >> +					tce, entry << IOMMU_PAGE_SHIFT, ret);
> >> +			if (!ret)
> >> +				ret = -EFAULT;
> >> +			goto unlock_exit;
> >> +		}
> >> +
> >> +		kva = (unsigned long) page_address(page);
> >> +		kva += offset;
> >> +		BUG_ON(!kva);
> >
> > Same here, can it happen?  If so, should it BUG or catch the below
> > EINVAL?
> >
> >> +		if (WARN_ON(kva & ~IOMMU_PAGE_MASK))
> >> +			return -EINVAL;
> >
> > Page leak?  Don't we want to do a put_page(), which means we probably
> > want a goto exit here.
> >
> >> +
> >> +		/* Preserve access bits */
> >> +		kva |= flags & VFIO_SPAPR_TCE_PUT_MASK;
> >> +
> >> +		/* tce_build receives a virtual address */
> >> +		entry += tbl->it_offset;	/* Offset into real TCE table */
> >> +		ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >> +
> >> +		/* tce_build() only returns non-zero for transient errors */
> >> +		if (unlikely(ret)) {
> >> +			printk(KERN_ERR "tce_vfio: Failed to add TCE\n");
> >> +			ret = -EIO;
> >> +			goto unlock_exit;
> >> +		}
> >> +	}
> >> +	/* Flush/invalidate TLB caches if necessary */
> >> +	if (ppc_md.tce_flush)
> >> +		ppc_md.tce_flush(tbl);
> >> +
> >> +	/* Make sure updates are seen by hardware */
> >> +	mb();
> >> +
> >> +unlock_exit:
> >
> > unlock seems wrong here, I had to go re-read the code looking for the
> > lock.
> >
> >> +	if (ret && page)
> >> +		put_page(page);
> >> +
> >> +	if (ret)
> >> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx "
> >> +				"ioba=%lx kva=%lx\n", tce,
> >> +				entry << IOMMU_PAGE_SHIFT, kva);
> >> +	return ret;
> >> +}
> >> +
> >> +/*
> >> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >> + */
> >> +
> >> +/*
> >> + * The container descriptor supports only a single group per container.
> >> + * Required by the API as the container is not supplied with the IOMMU group
> >> + * at the moment of initialization.
> >> + */
> >> +struct tce_container {
> >> +	struct iommu_table *tbl;
> >> +};
> >> +
> >> +static void *tce_iommu_open(unsigned long arg)
> >> +{
> >> +	struct tce_container *container;
> >> +
> >> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >> +		return ERR_PTR(-EINVAL);
> >> +	}
> >> +
> >> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> >> +	if (!container)
> >> +		return ERR_PTR(-ENOMEM);
> >> +
> >> +	return container;
> >> +}
> >> +
> >> +static void tce_iommu_release(void *iommu_data)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +	struct iommu_table *tbl = container->tbl;
> >> +	unsigned long i, tce;
> >> +
> >
> > This will segfault if releasing a container that never had an a device
> > attached.
> >
> >> +	/* Unmap leftovers */
> >> +	spin_lock_irq(&tbl->it_lock);
> >> +	for (i = tbl->it_offset; i < tbl->it_offset + tbl->it_size; ++i) {
> >> +		tce = ppc_md.tce_get(tbl, i);
> >> +		if (tce & VFIO_SPAPR_TCE_PUT_MASK)
> >> +			tce_free(tbl, i, tce);
> >> +	}
> >> +	/* Flush/invalidate TLB caches if necessary */
> >> +	if (ppc_md.tce_flush)
> >> +		ppc_md.tce_flush(tbl);
> >> +
> >> +	/* Make sure updates are seen by hardware */
> >> +	mb();
> >> +
> >> +	spin_unlock_irq(&tbl->it_lock);
> >> +
> >> +	kfree(container);
> >> +}
> >> +
> >> +static long tce_iommu_ioctl(void *iommu_data,
> >> +				 unsigned int cmd, unsigned long arg)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +	unsigned long minsz;
> >> +	long ret;
> >> +
> >> +	switch (cmd) {
> >> +	case VFIO_CHECK_EXTENSION: {
> >> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >> +	}
> >> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >> +		struct vfio_iommu_spapr_tce_info info;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +
> >> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >> +				dma64_window_size);
> >> +
> >> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (info.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		if (!tbl)
> >> +			return -ENXIO;
> >
> > nit: why not check this earlier?
> >
> >> +
> >> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >> +		info.dma64_window_start = 0;
> >> +		info.dma64_window_size = 0;
> >> +		info.flags = 0;
> >> +
> >> +		return copy_to_user((void __user *)arg, &info, minsz);
> >> +	}
> >> +	case VFIO_IOMMU_SPAPR_TCE_PUT: {
> >> +		struct vfio_iommu_spapr_tce_put par;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +
> >> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_put, tce);
> >> +
> >> +		if (copy_from_user(&par, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (par.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		if (!tbl) {
> >> +			return -ENXIO;
> >> +		}
> >
> > Same, plus drop the braces.
> >
> >> +
> >> +		spin_lock_irq(&tbl->it_lock);
> >> +		ret = tce_put(tbl, par.ioba >> IOMMU_PAGE_SHIFT,
> >> +				par.tce, par.flags);
> >> +		spin_unlock_irq(&tbl->it_lock);
> >> +
> >> +		return ret;
> >> +	}
> >
> > Is "PUT" really the name we want for this?
> 
> 
> Yes, it is a single H_PUT_TCE hypercall from POWER architecture spec.

Ok, if it makes sense on your arch, I won't complain (too much) about
it.

> >> +	default:
> >> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >> +	}
> >> +
> >> +	return -ENOTTY;
> >> +}
> >> +
> >> +static int tce_iommu_attach_group(void *iommu_data,
> >> +		struct iommu_group *iommu_group)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >> +
> >> +	printk(KERN_DEBUG "tce_vfio: Attaching group #%u to iommu %p\n",
> >> +			iommu_group_id(iommu_group), iommu_group);
> >
> > Let's use pr_debug() and friends throughout.
> >
> >> +	if (container->tbl) {
> >> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU "
> >> +				"container is allowed, "
> >> +				"existing id=%d, attaching id=%d\n",
> >> +				iommu_group_id(container->tbl->it_group),
> >> +				iommu_group_id(iommu_group));
> >> +		return -EBUSY;
> >> +	}
> >> +
> >
> > _type1 has a lock to avoid races here, I think you might need one too.
> >
> >> +	container->tbl = tbl;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static void tce_iommu_detach_group(void *iommu_data,
> >> +		struct iommu_group *iommu_group)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >> +
> >> +	BUG_ON(!tbl);
> >
> > Needed?  If so, why is there no check on attach?
> 
> Added to attach() :)
> 
> 
> >
> >> +	if (tbl != container->tbl) {
> >> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected "
> >> +				"group is #%u\n", iommu_group_id(iommu_group),
> >> +				iommu_group_id(tbl->it_group));
> >> +		return;
> >> +	}
> >> +	printk(KERN_DEBUG "tce_vfio: detaching group #%u from iommu %p\n",
> >> +			iommu_group_id(iommu_group), iommu_group);
> >
> > container->tbl = NULL?
> 
> 
> Then I won't be able to release pages in tce_iommu_release().
> Releasing pages in tce_iommu_detach_group() caused some other problems, 
> cannot recall now which ones.

What happens if you hot unplug a group from one VM and add it to
another?  ie. we've detached it from one container and add it to another
in a different instance.  Does it cause problems here?

> >> +}
> >> +
> >> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> >> +	.name		= "iommu-vfio-powerpc",
> >> +	.owner		= THIS_MODULE,
> >> +	.open		= tce_iommu_open,
> >> +	.release	= tce_iommu_release,
> >> +	.ioctl		= tce_iommu_ioctl,
> >> +	.attach_group	= tce_iommu_attach_group,
> >> +	.detach_group	= tce_iommu_detach_group,
> >> +};
> >> +
> >> +/*
> >> + * Add/delete devices support (hotplug, module_init, module_exit)
> >> + */
> >> +static int add_device(struct device *dev)
> >> +{
> >> +	struct iommu_table *tbl;
> >> +	int ret = 0;
> >> +
> >> +	if (dev->iommu_group) {
> >> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu "
> >> +				"group %d, skipping\n", dev->kobj.name,
> >
> > Watch line wrapping on strings.
> 
> Pardon?

Just suggesting that you try to wrap lines so that strings are
searchable.  For instance, can I search cscope for "is already in iommu
group".  It's generally accepted that printks can break 80 cols for
this.

> >> +				iommu_group_id(dev->iommu_group));
> >> +		return -EBUSY;
> >> +	}
> >> +
> >> +	tbl = get_iommu_table_base(dev);
> >> +	if (!tbl) {
> >> +		printk(KERN_DEBUG "tce_vfio: skipping device %s with no tbl\n",
> >> +				dev->kobj.name);
> >> +		return 0;
> >> +	}
> >> +
> >> +	printk(KERN_DEBUG "tce_vfio: adding %s to iommu group %d\n",
> >> +			dev->kobj.name, iommu_group_id(tbl->it_group));
> >> +
> >> +	ret = iommu_group_add_device(tbl->it_group, dev);
> >> +	if (ret < 0)
> >> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> >> +				dev->kobj.name, ret);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void del_device(struct device *dev)
> >> +{
> >> +	iommu_group_remove_device(dev);
> >> +}
> >> +
> >> +static int iommu_bus_notifier(struct notifier_block *nb,
> >> +			      unsigned long action, void *data)
> >> +{
> >> +	struct device *dev = data;
> >> +
> >> +	switch (action) {
> >> +	case BUS_NOTIFY_ADD_DEVICE:
> >> +		return add_device(dev);
> >> +	case BUS_NOTIFY_DEL_DEVICE:
> >> +		del_device(dev);
> >> +		return 0;
> >> +	default:
> >> +		return 0;
> >> +	}
> >> +}
> >> +
> >> +static struct notifier_block tce_iommu_bus_nb = {
> >> +	.notifier_call = iommu_bus_notifier,
> >> +};
> >> +
> >> +void group_release(void *iommu_data)
> >> +{
> >> +	struct iommu_table *tbl = iommu_data;
> >> +	tbl->it_group = NULL;
> >> +}
> >> +
> >> +static int __init tce_iommu_init(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp;
> >> +
> >> +	/* If the current platform does not support tce_get
> >> +	   we are unable to clean TCE table properly and
> >> +	   therefore it is better not to touch it at all */
> >> +	if (!ppc_md.tce_get) {
> >> +		printk(KERN_ERR "tce_vfio: ppc_md.tce_get isn't implemented\n");
> >> +		return -EOPNOTSUPP;
> >> +	}
> >> +
> >> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >> +
> >> +	/* Allocate and initialize VFIO groups */
> >
> > s/VFIO groups/IOMMU groups/
> >
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +
> >> +		/* Skip already initialized */
> >> +		if (tbl->it_group)
> >> +			continue;
> >> +
> >> +		grp = iommu_group_alloc();
> >> +		if (IS_ERR(grp)) {
> >> +			printk(KERN_INFO "tce_vfio: cannot create "
> >> +					"new IOMMU group, ret=%ld\n",
> >> +					PTR_ERR(grp));
> >> +			return -EFAULT;
> >> +		}
> >> +		tbl->it_group = grp;
> >> +		iommu_group_set_iommudata(grp, tbl, group_release);
> >> +	}
> >> +
> >> +	/* Add PCI devices to VFIO groups */
> >> +	for_each_pci_dev(pdev)
> >> +		add_device(&pdev->dev);
> >> +
> >> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> >> +}
> >> +
> >> +static void __exit tce_iommu_cleanup(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp = NULL;
> >> +
> >> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >> +
> >> +	/* Delete PCI devices from VFIO groups */
> >> +	for_each_pci_dev(pdev)
> >> +		del_device(&pdev->dev);
> >> +
> >> +	/* Release VFIO groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +		grp = tbl->it_group;
> >> +
> >> +		/* Skip (already) uninitialized */
> >> +		if (!grp)
> >> +			continue;
> >> +
> >> +		/* Do actual release, group_release() is expected to work */
> >> +		iommu_group_put(grp);
> >> +		BUG_ON(tbl->it_group);
> >> +	}
> >> +
> >
> >
> > It troubles me a bit that you're using the vfio driver to initialize and
> > tear down IOMMU groups on your platform.
> 
> 
> I am not following you here. Could you please explain a bit?

IOMMU groups are theoretically not just for VFIO.  They expose DMA
dependencies between devices for anyone who cares to know about it.
VFIO happens to care very much about that, but is hopefully not the only
consumer.  So it's a little bit like writing a driver for a device on a
new bus and incorporating the bus topology handling code into the device
driver.  IOMMU groups should be created and managed independent of VFIO.

> > VFIO makes use of IOMMU groups
> > and is the only user so far, but they're hopefully useful beyond this.
> > In fact, VFIO used to manage assembling all groups from data provided by
> > the IOMMU but David wanted to see IOMMU groups be a more universally
> > available feature, so it's odd to see POWER implementing it this way.
> 
> 
> David, help! :)
> 
> 
> >> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> >> +}
> >> +
> >> +module_init(tce_iommu_init);
> >> +module_exit(tce_iommu_cleanup);
> >> +
> >> +MODULE_VERSION(DRIVER_VERSION);
> >> +MODULE_LICENSE("GPL v2");
> >> +MODULE_AUTHOR(DRIVER_AUTHOR);
> >> +MODULE_DESCRIPTION(DRIVER_DESC);
> >> +
> >> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> >> index 0a4f180..2c0a927 100644
> >> --- a/include/linux/vfio.h
> >> +++ b/include/linux/vfio.h
> >> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
> >>   /* Extensions */
> >>
> >>   #define VFIO_TYPE1_IOMMU		1
> >> +#define VFIO_SPAPR_TCE_IOMMU		2
> >>
> >>   /*
> >>    * The IOCTL interface is designed for extensibility by embedding the
> >> @@ -442,4 +443,32 @@ struct vfio_iommu_type1_dma_unmap {
> >>
> >>   #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> >>
> >> +/* -------- API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> >> +
> >> +struct vfio_iommu_spapr_tce_info {
> >> +	__u32 argsz;
> >> +	__u32 flags;
> >> +	__u32 dma32_window_start;
> >> +	__u32 dma32_window_size;
> >> +	__u64 dma64_window_start;
> >> +	__u64 dma64_window_size;
> >> +};
> >> +
> >> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> >> +
> >> +struct vfio_iommu_spapr_tce_put {
> >> +	__u32 argsz;
> >> +	__u32 flags;
> >> +#define VFIO_SPAPR_TCE_READ		1
> >> +#define VFIO_SPAPR_TCE_WRITE		2
> >> +#define VFIO_SPAPR_TCE_BIDIRECTIONAL	(VFIO_SPAPR_TCE_READ|VFIO_SPAPR_TCE_WRITE)
> >> +#define VFIO_SPAPR_TCE_PUT_MASK		VFIO_SPAPR_TCE_BIDIRECTIONAL
> >> +	__u64 ioba;
> >> +	__u64 tce;
> >> +};
> >
> > Ok, so if READ & WRITE are both clear and ioba is set, that's an
> > "unmap"?  This is exactly why _type1 has a MAP and UNMAP, to make it
> > clear which fields are necessary for which call.  I think we should
> > probably do the same here.  Besides, _put makes me think there should be
> > a _get; do these have some unique meaning in POWER?
> 
> 
> It is a single H_PUT_TCE for putting a record into TCE table. The guest 
> calls H_PUT_TCE, QEMU replaces the address and simply forwards the call to 
> the host. Calling them map/unmap makes it less clear for powerpc people :)

In the unmap case we take an ioba and lookup a tce to clear, in the map
case we take an ioba and tce and insert them into the table.  It's valid
to document this and use a single ioctl, but I've opted on x86 to have
separate ioctls because the documentation falls out cleaner when there
aren't fields that are only used in certain conditions.  Do you really
want any userspace driver making use of this to know about powerpc
H_PUT_TCE or would it make more sense to have a MAP and UNMAP call?  I
think it would be better for the VFIO API if we had some consistency in
the mapping ioctls where possible.

> >
> >> +
> >> +#define VFIO_IOMMU_SPAPR_TCE_PUT	_IO(VFIO_TYPE, VFIO_BASE + 13)
> >> +
> >
> > Please document what all of the above means.  Thanks,
> 
> 
> Something like this?
> /*
>   * The VFIO_IOMMU_SPAPR_TCE_PUT is implemented as the H_PUT_TCE hypercall.
>   * ioba - I/O Bus Address for indexing into TCE table
>   * tce - logical address of storage
>   *
>   * The non-zero flags means adding new page into the table.
>   * The zero flags means releasing the existing page and clearing the
>   * TCE table entry.
>   */

Do you only want VFIO drivers to work on POWER if they're written by
POWER people?  Ideally there are a few simple concepts: a) devices have
an I/O virtual address space.  On x86 we call this the iova and it's
effectively a zero-based, 64bit (not really, but close enough) address
space.  You seem to have two smaller windows, one in 32bit space,
another in 64bit space (maybe we could name these more consistently).
b) Userspace has a buffer that they want to map and unmap to an iova,
potentially with some access flags.  That's all you need to know to use
the x86 _type1 VFIO IOMMU API.  Why do I need to know about H_PUT_TCE to
use this interface?  Let's assume there might be some VFIO drivers some
day that aren't written by POWER people.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] vfio: enabled and supported on power (v7)
  2012-09-13 22:34       ` Alex Williamson
@ 2012-09-13 22:41         ` Scott Wood
  2012-09-13 22:55           ` Alex Williamson
  2012-09-14  0:51         ` Alexey Kardashevskiy
  1 sibling, 1 reply; 25+ messages in thread
From: Scott Wood @ 2012-09-13 22:41 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, linuxppc-dev, Paul Mackerras, David Gibson

On Thu, Sep 13, 2012 at 04:34:59PM -0600, Alex Williamson wrote:
> Do you only want VFIO drivers to work on POWER if they're written by
> POWER people?  Ideally there are a few simple concepts: a) devices have
> an I/O virtual address space.  On x86 we call this the iova and it's
> effectively a zero-based, 64bit (not really, but close enough) address
> space.  You seem to have two smaller windows, one in 32bit space,
> another in 64bit space (maybe we could name these more consistently).
> b) Userspace has a buffer that they want to map and unmap to an iova,
> potentially with some access flags.  That's all you need to know to use
> the x86 _type1 VFIO IOMMU API.  Why do I need to know about H_PUT_TCE to
> use this interface?  Let's assume there might be some VFIO drivers some
> day that aren't written by POWER people.  Thanks,

I'm not familiar with the POWER IOMMU, but certainly with our chips it
would help allow generic drivers to work if there were a type of mapping
operation where the IOMMU driver decides the IOVA and returns it, instead
of the driver trying to choose the IOVA with no knowledge of the IOMMU's
constraints.

-Scott

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] vfio: enabled and supported on power (v7)
  2012-09-13 22:41         ` Scott Wood
@ 2012-09-13 22:55           ` Alex Williamson
  0 siblings, 0 replies; 25+ messages in thread
From: Alex Williamson @ 2012-09-13 22:55 UTC (permalink / raw)
  To: Scott Wood
  Cc: Alexey Kardashevskiy, linuxppc-dev, Paul Mackerras, David Gibson

On Thu, 2012-09-13 at 17:41 -0500, Scott Wood wrote:
> On Thu, Sep 13, 2012 at 04:34:59PM -0600, Alex Williamson wrote:
> > Do you only want VFIO drivers to work on POWER if they're written by
> > POWER people?  Ideally there are a few simple concepts: a) devices have
> > an I/O virtual address space.  On x86 we call this the iova and it's
> > effectively a zero-based, 64bit (not really, but close enough) address
> > space.  You seem to have two smaller windows, one in 32bit space,
> > another in 64bit space (maybe we could name these more consistently).
> > b) Userspace has a buffer that they want to map and unmap to an iova,
> > potentially with some access flags.  That's all you need to know to use
> > the x86 _type1 VFIO IOMMU API.  Why do I need to know about H_PUT_TCE to
> > use this interface?  Let's assume there might be some VFIO drivers some
> > day that aren't written by POWER people.  Thanks,
> 
> I'm not familiar with the POWER IOMMU, but certainly with our chips it
> would help allow generic drivers to work if there were a type of mapping
> operation where the IOMMU driver decides the IOVA and returns it, instead
> of the driver trying to choose the IOVA with no knowledge of the IOMMU's
> constraints.

That sounds reasonable to me.  If we had IOMMU API support for that in
the kernel on x86, it would only take an ALLOC_IOVA flag in the MAP
ioctl, returning the iova in the mapping structure, and the addition of
a capability to let userspace know it's there.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] vfio: enabled and supported on power (v7)
  2012-09-13 22:34       ` Alex Williamson
  2012-09-13 22:41         ` Scott Wood
@ 2012-09-14  0:51         ` Alexey Kardashevskiy
  2012-09-14  4:35           ` Alex Williamson
  1 sibling, 1 reply; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-09-14  0:51 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Paul Mackerras, linuxppc-dev, David Gibson

On 14/09/12 08:34, Alex Williamson wrote:
> On Tue, 2012-09-11 at 18:28 +1000, Alexey Kardashevskiy wrote:
>> On 11/09/12 02:02, Alex Williamson wrote:
>>> On Tue, 2012-09-04 at 17:33 +1000, Alexey Kardashevskiy wrote:
>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>>>> Cc: Paul Mackerras <paulus@samba.org>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>
>>> Please at least cc kvm@vger as well since we list that as the devel list
>>> for vfio.
>>>
>>>>    arch/powerpc/include/asm/iommu.h    |    3 +
>>>
>>> I'll need an ack from Ben or Paul for this change.
>>>
>>>>    drivers/iommu/Kconfig               |    8 +
>>>>    drivers/vfio/Kconfig                |    6 +
>>>>    drivers/vfio/Makefile               |    1 +
>>>>    drivers/vfio/vfio_iommu_spapr_tce.c |  440 +++++++++++++++++++++++++++++++++++
>>>>    include/linux/vfio.h                |   29 +++
>>>>    6 files changed, 487 insertions(+)
>>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>>
>>>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>>>> index 957a83f..c64bce7 100644
>>>> --- a/arch/powerpc/include/asm/iommu.h
>>>> +++ b/arch/powerpc/include/asm/iommu.h
>>>> @@ -66,6 +66,9 @@ struct iommu_table {
>>>>    	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
>>>>    	spinlock_t     it_lock;      /* Protects it_map */
>>>>    	unsigned long *it_map;       /* A simple allocation bitmap for now */
>>>> +#ifdef CONFIG_IOMMU_API
>>>> +	struct iommu_group *it_group;
>>>> +#endif
>>>>    };
>>>
>>> This seems to only be valid when vfio_iommu_spapr_tce is loaded, which
>>> is a bit misleading.
>>>
>>>>
>>>>    struct scatterlist;
>>>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>>>> index 3bd9fff..19cf2d9 100644
>>>> --- a/drivers/iommu/Kconfig
>>>> +++ b/drivers/iommu/Kconfig
>>>> @@ -162,4 +162,12 @@ config TEGRA_IOMMU_SMMU
>>>>    	  space through the SMMU (System Memory Management Unit)
>>>>    	  hardware included on Tegra SoCs.
>>>>
>>>> +config SPAPR_TCE_IOMMU
>>>> +	bool "sPAPR TCE IOMMU Support"
>>>> +	depends on PPC_PSERIES
>>>> +	select IOMMU_API
>>>> +	help
>>>> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
>>>> +	  still not implemented.
>>>> +
>>>>    endif # IOMMU_SUPPORT
>>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>>>> index 7cd5dec..b464687 100644
>>>> --- a/drivers/vfio/Kconfig
>>>> +++ b/drivers/vfio/Kconfig
>>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>>>    	depends on VFIO
>>>>    	default n
>>>>
>>>> +config VFIO_IOMMU_SPAPR_TCE
>>>> +	tristate
>>>> +	depends on VFIO && SPAPR_TCE_IOMMU
>>>> +	default n
>>>> +
>>>>    menuconfig VFIO
>>>>    	tristate "VFIO Non-Privileged userspace driver framework"
>>>>    	depends on IOMMU_API
>>>>    	select VFIO_IOMMU_TYPE1 if X86
>>>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>>>    	help
>>>>    	  VFIO provides a framework for secure userspace device drivers.
>>>>    	  See Documentation/vfio.txt for more details.
>>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>>>> index 2398d4a..72bfabc 100644
>>>> --- a/drivers/vfio/Makefile
>>>> +++ b/drivers/vfio/Makefile
>>>> @@ -1,3 +1,4 @@
>>>>    obj-$(CONFIG_VFIO) += vfio.o
>>>>    obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>>>    obj-$(CONFIG_VFIO_PCI) += pci/
>>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>> new file mode 100644
>>>> index 0000000..21f1909
>>>> --- /dev/null
>>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>> @@ -0,0 +1,440 @@
>>>> +/*
>>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>>>> + *
>>>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
>>>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify
>>>> + * it under the terms of the GNU General Public License version 2 as
>>>> + * published by the Free Software Foundation.
>>>> + *
>>>> + * Derived from original vfio_iommu_x86.c:
>>>
>>> Should this be _type1?  Only the mail archives are going to remember
>>> there was a _x86, so the renamed version is probably a better reference.
>>>
>>>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
>>>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
>>>> + */
>>>> +
>>>> +#include <linux/module.h>
>>>> +#include <linux/pci.h>
>>>> +#include <linux/slab.h>
>>>> +#include <linux/uaccess.h>
>>>> +#include <linux/err.h>
>>>> +#include <linux/vfio.h>
>>>> +#include <linux/spinlock.h>
>>>> +#include <asm/iommu.h>
>>>> +
>>>> +#define DRIVER_VERSION  "0.1"
>>>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
>>>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
>>>> +
>>>> +
>>>> +/*
>>>> + * SPAPR TCE API
>>>> + */
>>>> +static void tce_free(struct iommu_table *tbl, unsigned long entry,
>>>> +		unsigned long tce)
>>>> +{
>>>> +	struct page *page = pfn_to_page(tce >> PAGE_SHIFT);
>>>> +
>>>> +	WARN_ON(!page);
>>>> +	if (page) {
>>>> +		if (tce & VFIO_SPAPR_TCE_WRITE)
>>>> +			SetPageDirty(page);
>>>> +		put_page(page);
>>>> +	}
>>>> +	ppc_md.tce_free(tbl, entry, 1);
>>>> +}
>>>> +
>>>> +static long tce_put(struct iommu_table *tbl,
>>>> +		unsigned long entry, uint64_t tce, uint32_t flags)
>>>> +{
>>>> +	int ret;
>>>> +	unsigned long oldtce, kva, offset;
>>>> +	struct page *page = NULL;
>>>> +	enum dma_data_direction direction = DMA_NONE;
>>>> +
>>>> +	switch (flags & VFIO_SPAPR_TCE_PUT_MASK) {
>>>> +	case VFIO_SPAPR_TCE_READ:
>>>> +		direction = DMA_TO_DEVICE;
>>>> +		break;
>>>> +	case VFIO_SPAPR_TCE_WRITE:
>>>> +		direction = DMA_FROM_DEVICE;
>>>> +		break;
>>>> +	case VFIO_SPAPR_TCE_BIDIRECTIONAL:
>>>> +		direction = DMA_BIDIRECTIONAL;
>>>> +		break;
>>>> +	}
>>>> +
>>>> +	oldtce = ppc_md.tce_get(tbl, entry);
>>>> +
>>>> +	/* Free page if still allocated */
>>>> +	if (oldtce & VFIO_SPAPR_TCE_PUT_MASK)
>>>> +		tce_free(tbl, entry, oldtce);
>>>> +
>>>> +	/* Map new TCE */
>>>> +	if (direction != DMA_NONE) {
>>>> +		offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>>>> +		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>>>> +				direction != DMA_TO_DEVICE, &page);
>>>> +		BUG_ON(ret > 1);
>>>
>>> Can this happen?
>>>
>>>> +		if (ret < 1) {
>>>> +			printk(KERN_ERR "tce_vfio: get_user_pages_fast failed "
>>>> +					"tce=%llx ioba=%lx ret=%d\n",
>>>> +					tce, entry << IOMMU_PAGE_SHIFT, ret);
>>>> +			if (!ret)
>>>> +				ret = -EFAULT;
>>>> +			goto unlock_exit;
>>>> +		}
>>>> +
>>>> +		kva = (unsigned long) page_address(page);
>>>> +		kva += offset;
>>>> +		BUG_ON(!kva);
>>>
>>> Same here, can it happen?  If so, should it BUG or catch the below
>>> EINVAL?
>>>
>>>> +		if (WARN_ON(kva & ~IOMMU_PAGE_MASK))
>>>> +			return -EINVAL;
>>>
>>> Page leak?  Don't we want to do a put_page(), which means we probably
>>> want a goto exit here.
>>>
>>>> +
>>>> +		/* Preserve access bits */
>>>> +		kva |= flags & VFIO_SPAPR_TCE_PUT_MASK;
>>>> +
>>>> +		/* tce_build receives a virtual address */
>>>> +		entry += tbl->it_offset;	/* Offset into real TCE table */
>>>> +		ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>>>> +
>>>> +		/* tce_build() only returns non-zero for transient errors */
>>>> +		if (unlikely(ret)) {
>>>> +			printk(KERN_ERR "tce_vfio: Failed to add TCE\n");
>>>> +			ret = -EIO;
>>>> +			goto unlock_exit;
>>>> +		}
>>>> +	}
>>>> +	/* Flush/invalidate TLB caches if necessary */
>>>> +	if (ppc_md.tce_flush)
>>>> +		ppc_md.tce_flush(tbl);
>>>> +
>>>> +	/* Make sure updates are seen by hardware */
>>>> +	mb();
>>>> +
>>>> +unlock_exit:
>>>
>>> unlock seems wrong here, I had to go re-read the code looking for the
>>> lock.
>>>
>>>> +	if (ret && page)
>>>> +		put_page(page);
>>>> +
>>>> +	if (ret)
>>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx "
>>>> +				"ioba=%lx kva=%lx\n", tce,
>>>> +				entry << IOMMU_PAGE_SHIFT, kva);
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +/*
>>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>>>> + */
>>>> +
>>>> +/*
>>>> + * The container descriptor supports only a single group per container.
>>>> + * Required by the API as the container is not supplied with the IOMMU group
>>>> + * at the moment of initialization.
>>>> + */
>>>> +struct tce_container {
>>>> +	struct iommu_table *tbl;
>>>> +};
>>>> +
>>>> +static void *tce_iommu_open(unsigned long arg)
>>>> +{
>>>> +	struct tce_container *container;
>>>> +
>>>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
>>>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
>>>> +		return ERR_PTR(-EINVAL);
>>>> +	}
>>>> +
>>>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
>>>> +	if (!container)
>>>> +		return ERR_PTR(-ENOMEM);
>>>> +
>>>> +	return container;
>>>> +}
>>>> +
>>>> +static void tce_iommu_release(void *iommu_data)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +	struct iommu_table *tbl = container->tbl;
>>>> +	unsigned long i, tce;
>>>> +
>>>
>>> This will segfault if releasing a container that never had an a device
>>> attached.
>>>
>>>> +	/* Unmap leftovers */
>>>> +	spin_lock_irq(&tbl->it_lock);
>>>> +	for (i = tbl->it_offset; i < tbl->it_offset + tbl->it_size; ++i) {
>>>> +		tce = ppc_md.tce_get(tbl, i);
>>>> +		if (tce & VFIO_SPAPR_TCE_PUT_MASK)
>>>> +			tce_free(tbl, i, tce);
>>>> +	}
>>>> +	/* Flush/invalidate TLB caches if necessary */
>>>> +	if (ppc_md.tce_flush)
>>>> +		ppc_md.tce_flush(tbl);
>>>> +
>>>> +	/* Make sure updates are seen by hardware */
>>>> +	mb();
>>>> +
>>>> +	spin_unlock_irq(&tbl->it_lock);
>>>> +
>>>> +	kfree(container);
>>>> +}
>>>> +
>>>> +static long tce_iommu_ioctl(void *iommu_data,
>>>> +				 unsigned int cmd, unsigned long arg)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +	unsigned long minsz;
>>>> +	long ret;
>>>> +
>>>> +	switch (cmd) {
>>>> +	case VFIO_CHECK_EXTENSION: {
>>>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>>>> +	}
>>>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>>>> +		struct vfio_iommu_spapr_tce_info info;
>>>> +		struct iommu_table *tbl = container->tbl;
>>>> +
>>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>>>> +				dma64_window_size);
>>>> +
>>>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (info.argsz < minsz)
>>>> +			return -EINVAL;
>>>> +
>>>> +		if (!tbl)
>>>> +			return -ENXIO;
>>>
>>> nit: why not check this earlier?
>>>
>>>> +
>>>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>>>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>>>> +		info.dma64_window_start = 0;
>>>> +		info.dma64_window_size = 0;
>>>> +		info.flags = 0;
>>>> +
>>>> +		return copy_to_user((void __user *)arg, &info, minsz);
>>>> +	}
>>>> +	case VFIO_IOMMU_SPAPR_TCE_PUT: {
>>>> +		struct vfio_iommu_spapr_tce_put par;
>>>> +		struct iommu_table *tbl = container->tbl;
>>>> +
>>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_put, tce);
>>>> +
>>>> +		if (copy_from_user(&par, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (par.argsz < minsz)
>>>> +			return -EINVAL;
>>>> +
>>>> +		if (!tbl) {
>>>> +			return -ENXIO;
>>>> +		}
>>>
>>> Same, plus drop the braces.
>>>
>>>> +
>>>> +		spin_lock_irq(&tbl->it_lock);
>>>> +		ret = tce_put(tbl, par.ioba >> IOMMU_PAGE_SHIFT,
>>>> +				par.tce, par.flags);
>>>> +		spin_unlock_irq(&tbl->it_lock);
>>>> +
>>>> +		return ret;
>>>> +	}
>>>
>>> Is "PUT" really the name we want for this?
>>
>>
>> Yes, it is a single H_PUT_TCE hypercall from POWER architecture spec.
>
> Ok, if it makes sense on your arch, I won't complain (too much) about
> it.
>
>>>> +	default:
>>>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
>>>> +	}
>>>> +
>>>> +	return -ENOTTY;
>>>> +}
>>>> +
>>>> +static int tce_iommu_attach_group(void *iommu_data,
>>>> +		struct iommu_group *iommu_group)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>>>> +
>>>> +	printk(KERN_DEBUG "tce_vfio: Attaching group #%u to iommu %p\n",
>>>> +			iommu_group_id(iommu_group), iommu_group);
>>>
>>> Let's use pr_debug() and friends throughout.
>>>
>>>> +	if (container->tbl) {
>>>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU "
>>>> +				"container is allowed, "
>>>> +				"existing id=%d, attaching id=%d\n",
>>>> +				iommu_group_id(container->tbl->it_group),
>>>> +				iommu_group_id(iommu_group));
>>>> +		return -EBUSY;
>>>> +	}
>>>> +
>>>
>>> _type1 has a lock to avoid races here, I think you might need one too.
>>>
>>>> +	container->tbl = tbl;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static void tce_iommu_detach_group(void *iommu_data,
>>>> +		struct iommu_group *iommu_group)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>>>> +
>>>> +	BUG_ON(!tbl);
>>>
>>> Needed?  If so, why is there no check on attach?
>>
>> Added to attach() :)
>>
>>
>>>
>>>> +	if (tbl != container->tbl) {
>>>> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected "
>>>> +				"group is #%u\n", iommu_group_id(iommu_group),
>>>> +				iommu_group_id(tbl->it_group));
>>>> +		return;
>>>> +	}
>>>> +	printk(KERN_DEBUG "tce_vfio: detaching group #%u from iommu %p\n",
>>>> +			iommu_group_id(iommu_group), iommu_group);
>>>
>>> container->tbl = NULL?
>>
>>
>> Then I won't be able to release pages in tce_iommu_release().
>> Releasing pages in tce_iommu_detach_group() caused some other problems,
>> cannot recall now which ones.
>
> What happens if you hot unplug a group from one VM and add it to
> another?  ie. we've detached it from one container and add it to another
> in a different instance.  Does it cause problems here?


Then the container will be released as just one group per container is 
supported at the moment, no? Cannot check though as we do not support 
hotplug yet.



>>>> +}
>>>> +
>>>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>>>> +	.name		= "iommu-vfio-powerpc",
>>>> +	.owner		= THIS_MODULE,
>>>> +	.open		= tce_iommu_open,
>>>> +	.release	= tce_iommu_release,
>>>> +	.ioctl		= tce_iommu_ioctl,
>>>> +	.attach_group	= tce_iommu_attach_group,
>>>> +	.detach_group	= tce_iommu_detach_group,
>>>> +};
>>>> +
>>>> +/*
>>>> + * Add/delete devices support (hotplug, module_init, module_exit)
>>>> + */
>>>> +static int add_device(struct device *dev)
>>>> +{
>>>> +	struct iommu_table *tbl;
>>>> +	int ret = 0;
>>>> +
>>>> +	if (dev->iommu_group) {
>>>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu "
>>>> +				"group %d, skipping\n", dev->kobj.name,
>>>
>>> Watch line wrapping on strings.
>>
>> Pardon?
>
> Just suggesting that you try to wrap lines so that strings are
> searchable.  For instance, can I search cscope for "is already in iommu
> group".  It's generally accepted that printks can break 80 cols for
> this.

Aaaa. Did not know that this is accepted but was always annoyed to wrap 
this way, thanks :)


>>>> +				iommu_group_id(dev->iommu_group));
>>>> +		return -EBUSY;
>>>> +	}
>>>> +
>>>> +	tbl = get_iommu_table_base(dev);
>>>> +	if (!tbl) {
>>>> +		printk(KERN_DEBUG "tce_vfio: skipping device %s with no tbl\n",
>>>> +				dev->kobj.name);
>>>> +		return 0;
>>>> +	}
>>>> +
>>>> +	printk(KERN_DEBUG "tce_vfio: adding %s to iommu group %d\n",
>>>> +			dev->kobj.name, iommu_group_id(tbl->it_group));
>>>> +
>>>> +	ret = iommu_group_add_device(tbl->it_group, dev);
>>>> +	if (ret < 0)
>>>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>>>> +				dev->kobj.name, ret);
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static void del_device(struct device *dev)
>>>> +{
>>>> +	iommu_group_remove_device(dev);
>>>> +}
>>>> +
>>>> +static int iommu_bus_notifier(struct notifier_block *nb,
>>>> +			      unsigned long action, void *data)
>>>> +{
>>>> +	struct device *dev = data;
>>>> +
>>>> +	switch (action) {
>>>> +	case BUS_NOTIFY_ADD_DEVICE:
>>>> +		return add_device(dev);
>>>> +	case BUS_NOTIFY_DEL_DEVICE:
>>>> +		del_device(dev);
>>>> +		return 0;
>>>> +	default:
>>>> +		return 0;
>>>> +	}
>>>> +}
>>>> +
>>>> +static struct notifier_block tce_iommu_bus_nb = {
>>>> +	.notifier_call = iommu_bus_notifier,
>>>> +};
>>>> +
>>>> +void group_release(void *iommu_data)
>>>> +{
>>>> +	struct iommu_table *tbl = iommu_data;
>>>> +	tbl->it_group = NULL;
>>>> +}
>>>> +
>>>> +static int __init tce_iommu_init(void)
>>>> +{
>>>> +	struct pci_dev *pdev = NULL;
>>>> +	struct iommu_table *tbl;
>>>> +	struct iommu_group *grp;
>>>> +
>>>> +	/* If the current platform does not support tce_get
>>>> +	   we are unable to clean TCE table properly and
>>>> +	   therefore it is better not to touch it at all */
>>>> +	if (!ppc_md.tce_get) {
>>>> +		printk(KERN_ERR "tce_vfio: ppc_md.tce_get isn't implemented\n");
>>>> +		return -EOPNOTSUPP;
>>>> +	}
>>>> +
>>>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>>>> +
>>>> +	/* Allocate and initialize VFIO groups */
>>>
>>> s/VFIO groups/IOMMU groups/
>>>
>>>> +	for_each_pci_dev(pdev) {
>>>> +		tbl = get_iommu_table_base(&pdev->dev);
>>>> +		if (!tbl)
>>>> +			continue;
>>>> +
>>>> +		/* Skip already initialized */
>>>> +		if (tbl->it_group)
>>>> +			continue;
>>>> +
>>>> +		grp = iommu_group_alloc();
>>>> +		if (IS_ERR(grp)) {
>>>> +			printk(KERN_INFO "tce_vfio: cannot create "
>>>> +					"new IOMMU group, ret=%ld\n",
>>>> +					PTR_ERR(grp));
>>>> +			return -EFAULT;
>>>> +		}
>>>> +		tbl->it_group = grp;
>>>> +		iommu_group_set_iommudata(grp, tbl, group_release);
>>>> +	}
>>>> +
>>>> +	/* Add PCI devices to VFIO groups */
>>>> +	for_each_pci_dev(pdev)
>>>> +		add_device(&pdev->dev);
>>>> +
>>>> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>>>> +}
>>>> +
>>>> +static void __exit tce_iommu_cleanup(void)
>>>> +{
>>>> +	struct pci_dev *pdev = NULL;
>>>> +	struct iommu_table *tbl;
>>>> +	struct iommu_group *grp = NULL;
>>>> +
>>>> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>>>> +
>>>> +	/* Delete PCI devices from VFIO groups */
>>>> +	for_each_pci_dev(pdev)
>>>> +		del_device(&pdev->dev);
>>>> +
>>>> +	/* Release VFIO groups */
>>>> +	for_each_pci_dev(pdev) {
>>>> +		tbl = get_iommu_table_base(&pdev->dev);
>>>> +		if (!tbl)
>>>> +			continue;
>>>> +		grp = tbl->it_group;
>>>> +
>>>> +		/* Skip (already) uninitialized */
>>>> +		if (!grp)
>>>> +			continue;
>>>> +
>>>> +		/* Do actual release, group_release() is expected to work */
>>>> +		iommu_group_put(grp);
>>>> +		BUG_ON(tbl->it_group);
>>>> +	}
>>>> +
>>>
>>>
>>> It troubles me a bit that you're using the vfio driver to initialize and
>>> tear down IOMMU groups on your platform.
>>
>>
>> I am not following you here. Could you please explain a bit?
>
> IOMMU groups are theoretically not just for VFIO.  They expose DMA
> dependencies between devices for anyone who cares to know about it.
> VFIO happens to care very much about that, but is hopefully not the only
> consumer.  So it's a little bit like writing a driver for a device on a
> new bus and incorporating the bus topology handling code into the device
> driver.  IOMMU groups should be created and managed independent of VFIO.

Do you mean that we create groups only for PCI devices? Well, moving groups 
creation where the actual powerpc groups are allocated (pci scan) is 
problematic right now as iommu_init() is called too late.


>>> VFIO makes use of IOMMU groups
>>> and is the only user so far, but they're hopefully useful beyond this.
>>> In fact, VFIO used to manage assembling all groups from data provided by
>>> the IOMMU but David wanted to see IOMMU groups be a more universally
>>> available feature, so it's odd to see POWER implementing it this way.
>>
>>
>> David, help! :)
>>
>>
>>>> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>>>> +}
>>>> +
>>>> +module_init(tce_iommu_init);
>>>> +module_exit(tce_iommu_cleanup);
>>>> +
>>>> +MODULE_VERSION(DRIVER_VERSION);
>>>> +MODULE_LICENSE("GPL v2");
>>>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>>>> +MODULE_DESCRIPTION(DRIVER_DESC);
>>>> +
>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>>>> index 0a4f180..2c0a927 100644
>>>> --- a/include/linux/vfio.h
>>>> +++ b/include/linux/vfio.h
>>>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>>>>    /* Extensions */
>>>>
>>>>    #define VFIO_TYPE1_IOMMU		1
>>>> +#define VFIO_SPAPR_TCE_IOMMU		2
>>>>
>>>>    /*
>>>>     * The IOCTL interface is designed for extensibility by embedding the
>>>> @@ -442,4 +443,32 @@ struct vfio_iommu_type1_dma_unmap {
>>>>
>>>>    #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>>>
>>>> +/* -------- API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>>>> +
>>>> +struct vfio_iommu_spapr_tce_info {
>>>> +	__u32 argsz;
>>>> +	__u32 flags;
>>>> +	__u32 dma32_window_start;
>>>> +	__u32 dma32_window_size;
>>>> +	__u64 dma64_window_start;
>>>> +	__u64 dma64_window_size;
>>>> +};
>>>> +
>>>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>>>> +
>>>> +struct vfio_iommu_spapr_tce_put {
>>>> +	__u32 argsz;
>>>> +	__u32 flags;
>>>> +#define VFIO_SPAPR_TCE_READ		1
>>>> +#define VFIO_SPAPR_TCE_WRITE		2
>>>> +#define VFIO_SPAPR_TCE_BIDIRECTIONAL	(VFIO_SPAPR_TCE_READ|VFIO_SPAPR_TCE_WRITE)
>>>> +#define VFIO_SPAPR_TCE_PUT_MASK		VFIO_SPAPR_TCE_BIDIRECTIONAL
>>>> +	__u64 ioba;
>>>> +	__u64 tce;
>>>> +};
>>>
>>> Ok, so if READ & WRITE are both clear and ioba is set, that's an
>>> "unmap"?  This is exactly why _type1 has a MAP and UNMAP, to make it
>>> clear which fields are necessary for which call.  I think we should
>>> probably do the same here.  Besides, _put makes me think there should be
>>> a _get; do these have some unique meaning in POWER?
>>
>>
>> It is a single H_PUT_TCE for putting a record into TCE table. The guest
>> calls H_PUT_TCE, QEMU replaces the address and simply forwards the call to
>> the host. Calling them map/unmap makes it less clear for powerpc people :)
>
> In the unmap case we take an ioba and lookup a tce to clear, in the map
> case we take an ioba and tce and insert them into the table.  It's valid
> to document this and use a single ioctl, but I've opted on x86 to have
> separate ioctls because the documentation falls out cleaner when there
> aren't fields that are only used in certain conditions.  Do you really
> want any userspace driver making use of this to know about powerpc
> H_PUT_TCE or would it make more sense to have a MAP and UNMAP call?  I
> think it would be better for the VFIO API if we had some consistency in
> the mapping ioctls where possible.


I would think that passing through "as is" as much as possible is the best 
thing here as the aim is KVM. May be one day we will implement H_PUT_TCE in 
the kernel, so splitting H_PUT_TCE to map+unmap and then combining it back 
in the kernel (because we will have H_PUT_TCE handler) is a bit ugly.


>>>> +#define VFIO_IOMMU_SPAPR_TCE_PUT	_IO(VFIO_TYPE, VFIO_BASE + 13)
>>>> +
>>>
>>> Please document what all of the above means.  Thanks,
>>
>>
>> Something like this?
>> /*
>>    * The VFIO_IOMMU_SPAPR_TCE_PUT is implemented as the H_PUT_TCE hypercall.
>>    * ioba - I/O Bus Address for indexing into TCE table
>>    * tce - logical address of storage
>>    *
>>    * The non-zero flags means adding new page into the table.
>>    * The zero flags means releasing the existing page and clearing the
>>    * TCE table entry.
>>    */
>
> Do you only want VFIO drivers to work on POWER if they're written by
> POWER people?  Ideally there are a few simple concepts: a) devices have
> an I/O virtual address space.  On x86 we call this the iova and it's
> effectively a zero-based, 64bit (not really, but close enough) address
> space.  You seem to have two smaller windows, one in 32bit space,
> another in 64bit space (maybe we could name these more consistently).
> b) Userspace has a buffer that they want to map and unmap to an iova,
> potentially with some access flags.  That's all you need to know to use
> the x86 _type1 VFIO IOMMU API.


Do not you have to map entire RAM to PCI bus? You use listener which 
purpose is not very clear. This is an extra knowledge beyond qemu-to-host 
interface which the user space program should know.


> Why do I need to know about H_PUT_TCE to
> use this interface?  Let's assume there might be some VFIO drivers some
> day that aren't written by POWER people.  Thanks,

Example of such a driver? My imagination is weak :)


-- 
Alexey

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] vfio: enabled and supported on power (v7)
  2012-09-14  0:51         ` Alexey Kardashevskiy
@ 2012-09-14  4:35           ` Alex Williamson
  2012-10-11  8:19             ` Alexey Kardashevskiy
  0 siblings, 1 reply; 25+ messages in thread
From: Alex Williamson @ 2012-09-14  4:35 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Paul Mackerras, linuxppc-dev, David Gibson

On Fri, 2012-09-14 at 10:51 +1000, Alexey Kardashevskiy wrote:
> On 14/09/12 08:34, Alex Williamson wrote:
> > On Tue, 2012-09-11 at 18:28 +1000, Alexey Kardashevskiy wrote:
> >> On 11/09/12 02:02, Alex Williamson wrote:
> >>> On Tue, 2012-09-04 at 17:33 +1000, Alexey Kardashevskiy wrote:
> >>>> Cc: David Gibson <david@gibson.dropbear.id.au>
> >>>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> >>>> Cc: Paul Mackerras <paulus@samba.org>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> ---
> >>>
> >>> Please at least cc kvm@vger as well since we list that as the devel list
> >>> for vfio.
> >>>
> >>>>    arch/powerpc/include/asm/iommu.h    |    3 +
> >>>
> >>> I'll need an ack from Ben or Paul for this change.
> >>>
> >>>>    drivers/iommu/Kconfig               |    8 +
> >>>>    drivers/vfio/Kconfig                |    6 +
> >>>>    drivers/vfio/Makefile               |    1 +
> >>>>    drivers/vfio/vfio_iommu_spapr_tce.c |  440 +++++++++++++++++++++++++++++++++++
> >>>>    include/linux/vfio.h                |   29 +++
> >>>>    6 files changed, 487 insertions(+)
> >>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>
> >>>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> >>>> index 957a83f..c64bce7 100644
> >>>> --- a/arch/powerpc/include/asm/iommu.h
> >>>> +++ b/arch/powerpc/include/asm/iommu.h
> >>>> @@ -66,6 +66,9 @@ struct iommu_table {
> >>>>    	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
> >>>>    	spinlock_t     it_lock;      /* Protects it_map */
> >>>>    	unsigned long *it_map;       /* A simple allocation bitmap for now */
> >>>> +#ifdef CONFIG_IOMMU_API
> >>>> +	struct iommu_group *it_group;
> >>>> +#endif
> >>>>    };
> >>>
> >>> This seems to only be valid when vfio_iommu_spapr_tce is loaded, which
> >>> is a bit misleading.
> >>>
> >>>>
> >>>>    struct scatterlist;
> >>>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> >>>> index 3bd9fff..19cf2d9 100644
> >>>> --- a/drivers/iommu/Kconfig
> >>>> +++ b/drivers/iommu/Kconfig
> >>>> @@ -162,4 +162,12 @@ config TEGRA_IOMMU_SMMU
> >>>>    	  space through the SMMU (System Memory Management Unit)
> >>>>    	  hardware included on Tegra SoCs.
> >>>>
> >>>> +config SPAPR_TCE_IOMMU
> >>>> +	bool "sPAPR TCE IOMMU Support"
> >>>> +	depends on PPC_PSERIES
> >>>> +	select IOMMU_API
> >>>> +	help
> >>>> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> >>>> +	  still not implemented.
> >>>> +
> >>>>    endif # IOMMU_SUPPORT
> >>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >>>> index 7cd5dec..b464687 100644
> >>>> --- a/drivers/vfio/Kconfig
> >>>> +++ b/drivers/vfio/Kconfig
> >>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>>>    	depends on VFIO
> >>>>    	default n
> >>>>
> >>>> +config VFIO_IOMMU_SPAPR_TCE
> >>>> +	tristate
> >>>> +	depends on VFIO && SPAPR_TCE_IOMMU
> >>>> +	default n
> >>>> +
> >>>>    menuconfig VFIO
> >>>>    	tristate "VFIO Non-Privileged userspace driver framework"
> >>>>    	depends on IOMMU_API
> >>>>    	select VFIO_IOMMU_TYPE1 if X86
> >>>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>>>    	help
> >>>>    	  VFIO provides a framework for secure userspace device drivers.
> >>>>    	  See Documentation/vfio.txt for more details.
> >>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >>>> index 2398d4a..72bfabc 100644
> >>>> --- a/drivers/vfio/Makefile
> >>>> +++ b/drivers/vfio/Makefile
> >>>> @@ -1,3 +1,4 @@
> >>>>    obj-$(CONFIG_VFIO) += vfio.o
> >>>>    obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>>>    obj-$(CONFIG_VFIO_PCI) += pci/
> >>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> new file mode 100644
> >>>> index 0000000..21f1909
> >>>> --- /dev/null
> >>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> @@ -0,0 +1,440 @@
> >>>> +/*
> >>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >>>> + *
> >>>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >>>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> + *
> >>>> + * This program is free software; you can redistribute it and/or modify
> >>>> + * it under the terms of the GNU General Public License version 2 as
> >>>> + * published by the Free Software Foundation.
> >>>> + *
> >>>> + * Derived from original vfio_iommu_x86.c:
> >>>
> >>> Should this be _type1?  Only the mail archives are going to remember
> >>> there was a _x86, so the renamed version is probably a better reference.
> >>>
> >>>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >>>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> >>>> + */
> >>>> +
> >>>> +#include <linux/module.h>
> >>>> +#include <linux/pci.h>
> >>>> +#include <linux/slab.h>
> >>>> +#include <linux/uaccess.h>
> >>>> +#include <linux/err.h>
> >>>> +#include <linux/vfio.h>
> >>>> +#include <linux/spinlock.h>
> >>>> +#include <asm/iommu.h>
> >>>> +
> >>>> +#define DRIVER_VERSION  "0.1"
> >>>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> >>>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> >>>> +
> >>>> +
> >>>> +/*
> >>>> + * SPAPR TCE API
> >>>> + */
> >>>> +static void tce_free(struct iommu_table *tbl, unsigned long entry,
> >>>> +		unsigned long tce)
> >>>> +{
> >>>> +	struct page *page = pfn_to_page(tce >> PAGE_SHIFT);
> >>>> +
> >>>> +	WARN_ON(!page);
> >>>> +	if (page) {
> >>>> +		if (tce & VFIO_SPAPR_TCE_WRITE)
> >>>> +			SetPageDirty(page);
> >>>> +		put_page(page);
> >>>> +	}
> >>>> +	ppc_md.tce_free(tbl, entry, 1);
> >>>> +}
> >>>> +
> >>>> +static long tce_put(struct iommu_table *tbl,
> >>>> +		unsigned long entry, uint64_t tce, uint32_t flags)
> >>>> +{
> >>>> +	int ret;
> >>>> +	unsigned long oldtce, kva, offset;
> >>>> +	struct page *page = NULL;
> >>>> +	enum dma_data_direction direction = DMA_NONE;
> >>>> +
> >>>> +	switch (flags & VFIO_SPAPR_TCE_PUT_MASK) {
> >>>> +	case VFIO_SPAPR_TCE_READ:
> >>>> +		direction = DMA_TO_DEVICE;
> >>>> +		break;
> >>>> +	case VFIO_SPAPR_TCE_WRITE:
> >>>> +		direction = DMA_FROM_DEVICE;
> >>>> +		break;
> >>>> +	case VFIO_SPAPR_TCE_BIDIRECTIONAL:
> >>>> +		direction = DMA_BIDIRECTIONAL;
> >>>> +		break;
> >>>> +	}
> >>>> +
> >>>> +	oldtce = ppc_md.tce_get(tbl, entry);
> >>>> +
> >>>> +	/* Free page if still allocated */
> >>>> +	if (oldtce & VFIO_SPAPR_TCE_PUT_MASK)
> >>>> +		tce_free(tbl, entry, oldtce);
> >>>> +
> >>>> +	/* Map new TCE */
> >>>> +	if (direction != DMA_NONE) {
> >>>> +		offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >>>> +		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >>>> +				direction != DMA_TO_DEVICE, &page);
> >>>> +		BUG_ON(ret > 1);
> >>>
> >>> Can this happen?
> >>>
> >>>> +		if (ret < 1) {
> >>>> +			printk(KERN_ERR "tce_vfio: get_user_pages_fast failed "
> >>>> +					"tce=%llx ioba=%lx ret=%d\n",
> >>>> +					tce, entry << IOMMU_PAGE_SHIFT, ret);
> >>>> +			if (!ret)
> >>>> +				ret = -EFAULT;
> >>>> +			goto unlock_exit;
> >>>> +		}
> >>>> +
> >>>> +		kva = (unsigned long) page_address(page);
> >>>> +		kva += offset;
> >>>> +		BUG_ON(!kva);
> >>>
> >>> Same here, can it happen?  If so, should it BUG or catch the below
> >>> EINVAL?
> >>>
> >>>> +		if (WARN_ON(kva & ~IOMMU_PAGE_MASK))
> >>>> +			return -EINVAL;
> >>>
> >>> Page leak?  Don't we want to do a put_page(), which means we probably
> >>> want a goto exit here.
> >>>
> >>>> +
> >>>> +		/* Preserve access bits */
> >>>> +		kva |= flags & VFIO_SPAPR_TCE_PUT_MASK;
> >>>> +
> >>>> +		/* tce_build receives a virtual address */
> >>>> +		entry += tbl->it_offset;	/* Offset into real TCE table */
> >>>> +		ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >>>> +
> >>>> +		/* tce_build() only returns non-zero for transient errors */
> >>>> +		if (unlikely(ret)) {
> >>>> +			printk(KERN_ERR "tce_vfio: Failed to add TCE\n");
> >>>> +			ret = -EIO;
> >>>> +			goto unlock_exit;
> >>>> +		}
> >>>> +	}
> >>>> +	/* Flush/invalidate TLB caches if necessary */
> >>>> +	if (ppc_md.tce_flush)
> >>>> +		ppc_md.tce_flush(tbl);
> >>>> +
> >>>> +	/* Make sure updates are seen by hardware */
> >>>> +	mb();
> >>>> +
> >>>> +unlock_exit:
> >>>
> >>> unlock seems wrong here, I had to go re-read the code looking for the
> >>> lock.
> >>>
> >>>> +	if (ret && page)
> >>>> +		put_page(page);
> >>>> +
> >>>> +	if (ret)
> >>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx "
> >>>> +				"ioba=%lx kva=%lx\n", tce,
> >>>> +				entry << IOMMU_PAGE_SHIFT, kva);
> >>>> +	return ret;
> >>>> +}
> >>>> +
> >>>> +/*
> >>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >>>> + */
> >>>> +
> >>>> +/*
> >>>> + * The container descriptor supports only a single group per container.
> >>>> + * Required by the API as the container is not supplied with the IOMMU group
> >>>> + * at the moment of initialization.
> >>>> + */
> >>>> +struct tce_container {
> >>>> +	struct iommu_table *tbl;
> >>>> +};
> >>>> +
> >>>> +static void *tce_iommu_open(unsigned long arg)
> >>>> +{
> >>>> +	struct tce_container *container;
> >>>> +
> >>>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >>>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >>>> +		return ERR_PTR(-EINVAL);
> >>>> +	}
> >>>> +
> >>>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> >>>> +	if (!container)
> >>>> +		return ERR_PTR(-ENOMEM);
> >>>> +
> >>>> +	return container;
> >>>> +}
> >>>> +
> >>>> +static void tce_iommu_release(void *iommu_data)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +	struct iommu_table *tbl = container->tbl;
> >>>> +	unsigned long i, tce;
> >>>> +
> >>>
> >>> This will segfault if releasing a container that never had an a device
> >>> attached.
> >>>
> >>>> +	/* Unmap leftovers */
> >>>> +	spin_lock_irq(&tbl->it_lock);
> >>>> +	for (i = tbl->it_offset; i < tbl->it_offset + tbl->it_size; ++i) {
> >>>> +		tce = ppc_md.tce_get(tbl, i);
> >>>> +		if (tce & VFIO_SPAPR_TCE_PUT_MASK)
> >>>> +			tce_free(tbl, i, tce);
> >>>> +	}
> >>>> +	/* Flush/invalidate TLB caches if necessary */
> >>>> +	if (ppc_md.tce_flush)
> >>>> +		ppc_md.tce_flush(tbl);
> >>>> +
> >>>> +	/* Make sure updates are seen by hardware */
> >>>> +	mb();
> >>>> +
> >>>> +	spin_unlock_irq(&tbl->it_lock);
> >>>> +
> >>>> +	kfree(container);
> >>>> +}
> >>>> +
> >>>> +static long tce_iommu_ioctl(void *iommu_data,
> >>>> +				 unsigned int cmd, unsigned long arg)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +	unsigned long minsz;
> >>>> +	long ret;
> >>>> +
> >>>> +	switch (cmd) {
> >>>> +	case VFIO_CHECK_EXTENSION: {
> >>>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >>>> +	}
> >>>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>>> +		struct vfio_iommu_spapr_tce_info info;
> >>>> +		struct iommu_table *tbl = container->tbl;
> >>>> +
> >>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >>>> +				dma64_window_size);
> >>>> +
> >>>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (info.argsz < minsz)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		if (!tbl)
> >>>> +			return -ENXIO;
> >>>
> >>> nit: why not check this earlier?
> >>>
> >>>> +
> >>>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >>>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >>>> +		info.dma64_window_start = 0;
> >>>> +		info.dma64_window_size = 0;
> >>>> +		info.flags = 0;
> >>>> +
> >>>> +		return copy_to_user((void __user *)arg, &info, minsz);
> >>>> +	}
> >>>> +	case VFIO_IOMMU_SPAPR_TCE_PUT: {
> >>>> +		struct vfio_iommu_spapr_tce_put par;
> >>>> +		struct iommu_table *tbl = container->tbl;
> >>>> +
> >>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_put, tce);
> >>>> +
> >>>> +		if (copy_from_user(&par, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (par.argsz < minsz)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		if (!tbl) {
> >>>> +			return -ENXIO;
> >>>> +		}
> >>>
> >>> Same, plus drop the braces.
> >>>
> >>>> +
> >>>> +		spin_lock_irq(&tbl->it_lock);
> >>>> +		ret = tce_put(tbl, par.ioba >> IOMMU_PAGE_SHIFT,
> >>>> +				par.tce, par.flags);
> >>>> +		spin_unlock_irq(&tbl->it_lock);
> >>>> +
> >>>> +		return ret;
> >>>> +	}
> >>>
> >>> Is "PUT" really the name we want for this?
> >>
> >>
> >> Yes, it is a single H_PUT_TCE hypercall from POWER architecture spec.
> >
> > Ok, if it makes sense on your arch, I won't complain (too much) about
> > it.
> >
> >>>> +	default:
> >>>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >>>> +	}
> >>>> +
> >>>> +	return -ENOTTY;
> >>>> +}
> >>>> +
> >>>> +static int tce_iommu_attach_group(void *iommu_data,
> >>>> +		struct iommu_group *iommu_group)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>> +
> >>>> +	printk(KERN_DEBUG "tce_vfio: Attaching group #%u to iommu %p\n",
> >>>> +			iommu_group_id(iommu_group), iommu_group);
> >>>
> >>> Let's use pr_debug() and friends throughout.
> >>>
> >>>> +	if (container->tbl) {
> >>>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU "
> >>>> +				"container is allowed, "
> >>>> +				"existing id=%d, attaching id=%d\n",
> >>>> +				iommu_group_id(container->tbl->it_group),
> >>>> +				iommu_group_id(iommu_group));
> >>>> +		return -EBUSY;
> >>>> +	}
> >>>> +
> >>>
> >>> _type1 has a lock to avoid races here, I think you might need one too.
> >>>
> >>>> +	container->tbl = tbl;
> >>>> +
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>> +static void tce_iommu_detach_group(void *iommu_data,
> >>>> +		struct iommu_group *iommu_group)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>> +
> >>>> +	BUG_ON(!tbl);
> >>>
> >>> Needed?  If so, why is there no check on attach?
> >>
> >> Added to attach() :)
> >>
> >>
> >>>
> >>>> +	if (tbl != container->tbl) {
> >>>> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected "
> >>>> +				"group is #%u\n", iommu_group_id(iommu_group),
> >>>> +				iommu_group_id(tbl->it_group));
> >>>> +		return;
> >>>> +	}
> >>>> +	printk(KERN_DEBUG "tce_vfio: detaching group #%u from iommu %p\n",
> >>>> +			iommu_group_id(iommu_group), iommu_group);
> >>>
> >>> container->tbl = NULL?
> >>
> >>
> >> Then I won't be able to release pages in tce_iommu_release().
> >> Releasing pages in tce_iommu_detach_group() caused some other problems,
> >> cannot recall now which ones.
> >
> > What happens if you hot unplug a group from one VM and add it to
> > another?  ie. we've detached it from one container and add it to another
> > in a different instance.  Does it cause problems here?
> 
> 
> Then the container will be released as just one group per container is 
> supported at the moment, no? Cannot check though as we do not support 
> hotplug yet.

But you still have a race where the group is detached, but the container
is not yet released and can be attached to another container in a
different instance.

> >>>> +}
> >>>> +
> >>>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> >>>> +	.name		= "iommu-vfio-powerpc",
> >>>> +	.owner		= THIS_MODULE,
> >>>> +	.open		= tce_iommu_open,
> >>>> +	.release	= tce_iommu_release,
> >>>> +	.ioctl		= tce_iommu_ioctl,
> >>>> +	.attach_group	= tce_iommu_attach_group,
> >>>> +	.detach_group	= tce_iommu_detach_group,
> >>>> +};
> >>>> +
> >>>> +/*
> >>>> + * Add/delete devices support (hotplug, module_init, module_exit)
> >>>> + */
> >>>> +static int add_device(struct device *dev)
> >>>> +{
> >>>> +	struct iommu_table *tbl;
> >>>> +	int ret = 0;
> >>>> +
> >>>> +	if (dev->iommu_group) {
> >>>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu "
> >>>> +				"group %d, skipping\n", dev->kobj.name,
> >>>
> >>> Watch line wrapping on strings.
> >>
> >> Pardon?
> >
> > Just suggesting that you try to wrap lines so that strings are
> > searchable.  For instance, can I search cscope for "is already in iommu
> > group".  It's generally accepted that printks can break 80 cols for
> > this.
> 
> Aaaa. Did not know that this is accepted but was always annoyed to wrap 
> this way, thanks :)
> 
> 
> >>>> +				iommu_group_id(dev->iommu_group));
> >>>> +		return -EBUSY;
> >>>> +	}
> >>>> +
> >>>> +	tbl = get_iommu_table_base(dev);
> >>>> +	if (!tbl) {
> >>>> +		printk(KERN_DEBUG "tce_vfio: skipping device %s with no tbl\n",
> >>>> +				dev->kobj.name);
> >>>> +		return 0;
> >>>> +	}
> >>>> +
> >>>> +	printk(KERN_DEBUG "tce_vfio: adding %s to iommu group %d\n",
> >>>> +			dev->kobj.name, iommu_group_id(tbl->it_group));
> >>>> +
> >>>> +	ret = iommu_group_add_device(tbl->it_group, dev);
> >>>> +	if (ret < 0)
> >>>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> >>>> +				dev->kobj.name, ret);
> >>>> +
> >>>> +	return ret;
> >>>> +}
> >>>> +
> >>>> +static void del_device(struct device *dev)
> >>>> +{
> >>>> +	iommu_group_remove_device(dev);
> >>>> +}
> >>>> +
> >>>> +static int iommu_bus_notifier(struct notifier_block *nb,
> >>>> +			      unsigned long action, void *data)
> >>>> +{
> >>>> +	struct device *dev = data;
> >>>> +
> >>>> +	switch (action) {
> >>>> +	case BUS_NOTIFY_ADD_DEVICE:
> >>>> +		return add_device(dev);
> >>>> +	case BUS_NOTIFY_DEL_DEVICE:
> >>>> +		del_device(dev);
> >>>> +		return 0;
> >>>> +	default:
> >>>> +		return 0;
> >>>> +	}
> >>>> +}
> >>>> +
> >>>> +static struct notifier_block tce_iommu_bus_nb = {
> >>>> +	.notifier_call = iommu_bus_notifier,
> >>>> +};
> >>>> +
> >>>> +void group_release(void *iommu_data)
> >>>> +{
> >>>> +	struct iommu_table *tbl = iommu_data;
> >>>> +	tbl->it_group = NULL;
> >>>> +}
> >>>> +
> >>>> +static int __init tce_iommu_init(void)
> >>>> +{
> >>>> +	struct pci_dev *pdev = NULL;
> >>>> +	struct iommu_table *tbl;
> >>>> +	struct iommu_group *grp;
> >>>> +
> >>>> +	/* If the current platform does not support tce_get
> >>>> +	   we are unable to clean TCE table properly and
> >>>> +	   therefore it is better not to touch it at all */
> >>>> +	if (!ppc_md.tce_get) {
> >>>> +		printk(KERN_ERR "tce_vfio: ppc_md.tce_get isn't implemented\n");
> >>>> +		return -EOPNOTSUPP;
> >>>> +	}
> >>>> +
> >>>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >>>> +
> >>>> +	/* Allocate and initialize VFIO groups */
> >>>
> >>> s/VFIO groups/IOMMU groups/
> >>>
> >>>> +	for_each_pci_dev(pdev) {
> >>>> +		tbl = get_iommu_table_base(&pdev->dev);
> >>>> +		if (!tbl)
> >>>> +			continue;
> >>>> +
> >>>> +		/* Skip already initialized */
> >>>> +		if (tbl->it_group)
> >>>> +			continue;
> >>>> +
> >>>> +		grp = iommu_group_alloc();
> >>>> +		if (IS_ERR(grp)) {
> >>>> +			printk(KERN_INFO "tce_vfio: cannot create "
> >>>> +					"new IOMMU group, ret=%ld\n",
> >>>> +					PTR_ERR(grp));
> >>>> +			return -EFAULT;
> >>>> +		}
> >>>> +		tbl->it_group = grp;
> >>>> +		iommu_group_set_iommudata(grp, tbl, group_release);
> >>>> +	}
> >>>> +
> >>>> +	/* Add PCI devices to VFIO groups */
> >>>> +	for_each_pci_dev(pdev)
> >>>> +		add_device(&pdev->dev);
> >>>> +
> >>>> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> >>>> +}
> >>>> +
> >>>> +static void __exit tce_iommu_cleanup(void)
> >>>> +{
> >>>> +	struct pci_dev *pdev = NULL;
> >>>> +	struct iommu_table *tbl;
> >>>> +	struct iommu_group *grp = NULL;
> >>>> +
> >>>> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >>>> +
> >>>> +	/* Delete PCI devices from VFIO groups */
> >>>> +	for_each_pci_dev(pdev)
> >>>> +		del_device(&pdev->dev);
> >>>> +
> >>>> +	/* Release VFIO groups */
> >>>> +	for_each_pci_dev(pdev) {
> >>>> +		tbl = get_iommu_table_base(&pdev->dev);
> >>>> +		if (!tbl)
> >>>> +			continue;
> >>>> +		grp = tbl->it_group;
> >>>> +
> >>>> +		/* Skip (already) uninitialized */
> >>>> +		if (!grp)
> >>>> +			continue;
> >>>> +
> >>>> +		/* Do actual release, group_release() is expected to work */
> >>>> +		iommu_group_put(grp);
> >>>> +		BUG_ON(tbl->it_group);
> >>>> +	}
> >>>> +
> >>>
> >>>
> >>> It troubles me a bit that you're using the vfio driver to initialize and
> >>> tear down IOMMU groups on your platform.
> >>
> >>
> >> I am not following you here. Could you please explain a bit?
> >
> > IOMMU groups are theoretically not just for VFIO.  They expose DMA
> > dependencies between devices for anyone who cares to know about it.
> > VFIO happens to care very much about that, but is hopefully not the only
> > consumer.  So it's a little bit like writing a driver for a device on a
> > new bus and incorporating the bus topology handling code into the device
> > driver.  IOMMU groups should be created and managed independent of VFIO.
> 
> Do you mean that we create groups only for PCI devices? Well, moving groups 
> creation where the actual powerpc groups are allocated (pci scan) is 
> problematic right now as iommu_init() is called too late.

I mean IOMMU group creation should be independent of VFIO.  I'm not sure
how to make that ordering work on POWER, but integrating them into your
VFIO driver is contrary to many of the arguments that were made for
making IOMMU groups part of the base device model.

> >>> VFIO makes use of IOMMU groups
> >>> and is the only user so far, but they're hopefully useful beyond this.
> >>> In fact, VFIO used to manage assembling all groups from data provided by
> >>> the IOMMU but David wanted to see IOMMU groups be a more universally
> >>> available feature, so it's odd to see POWER implementing it this way.
> >>
> >>
> >> David, help! :)
> >>
> >>
> >>>> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> >>>> +}
> >>>> +
> >>>> +module_init(tce_iommu_init);
> >>>> +module_exit(tce_iommu_cleanup);
> >>>> +
> >>>> +MODULE_VERSION(DRIVER_VERSION);
> >>>> +MODULE_LICENSE("GPL v2");
> >>>> +MODULE_AUTHOR(DRIVER_AUTHOR);
> >>>> +MODULE_DESCRIPTION(DRIVER_DESC);
> >>>> +
> >>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> >>>> index 0a4f180..2c0a927 100644
> >>>> --- a/include/linux/vfio.h
> >>>> +++ b/include/linux/vfio.h
> >>>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
> >>>>    /* Extensions */
> >>>>
> >>>>    #define VFIO_TYPE1_IOMMU		1
> >>>> +#define VFIO_SPAPR_TCE_IOMMU		2
> >>>>
> >>>>    /*
> >>>>     * The IOCTL interface is designed for extensibility by embedding the
> >>>> @@ -442,4 +443,32 @@ struct vfio_iommu_type1_dma_unmap {
> >>>>
> >>>>    #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> >>>>
> >>>> +/* -------- API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> >>>> +
> >>>> +struct vfio_iommu_spapr_tce_info {
> >>>> +	__u32 argsz;
> >>>> +	__u32 flags;
> >>>> +	__u32 dma32_window_start;
> >>>> +	__u32 dma32_window_size;
> >>>> +	__u64 dma64_window_start;
> >>>> +	__u64 dma64_window_size;
> >>>> +};
> >>>> +
> >>>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> >>>> +
> >>>> +struct vfio_iommu_spapr_tce_put {
> >>>> +	__u32 argsz;
> >>>> +	__u32 flags;
> >>>> +#define VFIO_SPAPR_TCE_READ		1
> >>>> +#define VFIO_SPAPR_TCE_WRITE		2
> >>>> +#define VFIO_SPAPR_TCE_BIDIRECTIONAL	(VFIO_SPAPR_TCE_READ|VFIO_SPAPR_TCE_WRITE)
> >>>> +#define VFIO_SPAPR_TCE_PUT_MASK		VFIO_SPAPR_TCE_BIDIRECTIONAL
> >>>> +	__u64 ioba;
> >>>> +	__u64 tce;
> >>>> +};
> >>>
> >>> Ok, so if READ & WRITE are both clear and ioba is set, that's an
> >>> "unmap"?  This is exactly why _type1 has a MAP and UNMAP, to make it
> >>> clear which fields are necessary for which call.  I think we should
> >>> probably do the same here.  Besides, _put makes me think there should be
> >>> a _get; do these have some unique meaning in POWER?
> >>
> >>
> >> It is a single H_PUT_TCE for putting a record into TCE table. The guest
> >> calls H_PUT_TCE, QEMU replaces the address and simply forwards the call to
> >> the host. Calling them map/unmap makes it less clear for powerpc people :)
> >
> > In the unmap case we take an ioba and lookup a tce to clear, in the map
> > case we take an ioba and tce and insert them into the table.  It's valid
> > to document this and use a single ioctl, but I've opted on x86 to have
> > separate ioctls because the documentation falls out cleaner when there
> > aren't fields that are only used in certain conditions.  Do you really
> > want any userspace driver making use of this to know about powerpc
> > H_PUT_TCE or would it make more sense to have a MAP and UNMAP call?  I
> > think it would be better for the VFIO API if we had some consistency in
> > the mapping ioctls where possible.
> 
> 
> I would think that passing through "as is" as much as possible is the best 
> thing here as the aim is KVM. May be one day we will implement H_PUT_TCE in 
> the kernel, so splitting H_PUT_TCE to map+unmap and then combining it back 
> in the kernel (because we will have H_PUT_TCE handler) is a bit ugly.

No, KVM is a use case for VFIO, we shouldn't be assume it's _the_ use
case.  Exposing it "as is" means anyone trying to write a VFIO userspace
driver needs to know about the implementation of H_PUT_TCE to make the
driver work on POWER.  The fact that the same hypercall is made for a
map or unmap is really irrelevant to the VFIO API.

> >>>> +#define VFIO_IOMMU_SPAPR_TCE_PUT	_IO(VFIO_TYPE, VFIO_BASE + 13)
> >>>> +
> >>>
> >>> Please document what all of the above means.  Thanks,
> >>
> >>
> >> Something like this?
> >> /*
> >>    * The VFIO_IOMMU_SPAPR_TCE_PUT is implemented as the H_PUT_TCE hypercall.
> >>    * ioba - I/O Bus Address for indexing into TCE table
> >>    * tce - logical address of storage
> >>    *
> >>    * The non-zero flags means adding new page into the table.
> >>    * The zero flags means releasing the existing page and clearing the
> >>    * TCE table entry.
> >>    */
> >
> > Do you only want VFIO drivers to work on POWER if they're written by
> > POWER people?  Ideally there are a few simple concepts: a) devices have
> > an I/O virtual address space.  On x86 we call this the iova and it's
> > effectively a zero-based, 64bit (not really, but close enough) address
> > space.  You seem to have two smaller windows, one in 32bit space,
> > another in 64bit space (maybe we could name these more consistently).
> > b) Userspace has a buffer that they want to map and unmap to an iova,
> > potentially with some access flags.  That's all you need to know to use
> > the x86 _type1 VFIO IOMMU API.
> 
> 
> Do not you have to map entire RAM to PCI bus? You use listener which 
> purpose is not very clear. This is an extra knowledge beyond qemu-to-host 
> interface which the user space program should know.

In the x86 case, the buffer we want to map is all of guest RAM.  Some of
that changes dynamically, so we have a listener setup to make updates.
The only thing magic about doing that is that the device is then able to
DMA to any part of guest RAM and therefore the guest doesn't need to
know the IOMMU exists.  Device assignment is therefore transparent on
x86.

> > Why do I need to know about H_PUT_TCE to
> > use this interface?  Let's assume there might be some VFIO drivers some
> > day that aren't written by POWER people.  Thanks,
> 
> Example of such a driver? My imagination is weak :)

See Tom Lyon's original user level drivers:

https://github.com/pugs/vfio-user-level-drivers

These are against the original version of VFIO so no longer work, but
he's got drivers for common devices like Intel 82576 & 82599 SR-IOV
NICs.  There are special use cases and special devices where it makes
sense to have a driver in userspace.  Ideally a VFIO driver for a NIC
would work with fairly minimal IOMMU abstractions between x86 and POWER,
but if you design the SPAPR VFIO IOMMU API so that users need to
understand how H_PUT_TCE works to port their driver to POWER, you might
find it more difficult to leverage such drivers.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] powerpc-kvm: fixing page alignment for TCE
  2012-09-04  7:36   ` [PATCH] powerpc-kvm: fixing page alignment for TCE Alexey Kardashevskiy
@ 2012-09-20  9:01     ` Alexander Graf
  0 siblings, 0 replies; 25+ messages in thread
From: Alexander Graf @ 2012-09-20  9:01 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Paul Mackerras, linuxppc-dev, kvm-ppc, David Gibson


On 04.09.2012, at 09:36, Alexey Kardashevskiy wrote:

> From: Paul Mackerras <paulus@samba.org>
>=20
> TODO: ask Paul to make a proper message.

TODO?

Also, Ben or Paul, please ack if you think it's correct.


Alex

>=20
> This is the fix for a host kernel compiled with a page size
> other than 4K (TCE page size). In the case of a 64K page size,
> the host used to lose address bits in hpte_rpn().
> The patch fixes it.
>=20
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> arch/powerpc/kvm/book3s_64_mmu_hv.c |    9 ++++-----
> 1 file changed, 4 insertions(+), 5 deletions(-)
>=20
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c =
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 80a5775..a41f11b 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -503,7 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run =
*run, struct kvm_vcpu *vcpu,
> 	struct kvm *kvm =3D vcpu->kvm;
> 	unsigned long *hptep, hpte[3], r;
> 	unsigned long mmu_seq, psize, pte_size;
> -	unsigned long gfn, hva, pfn;
> +	unsigned long gpa, gfn, hva, pfn;
> 	struct kvm_memory_slot *memslot;
> 	unsigned long *rmap;
> 	struct revmap_entry *rev;
> @@ -541,15 +541,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run =
*run, struct kvm_vcpu *vcpu,
>=20
> 	/* Translate the logical address and get the page */
> 	psize =3D hpte_page_size(hpte[0], r);
> -	gfn =3D hpte_rpn(r, psize);
> +	gpa =3D (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1));
> +	gfn =3D gpa >> PAGE_SHIFT;
> 	memslot =3D gfn_to_memslot(kvm, gfn);
>=20
> 	/* No memslot means it's an emulated MMIO region */
> -	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
> -		unsigned long gpa =3D (gfn << PAGE_SHIFT) | (ea & (psize =
- 1));
> +	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
> 		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
> 					      dsisr & DSISR_ISSTORE);
> -	}
>=20
> 	if (!kvm->arch.using_mmu_notifiers)
> 		return -EFAULT;		/* should never get here */
> --=20
> 1.7.10.4
>=20
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] vfio: enabled and supported on power (v7)
  2012-09-14  4:35           ` Alex Williamson
@ 2012-10-11  8:19             ` Alexey Kardashevskiy
  2012-10-11 18:09               ` Alex Williamson
  0 siblings, 1 reply; 25+ messages in thread
From: Alexey Kardashevskiy @ 2012-10-11  8:19 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Paul Mackerras, linuxppc-dev, David Gibson

Ok I'm back, nothing seems happened during last month :)


On 14/09/12 14:35, Alex Williamson wrote:
> On Fri, 2012-09-14 at 10:51 +1000, Alexey Kardashevskiy wrote:
>> On 14/09/12 08:34, Alex Williamson wrote:
>>> On Tue, 2012-09-11 at 18:28 +1000, Alexey Kardashevskiy wrote:
>>>> On 11/09/12 02:02, Alex Williamson wrote:
>>>>> On Tue, 2012-09-04 at 17:33 +1000, Alexey Kardashevskiy wrote:
>>>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>>>>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>>>>>> Cc: Paul Mackerras <paulus@samba.org>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> ---
>>>>>
>>>>> Please at least cc kvm@vger as well since we list that as the devel list
>>>>> for vfio.
>>>>>
>>>>>>     arch/powerpc/include/asm/iommu.h    |    3 +
>>>>>
>>>>> I'll need an ack from Ben or Paul for this change.
>>>>>
>>>>>>     drivers/iommu/Kconfig               |    8 +
>>>>>>     drivers/vfio/Kconfig                |    6 +
>>>>>>     drivers/vfio/Makefile               |    1 +
>>>>>>     drivers/vfio/vfio_iommu_spapr_tce.c |  440 +++++++++++++++++++++++++++++++++++
>>>>>>     include/linux/vfio.h                |   29 +++
>>>>>>     6 files changed, 487 insertions(+)
>>>>>>     create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>>>>
>>>>>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>>>>>> index 957a83f..c64bce7 100644
>>>>>> --- a/arch/powerpc/include/asm/iommu.h
>>>>>> +++ b/arch/powerpc/include/asm/iommu.h
>>>>>> @@ -66,6 +66,9 @@ struct iommu_table {
>>>>>>     	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
>>>>>>     	spinlock_t     it_lock;      /* Protects it_map */
>>>>>>     	unsigned long *it_map;       /* A simple allocation bitmap for now */
>>>>>> +#ifdef CONFIG_IOMMU_API
>>>>>> +	struct iommu_group *it_group;
>>>>>> +#endif
>>>>>>     };
>>>>>
>>>>> This seems to only be valid when vfio_iommu_spapr_tce is loaded, which
>>>>> is a bit misleading.
>>>>>
>>>>>>
>>>>>>     struct scatterlist;
>>>>>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>>>>>> index 3bd9fff..19cf2d9 100644
>>>>>> --- a/drivers/iommu/Kconfig
>>>>>> +++ b/drivers/iommu/Kconfig
>>>>>> @@ -162,4 +162,12 @@ config TEGRA_IOMMU_SMMU
>>>>>>     	  space through the SMMU (System Memory Management Unit)
>>>>>>     	  hardware included on Tegra SoCs.
>>>>>>
>>>>>> +config SPAPR_TCE_IOMMU
>>>>>> +	bool "sPAPR TCE IOMMU Support"
>>>>>> +	depends on PPC_PSERIES
>>>>>> +	select IOMMU_API
>>>>>> +	help
>>>>>> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
>>>>>> +	  still not implemented.
>>>>>> +
>>>>>>     endif # IOMMU_SUPPORT
>>>>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>>>>>> index 7cd5dec..b464687 100644
>>>>>> --- a/drivers/vfio/Kconfig
>>>>>> +++ b/drivers/vfio/Kconfig
>>>>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>>>>>     	depends on VFIO
>>>>>>     	default n
>>>>>>
>>>>>> +config VFIO_IOMMU_SPAPR_TCE
>>>>>> +	tristate
>>>>>> +	depends on VFIO && SPAPR_TCE_IOMMU
>>>>>> +	default n
>>>>>> +
>>>>>>     menuconfig VFIO
>>>>>>     	tristate "VFIO Non-Privileged userspace driver framework"
>>>>>>     	depends on IOMMU_API
>>>>>>     	select VFIO_IOMMU_TYPE1 if X86
>>>>>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>>>>>     	help
>>>>>>     	  VFIO provides a framework for secure userspace device drivers.
>>>>>>     	  See Documentation/vfio.txt for more details.
>>>>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>>>>>> index 2398d4a..72bfabc 100644
>>>>>> --- a/drivers/vfio/Makefile
>>>>>> +++ b/drivers/vfio/Makefile
>>>>>> @@ -1,3 +1,4 @@
>>>>>>     obj-$(CONFIG_VFIO) += vfio.o
>>>>>>     obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>>>>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>>>>>     obj-$(CONFIG_VFIO_PCI) += pci/
>>>>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>>>> new file mode 100644
>>>>>> index 0000000..21f1909
>>>>>> --- /dev/null
>>>>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>>>> @@ -0,0 +1,440 @@
>>>>>> +/*
>>>>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>>>>>> + *
>>>>>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
>>>>>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> + *
>>>>>> + * This program is free software; you can redistribute it and/or modify
>>>>>> + * it under the terms of the GNU General Public License version 2 as
>>>>>> + * published by the Free Software Foundation.
>>>>>> + *
>>>>>> + * Derived from original vfio_iommu_x86.c:
>>>>>
>>>>> Should this be _type1?  Only the mail archives are going to remember
>>>>> there was a _x86, so the renamed version is probably a better reference.
>>>>>
>>>>>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
>>>>>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
>>>>>> + */
>>>>>> +
>>>>>> +#include <linux/module.h>
>>>>>> +#include <linux/pci.h>
>>>>>> +#include <linux/slab.h>
>>>>>> +#include <linux/uaccess.h>
>>>>>> +#include <linux/err.h>
>>>>>> +#include <linux/vfio.h>
>>>>>> +#include <linux/spinlock.h>
>>>>>> +#include <asm/iommu.h>
>>>>>> +
>>>>>> +#define DRIVER_VERSION  "0.1"
>>>>>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
>>>>>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
>>>>>> +
>>>>>> +
>>>>>> +/*
>>>>>> + * SPAPR TCE API
>>>>>> + */
>>>>>> +static void tce_free(struct iommu_table *tbl, unsigned long entry,
>>>>>> +		unsigned long tce)
>>>>>> +{
>>>>>> +	struct page *page = pfn_to_page(tce >> PAGE_SHIFT);
>>>>>> +
>>>>>> +	WARN_ON(!page);
>>>>>> +	if (page) {
>>>>>> +		if (tce & VFIO_SPAPR_TCE_WRITE)
>>>>>> +			SetPageDirty(page);
>>>>>> +		put_page(page);
>>>>>> +	}
>>>>>> +	ppc_md.tce_free(tbl, entry, 1);
>>>>>> +}
>>>>>> +
>>>>>> +static long tce_put(struct iommu_table *tbl,
>>>>>> +		unsigned long entry, uint64_t tce, uint32_t flags)
>>>>>> +{
>>>>>> +	int ret;
>>>>>> +	unsigned long oldtce, kva, offset;
>>>>>> +	struct page *page = NULL;
>>>>>> +	enum dma_data_direction direction = DMA_NONE;
>>>>>> +
>>>>>> +	switch (flags & VFIO_SPAPR_TCE_PUT_MASK) {
>>>>>> +	case VFIO_SPAPR_TCE_READ:
>>>>>> +		direction = DMA_TO_DEVICE;
>>>>>> +		break;
>>>>>> +	case VFIO_SPAPR_TCE_WRITE:
>>>>>> +		direction = DMA_FROM_DEVICE;
>>>>>> +		break;
>>>>>> +	case VFIO_SPAPR_TCE_BIDIRECTIONAL:
>>>>>> +		direction = DMA_BIDIRECTIONAL;
>>>>>> +		break;
>>>>>> +	}
>>>>>> +
>>>>>> +	oldtce = ppc_md.tce_get(tbl, entry);
>>>>>> +
>>>>>> +	/* Free page if still allocated */
>>>>>> +	if (oldtce & VFIO_SPAPR_TCE_PUT_MASK)
>>>>>> +		tce_free(tbl, entry, oldtce);
>>>>>> +
>>>>>> +	/* Map new TCE */
>>>>>> +	if (direction != DMA_NONE) {
>>>>>> +		offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>>>>>> +		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>>>>>> +				direction != DMA_TO_DEVICE, &page);
>>>>>> +		BUG_ON(ret > 1);
>>>>>
>>>>> Can this happen?
>>>>>
>>>>>> +		if (ret < 1) {
>>>>>> +			printk(KERN_ERR "tce_vfio: get_user_pages_fast failed "
>>>>>> +					"tce=%llx ioba=%lx ret=%d\n",
>>>>>> +					tce, entry << IOMMU_PAGE_SHIFT, ret);
>>>>>> +			if (!ret)
>>>>>> +				ret = -EFAULT;
>>>>>> +			goto unlock_exit;
>>>>>> +		}
>>>>>> +
>>>>>> +		kva = (unsigned long) page_address(page);
>>>>>> +		kva += offset;
>>>>>> +		BUG_ON(!kva);
>>>>>
>>>>> Same here, can it happen?  If so, should it BUG or catch the below
>>>>> EINVAL?
>>>>>
>>>>>> +		if (WARN_ON(kva & ~IOMMU_PAGE_MASK))
>>>>>> +			return -EINVAL;
>>>>>
>>>>> Page leak?  Don't we want to do a put_page(), which means we probably
>>>>> want a goto exit here.
>>>>>
>>>>>> +
>>>>>> +		/* Preserve access bits */
>>>>>> +		kva |= flags & VFIO_SPAPR_TCE_PUT_MASK;
>>>>>> +
>>>>>> +		/* tce_build receives a virtual address */
>>>>>> +		entry += tbl->it_offset;	/* Offset into real TCE table */
>>>>>> +		ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>>>>>> +
>>>>>> +		/* tce_build() only returns non-zero for transient errors */
>>>>>> +		if (unlikely(ret)) {
>>>>>> +			printk(KERN_ERR "tce_vfio: Failed to add TCE\n");
>>>>>> +			ret = -EIO;
>>>>>> +			goto unlock_exit;
>>>>>> +		}
>>>>>> +	}
>>>>>> +	/* Flush/invalidate TLB caches if necessary */
>>>>>> +	if (ppc_md.tce_flush)
>>>>>> +		ppc_md.tce_flush(tbl);
>>>>>> +
>>>>>> +	/* Make sure updates are seen by hardware */
>>>>>> +	mb();
>>>>>> +
>>>>>> +unlock_exit:
>>>>>
>>>>> unlock seems wrong here, I had to go re-read the code looking for the
>>>>> lock.
>>>>>
>>>>>> +	if (ret && page)
>>>>>> +		put_page(page);
>>>>>> +
>>>>>> +	if (ret)
>>>>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx "
>>>>>> +				"ioba=%lx kva=%lx\n", tce,
>>>>>> +				entry << IOMMU_PAGE_SHIFT, kva);
>>>>>> +	return ret;
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>>>>>> + */
>>>>>> +
>>>>>> +/*
>>>>>> + * The container descriptor supports only a single group per container.
>>>>>> + * Required by the API as the container is not supplied with the IOMMU group
>>>>>> + * at the moment of initialization.
>>>>>> + */
>>>>>> +struct tce_container {
>>>>>> +	struct iommu_table *tbl;
>>>>>> +};
>>>>>> +
>>>>>> +static void *tce_iommu_open(unsigned long arg)
>>>>>> +{
>>>>>> +	struct tce_container *container;
>>>>>> +
>>>>>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
>>>>>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
>>>>>> +		return ERR_PTR(-EINVAL);
>>>>>> +	}
>>>>>> +
>>>>>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
>>>>>> +	if (!container)
>>>>>> +		return ERR_PTR(-ENOMEM);
>>>>>> +
>>>>>> +	return container;
>>>>>> +}
>>>>>> +
>>>>>> +static void tce_iommu_release(void *iommu_data)
>>>>>> +{
>>>>>> +	struct tce_container *container = iommu_data;
>>>>>> +	struct iommu_table *tbl = container->tbl;
>>>>>> +	unsigned long i, tce;
>>>>>> +
>>>>>
>>>>> This will segfault if releasing a container that never had an a device
>>>>> attached.
>>>>>
>>>>>> +	/* Unmap leftovers */
>>>>>> +	spin_lock_irq(&tbl->it_lock);
>>>>>> +	for (i = tbl->it_offset; i < tbl->it_offset + tbl->it_size; ++i) {
>>>>>> +		tce = ppc_md.tce_get(tbl, i);
>>>>>> +		if (tce & VFIO_SPAPR_TCE_PUT_MASK)
>>>>>> +			tce_free(tbl, i, tce);
>>>>>> +	}
>>>>>> +	/* Flush/invalidate TLB caches if necessary */
>>>>>> +	if (ppc_md.tce_flush)
>>>>>> +		ppc_md.tce_flush(tbl);
>>>>>> +
>>>>>> +	/* Make sure updates are seen by hardware */
>>>>>> +	mb();
>>>>>> +
>>>>>> +	spin_unlock_irq(&tbl->it_lock);
>>>>>> +
>>>>>> +	kfree(container);
>>>>>> +}
>>>>>> +
>>>>>> +static long tce_iommu_ioctl(void *iommu_data,
>>>>>> +				 unsigned int cmd, unsigned long arg)
>>>>>> +{
>>>>>> +	struct tce_container *container = iommu_data;
>>>>>> +	unsigned long minsz;
>>>>>> +	long ret;
>>>>>> +
>>>>>> +	switch (cmd) {
>>>>>> +	case VFIO_CHECK_EXTENSION: {
>>>>>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>>>>>> +	}
>>>>>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>>>>>> +		struct vfio_iommu_spapr_tce_info info;
>>>>>> +		struct iommu_table *tbl = container->tbl;
>>>>>> +
>>>>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>>>>>> +				dma64_window_size);
>>>>>> +
>>>>>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
>>>>>> +			return -EFAULT;
>>>>>> +
>>>>>> +		if (info.argsz < minsz)
>>>>>> +			return -EINVAL;
>>>>>> +
>>>>>> +		if (!tbl)
>>>>>> +			return -ENXIO;
>>>>>
>>>>> nit: why not check this earlier?
>>>>>
>>>>>> +
>>>>>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>>>>>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>>>>>> +		info.dma64_window_start = 0;
>>>>>> +		info.dma64_window_size = 0;
>>>>>> +		info.flags = 0;
>>>>>> +
>>>>>> +		return copy_to_user((void __user *)arg, &info, minsz);
>>>>>> +	}
>>>>>> +	case VFIO_IOMMU_SPAPR_TCE_PUT: {
>>>>>> +		struct vfio_iommu_spapr_tce_put par;
>>>>>> +		struct iommu_table *tbl = container->tbl;
>>>>>> +
>>>>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_put, tce);
>>>>>> +
>>>>>> +		if (copy_from_user(&par, (void __user *)arg, minsz))
>>>>>> +			return -EFAULT;
>>>>>> +
>>>>>> +		if (par.argsz < minsz)
>>>>>> +			return -EINVAL;
>>>>>> +
>>>>>> +		if (!tbl) {
>>>>>> +			return -ENXIO;
>>>>>> +		}
>>>>>
>>>>> Same, plus drop the braces.
>>>>>
>>>>>> +
>>>>>> +		spin_lock_irq(&tbl->it_lock);
>>>>>> +		ret = tce_put(tbl, par.ioba >> IOMMU_PAGE_SHIFT,
>>>>>> +				par.tce, par.flags);
>>>>>> +		spin_unlock_irq(&tbl->it_lock);
>>>>>> +
>>>>>> +		return ret;
>>>>>> +	}
>>>>>
>>>>> Is "PUT" really the name we want for this?
>>>>
>>>>
>>>> Yes, it is a single H_PUT_TCE hypercall from POWER architecture spec.
>>>
>>> Ok, if it makes sense on your arch, I won't complain (too much) about
>>> it.
>>>
>>>>>> +	default:
>>>>>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
>>>>>> +	}
>>>>>> +
>>>>>> +	return -ENOTTY;
>>>>>> +}
>>>>>> +
>>>>>> +static int tce_iommu_attach_group(void *iommu_data,
>>>>>> +		struct iommu_group *iommu_group)
>>>>>> +{
>>>>>> +	struct tce_container *container = iommu_data;
>>>>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>>>>>> +
>>>>>> +	printk(KERN_DEBUG "tce_vfio: Attaching group #%u to iommu %p\n",
>>>>>> +			iommu_group_id(iommu_group), iommu_group);
>>>>>
>>>>> Let's use pr_debug() and friends throughout.
>>>>>
>>>>>> +	if (container->tbl) {
>>>>>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU "
>>>>>> +				"container is allowed, "
>>>>>> +				"existing id=%d, attaching id=%d\n",
>>>>>> +				iommu_group_id(container->tbl->it_group),
>>>>>> +				iommu_group_id(iommu_group));
>>>>>> +		return -EBUSY;
>>>>>> +	}
>>>>>> +
>>>>>
>>>>> _type1 has a lock to avoid races here, I think you might need one too.
>>>>>
>>>>>> +	container->tbl = tbl;
>>>>>> +
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void tce_iommu_detach_group(void *iommu_data,
>>>>>> +		struct iommu_group *iommu_group)
>>>>>> +{
>>>>>> +	struct tce_container *container = iommu_data;
>>>>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>>>>>> +
>>>>>> +	BUG_ON(!tbl);
>>>>>
>>>>> Needed?  If so, why is there no check on attach?
>>>>
>>>> Added to attach() :)
>>>>
>>>>
>>>>>
>>>>>> +	if (tbl != container->tbl) {
>>>>>> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected "
>>>>>> +				"group is #%u\n", iommu_group_id(iommu_group),
>>>>>> +				iommu_group_id(tbl->it_group));
>>>>>> +		return;
>>>>>> +	}
>>>>>> +	printk(KERN_DEBUG "tce_vfio: detaching group #%u from iommu %p\n",
>>>>>> +			iommu_group_id(iommu_group), iommu_group);
>>>>>
>>>>> container->tbl = NULL?
>>>>
>>>>
>>>> Then I won't be able to release pages in tce_iommu_release().
>>>> Releasing pages in tce_iommu_detach_group() caused some other problems,
>>>> cannot recall now which ones.
>>>
>>> What happens if you hot unplug a group from one VM and add it to
>>> another?  ie. we've detached it from one container and add it to another
>>> in a different instance.  Does it cause problems here?
>>
>>
>> Then the container will be released as just one group per container is
>> supported at the moment, no? Cannot check though as we do not support
>> hotplug yet.
>
> But you still have a race where the group is detached, but the container
> is not yet released and can be attached to another container in a
> different instance.


Yeah... Moved cleanup to detach(), trying to reproduce the issue I had 
before but to no avail.


>>>>>> +}
>>>>>> +
>>>>>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>>>>>> +	.name		= "iommu-vfio-powerpc",
>>>>>> +	.owner		= THIS_MODULE,
>>>>>> +	.open		= tce_iommu_open,
>>>>>> +	.release	= tce_iommu_release,
>>>>>> +	.ioctl		= tce_iommu_ioctl,
>>>>>> +	.attach_group	= tce_iommu_attach_group,
>>>>>> +	.detach_group	= tce_iommu_detach_group,
>>>>>> +};
>>>>>> +
>>>>>> +/*
>>>>>> + * Add/delete devices support (hotplug, module_init, module_exit)
>>>>>> + */
>>>>>> +static int add_device(struct device *dev)
>>>>>> +{
>>>>>> +	struct iommu_table *tbl;
>>>>>> +	int ret = 0;
>>>>>> +
>>>>>> +	if (dev->iommu_group) {
>>>>>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu "
>>>>>> +				"group %d, skipping\n", dev->kobj.name,
>>>>>
>>>>> Watch line wrapping on strings.
>>>>
>>>> Pardon?
>>>
>>> Just suggesting that you try to wrap lines so that strings are
>>> searchable.  For instance, can I search cscope for "is already in iommu
>>> group".  It's generally accepted that printks can break 80 cols for
>>> this.
>>
>> Aaaa. Did not know that this is accepted but was always annoyed to wrap
>> this way, thanks :)
>>
>>
>>>>>> +				iommu_group_id(dev->iommu_group));
>>>>>> +		return -EBUSY;
>>>>>> +	}
>>>>>> +
>>>>>> +	tbl = get_iommu_table_base(dev);
>>>>>> +	if (!tbl) {
>>>>>> +		printk(KERN_DEBUG "tce_vfio: skipping device %s with no tbl\n",
>>>>>> +				dev->kobj.name);
>>>>>> +		return 0;
>>>>>> +	}
>>>>>> +
>>>>>> +	printk(KERN_DEBUG "tce_vfio: adding %s to iommu group %d\n",
>>>>>> +			dev->kobj.name, iommu_group_id(tbl->it_group));
>>>>>> +
>>>>>> +	ret = iommu_group_add_device(tbl->it_group, dev);
>>>>>> +	if (ret < 0)
>>>>>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>>>>>> +				dev->kobj.name, ret);
>>>>>> +
>>>>>> +	return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static void del_device(struct device *dev)
>>>>>> +{
>>>>>> +	iommu_group_remove_device(dev);
>>>>>> +}
>>>>>> +
>>>>>> +static int iommu_bus_notifier(struct notifier_block *nb,
>>>>>> +			      unsigned long action, void *data)
>>>>>> +{
>>>>>> +	struct device *dev = data;
>>>>>> +
>>>>>> +	switch (action) {
>>>>>> +	case BUS_NOTIFY_ADD_DEVICE:
>>>>>> +		return add_device(dev);
>>>>>> +	case BUS_NOTIFY_DEL_DEVICE:
>>>>>> +		del_device(dev);
>>>>>> +		return 0;
>>>>>> +	default:
>>>>>> +		return 0;
>>>>>> +	}
>>>>>> +}
>>>>>> +
>>>>>> +static struct notifier_block tce_iommu_bus_nb = {
>>>>>> +	.notifier_call = iommu_bus_notifier,
>>>>>> +};
>>>>>> +
>>>>>> +void group_release(void *iommu_data)
>>>>>> +{
>>>>>> +	struct iommu_table *tbl = iommu_data;
>>>>>> +	tbl->it_group = NULL;
>>>>>> +}
>>>>>> +
>>>>>> +static int __init tce_iommu_init(void)
>>>>>> +{
>>>>>> +	struct pci_dev *pdev = NULL;
>>>>>> +	struct iommu_table *tbl;
>>>>>> +	struct iommu_group *grp;
>>>>>> +
>>>>>> +	/* If the current platform does not support tce_get
>>>>>> +	   we are unable to clean TCE table properly and
>>>>>> +	   therefore it is better not to touch it at all */
>>>>>> +	if (!ppc_md.tce_get) {
>>>>>> +		printk(KERN_ERR "tce_vfio: ppc_md.tce_get isn't implemented\n");
>>>>>> +		return -EOPNOTSUPP;
>>>>>> +	}
>>>>>> +
>>>>>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>>>>>> +
>>>>>> +	/* Allocate and initialize VFIO groups */
>>>>>
>>>>> s/VFIO groups/IOMMU groups/
>>>>>
>>>>>> +	for_each_pci_dev(pdev) {
>>>>>> +		tbl = get_iommu_table_base(&pdev->dev);
>>>>>> +		if (!tbl)
>>>>>> +			continue;
>>>>>> +
>>>>>> +		/* Skip already initialized */
>>>>>> +		if (tbl->it_group)
>>>>>> +			continue;
>>>>>> +
>>>>>> +		grp = iommu_group_alloc();
>>>>>> +		if (IS_ERR(grp)) {
>>>>>> +			printk(KERN_INFO "tce_vfio: cannot create "
>>>>>> +					"new IOMMU group, ret=%ld\n",
>>>>>> +					PTR_ERR(grp));
>>>>>> +			return -EFAULT;
>>>>>> +		}
>>>>>> +		tbl->it_group = grp;
>>>>>> +		iommu_group_set_iommudata(grp, tbl, group_release);
>>>>>> +	}
>>>>>> +
>>>>>> +	/* Add PCI devices to VFIO groups */
>>>>>> +	for_each_pci_dev(pdev)
>>>>>> +		add_device(&pdev->dev);
>>>>>> +
>>>>>> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>>>>>> +}
>>>>>> +
>>>>>> +static void __exit tce_iommu_cleanup(void)
>>>>>> +{
>>>>>> +	struct pci_dev *pdev = NULL;
>>>>>> +	struct iommu_table *tbl;
>>>>>> +	struct iommu_group *grp = NULL;
>>>>>> +
>>>>>> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>>>>>> +
>>>>>> +	/* Delete PCI devices from VFIO groups */
>>>>>> +	for_each_pci_dev(pdev)
>>>>>> +		del_device(&pdev->dev);
>>>>>> +
>>>>>> +	/* Release VFIO groups */
>>>>>> +	for_each_pci_dev(pdev) {
>>>>>> +		tbl = get_iommu_table_base(&pdev->dev);
>>>>>> +		if (!tbl)
>>>>>> +			continue;
>>>>>> +		grp = tbl->it_group;
>>>>>> +
>>>>>> +		/* Skip (already) uninitialized */
>>>>>> +		if (!grp)
>>>>>> +			continue;
>>>>>> +
>>>>>> +		/* Do actual release, group_release() is expected to work */
>>>>>> +		iommu_group_put(grp);
>>>>>> +		BUG_ON(tbl->it_group);
>>>>>> +	}
>>>>>> +
>>>>>
>>>>>
>>>>> It troubles me a bit that you're using the vfio driver to initialize and
>>>>> tear down IOMMU groups on your platform.
>>>>
>>>>
>>>> I am not following you here. Could you please explain a bit?
>>>
>>> IOMMU groups are theoretically not just for VFIO.  They expose DMA
>>> dependencies between devices for anyone who cares to know about it.
>>> VFIO happens to care very much about that, but is hopefully not the only
>>> consumer.  So it's a little bit like writing a driver for a device on a
>>> new bus and incorporating the bus topology handling code into the device
>>> driver.  IOMMU groups should be created and managed independent of VFIO.
>>
>> Do you mean that we create groups only for PCI devices? Well, moving groups
>> creation where the actual powerpc groups are allocated (pci scan) is
>> problematic right now as iommu_init() is called too late.
>
> I mean IOMMU group creation should be independent of VFIO.  I'm not sure
> how to make that ordering work on POWER, but integrating them into your
> VFIO driver is contrary to many of the arguments that were made for
> making IOMMU groups part of the base device model.


I still do not get it. The creation code itself does not depend on VFIO.
And yes, I would like to create groups from the platform code when the 
actual IOMMU tables are created, the only problem is that iommu_init() is 
called too late - after PCI scan (subsys_initcall(pcibios_init) from 
arch/powerpc/kernel/pci_64.c), iommu_init is subsys_initcall as well.
I could move tce_iommu_init/tce_iommu_cleanup as a module somewhere in 
arch/powerpc but moving iommu_init() earlier looks better, then I would 
create IOMMU groups exactly when their POWER counterparts are created.



>>>>> VFIO makes use of IOMMU groups
>>>>> and is the only user so far, but they're hopefully useful beyond this.
>>>>> In fact, VFIO used to manage assembling all groups from data provided by
>>>>> the IOMMU but David wanted to see IOMMU groups be a more universally
>>>>> available feature, so it's odd to see POWER implementing it this way.
>>>>
>>>>
>>>> David, help! :)
>>>>
>>>>
>>>>>> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>>>>>> +}
>>>>>> +
>>>>>> +module_init(tce_iommu_init);
>>>>>> +module_exit(tce_iommu_cleanup);
>>>>>> +
>>>>>> +MODULE_VERSION(DRIVER_VERSION);
>>>>>> +MODULE_LICENSE("GPL v2");
>>>>>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>>>>>> +MODULE_DESCRIPTION(DRIVER_DESC);
>>>>>> +
>>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>>>>>> index 0a4f180..2c0a927 100644
>>>>>> --- a/include/linux/vfio.h
>>>>>> +++ b/include/linux/vfio.h
>>>>>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>>>>>>     /* Extensions */
>>>>>>
>>>>>>     #define VFIO_TYPE1_IOMMU		1
>>>>>> +#define VFIO_SPAPR_TCE_IOMMU		2
>>>>>>
>>>>>>     /*
>>>>>>      * The IOCTL interface is designed for extensibility by embedding the
>>>>>> @@ -442,4 +443,32 @@ struct vfio_iommu_type1_dma_unmap {
>>>>>>
>>>>>>     #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>>>>>
>>>>>> +/* -------- API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>>>>>> +
>>>>>> +struct vfio_iommu_spapr_tce_info {
>>>>>> +	__u32 argsz;
>>>>>> +	__u32 flags;
>>>>>> +	__u32 dma32_window_start;
>>>>>> +	__u32 dma32_window_size;
>>>>>> +	__u64 dma64_window_start;
>>>>>> +	__u64 dma64_window_size;
>>>>>> +};
>>>>>> +
>>>>>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>>>>>> +
>>>>>> +struct vfio_iommu_spapr_tce_put {
>>>>>> +	__u32 argsz;
>>>>>> +	__u32 flags;
>>>>>> +#define VFIO_SPAPR_TCE_READ		1
>>>>>> +#define VFIO_SPAPR_TCE_WRITE		2
>>>>>> +#define VFIO_SPAPR_TCE_BIDIRECTIONAL	(VFIO_SPAPR_TCE_READ|VFIO_SPAPR_TCE_WRITE)
>>>>>> +#define VFIO_SPAPR_TCE_PUT_MASK		VFIO_SPAPR_TCE_BIDIRECTIONAL
>>>>>> +	__u64 ioba;
>>>>>> +	__u64 tce;
>>>>>> +};
>>>>>
>>>>> Ok, so if READ & WRITE are both clear and ioba is set, that's an
>>>>> "unmap"?  This is exactly why _type1 has a MAP and UNMAP, to make it
>>>>> clear which fields are necessary for which call.  I think we should
>>>>> probably do the same here.  Besides, _put makes me think there should be
>>>>> a _get; do these have some unique meaning in POWER?
>>>>
>>>>
>>>> It is a single H_PUT_TCE for putting a record into TCE table. The guest
>>>> calls H_PUT_TCE, QEMU replaces the address and simply forwards the call to
>>>> the host. Calling them map/unmap makes it less clear for powerpc people :)
>>>
>>> In the unmap case we take an ioba and lookup a tce to clear, in the map
>>> case we take an ioba and tce and insert them into the table.  It's valid
>>> to document this and use a single ioctl, but I've opted on x86 to have
>>> separate ioctls because the documentation falls out cleaner when there
>>> aren't fields that are only used in certain conditions.  Do you really
>>> want any userspace driver making use of this to know about powerpc
>>> H_PUT_TCE or would it make more sense to have a MAP and UNMAP call?  I
>>> think it would be better for the VFIO API if we had some consistency in
>>> the mapping ioctls where possible.
>>
>>
>> I would think that passing through "as is" as much as possible is the best
>> thing here as the aim is KVM. May be one day we will implement H_PUT_TCE in
>> the kernel, so splitting H_PUT_TCE to map+unmap and then combining it back
>> in the kernel (because we will have H_PUT_TCE handler) is a bit ugly.
>
> No, KVM is a use case for VFIO, we shouldn't be assume it's _the_ use
> case.  Exposing it "as is" means anyone trying to write a VFIO userspace
> driver needs to know about the implementation of H_PUT_TCE to make the
> driver work on POWER.  The fact that the same hypercall is made for a
> map or unmap is really irrelevant to the VFIO API.
>
>>>>>> +#define VFIO_IOMMU_SPAPR_TCE_PUT	_IO(VFIO_TYPE, VFIO_BASE + 13)
>>>>>> +
>>>>>
>>>>> Please document what all of the above means.  Thanks,
>>>>
>>>>
>>>> Something like this?
>>>> /*
>>>>     * The VFIO_IOMMU_SPAPR_TCE_PUT is implemented as the H_PUT_TCE hypercall.
>>>>     * ioba - I/O Bus Address for indexing into TCE table
>>>>     * tce - logical address of storage
>>>>     *
>>>>     * The non-zero flags means adding new page into the table.
>>>>     * The zero flags means releasing the existing page and clearing the
>>>>     * TCE table entry.
>>>>     */
>>>
>>> Do you only want VFIO drivers to work on POWER if they're written by
>>> POWER people?  Ideally there are a few simple concepts: a) devices have
>>> an I/O virtual address space.  On x86 we call this the iova and it's
>>> effectively a zero-based, 64bit (not really, but close enough) address
>>> space.  You seem to have two smaller windows, one in 32bit space,
>>> another in 64bit space (maybe we could name these more consistently).
>>> b) Userspace has a buffer that they want to map and unmap to an iova,
>>> potentially with some access flags.  That's all you need to know to use
>>> the x86 _type1 VFIO IOMMU API.
>>
>>
>> Do not you have to map entire RAM to PCI bus? You use listener which
>> purpose is not very clear. This is an extra knowledge beyond qemu-to-host
>> interface which the user space program should know.
>
> In the x86 case, the buffer we want to map is all of guest RAM.  Some of
> that changes dynamically, so we have a listener setup to make updates.
> The only thing magic about doing that is that the device is then able to
> DMA to any part of guest RAM and therefore the guest doesn't need to
> know the IOMMU exists.  Device assignment is therefore transparent on
> x86.

>>> Why do I need to know about H_PUT_TCE to
>>> use this interface?  Let's assume there might be some VFIO drivers some
>>> day that aren't written by POWER people.  Thanks,
>>
>> Example of such a driver? My imagination is weak :)
>
> See Tom Lyon's original user level drivers:
>
> https://github.com/pugs/vfio-user-level-drivers
>
> These are against the original version of VFIO so no longer work, but
> he's got drivers for common devices like Intel 82576 & 82599 SR-IOV
> NICs.  There are special use cases and special devices where it makes
> sense to have a driver in userspace.  Ideally a VFIO driver for a NIC
> would work with fairly minimal IOMMU abstractions between x86 and POWER,
> but if you design the SPAPR VFIO IOMMU API so that users need to
> understand how H_PUT_TCE works to port their driver to POWER, you might
> find it more difficult to leverage such drivers.  Thanks,

A user space driver needs to know the DMA window if it wants to work on 
POWER because this is how PHB is configured. So it needs to know something 
about POWER. It could be fixed easily - we stop using IOMMU types and make 
map/unmap/info independent from the IOMMU type. You'll have to implement 
DMA window properties (which is entire RAM) and then we'll put to the spec 
that VFIO users have to call DMA map/unmap only for addresses within the 
returned DMA window boundaries. I thought this is what we wanted to avoid...
Or implement third type of IOMMU - "VFIO user space driver" (and implement 
dma_alloc() which would allocate an address for DMA on PCI bus) and name 
the first twos as "TYPE1 KVM" and "SPAPR TCE KVM".


-- 
Alexey

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] vfio: enabled and supported on power (v7)
  2012-10-11  8:19             ` Alexey Kardashevskiy
@ 2012-10-11 18:09               ` Alex Williamson
  0 siblings, 0 replies; 25+ messages in thread
From: Alex Williamson @ 2012-10-11 18:09 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Paul Mackerras, linuxppc-dev, David Gibson

On Thu, 2012-10-11 at 19:19 +1100, Alexey Kardashevskiy wrote:
> Ok I'm back, nothing seems happened during last month :)

Nope, not much ;)  Note that I added a hack to avoid the INTx EOI
problem, I expect it should work for you too.

> On 14/09/12 14:35, Alex Williamson wrote:
> > On Fri, 2012-09-14 at 10:51 +1000, Alexey Kardashevskiy wrote:
> >> On 14/09/12 08:34, Alex Williamson wrote:
> >>> On Tue, 2012-09-11 at 18:28 +1000, Alexey Kardashevskiy wrote:
> >>>> On 11/09/12 02:02, Alex Williamson wrote:
> >>>>> On Tue, 2012-09-04 at 17:33 +1000, Alexey Kardashevskiy wrote:
> >>>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
> >>>>>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> >>>>>> Cc: Paul Mackerras <paulus@samba.org>
> >>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>> ---
> >>>>>
> >>>>> Please at least cc kvm@vger as well since we list that as the devel list
> >>>>> for vfio.
> >>>>>
> >>>>>>     arch/powerpc/include/asm/iommu.h    |    3 +
> >>>>>
> >>>>> I'll need an ack from Ben or Paul for this change.
> >>>>>
> >>>>>>     drivers/iommu/Kconfig               |    8 +
> >>>>>>     drivers/vfio/Kconfig                |    6 +
> >>>>>>     drivers/vfio/Makefile               |    1 +
> >>>>>>     drivers/vfio/vfio_iommu_spapr_tce.c |  440 +++++++++++++++++++++++++++++++++++
> >>>>>>     include/linux/vfio.h                |   29 +++
> >>>>>>     6 files changed, 487 insertions(+)
> >>>>>>     create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>>>
> >>>>>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> >>>>>> index 957a83f..c64bce7 100644
> >>>>>> --- a/arch/powerpc/include/asm/iommu.h
> >>>>>> +++ b/arch/powerpc/include/asm/iommu.h
> >>>>>> @@ -66,6 +66,9 @@ struct iommu_table {
> >>>>>>     	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
> >>>>>>     	spinlock_t     it_lock;      /* Protects it_map */
> >>>>>>     	unsigned long *it_map;       /* A simple allocation bitmap for now */
> >>>>>> +#ifdef CONFIG_IOMMU_API
> >>>>>> +	struct iommu_group *it_group;
> >>>>>> +#endif
> >>>>>>     };
> >>>>>
> >>>>> This seems to only be valid when vfio_iommu_spapr_tce is loaded, which
> >>>>> is a bit misleading.
> >>>>>
> >>>>>>
> >>>>>>     struct scatterlist;
> >>>>>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> >>>>>> index 3bd9fff..19cf2d9 100644
> >>>>>> --- a/drivers/iommu/Kconfig
> >>>>>> +++ b/drivers/iommu/Kconfig
> >>>>>> @@ -162,4 +162,12 @@ config TEGRA_IOMMU_SMMU
> >>>>>>     	  space through the SMMU (System Memory Management Unit)
> >>>>>>     	  hardware included on Tegra SoCs.
> >>>>>>
> >>>>>> +config SPAPR_TCE_IOMMU
> >>>>>> +	bool "sPAPR TCE IOMMU Support"
> >>>>>> +	depends on PPC_PSERIES
> >>>>>> +	select IOMMU_API
> >>>>>> +	help
> >>>>>> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> >>>>>> +	  still not implemented.
> >>>>>> +
> >>>>>>     endif # IOMMU_SUPPORT
> >>>>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >>>>>> index 7cd5dec..b464687 100644
> >>>>>> --- a/drivers/vfio/Kconfig
> >>>>>> +++ b/drivers/vfio/Kconfig
> >>>>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>>>>>     	depends on VFIO
> >>>>>>     	default n
> >>>>>>
> >>>>>> +config VFIO_IOMMU_SPAPR_TCE
> >>>>>> +	tristate
> >>>>>> +	depends on VFIO && SPAPR_TCE_IOMMU
> >>>>>> +	default n
> >>>>>> +
> >>>>>>     menuconfig VFIO
> >>>>>>     	tristate "VFIO Non-Privileged userspace driver framework"
> >>>>>>     	depends on IOMMU_API
> >>>>>>     	select VFIO_IOMMU_TYPE1 if X86
> >>>>>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>>>>>     	help
> >>>>>>     	  VFIO provides a framework for secure userspace device drivers.
> >>>>>>     	  See Documentation/vfio.txt for more details.
> >>>>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >>>>>> index 2398d4a..72bfabc 100644
> >>>>>> --- a/drivers/vfio/Makefile
> >>>>>> +++ b/drivers/vfio/Makefile
> >>>>>> @@ -1,3 +1,4 @@
> >>>>>>     obj-$(CONFIG_VFIO) += vfio.o
> >>>>>>     obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >>>>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>>>>>     obj-$(CONFIG_VFIO_PCI) += pci/
> >>>>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>>> new file mode 100644
> >>>>>> index 0000000..21f1909
> >>>>>> --- /dev/null
> >>>>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>>> @@ -0,0 +1,440 @@
> >>>>>> +/*
> >>>>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >>>>>> + *
> >>>>>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >>>>>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>> + *
> >>>>>> + * This program is free software; you can redistribute it and/or modify
> >>>>>> + * it under the terms of the GNU General Public License version 2 as
> >>>>>> + * published by the Free Software Foundation.
> >>>>>> + *
> >>>>>> + * Derived from original vfio_iommu_x86.c:
> >>>>>
> >>>>> Should this be _type1?  Only the mail archives are going to remember
> >>>>> there was a _x86, so the renamed version is probably a better reference.
> >>>>>
> >>>>>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >>>>>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> >>>>>> + */
> >>>>>> +
> >>>>>> +#include <linux/module.h>
> >>>>>> +#include <linux/pci.h>
> >>>>>> +#include <linux/slab.h>
> >>>>>> +#include <linux/uaccess.h>
> >>>>>> +#include <linux/err.h>
> >>>>>> +#include <linux/vfio.h>
> >>>>>> +#include <linux/spinlock.h>
> >>>>>> +#include <asm/iommu.h>
> >>>>>> +
> >>>>>> +#define DRIVER_VERSION  "0.1"
> >>>>>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> >>>>>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> >>>>>> +
> >>>>>> +
> >>>>>> +/*
> >>>>>> + * SPAPR TCE API
> >>>>>> + */
> >>>>>> +static void tce_free(struct iommu_table *tbl, unsigned long entry,
> >>>>>> +		unsigned long tce)
> >>>>>> +{
> >>>>>> +	struct page *page = pfn_to_page(tce >> PAGE_SHIFT);
> >>>>>> +
> >>>>>> +	WARN_ON(!page);
> >>>>>> +	if (page) {
> >>>>>> +		if (tce & VFIO_SPAPR_TCE_WRITE)
> >>>>>> +			SetPageDirty(page);
> >>>>>> +		put_page(page);
> >>>>>> +	}
> >>>>>> +	ppc_md.tce_free(tbl, entry, 1);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static long tce_put(struct iommu_table *tbl,
> >>>>>> +		unsigned long entry, uint64_t tce, uint32_t flags)
> >>>>>> +{
> >>>>>> +	int ret;
> >>>>>> +	unsigned long oldtce, kva, offset;
> >>>>>> +	struct page *page = NULL;
> >>>>>> +	enum dma_data_direction direction = DMA_NONE;
> >>>>>> +
> >>>>>> +	switch (flags & VFIO_SPAPR_TCE_PUT_MASK) {
> >>>>>> +	case VFIO_SPAPR_TCE_READ:
> >>>>>> +		direction = DMA_TO_DEVICE;
> >>>>>> +		break;
> >>>>>> +	case VFIO_SPAPR_TCE_WRITE:
> >>>>>> +		direction = DMA_FROM_DEVICE;
> >>>>>> +		break;
> >>>>>> +	case VFIO_SPAPR_TCE_BIDIRECTIONAL:
> >>>>>> +		direction = DMA_BIDIRECTIONAL;
> >>>>>> +		break;
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	oldtce = ppc_md.tce_get(tbl, entry);
> >>>>>> +
> >>>>>> +	/* Free page if still allocated */
> >>>>>> +	if (oldtce & VFIO_SPAPR_TCE_PUT_MASK)
> >>>>>> +		tce_free(tbl, entry, oldtce);
> >>>>>> +
> >>>>>> +	/* Map new TCE */
> >>>>>> +	if (direction != DMA_NONE) {
> >>>>>> +		offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >>>>>> +		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >>>>>> +				direction != DMA_TO_DEVICE, &page);
> >>>>>> +		BUG_ON(ret > 1);
> >>>>>
> >>>>> Can this happen?
> >>>>>
> >>>>>> +		if (ret < 1) {
> >>>>>> +			printk(KERN_ERR "tce_vfio: get_user_pages_fast failed "
> >>>>>> +					"tce=%llx ioba=%lx ret=%d\n",
> >>>>>> +					tce, entry << IOMMU_PAGE_SHIFT, ret);
> >>>>>> +			if (!ret)
> >>>>>> +				ret = -EFAULT;
> >>>>>> +			goto unlock_exit;
> >>>>>> +		}
> >>>>>> +
> >>>>>> +		kva = (unsigned long) page_address(page);
> >>>>>> +		kva += offset;
> >>>>>> +		BUG_ON(!kva);
> >>>>>
> >>>>> Same here, can it happen?  If so, should it BUG or catch the below
> >>>>> EINVAL?
> >>>>>
> >>>>>> +		if (WARN_ON(kva & ~IOMMU_PAGE_MASK))
> >>>>>> +			return -EINVAL;
> >>>>>
> >>>>> Page leak?  Don't we want to do a put_page(), which means we probably
> >>>>> want a goto exit here.
> >>>>>
> >>>>>> +
> >>>>>> +		/* Preserve access bits */
> >>>>>> +		kva |= flags & VFIO_SPAPR_TCE_PUT_MASK;
> >>>>>> +
> >>>>>> +		/* tce_build receives a virtual address */
> >>>>>> +		entry += tbl->it_offset;	/* Offset into real TCE table */
> >>>>>> +		ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >>>>>> +
> >>>>>> +		/* tce_build() only returns non-zero for transient errors */
> >>>>>> +		if (unlikely(ret)) {
> >>>>>> +			printk(KERN_ERR "tce_vfio: Failed to add TCE\n");
> >>>>>> +			ret = -EIO;
> >>>>>> +			goto unlock_exit;
> >>>>>> +		}
> >>>>>> +	}
> >>>>>> +	/* Flush/invalidate TLB caches if necessary */
> >>>>>> +	if (ppc_md.tce_flush)
> >>>>>> +		ppc_md.tce_flush(tbl);
> >>>>>> +
> >>>>>> +	/* Make sure updates are seen by hardware */
> >>>>>> +	mb();
> >>>>>> +
> >>>>>> +unlock_exit:
> >>>>>
> >>>>> unlock seems wrong here, I had to go re-read the code looking for the
> >>>>> lock.
> >>>>>
> >>>>>> +	if (ret && page)
> >>>>>> +		put_page(page);
> >>>>>> +
> >>>>>> +	if (ret)
> >>>>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx "
> >>>>>> +				"ioba=%lx kva=%lx\n", tce,
> >>>>>> +				entry << IOMMU_PAGE_SHIFT, kva);
> >>>>>> +	return ret;
> >>>>>> +}
> >>>>>> +
> >>>>>> +/*
> >>>>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >>>>>> + */
> >>>>>> +
> >>>>>> +/*
> >>>>>> + * The container descriptor supports only a single group per container.
> >>>>>> + * Required by the API as the container is not supplied with the IOMMU group
> >>>>>> + * at the moment of initialization.
> >>>>>> + */
> >>>>>> +struct tce_container {
> >>>>>> +	struct iommu_table *tbl;
> >>>>>> +};
> >>>>>> +
> >>>>>> +static void *tce_iommu_open(unsigned long arg)
> >>>>>> +{
> >>>>>> +	struct tce_container *container;
> >>>>>> +
> >>>>>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >>>>>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >>>>>> +		return ERR_PTR(-EINVAL);
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> >>>>>> +	if (!container)
> >>>>>> +		return ERR_PTR(-ENOMEM);
> >>>>>> +
> >>>>>> +	return container;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void tce_iommu_release(void *iommu_data)
> >>>>>> +{
> >>>>>> +	struct tce_container *container = iommu_data;
> >>>>>> +	struct iommu_table *tbl = container->tbl;
> >>>>>> +	unsigned long i, tce;
> >>>>>> +
> >>>>>
> >>>>> This will segfault if releasing a container that never had an a device
> >>>>> attached.
> >>>>>
> >>>>>> +	/* Unmap leftovers */
> >>>>>> +	spin_lock_irq(&tbl->it_lock);
> >>>>>> +	for (i = tbl->it_offset; i < tbl->it_offset + tbl->it_size; ++i) {
> >>>>>> +		tce = ppc_md.tce_get(tbl, i);
> >>>>>> +		if (tce & VFIO_SPAPR_TCE_PUT_MASK)
> >>>>>> +			tce_free(tbl, i, tce);
> >>>>>> +	}
> >>>>>> +	/* Flush/invalidate TLB caches if necessary */
> >>>>>> +	if (ppc_md.tce_flush)
> >>>>>> +		ppc_md.tce_flush(tbl);
> >>>>>> +
> >>>>>> +	/* Make sure updates are seen by hardware */
> >>>>>> +	mb();
> >>>>>> +
> >>>>>> +	spin_unlock_irq(&tbl->it_lock);
> >>>>>> +
> >>>>>> +	kfree(container);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static long tce_iommu_ioctl(void *iommu_data,
> >>>>>> +				 unsigned int cmd, unsigned long arg)
> >>>>>> +{
> >>>>>> +	struct tce_container *container = iommu_data;
> >>>>>> +	unsigned long minsz;
> >>>>>> +	long ret;
> >>>>>> +
> >>>>>> +	switch (cmd) {
> >>>>>> +	case VFIO_CHECK_EXTENSION: {
> >>>>>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >>>>>> +	}
> >>>>>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>>>>> +		struct vfio_iommu_spapr_tce_info info;
> >>>>>> +		struct iommu_table *tbl = container->tbl;
> >>>>>> +
> >>>>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >>>>>> +				dma64_window_size);
> >>>>>> +
> >>>>>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> >>>>>> +			return -EFAULT;
> >>>>>> +
> >>>>>> +		if (info.argsz < minsz)
> >>>>>> +			return -EINVAL;
> >>>>>> +
> >>>>>> +		if (!tbl)
> >>>>>> +			return -ENXIO;
> >>>>>
> >>>>> nit: why not check this earlier?
> >>>>>
> >>>>>> +
> >>>>>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >>>>>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >>>>>> +		info.dma64_window_start = 0;
> >>>>>> +		info.dma64_window_size = 0;
> >>>>>> +		info.flags = 0;
> >>>>>> +
> >>>>>> +		return copy_to_user((void __user *)arg, &info, minsz);
> >>>>>> +	}
> >>>>>> +	case VFIO_IOMMU_SPAPR_TCE_PUT: {
> >>>>>> +		struct vfio_iommu_spapr_tce_put par;
> >>>>>> +		struct iommu_table *tbl = container->tbl;
> >>>>>> +
> >>>>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_put, tce);
> >>>>>> +
> >>>>>> +		if (copy_from_user(&par, (void __user *)arg, minsz))
> >>>>>> +			return -EFAULT;
> >>>>>> +
> >>>>>> +		if (par.argsz < minsz)
> >>>>>> +			return -EINVAL;
> >>>>>> +
> >>>>>> +		if (!tbl) {
> >>>>>> +			return -ENXIO;
> >>>>>> +		}
> >>>>>
> >>>>> Same, plus drop the braces.
> >>>>>
> >>>>>> +
> >>>>>> +		spin_lock_irq(&tbl->it_lock);
> >>>>>> +		ret = tce_put(tbl, par.ioba >> IOMMU_PAGE_SHIFT,
> >>>>>> +				par.tce, par.flags);
> >>>>>> +		spin_unlock_irq(&tbl->it_lock);
> >>>>>> +
> >>>>>> +		return ret;
> >>>>>> +	}
> >>>>>
> >>>>> Is "PUT" really the name we want for this?
> >>>>
> >>>>
> >>>> Yes, it is a single H_PUT_TCE hypercall from POWER architecture spec.
> >>>
> >>> Ok, if it makes sense on your arch, I won't complain (too much) about
> >>> it.
> >>>
> >>>>>> +	default:
> >>>>>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	return -ENOTTY;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int tce_iommu_attach_group(void *iommu_data,
> >>>>>> +		struct iommu_group *iommu_group)
> >>>>>> +{
> >>>>>> +	struct tce_container *container = iommu_data;
> >>>>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>>>> +
> >>>>>> +	printk(KERN_DEBUG "tce_vfio: Attaching group #%u to iommu %p\n",
> >>>>>> +			iommu_group_id(iommu_group), iommu_group);
> >>>>>
> >>>>> Let's use pr_debug() and friends throughout.
> >>>>>
> >>>>>> +	if (container->tbl) {
> >>>>>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU "
> >>>>>> +				"container is allowed, "
> >>>>>> +				"existing id=%d, attaching id=%d\n",
> >>>>>> +				iommu_group_id(container->tbl->it_group),
> >>>>>> +				iommu_group_id(iommu_group));
> >>>>>> +		return -EBUSY;
> >>>>>> +	}
> >>>>>> +
> >>>>>
> >>>>> _type1 has a lock to avoid races here, I think you might need one too.
> >>>>>
> >>>>>> +	container->tbl = tbl;
> >>>>>> +
> >>>>>> +	return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void tce_iommu_detach_group(void *iommu_data,
> >>>>>> +		struct iommu_group *iommu_group)
> >>>>>> +{
> >>>>>> +	struct tce_container *container = iommu_data;
> >>>>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>>>> +
> >>>>>> +	BUG_ON(!tbl);
> >>>>>
> >>>>> Needed?  If so, why is there no check on attach?
> >>>>
> >>>> Added to attach() :)
> >>>>
> >>>>
> >>>>>
> >>>>>> +	if (tbl != container->tbl) {
> >>>>>> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected "
> >>>>>> +				"group is #%u\n", iommu_group_id(iommu_group),
> >>>>>> +				iommu_group_id(tbl->it_group));
> >>>>>> +		return;
> >>>>>> +	}
> >>>>>> +	printk(KERN_DEBUG "tce_vfio: detaching group #%u from iommu %p\n",
> >>>>>> +			iommu_group_id(iommu_group), iommu_group);
> >>>>>
> >>>>> container->tbl = NULL?
> >>>>
> >>>>
> >>>> Then I won't be able to release pages in tce_iommu_release().
> >>>> Releasing pages in tce_iommu_detach_group() caused some other problems,
> >>>> cannot recall now which ones.
> >>>
> >>> What happens if you hot unplug a group from one VM and add it to
> >>> another?  ie. we've detached it from one container and add it to another
> >>> in a different instance.  Does it cause problems here?
> >>
> >>
> >> Then the container will be released as just one group per container is
> >> supported at the moment, no? Cannot check though as we do not support
> >> hotplug yet.
> >
> > But you still have a race where the group is detached, but the container
> > is not yet released and can be attached to another container in a
> > different instance.
> 
> 
> Yeah... Moved cleanup to detach(), trying to reproduce the issue I had 
> before but to no avail.
> 
> 
> >>>>>> +}
> >>>>>> +
> >>>>>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> >>>>>> +	.name		= "iommu-vfio-powerpc",
> >>>>>> +	.owner		= THIS_MODULE,
> >>>>>> +	.open		= tce_iommu_open,
> >>>>>> +	.release	= tce_iommu_release,
> >>>>>> +	.ioctl		= tce_iommu_ioctl,
> >>>>>> +	.attach_group	= tce_iommu_attach_group,
> >>>>>> +	.detach_group	= tce_iommu_detach_group,
> >>>>>> +};
> >>>>>> +
> >>>>>> +/*
> >>>>>> + * Add/delete devices support (hotplug, module_init, module_exit)
> >>>>>> + */
> >>>>>> +static int add_device(struct device *dev)
> >>>>>> +{
> >>>>>> +	struct iommu_table *tbl;
> >>>>>> +	int ret = 0;
> >>>>>> +
> >>>>>> +	if (dev->iommu_group) {
> >>>>>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu "
> >>>>>> +				"group %d, skipping\n", dev->kobj.name,
> >>>>>
> >>>>> Watch line wrapping on strings.
> >>>>
> >>>> Pardon?
> >>>
> >>> Just suggesting that you try to wrap lines so that strings are
> >>> searchable.  For instance, can I search cscope for "is already in iommu
> >>> group".  It's generally accepted that printks can break 80 cols for
> >>> this.
> >>
> >> Aaaa. Did not know that this is accepted but was always annoyed to wrap
> >> this way, thanks :)
> >>
> >>
> >>>>>> +				iommu_group_id(dev->iommu_group));
> >>>>>> +		return -EBUSY;
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	tbl = get_iommu_table_base(dev);
> >>>>>> +	if (!tbl) {
> >>>>>> +		printk(KERN_DEBUG "tce_vfio: skipping device %s with no tbl\n",
> >>>>>> +				dev->kobj.name);
> >>>>>> +		return 0;
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	printk(KERN_DEBUG "tce_vfio: adding %s to iommu group %d\n",
> >>>>>> +			dev->kobj.name, iommu_group_id(tbl->it_group));
> >>>>>> +
> >>>>>> +	ret = iommu_group_add_device(tbl->it_group, dev);
> >>>>>> +	if (ret < 0)
> >>>>>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> >>>>>> +				dev->kobj.name, ret);
> >>>>>> +
> >>>>>> +	return ret;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void del_device(struct device *dev)
> >>>>>> +{
> >>>>>> +	iommu_group_remove_device(dev);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int iommu_bus_notifier(struct notifier_block *nb,
> >>>>>> +			      unsigned long action, void *data)
> >>>>>> +{
> >>>>>> +	struct device *dev = data;
> >>>>>> +
> >>>>>> +	switch (action) {
> >>>>>> +	case BUS_NOTIFY_ADD_DEVICE:
> >>>>>> +		return add_device(dev);
> >>>>>> +	case BUS_NOTIFY_DEL_DEVICE:
> >>>>>> +		del_device(dev);
> >>>>>> +		return 0;
> >>>>>> +	default:
> >>>>>> +		return 0;
> >>>>>> +	}
> >>>>>> +}
> >>>>>> +
> >>>>>> +static struct notifier_block tce_iommu_bus_nb = {
> >>>>>> +	.notifier_call = iommu_bus_notifier,
> >>>>>> +};
> >>>>>> +
> >>>>>> +void group_release(void *iommu_data)
> >>>>>> +{
> >>>>>> +	struct iommu_table *tbl = iommu_data;
> >>>>>> +	tbl->it_group = NULL;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int __init tce_iommu_init(void)
> >>>>>> +{
> >>>>>> +	struct pci_dev *pdev = NULL;
> >>>>>> +	struct iommu_table *tbl;
> >>>>>> +	struct iommu_group *grp;
> >>>>>> +
> >>>>>> +	/* If the current platform does not support tce_get
> >>>>>> +	   we are unable to clean TCE table properly and
> >>>>>> +	   therefore it is better not to touch it at all */
> >>>>>> +	if (!ppc_md.tce_get) {
> >>>>>> +		printk(KERN_ERR "tce_vfio: ppc_md.tce_get isn't implemented\n");
> >>>>>> +		return -EOPNOTSUPP;
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >>>>>> +
> >>>>>> +	/* Allocate and initialize VFIO groups */
> >>>>>
> >>>>> s/VFIO groups/IOMMU groups/
> >>>>>
> >>>>>> +	for_each_pci_dev(pdev) {
> >>>>>> +		tbl = get_iommu_table_base(&pdev->dev);
> >>>>>> +		if (!tbl)
> >>>>>> +			continue;
> >>>>>> +
> >>>>>> +		/* Skip already initialized */
> >>>>>> +		if (tbl->it_group)
> >>>>>> +			continue;
> >>>>>> +
> >>>>>> +		grp = iommu_group_alloc();
> >>>>>> +		if (IS_ERR(grp)) {
> >>>>>> +			printk(KERN_INFO "tce_vfio: cannot create "
> >>>>>> +					"new IOMMU group, ret=%ld\n",
> >>>>>> +					PTR_ERR(grp));
> >>>>>> +			return -EFAULT;
> >>>>>> +		}
> >>>>>> +		tbl->it_group = grp;
> >>>>>> +		iommu_group_set_iommudata(grp, tbl, group_release);
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	/* Add PCI devices to VFIO groups */
> >>>>>> +	for_each_pci_dev(pdev)
> >>>>>> +		add_device(&pdev->dev);
> >>>>>> +
> >>>>>> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void __exit tce_iommu_cleanup(void)
> >>>>>> +{
> >>>>>> +	struct pci_dev *pdev = NULL;
> >>>>>> +	struct iommu_table *tbl;
> >>>>>> +	struct iommu_group *grp = NULL;
> >>>>>> +
> >>>>>> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >>>>>> +
> >>>>>> +	/* Delete PCI devices from VFIO groups */
> >>>>>> +	for_each_pci_dev(pdev)
> >>>>>> +		del_device(&pdev->dev);
> >>>>>> +
> >>>>>> +	/* Release VFIO groups */
> >>>>>> +	for_each_pci_dev(pdev) {
> >>>>>> +		tbl = get_iommu_table_base(&pdev->dev);
> >>>>>> +		if (!tbl)
> >>>>>> +			continue;
> >>>>>> +		grp = tbl->it_group;
> >>>>>> +
> >>>>>> +		/* Skip (already) uninitialized */
> >>>>>> +		if (!grp)
> >>>>>> +			continue;
> >>>>>> +
> >>>>>> +		/* Do actual release, group_release() is expected to work */
> >>>>>> +		iommu_group_put(grp);
> >>>>>> +		BUG_ON(tbl->it_group);
> >>>>>> +	}
> >>>>>> +
> >>>>>
> >>>>>
> >>>>> It troubles me a bit that you're using the vfio driver to initialize and
> >>>>> tear down IOMMU groups on your platform.
> >>>>
> >>>>
> >>>> I am not following you here. Could you please explain a bit?
> >>>
> >>> IOMMU groups are theoretically not just for VFIO.  They expose DMA
> >>> dependencies between devices for anyone who cares to know about it.
> >>> VFIO happens to care very much about that, but is hopefully not the only
> >>> consumer.  So it's a little bit like writing a driver for a device on a
> >>> new bus and incorporating the bus topology handling code into the device
> >>> driver.  IOMMU groups should be created and managed independent of VFIO.
> >>
> >> Do you mean that we create groups only for PCI devices? Well, moving groups
> >> creation where the actual powerpc groups are allocated (pci scan) is
> >> problematic right now as iommu_init() is called too late.
> >
> > I mean IOMMU group creation should be independent of VFIO.  I'm not sure
> > how to make that ordering work on POWER, but integrating them into your
> > VFIO driver is contrary to many of the arguments that were made for
> > making IOMMU groups part of the base device model.
> 
> 
> I still do not get it. The creation code itself does not depend on VFIO.
> And yes, I would like to create groups from the platform code when the 
> actual IOMMU tables are created, the only problem is that iommu_init() is 
> called too late - after PCI scan (subsys_initcall(pcibios_init) from 
> arch/powerpc/kernel/pci_64.c), iommu_init is subsys_initcall as well.
> I could move tce_iommu_init/tce_iommu_cleanup as a module somewhere in 
> arch/powerpc but moving iommu_init() earlier looks better, then I would 
> create IOMMU groups exactly when their POWER counterparts are created.

That sounds like a better integration of IOMMU groups into POWER than
only enabling them for VFIO.

> >>>>> VFIO makes use of IOMMU groups
> >>>>> and is the only user so far, but they're hopefully useful beyond this.
> >>>>> In fact, VFIO used to manage assembling all groups from data provided by
> >>>>> the IOMMU but David wanted to see IOMMU groups be a more universally
> >>>>> available feature, so it's odd to see POWER implementing it this way.
> >>>>
> >>>>
> >>>> David, help! :)
> >>>>
> >>>>
> >>>>>> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> >>>>>> +}
> >>>>>> +
> >>>>>> +module_init(tce_iommu_init);
> >>>>>> +module_exit(tce_iommu_cleanup);
> >>>>>> +
> >>>>>> +MODULE_VERSION(DRIVER_VERSION);
> >>>>>> +MODULE_LICENSE("GPL v2");
> >>>>>> +MODULE_AUTHOR(DRIVER_AUTHOR);
> >>>>>> +MODULE_DESCRIPTION(DRIVER_DESC);
> >>>>>> +
> >>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> >>>>>> index 0a4f180..2c0a927 100644
> >>>>>> --- a/include/linux/vfio.h
> >>>>>> +++ b/include/linux/vfio.h
> >>>>>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
> >>>>>>     /* Extensions */
> >>>>>>
> >>>>>>     #define VFIO_TYPE1_IOMMU		1
> >>>>>> +#define VFIO_SPAPR_TCE_IOMMU		2
> >>>>>>
> >>>>>>     /*
> >>>>>>      * The IOCTL interface is designed for extensibility by embedding the
> >>>>>> @@ -442,4 +443,32 @@ struct vfio_iommu_type1_dma_unmap {
> >>>>>>
> >>>>>>     #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> >>>>>>
> >>>>>> +/* -------- API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> >>>>>> +
> >>>>>> +struct vfio_iommu_spapr_tce_info {
> >>>>>> +	__u32 argsz;
> >>>>>> +	__u32 flags;
> >>>>>> +	__u32 dma32_window_start;
> >>>>>> +	__u32 dma32_window_size;
> >>>>>> +	__u64 dma64_window_start;
> >>>>>> +	__u64 dma64_window_size;
> >>>>>> +};
> >>>>>> +
> >>>>>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> >>>>>> +
> >>>>>> +struct vfio_iommu_spapr_tce_put {
> >>>>>> +	__u32 argsz;
> >>>>>> +	__u32 flags;
> >>>>>> +#define VFIO_SPAPR_TCE_READ		1
> >>>>>> +#define VFIO_SPAPR_TCE_WRITE		2
> >>>>>> +#define VFIO_SPAPR_TCE_BIDIRECTIONAL	(VFIO_SPAPR_TCE_READ|VFIO_SPAPR_TCE_WRITE)
> >>>>>> +#define VFIO_SPAPR_TCE_PUT_MASK		VFIO_SPAPR_TCE_BIDIRECTIONAL
> >>>>>> +	__u64 ioba;
> >>>>>> +	__u64 tce;
> >>>>>> +};
> >>>>>
> >>>>> Ok, so if READ & WRITE are both clear and ioba is set, that's an
> >>>>> "unmap"?  This is exactly why _type1 has a MAP and UNMAP, to make it
> >>>>> clear which fields are necessary for which call.  I think we should
> >>>>> probably do the same here.  Besides, _put makes me think there should be
> >>>>> a _get; do these have some unique meaning in POWER?
> >>>>
> >>>>
> >>>> It is a single H_PUT_TCE for putting a record into TCE table. The guest
> >>>> calls H_PUT_TCE, QEMU replaces the address and simply forwards the call to
> >>>> the host. Calling them map/unmap makes it less clear for powerpc people :)
> >>>
> >>> In the unmap case we take an ioba and lookup a tce to clear, in the map
> >>> case we take an ioba and tce and insert them into the table.  It's valid
> >>> to document this and use a single ioctl, but I've opted on x86 to have
> >>> separate ioctls because the documentation falls out cleaner when there
> >>> aren't fields that are only used in certain conditions.  Do you really
> >>> want any userspace driver making use of this to know about powerpc
> >>> H_PUT_TCE or would it make more sense to have a MAP and UNMAP call?  I
> >>> think it would be better for the VFIO API if we had some consistency in
> >>> the mapping ioctls where possible.
> >>
> >>
> >> I would think that passing through "as is" as much as possible is the best
> >> thing here as the aim is KVM. May be one day we will implement H_PUT_TCE in
> >> the kernel, so splitting H_PUT_TCE to map+unmap and then combining it back
> >> in the kernel (because we will have H_PUT_TCE handler) is a bit ugly.
> >
> > No, KVM is a use case for VFIO, we shouldn't be assume it's _the_ use
> > case.  Exposing it "as is" means anyone trying to write a VFIO userspace
> > driver needs to know about the implementation of H_PUT_TCE to make the
> > driver work on POWER.  The fact that the same hypercall is made for a
> > map or unmap is really irrelevant to the VFIO API.
> >
> >>>>>> +#define VFIO_IOMMU_SPAPR_TCE_PUT	_IO(VFIO_TYPE, VFIO_BASE + 13)
> >>>>>> +
> >>>>>
> >>>>> Please document what all of the above means.  Thanks,
> >>>>
> >>>>
> >>>> Something like this?
> >>>> /*
> >>>>     * The VFIO_IOMMU_SPAPR_TCE_PUT is implemented as the H_PUT_TCE hypercall.
> >>>>     * ioba - I/O Bus Address for indexing into TCE table
> >>>>     * tce - logical address of storage
> >>>>     *
> >>>>     * The non-zero flags means adding new page into the table.
> >>>>     * The zero flags means releasing the existing page and clearing the
> >>>>     * TCE table entry.
> >>>>     */
> >>>
> >>> Do you only want VFIO drivers to work on POWER if they're written by
> >>> POWER people?  Ideally there are a few simple concepts: a) devices have
> >>> an I/O virtual address space.  On x86 we call this the iova and it's
> >>> effectively a zero-based, 64bit (not really, but close enough) address
> >>> space.  You seem to have two smaller windows, one in 32bit space,
> >>> another in 64bit space (maybe we could name these more consistently).
> >>> b) Userspace has a buffer that they want to map and unmap to an iova,
> >>> potentially with some access flags.  That's all you need to know to use
> >>> the x86 _type1 VFIO IOMMU API.
> >>
> >>
> >> Do not you have to map entire RAM to PCI bus? You use listener which
> >> purpose is not very clear. This is an extra knowledge beyond qemu-to-host
> >> interface which the user space program should know.
> >
> > In the x86 case, the buffer we want to map is all of guest RAM.  Some of
> > that changes dynamically, so we have a listener setup to make updates.
> > The only thing magic about doing that is that the device is then able to
> > DMA to any part of guest RAM and therefore the guest doesn't need to
> > know the IOMMU exists.  Device assignment is therefore transparent on
> > x86.
> 
> >>> Why do I need to know about H_PUT_TCE to
> >>> use this interface?  Let's assume there might be some VFIO drivers some
> >>> day that aren't written by POWER people.  Thanks,
> >>
> >> Example of such a driver? My imagination is weak :)
> >
> > See Tom Lyon's original user level drivers:
> >
> > https://github.com/pugs/vfio-user-level-drivers
> >
> > These are against the original version of VFIO so no longer work, but
> > he's got drivers for common devices like Intel 82576 & 82599 SR-IOV
> > NICs.  There are special use cases and special devices where it makes
> > sense to have a driver in userspace.  Ideally a VFIO driver for a NIC
> > would work with fairly minimal IOMMU abstractions between x86 and POWER,
> > but if you design the SPAPR VFIO IOMMU API so that users need to
> > understand how H_PUT_TCE works to port their driver to POWER, you might
> > find it more difficult to leverage such drivers.  Thanks,
> 
> A user space driver needs to know the DMA window if it wants to work on 
> POWER because this is how PHB is configured. So it needs to know something 
> about POWER.

No, it only needs to know the window.

>  It could be fixed easily - we stop using IOMMU types and make 
> map/unmap/info independent from the IOMMU type.

Exactly, try to implement map/unmap/info and where possible, try to use
similar parameters to the existing iommu backends so we have some
consistency.  Adding a window is a trivial addition and doesn't require
the user to know anything about H_PUT_TCE.

> You'll have to implement 
> DMA window properties (which is entire RAM) and then we'll put to the spec 
> that VFIO users have to call DMA map/unmap only for addresses within the 
> returned DMA window boundaries. I thought this is what we wanted to avoid...

Yes, now you've gone too far.  Trying to define a single interface to
support any possible IOMMU is an exercise in madness.  We've tried it
before.  What I would like to agree on is simply that info, map, and
unmap should exist for all IOMMUs and should use similar parameters
where possible.  Your info needs to describe windows, mine doesn't,
that's fine.  Is there any reason your map or unmap needs to be
different?  If we limit the differences to only those necessary we'll
make it much easier for userspace drivers to share code.

> Or implement third type of IOMMU - "VFIO user space driver" (and implement 
> dma_alloc() which would allocate an address for DMA on PCI bus) and name 
> the first twos as "TYPE1 KVM" and "SPAPR TCE KVM".

And this 3rd type just offloads userspace by managing the iova window
allocations itself?  I would think this would be best to leave to a
userspace library and made easy to write by using consistent interfaces.
Thanks,

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2012-10-11 18:09 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20120821113534.GS29724@truffula.fritz.box>
2012-09-04  7:33 ` [PATCH] vfio: enabled and supported on power (v7) Alexey Kardashevskiy
2012-09-04  7:35   ` [PATCH] powerpc-powernv: added tce_get callback for powernv platform Alexey Kardashevskiy
2012-09-04 19:41     ` Benjamin Herrenschmidt
2012-09-04 22:35       ` David Gibson
2012-09-05  0:19       ` Alexey Kardashevskiy
2012-09-05  0:32         ` Benjamin Herrenschmidt
2012-09-04  7:36   ` [PATCH] powerpc-kvm: fixing page alignment for TCE Alexey Kardashevskiy
2012-09-20  9:01     ` Alexander Graf
2012-09-04  7:36   ` [PATCH] powerpc-powernv: align BARs to PAGE_SIZE on powernv platform Alexey Kardashevskiy
2012-09-04 19:45     ` Benjamin Herrenschmidt
2012-09-05  0:55       ` Alexey Kardashevskiy
2012-09-05  1:16         ` Benjamin Herrenschmidt
2012-09-05  4:57           ` Alex Williamson
2012-09-05  5:17             ` Benjamin Herrenschmidt
2012-09-05  5:27               ` Alexey Kardashevskiy
2012-09-10 17:06                 ` Alex Williamson
2012-09-10 16:02   ` [PATCH] vfio: enabled and supported on power (v7) Alex Williamson
2012-09-11  8:28     ` Alexey Kardashevskiy
2012-09-13 22:34       ` Alex Williamson
2012-09-13 22:41         ` Scott Wood
2012-09-13 22:55           ` Alex Williamson
2012-09-14  0:51         ` Alexey Kardashevskiy
2012-09-14  4:35           ` Alex Williamson
2012-10-11  8:19             ` Alexey Kardashevskiy
2012-10-11 18:09               ` Alex Williamson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).