All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] vfio powerpc: enabled and supported on powernv platform
       [not found] <20121113033832.GW4696@truffula.fritz.box>
@ 2012-11-20  0:48   ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-20  0:48 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.
The platform dependent part includes IOMMU initialization
and handling.

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan, only POWERNV
platform is supported at the moment.

Also the patch implements an VFIO-IOMMU driver which
manages DMA mapping/unmapping requests coming from
the client (now QEMU). It also returns a DMA window
information to let the guest initialize the device tree
for a guest OS properly. Although this driver has been
tested only on POWERNV, it should work on any platform
supporting TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    6 +
 arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 drivers/vfio/Kconfig                 |    6 +
 drivers/vfio/Makefile                |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  247 ++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                 |   20 +++
 8 files changed, 563 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5ba66cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,30 +64,33 @@ struct iommu_pool {
 } ____cacheline_aligned_in_smp;
 
 struct iommu_table {
 	unsigned long  it_busno;     /* Bus number this table belongs to */
 	unsigned long  it_size;      /* Size of iommu table in entries */
 	unsigned long  it_offset;    /* Offset into global table */
 	unsigned long  it_base;      /* mapped address of tce table */
 	unsigned long  it_index;     /* which iommu table this is */
 	unsigned long  it_type;      /* type: PCI or Virtual Bus */
 	unsigned long  it_blocksize; /* Entries in each block (cacheline) */
 	unsigned long  poolsize;
 	unsigned long  nr_pools;
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
 
 static inline void set_iommu_table_base(struct device *dev, void *base)
 {
 	dev->archdata.dma_data.iommu_table_base = base;
 }
 
 static inline void *get_iommu_table_base(struct device *dev)
 {
 	return dev->archdata.dma_data.iommu_table_base;
 }
 
 /* Frees table for an individual device node */
@@ -135,17 +138,20 @@ static inline void pci_iommu_init(void) { }
 extern void alloc_dart_table(void);
 #if defined(CONFIG_PPC64) && defined(CONFIG_PM)
 static inline void iommu_save(void)
 {
 	if (ppc_md.iommu_save)
 		ppc_md.iommu_save();
 }
 
 static inline void iommu_restore(void)
 {
 	if (ppc_md.iommu_restore)
 		ppc_md.iommu_restore();
 }
 #endif
 
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+		enum dma_data_direction direction, unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..94f614b 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -32,30 +32,31 @@
 #include <linux/dma-mapping.h>
 #include <linux/bitmap.h>
 #include <linux/iommu-helper.h>
 #include <linux/crash_dump.h>
 #include <linux/hash.h>
 #include <linux/fault-inject.h>
 #include <linux/pci.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
 static int novmerge;
 
 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
 
 static int __init setup_iommu(char *str)
 {
 	if (!strcmp(str, "novmerge"))
 		novmerge = 1;
 	else if (!strcmp(str, "vmerge"))
 		novmerge = 0;
 	return 1;
 }
@@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
 }
 
 void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 			 void *vaddr, dma_addr_t dma_handle)
 {
 	if (tbl) {
 		unsigned int nio_pages;
 
 		size = PAGE_ALIGN(size);
 		nio_pages = size >> IOMMU_PAGE_SHIFT;
 		iommu_free(tbl, dma_handle, nio_pages);
 		size = PAGE_ALIGN(size);
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	struct page *page = NULL;
+	unsigned long oldtce;
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+
+	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		return NULL;
+
+	page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+	WARN_ON(!page);
+	if (page && (oldtce & TCE_PCI_WRITE))
+		SetPageDirty(page);
+	ppc_md.tce_free(tbl, entry, 1);
+
+	return page;
+}
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret)
+			ret = -EFAULT;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+		enum dma_data_direction direction, unsigned long pages)
+{
+	int i, ret = 0, pages_to_put = 0;
+	struct page *page;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+	struct page **oldpages;
+	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+	/* Handle a single page request without allocation
+	   of pages-to-release array */
+	if (pages == 1) {
+		spin_lock(&(pool->lock));
+		page = free_tce(tbl, entry);
+
+		if (direction != DMA_NONE)
+			ret = put_tce(tbl, entry, tce, direction);
+
+		tce_flush(tbl);
+
+		if (page)
+			put_page(page);
+
+		spin_unlock(&(pool->lock));
+		return ret;
+	}
+
+	/* Releasing multiple pages */
+	/* Allocate an array for pages to be released after TCE table
+	   is updated */
+	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!oldpages)
+		return -ENOMEM;
+
+	spin_lock(&(pool->lock));
+
+	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
+		page = free_tce(tbl, entry);
+		if (page) {
+			oldpages[pages_to_put] = page;
+			++pages_to_put;
+		}
+
+		if (direction != DMA_NONE)
+			ret = put_tce(tbl, entry, tce, direction);
+
+		/* Release old pages if we reached the end of oldpages[] or
+		   it is the last page or we are about to exit the loop */
+		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
+			tce_flush(tbl);
+
+			/* Release pages after removing them from TCE table */
+			while (pages_to_put) {
+				--pages_to_put;
+				put_page(oldpages[pages_to_put]);
+			}
+		}
+	}
+
+	spin_unlock(&(pool->lock));
+	kfree(oldpages);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..676f4d9 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -8,30 +8,31 @@
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/ppc-pci.h>
 #include <asm/opal.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/abs_addr.h>
 #include <asm/firmware.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
 	/* Configure IOMMU DMA hooks */
 	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
 	ppc_md.tce_build = pnv_tce_build;
 	ppc_md.tce_free = pnv_tce_free;
 	ppc_md.tce_get = pnv_tce_get;
 	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
 	set_pci_dma_ops(&dma_iommu_ops);
 
 	/* Configure MSIs */
 #ifdef CONFIG_PCI_MSI
 	ppc_md.msi_check_device = pnv_msi_check_device;
 	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev->kobj.name,
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev->kobj.name);
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev->kobj.name, iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev->kobj.name, ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -175,16 +175,24 @@ config EXYNOS_IOMMU
 	  processor family. This enables H/W multimedia accellerators to see
 	  non-linear physical memory chunks as a linear memory in their
 	  address spaces
 
 	  If unsure, say N here.
 
 config EXYNOS_IOMMU_DEBUG
 	bool "Debugging log for Exynos IOMMU"
 	depends on EXYNOS_IOMMU
 	help
 	  Select this to see the detailed log message that shows what
 	  happens in the IOMMU driver
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -1,16 +1,22 @@
 config VFIO_IOMMU_TYPE1
 	tristate
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
 
 	  If you don't know what to do here, say N.
 
 source "drivers/vfio/pci/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..ac72c74d
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,247 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma64_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.dma64_window_start = 0;
+		info.dma64_window_size = 0;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map par;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction = DMA_NONE;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&par, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (par.argsz < minsz)
+			return -EINVAL;
+
+		if ((par.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(par.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
+			direction = DMA_BIDIRECTIONAL;
+		} else if (par.flags & VFIO_DMA_MAP_FLAG_READ) {
+			direction = DMA_TO_DEVICE;
+		} else if (par.flags & VFIO_DMA_MAP_FLAG_WRITE) {
+			direction = DMA_FROM_DEVICE;
+		}
+
+		par.size += par.iova & ~IOMMU_PAGE_MASK;
+		par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
+
+		return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
+				par.vaddr & IOMMU_PAGE_MASK, direction,
+				par.size >> IOMMU_PAGE_SHIFT);
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap par;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&par, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (par.argsz < minsz)
+			return -EINVAL;
+
+		par.size += par.iova & ~IOMMU_PAGE_MASK;
+		par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
+
+		return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
+				0, DMA_NONE, par.size >> IOMMU_PAGE_SHIFT);
+	}
+	default:
+		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
+		container->tbl = NULL;
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..3ecd65c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -87,30 +87,31 @@ extern void vfio_unregister_iommu_driver(
  * Simple helper macro for dealing with variable sized structures passed
  * from user space.  This allows us to easily determine if the provided
  * structure is sized to include various fields.
  */
 #define offsetofend(TYPE, MEMBER) ({				\
 	TYPE tmp;						\
 	offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); })		\
 
 #endif /* __KERNEL__ */
 
 /* Kernel & User level defines for VFIO IOCTLs. */
 
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
  * kernel and userspace.  We therefore use the _IO() macro for these
  * defines to avoid implicitly embedding a size into the ioctl request.
  * As structure fields are added, argsz will increase to match and flag
  * bits will be defined to indicate additional fields with valid data.
  * It's *always* the caller's responsibility to indicate the size of
  * the structure passed by setting argsz appropriately.
  */
 
 #define VFIO_TYPE	(';')
 #define VFIO_BASE	100
 
@@ -430,16 +431,35 @@ struct vfio_iommu_type1_dma_map {
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
  *
  * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
  * Caller sets argsz.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
 };
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;
+	__u32 dma32_window_start;
+	__u32 dma32_window_size;
+	__u64 dma64_window_start;
+	__u64 dma64_window_size;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-20  0:48   ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-20  0:48 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.
The platform dependent part includes IOMMU initialization
and handling.

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan, only POWERNV
platform is supported at the moment.

Also the patch implements an VFIO-IOMMU driver which
manages DMA mapping/unmapping requests coming from
the client (now QEMU). It also returns a DMA window
information to let the guest initialize the device tree
for a guest OS properly. Although this driver has been
tested only on POWERNV, it should work on any platform
supporting TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    6 +
 arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 drivers/vfio/Kconfig                 |    6 +
 drivers/vfio/Makefile                |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  247 ++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                 |   20 +++
 8 files changed, 563 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5ba66cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,30 +64,33 @@ struct iommu_pool {
 } ____cacheline_aligned_in_smp;
 
 struct iommu_table {
 	unsigned long  it_busno;     /* Bus number this table belongs to */
 	unsigned long  it_size;      /* Size of iommu table in entries */
 	unsigned long  it_offset;    /* Offset into global table */
 	unsigned long  it_base;      /* mapped address of tce table */
 	unsigned long  it_index;     /* which iommu table this is */
 	unsigned long  it_type;      /* type: PCI or Virtual Bus */
 	unsigned long  it_blocksize; /* Entries in each block (cacheline) */
 	unsigned long  poolsize;
 	unsigned long  nr_pools;
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
 
 static inline void set_iommu_table_base(struct device *dev, void *base)
 {
 	dev->archdata.dma_data.iommu_table_base = base;
 }
 
 static inline void *get_iommu_table_base(struct device *dev)
 {
 	return dev->archdata.dma_data.iommu_table_base;
 }
 
 /* Frees table for an individual device node */
@@ -135,17 +138,20 @@ static inline void pci_iommu_init(void) { }
 extern void alloc_dart_table(void);
 #if defined(CONFIG_PPC64) && defined(CONFIG_PM)
 static inline void iommu_save(void)
 {
 	if (ppc_md.iommu_save)
 		ppc_md.iommu_save();
 }
 
 static inline void iommu_restore(void)
 {
 	if (ppc_md.iommu_restore)
 		ppc_md.iommu_restore();
 }
 #endif
 
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+		enum dma_data_direction direction, unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..94f614b 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -32,30 +32,31 @@
 #include <linux/dma-mapping.h>
 #include <linux/bitmap.h>
 #include <linux/iommu-helper.h>
 #include <linux/crash_dump.h>
 #include <linux/hash.h>
 #include <linux/fault-inject.h>
 #include <linux/pci.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
 static int novmerge;
 
 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
 
 static int __init setup_iommu(char *str)
 {
 	if (!strcmp(str, "novmerge"))
 		novmerge = 1;
 	else if (!strcmp(str, "vmerge"))
 		novmerge = 0;
 	return 1;
 }
@@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
 }
 
 void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 			 void *vaddr, dma_addr_t dma_handle)
 {
 	if (tbl) {
 		unsigned int nio_pages;
 
 		size = PAGE_ALIGN(size);
 		nio_pages = size >> IOMMU_PAGE_SHIFT;
 		iommu_free(tbl, dma_handle, nio_pages);
 		size = PAGE_ALIGN(size);
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	struct page *page = NULL;
+	unsigned long oldtce;
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+
+	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		return NULL;
+
+	page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+	WARN_ON(!page);
+	if (page && (oldtce & TCE_PCI_WRITE))
+		SetPageDirty(page);
+	ppc_md.tce_free(tbl, entry, 1);
+
+	return page;
+}
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret)
+			ret = -EFAULT;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+		enum dma_data_direction direction, unsigned long pages)
+{
+	int i, ret = 0, pages_to_put = 0;
+	struct page *page;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+	struct page **oldpages;
+	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+	/* Handle a single page request without allocation
+	   of pages-to-release array */
+	if (pages == 1) {
+		spin_lock(&(pool->lock));
+		page = free_tce(tbl, entry);
+
+		if (direction != DMA_NONE)
+			ret = put_tce(tbl, entry, tce, direction);
+
+		tce_flush(tbl);
+
+		if (page)
+			put_page(page);
+
+		spin_unlock(&(pool->lock));
+		return ret;
+	}
+
+	/* Releasing multiple pages */
+	/* Allocate an array for pages to be released after TCE table
+	   is updated */
+	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!oldpages)
+		return -ENOMEM;
+
+	spin_lock(&(pool->lock));
+
+	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
+		page = free_tce(tbl, entry);
+		if (page) {
+			oldpages[pages_to_put] = page;
+			++pages_to_put;
+		}
+
+		if (direction != DMA_NONE)
+			ret = put_tce(tbl, entry, tce, direction);
+
+		/* Release old pages if we reached the end of oldpages[] or
+		   it is the last page or we are about to exit the loop */
+		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
+			tce_flush(tbl);
+
+			/* Release pages after removing them from TCE table */
+			while (pages_to_put) {
+				--pages_to_put;
+				put_page(oldpages[pages_to_put]);
+			}
+		}
+	}
+
+	spin_unlock(&(pool->lock));
+	kfree(oldpages);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..676f4d9 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -8,30 +8,31 @@
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/ppc-pci.h>
 #include <asm/opal.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/abs_addr.h>
 #include <asm/firmware.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
 	/* Configure IOMMU DMA hooks */
 	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
 	ppc_md.tce_build = pnv_tce_build;
 	ppc_md.tce_free = pnv_tce_free;
 	ppc_md.tce_get = pnv_tce_get;
 	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
 	set_pci_dma_ops(&dma_iommu_ops);
 
 	/* Configure MSIs */
 #ifdef CONFIG_PCI_MSI
 	ppc_md.msi_check_device = pnv_msi_check_device;
 	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev->kobj.name,
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev->kobj.name);
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev->kobj.name, iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev->kobj.name, ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -175,16 +175,24 @@ config EXYNOS_IOMMU
 	  processor family. This enables H/W multimedia accellerators to see
 	  non-linear physical memory chunks as a linear memory in their
 	  address spaces
 
 	  If unsure, say N here.
 
 config EXYNOS_IOMMU_DEBUG
 	bool "Debugging log for Exynos IOMMU"
 	depends on EXYNOS_IOMMU
 	help
 	  Select this to see the detailed log message that shows what
 	  happens in the IOMMU driver
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -1,16 +1,22 @@
 config VFIO_IOMMU_TYPE1
 	tristate
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
 
 	  If you don't know what to do here, say N.
 
 source "drivers/vfio/pci/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..ac72c74d
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,247 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma64_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.dma64_window_start = 0;
+		info.dma64_window_size = 0;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map par;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction = DMA_NONE;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&par, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (par.argsz < minsz)
+			return -EINVAL;
+
+		if ((par.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(par.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
+			direction = DMA_BIDIRECTIONAL;
+		} else if (par.flags & VFIO_DMA_MAP_FLAG_READ) {
+			direction = DMA_TO_DEVICE;
+		} else if (par.flags & VFIO_DMA_MAP_FLAG_WRITE) {
+			direction = DMA_FROM_DEVICE;
+		}
+
+		par.size += par.iova & ~IOMMU_PAGE_MASK;
+		par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
+
+		return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
+				par.vaddr & IOMMU_PAGE_MASK, direction,
+				par.size >> IOMMU_PAGE_SHIFT);
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap par;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&par, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (par.argsz < minsz)
+			return -EINVAL;
+
+		par.size += par.iova & ~IOMMU_PAGE_MASK;
+		par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
+
+		return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
+				0, DMA_NONE, par.size >> IOMMU_PAGE_SHIFT);
+	}
+	default:
+		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
+		container->tbl = NULL;
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..3ecd65c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -87,30 +87,31 @@ extern void vfio_unregister_iommu_driver(
  * Simple helper macro for dealing with variable sized structures passed
  * from user space.  This allows us to easily determine if the provided
  * structure is sized to include various fields.
  */
 #define offsetofend(TYPE, MEMBER) ({				\
 	TYPE tmp;						\
 	offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); })		\
 
 #endif /* __KERNEL__ */
 
 /* Kernel & User level defines for VFIO IOCTLs. */
 
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
  * kernel and userspace.  We therefore use the _IO() macro for these
  * defines to avoid implicitly embedding a size into the ioctl request.
  * As structure fields are added, argsz will increase to match and flag
  * bits will be defined to indicate additional fields with valid data.
  * It's *always* the caller's responsibility to indicate the size of
  * the structure passed by setting argsz appropriately.
  */
 
 #define VFIO_TYPE	(';')
 #define VFIO_BASE	100
 
@@ -430,16 +431,35 @@ struct vfio_iommu_type1_dma_map {
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
  *
  * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
  * Caller sets argsz.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
 };
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;
+	__u32 dma32_window_start;
+	__u32 dma32_window_size;
+	__u64 dma64_window_start;
+	__u64 dma64_window_size;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
  2012-11-20  0:48   ` Alexey Kardashevskiy
@ 2012-11-20 18:19     ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-20 18:19 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> The platform dependent part includes IOMMU initialization
> and handling.
> 
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan, only POWERNV
> platform is supported at the moment.
> 
> Also the patch implements an VFIO-IOMMU driver which
> manages DMA mapping/unmapping requests coming from
> the client (now QEMU). It also returns a DMA window
> information to let the guest initialize the device tree
> for a guest OS properly. Although this driver has been
> tested only on POWERNV, it should work on any platform
> supporting TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    6 +
>  arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  drivers/vfio/Kconfig                 |    6 +
>  drivers/vfio/Makefile                |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c  |  247 ++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                 |   20 +++
>  8 files changed, 563 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5ba66cb 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -64,30 +64,33 @@ struct iommu_pool {
>  } ____cacheline_aligned_in_smp;
>  
>  struct iommu_table {
>  	unsigned long  it_busno;     /* Bus number this table belongs to */
>  	unsigned long  it_size;      /* Size of iommu table in entries */
>  	unsigned long  it_offset;    /* Offset into global table */
>  	unsigned long  it_base;      /* mapped address of tce table */
>  	unsigned long  it_index;     /* which iommu table this is */
>  	unsigned long  it_type;      /* type: PCI or Virtual Bus */
>  	unsigned long  it_blocksize; /* Entries in each block (cacheline) */
>  	unsigned long  poolsize;
>  	unsigned long  nr_pools;
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
>  
>  static inline void set_iommu_table_base(struct device *dev, void *base)
>  {
>  	dev->archdata.dma_data.iommu_table_base = base;
>  }
>  
>  static inline void *get_iommu_table_base(struct device *dev)
>  {
>  	return dev->archdata.dma_data.iommu_table_base;
>  }
>  
>  /* Frees table for an individual device node */
> @@ -135,17 +138,20 @@ static inline void pci_iommu_init(void) { }
>  extern void alloc_dart_table(void);
>  #if defined(CONFIG_PPC64) && defined(CONFIG_PM)
>  static inline void iommu_save(void)
>  {
>  	if (ppc_md.iommu_save)
>  		ppc_md.iommu_save();
>  }
>  
>  static inline void iommu_restore(void)
>  {
>  	if (ppc_md.iommu_restore)
>  		ppc_md.iommu_restore();
>  }
>  #endif
>  
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> +		enum dma_data_direction direction, unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..94f614b 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -32,30 +32,31 @@
>  #include <linux/dma-mapping.h>
>  #include <linux/bitmap.h>
>  #include <linux/iommu-helper.h>
>  #include <linux/crash_dump.h>
>  #include <linux/hash.h>
>  #include <linux/fault-inject.h>
>  #include <linux/pci.h>
>  #include <asm/io.h>
>  #include <asm/prom.h>
>  #include <asm/iommu.h>
>  #include <asm/pci-bridge.h>
>  #include <asm/machdep.h>
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
>  static int novmerge;
>  
>  static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
>  
>  static int __init setup_iommu(char *str)
>  {
>  	if (!strcmp(str, "novmerge"))
>  		novmerge = 1;
>  	else if (!strcmp(str, "vmerge"))
>  		novmerge = 0;
>  	return 1;
>  }
> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
>  }
>  
>  void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  			 void *vaddr, dma_addr_t dma_handle)
>  {
>  	if (tbl) {
>  		unsigned int nio_pages;
>  
>  		size = PAGE_ALIGN(size);
>  		nio_pages = size >> IOMMU_PAGE_SHIFT;
>  		iommu_free(tbl, dma_handle, nio_pages);
>  		size = PAGE_ALIGN(size);
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
> +{
> +	struct page *page = NULL;

NULL initialization doesn't appear to be necessary

> +	unsigned long oldtce;
> +
> +	oldtce = ppc_md.tce_get(tbl, entry);
> +
> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +		return NULL;
> +
> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +	WARN_ON(!page);
> +	if (page && (oldtce & TCE_PCI_WRITE))
> +		SetPageDirty(page);
> +	ppc_md.tce_free(tbl, entry, 1);
> +
> +	return page;
> +}
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret)
> +			ret = -EFAULT;

Missing return ret?  Otherwise we've got some bogus uses of page below
and we're setting ret for no reason here.

> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> +		enum dma_data_direction direction, unsigned long pages)
> +{
> +	int i, ret = 0, pages_to_put = 0;
> +	struct page *page;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +	struct page **oldpages;
> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +
> +	/* Handle a single page request without allocation
> +	   of pages-to-release array */
> +	if (pages == 1) {
> +		spin_lock(&(pool->lock));
> +		page = free_tce(tbl, entry);
> +
> +		if (direction != DMA_NONE)
> +			ret = put_tce(tbl, entry, tce, direction);
> +
> +		tce_flush(tbl);
> +
> +		if (page)
> +			put_page(page);
> +
> +		spin_unlock(&(pool->lock));
> +		return ret;
> +	}
> +
> +	/* Releasing multiple pages */
> +	/* Allocate an array for pages to be released after TCE table
> +	   is updated */
> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!oldpages)
> +		return -ENOMEM;
> +
> +	spin_lock(&(pool->lock));
> +
> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
> +		page = free_tce(tbl, entry);
> +		if (page) {
> +			oldpages[pages_to_put] = page;
> +			++pages_to_put;
> +		}
> +
> +		if (direction != DMA_NONE)
> +			ret = put_tce(tbl, entry, tce, direction);
> +
> +		/* Release old pages if we reached the end of oldpages[] or
> +		   it is the last page or we are about to exit the loop */
> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
> +			tce_flush(tbl);

Avoiding tce_flush() is the reason for all this extra overhead, right?
I wonder if it'd be cleaner separating map vs unmap, where the map case
can avoid the oldpages array... but that means inserting new mappings on
top of old ones wouldn't put the pages.

> +
> +			/* Release pages after removing them from TCE table */
> +			while (pages_to_put) {
> +				--pages_to_put;
> +				put_page(oldpages[pages_to_put]);
> +			}
> +		}
> +	}
> +
> +	spin_unlock(&(pool->lock));
> +	kfree(oldpages);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..676f4d9 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -8,30 +8,31 @@
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
>   * as published by the Free Software Foundation; either version
>   * 2 of the License, or (at your option) any later version.
>   */
>  
>  #include <linux/kernel.h>
>  #include <linux/pci.h>
>  #include <linux/delay.h>
>  #include <linux/string.h>
>  #include <linux/init.h>
>  #include <linux/bootmem.h>
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
>  #include <asm/prom.h>
>  #include <asm/pci-bridge.h>
>  #include <asm/machdep.h>
>  #include <asm/ppc-pci.h>
>  #include <asm/opal.h>
>  #include <asm/iommu.h>
>  #include <asm/tce.h>
>  #include <asm/abs_addr.h>
>  #include <asm/firmware.h>
>  
>  #include "powernv.h"
>  #include "pci.h"
> @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
>  	/* Configure IOMMU DMA hooks */
>  	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
>  	ppc_md.tce_build = pnv_tce_build;
>  	ppc_md.tce_free = pnv_tce_free;
>  	ppc_md.tce_get = pnv_tce_get;
>  	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
>  	set_pci_dma_ops(&dma_iommu_ops);
>  
>  	/* Configure MSIs */
>  #ifdef CONFIG_PCI_MSI
>  	ppc_md.msi_check_device = pnv_msi_check_device;
>  	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev->kobj.name,

dev_name(dev)

> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev->kobj.name);
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev->kobj.name, iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev->kobj.name, ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);

There's already a notifier in the iommu code if you were to register an
iommu_ops with the add/remove_device entries.  That would allow you to
remove the notifier block and notifier function below and the second
loop below.  Are you avoiding that to avoid the rest of iommu_ops?

Also, shouldn't this notifier only be registered after the first loop
below?  Otherwise ADD_DEVICE could race with setting up groups, which we
assume are present in the add_device() above.

> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -175,16 +175,24 @@ config EXYNOS_IOMMU
>  	  processor family. This enables H/W multimedia accellerators to see
>  	  non-linear physical memory chunks as a linear memory in their
>  	  address spaces
>  
>  	  If unsure, say N here.
>  
>  config EXYNOS_IOMMU_DEBUG
>  	bool "Debugging log for Exynos IOMMU"
>  	depends on EXYNOS_IOMMU
>  	help
>  	  Select this to see the detailed log message that shows what
>  	  happens in the IOMMU driver
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

How are you planning to split this up among maintainers?  A powerpc
patch, an iommu kconfig patch, then the vfio changes below for me?

> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -1,16 +1,22 @@
>  config VFIO_IOMMU_TYPE1
>  	tristate
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
>  
>  	  If you don't know what to do here, say N.
>  
>  source "drivers/vfio/pci/Kconfig"
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..ac72c74d
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,247 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);
> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION: {
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +	}
> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma64_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.dma64_window_start = 0;
> +		info.dma64_window_size = 0;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map par;

What does "par" stand for?

> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction = DMA_NONE;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&par, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (par.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((par.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(par.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> +			direction = DMA_BIDIRECTIONAL;
> +		} else if (par.flags & VFIO_DMA_MAP_FLAG_READ) {
> +			direction = DMA_TO_DEVICE;
> +		} else if (par.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> +			direction = DMA_FROM_DEVICE;
> +		}
> +
> +		par.size += par.iova & ~IOMMU_PAGE_MASK;
> +		par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
> +
> +		return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
> +				par.vaddr & IOMMU_PAGE_MASK, direction,
> +				par.size >> IOMMU_PAGE_SHIFT);
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap par;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&par, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (par.argsz < minsz)
> +			return -EINVAL;
> +
> +		par.size += par.iova & ~IOMMU_PAGE_MASK;
> +		par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
> +
> +		return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
> +				0, DMA_NONE, par.size >> IOMMU_PAGE_SHIFT);
> +	}
> +	default:
> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;
> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
> +		container->tbl = NULL;
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..3ecd65c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -87,30 +87,31 @@ extern void vfio_unregister_iommu_driver(
>   * Simple helper macro for dealing with variable sized structures passed
>   * from user space.  This allows us to easily determine if the provided
>   * structure is sized to include various fields.
>   */
>  #define offsetofend(TYPE, MEMBER) ({				\
>  	TYPE tmp;						\
>  	offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); })		\
>  
>  #endif /* __KERNEL__ */
>  
>  /* Kernel & User level defines for VFIO IOCTLs. */
>  
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
>   * structure length (argsz) and flags into structures passed between
>   * kernel and userspace.  We therefore use the _IO() macro for these
>   * defines to avoid implicitly embedding a size into the ioctl request.
>   * As structure fields are added, argsz will increase to match and flag
>   * bits will be defined to indicate additional fields with valid data.
>   * It's *always* the caller's responsibility to indicate the size of
>   * the structure passed by setting argsz appropriately.
>   */
>  
>  #define VFIO_TYPE	(';')
>  #define VFIO_BASE	100
>  
> @@ -430,16 +431,35 @@ struct vfio_iommu_type1_dma_map {
>  /**
>   * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
>   *
>   * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
>   * Caller sets argsz.
>   */
>  struct vfio_iommu_type1_dma_unmap {
>  	__u32	argsz;
>  	__u32	flags;
>  	__u64	iova;				/* IO virtual address */
>  	__u64	size;				/* Size of mapping (bytes) */
>  };
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;
> +	__u32 dma32_window_start;
> +	__u32 dma32_window_size;
> +	__u64 dma64_window_start;
> +	__u64 dma64_window_size;
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */

I'm glad you were able to reuse these, after this gets merged we can
rename the structure to something more common and typedef for both type1
and spapr_tce so we don't forget it's shared.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-20 18:19     ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-20 18:19 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> The platform dependent part includes IOMMU initialization
> and handling.
> 
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan, only POWERNV
> platform is supported at the moment.
> 
> Also the patch implements an VFIO-IOMMU driver which
> manages DMA mapping/unmapping requests coming from
> the client (now QEMU). It also returns a DMA window
> information to let the guest initialize the device tree
> for a guest OS properly. Although this driver has been
> tested only on POWERNV, it should work on any platform
> supporting TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    6 +
>  arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  drivers/vfio/Kconfig                 |    6 +
>  drivers/vfio/Makefile                |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c  |  247 ++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                 |   20 +++
>  8 files changed, 563 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5ba66cb 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -64,30 +64,33 @@ struct iommu_pool {
>  } ____cacheline_aligned_in_smp;
>  
>  struct iommu_table {
>  	unsigned long  it_busno;     /* Bus number this table belongs to */
>  	unsigned long  it_size;      /* Size of iommu table in entries */
>  	unsigned long  it_offset;    /* Offset into global table */
>  	unsigned long  it_base;      /* mapped address of tce table */
>  	unsigned long  it_index;     /* which iommu table this is */
>  	unsigned long  it_type;      /* type: PCI or Virtual Bus */
>  	unsigned long  it_blocksize; /* Entries in each block (cacheline) */
>  	unsigned long  poolsize;
>  	unsigned long  nr_pools;
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
>  
>  static inline void set_iommu_table_base(struct device *dev, void *base)
>  {
>  	dev->archdata.dma_data.iommu_table_base = base;
>  }
>  
>  static inline void *get_iommu_table_base(struct device *dev)
>  {
>  	return dev->archdata.dma_data.iommu_table_base;
>  }
>  
>  /* Frees table for an individual device node */
> @@ -135,17 +138,20 @@ static inline void pci_iommu_init(void) { }
>  extern void alloc_dart_table(void);
>  #if defined(CONFIG_PPC64) && defined(CONFIG_PM)
>  static inline void iommu_save(void)
>  {
>  	if (ppc_md.iommu_save)
>  		ppc_md.iommu_save();
>  }
>  
>  static inline void iommu_restore(void)
>  {
>  	if (ppc_md.iommu_restore)
>  		ppc_md.iommu_restore();
>  }
>  #endif
>  
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> +		enum dma_data_direction direction, unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..94f614b 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -32,30 +32,31 @@
>  #include <linux/dma-mapping.h>
>  #include <linux/bitmap.h>
>  #include <linux/iommu-helper.h>
>  #include <linux/crash_dump.h>
>  #include <linux/hash.h>
>  #include <linux/fault-inject.h>
>  #include <linux/pci.h>
>  #include <asm/io.h>
>  #include <asm/prom.h>
>  #include <asm/iommu.h>
>  #include <asm/pci-bridge.h>
>  #include <asm/machdep.h>
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
>  static int novmerge;
>  
>  static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
>  
>  static int __init setup_iommu(char *str)
>  {
>  	if (!strcmp(str, "novmerge"))
>  		novmerge = 1;
>  	else if (!strcmp(str, "vmerge"))
>  		novmerge = 0;
>  	return 1;
>  }
> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
>  }
>  
>  void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  			 void *vaddr, dma_addr_t dma_handle)
>  {
>  	if (tbl) {
>  		unsigned int nio_pages;
>  
>  		size = PAGE_ALIGN(size);
>  		nio_pages = size >> IOMMU_PAGE_SHIFT;
>  		iommu_free(tbl, dma_handle, nio_pages);
>  		size = PAGE_ALIGN(size);
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
> +{
> +	struct page *page = NULL;

NULL initialization doesn't appear to be necessary

> +	unsigned long oldtce;
> +
> +	oldtce = ppc_md.tce_get(tbl, entry);
> +
> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +		return NULL;
> +
> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +	WARN_ON(!page);
> +	if (page && (oldtce & TCE_PCI_WRITE))
> +		SetPageDirty(page);
> +	ppc_md.tce_free(tbl, entry, 1);
> +
> +	return page;
> +}
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret)
> +			ret = -EFAULT;

Missing return ret?  Otherwise we've got some bogus uses of page below
and we're setting ret for no reason here.

> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> +		enum dma_data_direction direction, unsigned long pages)
> +{
> +	int i, ret = 0, pages_to_put = 0;
> +	struct page *page;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +	struct page **oldpages;
> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +
> +	/* Handle a single page request without allocation
> +	   of pages-to-release array */
> +	if (pages == 1) {
> +		spin_lock(&(pool->lock));
> +		page = free_tce(tbl, entry);
> +
> +		if (direction != DMA_NONE)
> +			ret = put_tce(tbl, entry, tce, direction);
> +
> +		tce_flush(tbl);
> +
> +		if (page)
> +			put_page(page);
> +
> +		spin_unlock(&(pool->lock));
> +		return ret;
> +	}
> +
> +	/* Releasing multiple pages */
> +	/* Allocate an array for pages to be released after TCE table
> +	   is updated */
> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!oldpages)
> +		return -ENOMEM;
> +
> +	spin_lock(&(pool->lock));
> +
> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
> +		page = free_tce(tbl, entry);
> +		if (page) {
> +			oldpages[pages_to_put] = page;
> +			++pages_to_put;
> +		}
> +
> +		if (direction != DMA_NONE)
> +			ret = put_tce(tbl, entry, tce, direction);
> +
> +		/* Release old pages if we reached the end of oldpages[] or
> +		   it is the last page or we are about to exit the loop */
> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
> +			tce_flush(tbl);

Avoiding tce_flush() is the reason for all this extra overhead, right?
I wonder if it'd be cleaner separating map vs unmap, where the map case
can avoid the oldpages array... but that means inserting new mappings on
top of old ones wouldn't put the pages.

> +
> +			/* Release pages after removing them from TCE table */
> +			while (pages_to_put) {
> +				--pages_to_put;
> +				put_page(oldpages[pages_to_put]);
> +			}
> +		}
> +	}
> +
> +	spin_unlock(&(pool->lock));
> +	kfree(oldpages);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..676f4d9 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -8,30 +8,31 @@
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
>   * as published by the Free Software Foundation; either version
>   * 2 of the License, or (at your option) any later version.
>   */
>  
>  #include <linux/kernel.h>
>  #include <linux/pci.h>
>  #include <linux/delay.h>
>  #include <linux/string.h>
>  #include <linux/init.h>
>  #include <linux/bootmem.h>
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
>  #include <asm/prom.h>
>  #include <asm/pci-bridge.h>
>  #include <asm/machdep.h>
>  #include <asm/ppc-pci.h>
>  #include <asm/opal.h>
>  #include <asm/iommu.h>
>  #include <asm/tce.h>
>  #include <asm/abs_addr.h>
>  #include <asm/firmware.h>
>  
>  #include "powernv.h"
>  #include "pci.h"
> @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
>  	/* Configure IOMMU DMA hooks */
>  	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
>  	ppc_md.tce_build = pnv_tce_build;
>  	ppc_md.tce_free = pnv_tce_free;
>  	ppc_md.tce_get = pnv_tce_get;
>  	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
>  	set_pci_dma_ops(&dma_iommu_ops);
>  
>  	/* Configure MSIs */
>  #ifdef CONFIG_PCI_MSI
>  	ppc_md.msi_check_device = pnv_msi_check_device;
>  	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev->kobj.name,

dev_name(dev)

> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev->kobj.name);
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev->kobj.name, iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev->kobj.name, ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);

There's already a notifier in the iommu code if you were to register an
iommu_ops with the add/remove_device entries.  That would allow you to
remove the notifier block and notifier function below and the second
loop below.  Are you avoiding that to avoid the rest of iommu_ops?

Also, shouldn't this notifier only be registered after the first loop
below?  Otherwise ADD_DEVICE could race with setting up groups, which we
assume are present in the add_device() above.

> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -175,16 +175,24 @@ config EXYNOS_IOMMU
>  	  processor family. This enables H/W multimedia accellerators to see
>  	  non-linear physical memory chunks as a linear memory in their
>  	  address spaces
>  
>  	  If unsure, say N here.
>  
>  config EXYNOS_IOMMU_DEBUG
>  	bool "Debugging log for Exynos IOMMU"
>  	depends on EXYNOS_IOMMU
>  	help
>  	  Select this to see the detailed log message that shows what
>  	  happens in the IOMMU driver
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

How are you planning to split this up among maintainers?  A powerpc
patch, an iommu kconfig patch, then the vfio changes below for me?

> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -1,16 +1,22 @@
>  config VFIO_IOMMU_TYPE1
>  	tristate
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
>  
>  	  If you don't know what to do here, say N.
>  
>  source "drivers/vfio/pci/Kconfig"
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..ac72c74d
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,247 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);
> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION: {
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +	}
> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma64_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.dma64_window_start = 0;
> +		info.dma64_window_size = 0;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map par;

What does "par" stand for?

> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction = DMA_NONE;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&par, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (par.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((par.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(par.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> +			direction = DMA_BIDIRECTIONAL;
> +		} else if (par.flags & VFIO_DMA_MAP_FLAG_READ) {
> +			direction = DMA_TO_DEVICE;
> +		} else if (par.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> +			direction = DMA_FROM_DEVICE;
> +		}
> +
> +		par.size += par.iova & ~IOMMU_PAGE_MASK;
> +		par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
> +
> +		return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
> +				par.vaddr & IOMMU_PAGE_MASK, direction,
> +				par.size >> IOMMU_PAGE_SHIFT);
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap par;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&par, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (par.argsz < minsz)
> +			return -EINVAL;
> +
> +		par.size += par.iova & ~IOMMU_PAGE_MASK;
> +		par.size = _ALIGN_UP(par.size, IOMMU_PAGE_SIZE);
> +
> +		return iommu_put_tces(tbl, par.iova >> IOMMU_PAGE_SHIFT,
> +				0, DMA_NONE, par.size >> IOMMU_PAGE_SHIFT);
> +	}
> +	default:
> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;
> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
> +		container->tbl = NULL;
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..3ecd65c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -87,30 +87,31 @@ extern void vfio_unregister_iommu_driver(
>   * Simple helper macro for dealing with variable sized structures passed
>   * from user space.  This allows us to easily determine if the provided
>   * structure is sized to include various fields.
>   */
>  #define offsetofend(TYPE, MEMBER) ({				\
>  	TYPE tmp;						\
>  	offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); })		\
>  
>  #endif /* __KERNEL__ */
>  
>  /* Kernel & User level defines for VFIO IOCTLs. */
>  
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
>   * structure length (argsz) and flags into structures passed between
>   * kernel and userspace.  We therefore use the _IO() macro for these
>   * defines to avoid implicitly embedding a size into the ioctl request.
>   * As structure fields are added, argsz will increase to match and flag
>   * bits will be defined to indicate additional fields with valid data.
>   * It's *always* the caller's responsibility to indicate the size of
>   * the structure passed by setting argsz appropriately.
>   */
>  
>  #define VFIO_TYPE	(';')
>  #define VFIO_BASE	100
>  
> @@ -430,16 +431,35 @@ struct vfio_iommu_type1_dma_map {
>  /**
>   * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
>   *
>   * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
>   * Caller sets argsz.
>   */
>  struct vfio_iommu_type1_dma_unmap {
>  	__u32	argsz;
>  	__u32	flags;
>  	__u64	iova;				/* IO virtual address */
>  	__u64	size;				/* Size of mapping (bytes) */
>  };
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;
> +	__u32 dma32_window_start;
> +	__u32 dma32_window_size;
> +	__u64 dma64_window_start;
> +	__u64 dma64_window_size;
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */

I'm glad you were able to reuse these, after this gets merged we can
rename the structure to something more common and typedef for both type1
and spapr_tce so we don't forget it's shared.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* RE: [PATCH] vfio powerpc: enabled and supported on powernv platform
  2012-11-20 18:19     ` Alex Williamson
  (?)
@ 2012-11-22 11:56       ` Sethi Varun-B16395
  -1 siblings, 0 replies; 122+ messages in thread
From: Sethi Varun-B16395 @ 2012-11-22 11:56 UTC (permalink / raw)
  To: Alex Williamson, Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 14871 bytes --]



> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> owner@vger.kernel.org] On Behalf Of Alex Williamson
> Sent: Tuesday, November 20, 2012 11:50 PM
> To: Alexey Kardashevskiy
> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> David Gibson
> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> platform
> 
> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > VFIO implements platform independent stuff such as a PCI driver, BAR
> > access (via read/write on a file descriptor or direct mapping when
> > possible) and IRQ signaling.
> > The platform dependent part includes IOMMU initialization and
> > handling.
> >
> > This patch initializes IOMMU groups based on the IOMMU configuration
> > discovered during the PCI scan, only POWERNV platform is supported at
> > the moment.
> >
> > Also the patch implements an VFIO-IOMMU driver which manages DMA
> > mapping/unmapping requests coming from the client (now QEMU). It also
> > returns a DMA window information to let the guest initialize the
> > device tree for a guest OS properly. Although this driver has been
> > tested only on POWERNV, it should work on any platform supporting TCE
> > tables.
> >
> > To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >
> > Cc: David Gibson <david@gibson.dropbear.id.au>
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> >  arch/powerpc/include/asm/iommu.h     |    6 +
> >  arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> >  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> >  drivers/iommu/Kconfig                |    8 ++
> >  drivers/vfio/Kconfig                 |    6 +
> >  drivers/vfio/Makefile                |    1 +
> >  drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> ++++++++++++++++++++++++++++++++++
> >  include/linux/vfio.h                 |   20 +++
> >  8 files changed, 563 insertions(+)
> >  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >
> > diff --git a/arch/powerpc/include/asm/iommu.h
> > b/arch/powerpc/include/asm/iommu.h
> > index cbfe678..5ba66cb 100644
> > --- a/arch/powerpc/include/asm/iommu.h
> > +++ b/arch/powerpc/include/asm/iommu.h
> > @@ -64,30 +64,33 @@ struct iommu_pool {  }
> > ____cacheline_aligned_in_smp;
> >
> >  struct iommu_table {
> >  	unsigned long  it_busno;     /* Bus number this table belongs to */
> >  	unsigned long  it_size;      /* Size of iommu table in entries */
> >  	unsigned long  it_offset;    /* Offset into global table */
> >  	unsigned long  it_base;      /* mapped address of tce table */
> >  	unsigned long  it_index;     /* which iommu table this is */
> >  	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> >  	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> */
> >  	unsigned long  poolsize;
> >  	unsigned long  nr_pools;
> >  	struct iommu_pool large_pool;
> >  	struct iommu_pool pools[IOMMU_NR_POOLS];
> >  	unsigned long *it_map;       /* A simple allocation bitmap for now
> */
> > +#ifdef CONFIG_IOMMU_API
> > +	struct iommu_group *it_group;
> > +#endif
> >  };
> >
> >  struct scatterlist;
> >
> >  static inline void set_iommu_table_base(struct device *dev, void
> > *base)  {
> >  	dev->archdata.dma_data.iommu_table_base = base;  }
> >
> >  static inline void *get_iommu_table_base(struct device *dev)  {
> >  	return dev->archdata.dma_data.iommu_table_base;
> >  }
> >
> >  /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > static inline void pci_iommu_init(void) { }  extern void
> > alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> > defined(CONFIG_PM)  static inline void iommu_save(void)  {
> >  	if (ppc_md.iommu_save)
> >  		ppc_md.iommu_save();
> >  }
> >
> >  static inline void iommu_restore(void)  {
> >  	if (ppc_md.iommu_restore)
> >  		ppc_md.iommu_restore();
> >  }
> >  #endif
> >
> > +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> entry, uint64_t tce,
> > +		enum dma_data_direction direction, unsigned long pages);
> > +
> >  #endif /* __KERNEL__ */
> >  #endif /* _ASM_IOMMU_H */
> > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > index ff5a6ce..94f614b 100644
> > --- a/arch/powerpc/kernel/iommu.c
> > +++ b/arch/powerpc/kernel/iommu.c
> > @@ -32,30 +32,31 @@
> >  #include <linux/dma-mapping.h>
> >  #include <linux/bitmap.h>
> >  #include <linux/iommu-helper.h>
> >  #include <linux/crash_dump.h>
> >  #include <linux/hash.h>
> >  #include <linux/fault-inject.h>
> >  #include <linux/pci.h>
> >  #include <asm/io.h>
> >  #include <asm/prom.h>
> >  #include <asm/iommu.h>
> >  #include <asm/pci-bridge.h>
> >  #include <asm/machdep.h>
> >  #include <asm/kdump.h>
> >  #include <asm/fadump.h>
> >  #include <asm/vio.h>
> > +#include <asm/tce.h>
> >
> >  #define DBG(...)
> >
> >  static int novmerge;
> >
> >  static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > int);
> >
> >  static int __init setup_iommu(char *str)  {
> >  	if (!strcmp(str, "novmerge"))
> >  		novmerge = 1;
> >  	else if (!strcmp(str, "vmerge"))
> >  		novmerge = 0;
> >  	return 1;
> >  }
> > @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> > struct iommu_table *tbl,  }
> >
> >  void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >  			 void *vaddr, dma_addr_t dma_handle)  {
> >  	if (tbl) {
> >  		unsigned int nio_pages;
> >
> >  		size = PAGE_ALIGN(size);
> >  		nio_pages = size >> IOMMU_PAGE_SHIFT;
> >  		iommu_free(tbl, dma_handle, nio_pages);
> >  		size = PAGE_ALIGN(size);
> >  		free_pages((unsigned long)vaddr, get_order(size));
> >  	}
> >  }
> > +
> > +#ifdef CONFIG_IOMMU_API
> > +/*
> > + * SPAPR TCE API
> > + */
> > +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> > +entry) {
> > +	struct page *page = NULL;
> 
> NULL initialization doesn't appear to be necessary
> 
> > +	unsigned long oldtce;
> > +
> > +	oldtce = ppc_md.tce_get(tbl, entry);
> > +
> > +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> > +		return NULL;
> > +
> > +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > +
> > +	WARN_ON(!page);
> > +	if (page && (oldtce & TCE_PCI_WRITE))
> > +		SetPageDirty(page);
> > +	ppc_md.tce_free(tbl, entry, 1);
> > +
> > +	return page;
> > +}
> > +
> > +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > +		uint64_t tce, enum dma_data_direction direction) {
> > +	int ret;
> > +	struct page *page = NULL;
> > +	unsigned long kva, offset;
> > +
> > +	/* Map new TCE */
> > +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> > +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > +			direction != DMA_TO_DEVICE, &page);
> > +	if (ret < 1) {
> > +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> tce=%llx ioba=%lx ret=%d\n",
> > +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> > +		if (!ret)
> > +			ret = -EFAULT;
> 
> Missing return ret?  Otherwise we've got some bogus uses of page below
> and we're setting ret for no reason here.
> 
> > +	}
> > +
> > +	kva = (unsigned long) page_address(page);
> > +	kva += offset;
> > +
> > +	/* tce_build receives a virtual address */
> > +	entry += tbl->it_offset; /* Offset into real TCE table */
> > +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > +
> > +	/* tce_build() only returns non-zero for transient errors */
> > +	if (unlikely(ret)) {
> > +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> ioba=%lx kva=%lx ret=%d\n",
> > +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > +		put_page(page);
> > +		return -EIO;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static void tce_flush(struct iommu_table *tbl) {
> > +	/* Flush/invalidate TLB caches if necessary */
> > +	if (ppc_md.tce_flush)
> > +		ppc_md.tce_flush(tbl);
> > +
> > +	/* Make sure updates are seen by hardware */
> > +	mb();
> > +}
> > +
> > +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> uint64_t tce,
> > +		enum dma_data_direction direction, unsigned long pages) {
> > +	int i, ret = 0, pages_to_put = 0;
> > +	struct page *page;
> > +	struct iommu_pool *pool = get_pool(tbl, entry);
> > +	struct page **oldpages;
> > +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> > +
> > +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > +
> > +	/* Handle a single page request without allocation
> > +	   of pages-to-release array */
> > +	if (pages == 1) {
> > +		spin_lock(&(pool->lock));
> > +		page = free_tce(tbl, entry);
> > +
> > +		if (direction != DMA_NONE)
> > +			ret = put_tce(tbl, entry, tce, direction);
> > +
> > +		tce_flush(tbl);
> > +
> > +		if (page)
> > +			put_page(page);
> > +
> > +		spin_unlock(&(pool->lock));
> > +		return ret;
> > +	}
> > +
> > +	/* Releasing multiple pages */
> > +	/* Allocate an array for pages to be released after TCE table
> > +	   is updated */
> > +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > +	if (!oldpages)
> > +		return -ENOMEM;
> > +
> > +	spin_lock(&(pool->lock));
> > +
> > +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> IOMMU_PAGE_SIZE) {
> > +		page = free_tce(tbl, entry);
> > +		if (page) {
> > +			oldpages[pages_to_put] = page;
> > +			++pages_to_put;
> > +		}
> > +
> > +		if (direction != DMA_NONE)
> > +			ret = put_tce(tbl, entry, tce, direction);
> > +
> > +		/* Release old pages if we reached the end of oldpages[] or
> > +		   it is the last page or we are about to exit the loop */
> > +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> {
> > +			tce_flush(tbl);
> 
> Avoiding tce_flush() is the reason for all this extra overhead, right?
> I wonder if it'd be cleaner separating map vs unmap, where the map case
> can avoid the oldpages array... but that means inserting new mappings on
> top of old ones wouldn't put the pages.
> 
> > +
> > +			/* Release pages after removing them from TCE table */
> > +			while (pages_to_put) {
> > +				--pages_to_put;
> > +				put_page(oldpages[pages_to_put]);
> > +			}
> > +		}
> > +	}
> > +
> > +	spin_unlock(&(pool->lock));
> > +	kfree(oldpages);
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_put_tces);
> > +#endif /* CONFIG_IOMMU_API */
> > diff --git a/arch/powerpc/platforms/powernv/pci.c
> > b/arch/powerpc/platforms/powernv/pci.c
> > index 05205cf..676f4d9 100644
> > --- a/arch/powerpc/platforms/powernv/pci.c
> > +++ b/arch/powerpc/platforms/powernv/pci.c
> > @@ -8,30 +8,31 @@
> >   * This program is free software; you can redistribute it and/or
> >   * modify it under the terms of the GNU General Public License
> >   * as published by the Free Software Foundation; either version
> >   * 2 of the License, or (at your option) any later version.
> >   */
> >
> >  #include <linux/kernel.h>
> >  #include <linux/pci.h>
> >  #include <linux/delay.h>
> >  #include <linux/string.h>
> >  #include <linux/init.h>
> >  #include <linux/bootmem.h>
> >  #include <linux/irq.h>
> >  #include <linux/io.h>
> >  #include <linux/msi.h>
> > +#include <linux/iommu.h>
> >
> >  #include <asm/sections.h>
> >  #include <asm/io.h>
> >  #include <asm/prom.h>
> >  #include <asm/pci-bridge.h>
> >  #include <asm/machdep.h>
> >  #include <asm/ppc-pci.h>
> >  #include <asm/opal.h>
> >  #include <asm/iommu.h>
> >  #include <asm/tce.h>
> >  #include <asm/abs_addr.h>
> >  #include <asm/firmware.h>
> >
> >  #include "powernv.h"
> >  #include "pci.h"
> > @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> >  	/* Configure IOMMU DMA hooks */
> >  	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> >  	ppc_md.tce_build = pnv_tce_build;
> >  	ppc_md.tce_free = pnv_tce_free;
> >  	ppc_md.tce_get = pnv_tce_get;
> >  	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> >  	set_pci_dma_ops(&dma_iommu_ops);
> >
> >  	/* Configure MSIs */
> >  #ifdef CONFIG_PCI_MSI
> >  	ppc_md.msi_check_device = pnv_msi_check_device;
> >  	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> >  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;  #endif  }
> > +
> > +#ifdef CONFIG_IOMMU_API
> > +/*
> > + * IOMMU groups support required by VFIO  */ static int
> > +add_device(struct device *dev) {
> > +	struct iommu_table *tbl;
> > +	int ret = 0;
> > +
> > +	if (WARN_ON(dev->iommu_group)) {
> > +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu
> group %d, skipping\n",
> > +				dev->kobj.name,
> 
> dev_name(dev)
> 
> > +				iommu_group_id(dev->iommu_group));
> > +		return -EBUSY;
> > +	}
> > +
> > +	tbl = get_iommu_table_base(dev);
> > +	if (!tbl) {
> > +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> > +				dev->kobj.name);
> > +		return 0;
> > +	}
> > +
> > +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> > +			dev->kobj.name, iommu_group_id(tbl->it_group));
> > +
> > +	ret = iommu_group_add_device(tbl->it_group, dev);
> > +	if (ret < 0)
> > +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> > +				dev->kobj.name, ret);
> > +
> > +	return ret;
> > +}
> > +
> > +static void del_device(struct device *dev) {
> > +	iommu_group_remove_device(dev);
> > +}
> > +
> > +static int iommu_bus_notifier(struct notifier_block *nb,
> > +			      unsigned long action, void *data) {
> > +	struct device *dev = data;
> > +
> > +	switch (action) {
> > +	case BUS_NOTIFY_ADD_DEVICE:
> > +		return add_device(dev);
> > +	case BUS_NOTIFY_DEL_DEVICE:
> > +		del_device(dev);
> > +		return 0;
> > +	default:
> > +		return 0;
> > +	}
> > +}
> > +
> > +static struct notifier_block tce_iommu_bus_nb = {
> > +	.notifier_call = iommu_bus_notifier, };
> > +
> > +static void group_release(void *iommu_data) {
> > +	struct iommu_table *tbl = iommu_data;
> > +	tbl->it_group = NULL;
> > +}
> > +
> > +static int __init tce_iommu_init(void) {
> > +	struct pci_dev *pdev = NULL;
> > +	struct iommu_table *tbl;
> > +	struct iommu_group *grp;
> > +
> > +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> 
> There's already a notifier in the iommu code if you were to register an
> iommu_ops with the add/remove_device entries.  That would allow you to
> remove the notifier block and notifier function below and the second loop
> below.  Are you avoiding that to avoid the rest of iommu_ops?
> 
[Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device.

> Also, shouldn't this notifier only be registered after the first loop
> below?  Otherwise ADD_DEVICE could race with setting up groups, which we
> assume are present in the add_device() above.
[Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed.

-Varun
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 122+ messages in thread

* RE: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-22 11:56       ` Sethi Varun-B16395
  0 siblings, 0 replies; 122+ messages in thread
From: Sethi Varun-B16395 @ 2012-11-22 11:56 UTC (permalink / raw)
  To: Alex Williamson, Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson



> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> owner@vger.kernel.org] On Behalf Of Alex Williamson
> Sent: Tuesday, November 20, 2012 11:50 PM
> To: Alexey Kardashevskiy
> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> David Gibson
> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> platform
> 
> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > VFIO implements platform independent stuff such as a PCI driver, BAR
> > access (via read/write on a file descriptor or direct mapping when
> > possible) and IRQ signaling.
> > The platform dependent part includes IOMMU initialization and
> > handling.
> >
> > This patch initializes IOMMU groups based on the IOMMU configuration
> > discovered during the PCI scan, only POWERNV platform is supported at
> > the moment.
> >
> > Also the patch implements an VFIO-IOMMU driver which manages DMA
> > mapping/unmapping requests coming from the client (now QEMU). It also
> > returns a DMA window information to let the guest initialize the
> > device tree for a guest OS properly. Although this driver has been
> > tested only on POWERNV, it should work on any platform supporting TCE
> > tables.
> >
> > To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >
> > Cc: David Gibson <david@gibson.dropbear.id.au>
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> >  arch/powerpc/include/asm/iommu.h     |    6 +
> >  arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> >  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> >  drivers/iommu/Kconfig                |    8 ++
> >  drivers/vfio/Kconfig                 |    6 +
> >  drivers/vfio/Makefile                |    1 +
> >  drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> ++++++++++++++++++++++++++++++++++
> >  include/linux/vfio.h                 |   20 +++
> >  8 files changed, 563 insertions(+)
> >  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >
> > diff --git a/arch/powerpc/include/asm/iommu.h
> > b/arch/powerpc/include/asm/iommu.h
> > index cbfe678..5ba66cb 100644
> > --- a/arch/powerpc/include/asm/iommu.h
> > +++ b/arch/powerpc/include/asm/iommu.h
> > @@ -64,30 +64,33 @@ struct iommu_pool {  }
> > ____cacheline_aligned_in_smp;
> >
> >  struct iommu_table {
> >  	unsigned long  it_busno;     /* Bus number this table belongs to */
> >  	unsigned long  it_size;      /* Size of iommu table in entries */
> >  	unsigned long  it_offset;    /* Offset into global table */
> >  	unsigned long  it_base;      /* mapped address of tce table */
> >  	unsigned long  it_index;     /* which iommu table this is */
> >  	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> >  	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> */
> >  	unsigned long  poolsize;
> >  	unsigned long  nr_pools;
> >  	struct iommu_pool large_pool;
> >  	struct iommu_pool pools[IOMMU_NR_POOLS];
> >  	unsigned long *it_map;       /* A simple allocation bitmap for now
> */
> > +#ifdef CONFIG_IOMMU_API
> > +	struct iommu_group *it_group;
> > +#endif
> >  };
> >
> >  struct scatterlist;
> >
> >  static inline void set_iommu_table_base(struct device *dev, void
> > *base)  {
> >  	dev->archdata.dma_data.iommu_table_base = base;  }
> >
> >  static inline void *get_iommu_table_base(struct device *dev)  {
> >  	return dev->archdata.dma_data.iommu_table_base;
> >  }
> >
> >  /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > static inline void pci_iommu_init(void) { }  extern void
> > alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> > defined(CONFIG_PM)  static inline void iommu_save(void)  {
> >  	if (ppc_md.iommu_save)
> >  		ppc_md.iommu_save();
> >  }
> >
> >  static inline void iommu_restore(void)  {
> >  	if (ppc_md.iommu_restore)
> >  		ppc_md.iommu_restore();
> >  }
> >  #endif
> >
> > +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> entry, uint64_t tce,
> > +		enum dma_data_direction direction, unsigned long pages);
> > +
> >  #endif /* __KERNEL__ */
> >  #endif /* _ASM_IOMMU_H */
> > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > index ff5a6ce..94f614b 100644
> > --- a/arch/powerpc/kernel/iommu.c
> > +++ b/arch/powerpc/kernel/iommu.c
> > @@ -32,30 +32,31 @@
> >  #include <linux/dma-mapping.h>
> >  #include <linux/bitmap.h>
> >  #include <linux/iommu-helper.h>
> >  #include <linux/crash_dump.h>
> >  #include <linux/hash.h>
> >  #include <linux/fault-inject.h>
> >  #include <linux/pci.h>
> >  #include <asm/io.h>
> >  #include <asm/prom.h>
> >  #include <asm/iommu.h>
> >  #include <asm/pci-bridge.h>
> >  #include <asm/machdep.h>
> >  #include <asm/kdump.h>
> >  #include <asm/fadump.h>
> >  #include <asm/vio.h>
> > +#include <asm/tce.h>
> >
> >  #define DBG(...)
> >
> >  static int novmerge;
> >
> >  static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > int);
> >
> >  static int __init setup_iommu(char *str)  {
> >  	if (!strcmp(str, "novmerge"))
> >  		novmerge = 1;
> >  	else if (!strcmp(str, "vmerge"))
> >  		novmerge = 0;
> >  	return 1;
> >  }
> > @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> > struct iommu_table *tbl,  }
> >
> >  void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >  			 void *vaddr, dma_addr_t dma_handle)  {
> >  	if (tbl) {
> >  		unsigned int nio_pages;
> >
> >  		size = PAGE_ALIGN(size);
> >  		nio_pages = size >> IOMMU_PAGE_SHIFT;
> >  		iommu_free(tbl, dma_handle, nio_pages);
> >  		size = PAGE_ALIGN(size);
> >  		free_pages((unsigned long)vaddr, get_order(size));
> >  	}
> >  }
> > +
> > +#ifdef CONFIG_IOMMU_API
> > +/*
> > + * SPAPR TCE API
> > + */
> > +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> > +entry) {
> > +	struct page *page = NULL;
> 
> NULL initialization doesn't appear to be necessary
> 
> > +	unsigned long oldtce;
> > +
> > +	oldtce = ppc_md.tce_get(tbl, entry);
> > +
> > +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> > +		return NULL;
> > +
> > +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > +
> > +	WARN_ON(!page);
> > +	if (page && (oldtce & TCE_PCI_WRITE))
> > +		SetPageDirty(page);
> > +	ppc_md.tce_free(tbl, entry, 1);
> > +
> > +	return page;
> > +}
> > +
> > +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > +		uint64_t tce, enum dma_data_direction direction) {
> > +	int ret;
> > +	struct page *page = NULL;
> > +	unsigned long kva, offset;
> > +
> > +	/* Map new TCE */
> > +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> > +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > +			direction != DMA_TO_DEVICE, &page);
> > +	if (ret < 1) {
> > +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> tce=%llx ioba=%lx ret=%d\n",
> > +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> > +		if (!ret)
> > +			ret = -EFAULT;
> 
> Missing return ret?  Otherwise we've got some bogus uses of page below
> and we're setting ret for no reason here.
> 
> > +	}
> > +
> > +	kva = (unsigned long) page_address(page);
> > +	kva += offset;
> > +
> > +	/* tce_build receives a virtual address */
> > +	entry += tbl->it_offset; /* Offset into real TCE table */
> > +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > +
> > +	/* tce_build() only returns non-zero for transient errors */
> > +	if (unlikely(ret)) {
> > +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> ioba=%lx kva=%lx ret=%d\n",
> > +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > +		put_page(page);
> > +		return -EIO;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static void tce_flush(struct iommu_table *tbl) {
> > +	/* Flush/invalidate TLB caches if necessary */
> > +	if (ppc_md.tce_flush)
> > +		ppc_md.tce_flush(tbl);
> > +
> > +	/* Make sure updates are seen by hardware */
> > +	mb();
> > +}
> > +
> > +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> uint64_t tce,
> > +		enum dma_data_direction direction, unsigned long pages) {
> > +	int i, ret = 0, pages_to_put = 0;
> > +	struct page *page;
> > +	struct iommu_pool *pool = get_pool(tbl, entry);
> > +	struct page **oldpages;
> > +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> > +
> > +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > +
> > +	/* Handle a single page request without allocation
> > +	   of pages-to-release array */
> > +	if (pages == 1) {
> > +		spin_lock(&(pool->lock));
> > +		page = free_tce(tbl, entry);
> > +
> > +		if (direction != DMA_NONE)
> > +			ret = put_tce(tbl, entry, tce, direction);
> > +
> > +		tce_flush(tbl);
> > +
> > +		if (page)
> > +			put_page(page);
> > +
> > +		spin_unlock(&(pool->lock));
> > +		return ret;
> > +	}
> > +
> > +	/* Releasing multiple pages */
> > +	/* Allocate an array for pages to be released after TCE table
> > +	   is updated */
> > +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > +	if (!oldpages)
> > +		return -ENOMEM;
> > +
> > +	spin_lock(&(pool->lock));
> > +
> > +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> IOMMU_PAGE_SIZE) {
> > +		page = free_tce(tbl, entry);
> > +		if (page) {
> > +			oldpages[pages_to_put] = page;
> > +			++pages_to_put;
> > +		}
> > +
> > +		if (direction != DMA_NONE)
> > +			ret = put_tce(tbl, entry, tce, direction);
> > +
> > +		/* Release old pages if we reached the end of oldpages[] or
> > +		   it is the last page or we are about to exit the loop */
> > +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> {
> > +			tce_flush(tbl);
> 
> Avoiding tce_flush() is the reason for all this extra overhead, right?
> I wonder if it'd be cleaner separating map vs unmap, where the map case
> can avoid the oldpages array... but that means inserting new mappings on
> top of old ones wouldn't put the pages.
> 
> > +
> > +			/* Release pages after removing them from TCE table */
> > +			while (pages_to_put) {
> > +				--pages_to_put;
> > +				put_page(oldpages[pages_to_put]);
> > +			}
> > +		}
> > +	}
> > +
> > +	spin_unlock(&(pool->lock));
> > +	kfree(oldpages);
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_put_tces);
> > +#endif /* CONFIG_IOMMU_API */
> > diff --git a/arch/powerpc/platforms/powernv/pci.c
> > b/arch/powerpc/platforms/powernv/pci.c
> > index 05205cf..676f4d9 100644
> > --- a/arch/powerpc/platforms/powernv/pci.c
> > +++ b/arch/powerpc/platforms/powernv/pci.c
> > @@ -8,30 +8,31 @@
> >   * This program is free software; you can redistribute it and/or
> >   * modify it under the terms of the GNU General Public License
> >   * as published by the Free Software Foundation; either version
> >   * 2 of the License, or (at your option) any later version.
> >   */
> >
> >  #include <linux/kernel.h>
> >  #include <linux/pci.h>
> >  #include <linux/delay.h>
> >  #include <linux/string.h>
> >  #include <linux/init.h>
> >  #include <linux/bootmem.h>
> >  #include <linux/irq.h>
> >  #include <linux/io.h>
> >  #include <linux/msi.h>
> > +#include <linux/iommu.h>
> >
> >  #include <asm/sections.h>
> >  #include <asm/io.h>
> >  #include <asm/prom.h>
> >  #include <asm/pci-bridge.h>
> >  #include <asm/machdep.h>
> >  #include <asm/ppc-pci.h>
> >  #include <asm/opal.h>
> >  #include <asm/iommu.h>
> >  #include <asm/tce.h>
> >  #include <asm/abs_addr.h>
> >  #include <asm/firmware.h>
> >
> >  #include "powernv.h"
> >  #include "pci.h"
> > @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> >  	/* Configure IOMMU DMA hooks */
> >  	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> >  	ppc_md.tce_build = pnv_tce_build;
> >  	ppc_md.tce_free = pnv_tce_free;
> >  	ppc_md.tce_get = pnv_tce_get;
> >  	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> >  	set_pci_dma_ops(&dma_iommu_ops);
> >
> >  	/* Configure MSIs */
> >  #ifdef CONFIG_PCI_MSI
> >  	ppc_md.msi_check_device = pnv_msi_check_device;
> >  	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> >  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;  #endif  }
> > +
> > +#ifdef CONFIG_IOMMU_API
> > +/*
> > + * IOMMU groups support required by VFIO  */ static int
> > +add_device(struct device *dev) {
> > +	struct iommu_table *tbl;
> > +	int ret = 0;
> > +
> > +	if (WARN_ON(dev->iommu_group)) {
> > +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu
> group %d, skipping\n",
> > +				dev->kobj.name,
> 
> dev_name(dev)
> 
> > +				iommu_group_id(dev->iommu_group));
> > +		return -EBUSY;
> > +	}
> > +
> > +	tbl = get_iommu_table_base(dev);
> > +	if (!tbl) {
> > +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> > +				dev->kobj.name);
> > +		return 0;
> > +	}
> > +
> > +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> > +			dev->kobj.name, iommu_group_id(tbl->it_group));
> > +
> > +	ret = iommu_group_add_device(tbl->it_group, dev);
> > +	if (ret < 0)
> > +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> > +				dev->kobj.name, ret);
> > +
> > +	return ret;
> > +}
> > +
> > +static void del_device(struct device *dev) {
> > +	iommu_group_remove_device(dev);
> > +}
> > +
> > +static int iommu_bus_notifier(struct notifier_block *nb,
> > +			      unsigned long action, void *data) {
> > +	struct device *dev = data;
> > +
> > +	switch (action) {
> > +	case BUS_NOTIFY_ADD_DEVICE:
> > +		return add_device(dev);
> > +	case BUS_NOTIFY_DEL_DEVICE:
> > +		del_device(dev);
> > +		return 0;
> > +	default:
> > +		return 0;
> > +	}
> > +}
> > +
> > +static struct notifier_block tce_iommu_bus_nb = {
> > +	.notifier_call = iommu_bus_notifier, };
> > +
> > +static void group_release(void *iommu_data) {
> > +	struct iommu_table *tbl = iommu_data;
> > +	tbl->it_group = NULL;
> > +}
> > +
> > +static int __init tce_iommu_init(void) {
> > +	struct pci_dev *pdev = NULL;
> > +	struct iommu_table *tbl;
> > +	struct iommu_group *grp;
> > +
> > +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> 
> There's already a notifier in the iommu code if you were to register an
> iommu_ops with the add/remove_device entries.  That would allow you to
> remove the notifier block and notifier function below and the second loop
> below.  Are you avoiding that to avoid the rest of iommu_ops?
> 
[Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device.

> Also, shouldn't this notifier only be registered after the first loop
> below?  Otherwise ADD_DEVICE could race with setting up groups, which we
> assume are present in the add_device() above.
[Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed.

-Varun

^ permalink raw reply	[flat|nested] 122+ messages in thread

* RE: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-22 11:56       ` Sethi Varun-B16395
  0 siblings, 0 replies; 122+ messages in thread
From: Sethi Varun-B16395 @ 2012-11-22 11:56 UTC (permalink / raw)
  To: Alex Williamson, Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

DQoNCj4gLS0tLS1PcmlnaW5hbCBNZXNzYWdlLS0tLS0NCj4gRnJvbTogbGludXgta2VybmVsLW93
bmVyQHZnZXIua2VybmVsLm9yZyBbbWFpbHRvOmxpbnV4LWtlcm5lbC0NCj4gb3duZXJAdmdlci5r
ZXJuZWwub3JnXSBPbiBCZWhhbGYgT2YgQWxleCBXaWxsaWFtc29uDQo+IFNlbnQ6IFR1ZXNkYXks
IE5vdmVtYmVyIDIwLCAyMDEyIDExOjUwIFBNDQo+IFRvOiBBbGV4ZXkgS2FyZGFzaGV2c2tpeQ0K
PiBDYzogQmVuamFtaW4gSGVycmVuc2NobWlkdDsgUGF1bCBNYWNrZXJyYXM7IGxpbnV4cHBjLQ0K
PiBkZXZAbGlzdHMub3psYWJzLm9yZzsgbGludXgta2VybmVsQHZnZXIua2VybmVsLm9yZzsga3Zt
QHZnZXIua2VybmVsLm9yZzsNCj4gRGF2aWQgR2lic29uDQo+IFN1YmplY3Q6IFJlOiBbUEFUQ0hd
IHZmaW8gcG93ZXJwYzogZW5hYmxlZCBhbmQgc3VwcG9ydGVkIG9uIHBvd2VybnYNCj4gcGxhdGZv
cm0NCj4gDQo+IE9uIFR1ZSwgMjAxMi0xMS0yMCBhdCAxMTo0OCArMTEwMCwgQWxleGV5IEthcmRh
c2hldnNraXkgd3JvdGU6DQo+ID4gVkZJTyBpbXBsZW1lbnRzIHBsYXRmb3JtIGluZGVwZW5kZW50
IHN0dWZmIHN1Y2ggYXMgYSBQQ0kgZHJpdmVyLCBCQVINCj4gPiBhY2Nlc3MgKHZpYSByZWFkL3dy
aXRlIG9uIGEgZmlsZSBkZXNjcmlwdG9yIG9yIGRpcmVjdCBtYXBwaW5nIHdoZW4NCj4gPiBwb3Nz
aWJsZSkgYW5kIElSUSBzaWduYWxpbmcuDQo+ID4gVGhlIHBsYXRmb3JtIGRlcGVuZGVudCBwYXJ0
IGluY2x1ZGVzIElPTU1VIGluaXRpYWxpemF0aW9uIGFuZA0KPiA+IGhhbmRsaW5nLg0KPiA+DQo+
ID4gVGhpcyBwYXRjaCBpbml0aWFsaXplcyBJT01NVSBncm91cHMgYmFzZWQgb24gdGhlIElPTU1V
IGNvbmZpZ3VyYXRpb24NCj4gPiBkaXNjb3ZlcmVkIGR1cmluZyB0aGUgUENJIHNjYW4sIG9ubHkg
UE9XRVJOViBwbGF0Zm9ybSBpcyBzdXBwb3J0ZWQgYXQNCj4gPiB0aGUgbW9tZW50Lg0KPiA+DQo+
ID4gQWxzbyB0aGUgcGF0Y2ggaW1wbGVtZW50cyBhbiBWRklPLUlPTU1VIGRyaXZlciB3aGljaCBt
YW5hZ2VzIERNQQ0KPiA+IG1hcHBpbmcvdW5tYXBwaW5nIHJlcXVlc3RzIGNvbWluZyBmcm9tIHRo
ZSBjbGllbnQgKG5vdyBRRU1VKS4gSXQgYWxzbw0KPiA+IHJldHVybnMgYSBETUEgd2luZG93IGlu
Zm9ybWF0aW9uIHRvIGxldCB0aGUgZ3Vlc3QgaW5pdGlhbGl6ZSB0aGUNCj4gPiBkZXZpY2UgdHJl
ZSBmb3IgYSBndWVzdCBPUyBwcm9wZXJseS4gQWx0aG91Z2ggdGhpcyBkcml2ZXIgaGFzIGJlZW4N
Cj4gPiB0ZXN0ZWQgb25seSBvbiBQT1dFUk5WLCBpdCBzaG91bGQgd29yayBvbiBhbnkgcGxhdGZv
cm0gc3VwcG9ydGluZyBUQ0UNCj4gPiB0YWJsZXMuDQo+ID4NCj4gPiBUbyBlbmFibGUgVkZJTyBv
biBQT1dFUiwgZW5hYmxlIFNQQVBSX1RDRV9JT01NVSBjb25maWcgb3B0aW9uLg0KPiA+DQo+ID4g
Q2M6IERhdmlkIEdpYnNvbiA8ZGF2aWRAZ2lic29uLmRyb3BiZWFyLmlkLmF1Pg0KPiA+IFNpZ25l
ZC1vZmYtYnk6IEFsZXhleSBLYXJkYXNoZXZza2l5IDxhaWtAb3psYWJzLnJ1Pg0KPiA+IC0tLQ0K
PiA+ICBhcmNoL3Bvd2VycGMvaW5jbHVkZS9hc20vaW9tbXUuaCAgICAgfCAgICA2ICsNCj4gPiAg
YXJjaC9wb3dlcnBjL2tlcm5lbC9pb21tdS5jICAgICAgICAgIHwgIDE0MCArKysrKysrKysrKysr
KysrKysrDQo+ID4gIGFyY2gvcG93ZXJwYy9wbGF0Zm9ybXMvcG93ZXJudi9wY2kuYyB8ICAxMzUg
KysrKysrKysrKysrKysrKysrKw0KPiA+ICBkcml2ZXJzL2lvbW11L0tjb25maWcgICAgICAgICAg
ICAgICAgfCAgICA4ICsrDQo+ID4gIGRyaXZlcnMvdmZpby9LY29uZmlnICAgICAgICAgICAgICAg
ICB8ICAgIDYgKw0KPiA+ICBkcml2ZXJzL3ZmaW8vTWFrZWZpbGUgICAgICAgICAgICAgICAgfCAg
ICAxICsNCj4gPiAgZHJpdmVycy92ZmlvL3ZmaW9faW9tbXVfc3BhcHJfdGNlLmMgIHwgIDI0Nw0K
PiArKysrKysrKysrKysrKysrKysrKysrKysrKysrKysrKysrDQo+ID4gIGluY2x1ZGUvbGludXgv
dmZpby5oICAgICAgICAgICAgICAgICB8ICAgMjAgKysrDQo+ID4gIDggZmlsZXMgY2hhbmdlZCwg
NTYzIGluc2VydGlvbnMoKykNCj4gPiAgY3JlYXRlIG1vZGUgMTAwNjQ0IGRyaXZlcnMvdmZpby92
ZmlvX2lvbW11X3NwYXByX3RjZS5jDQo+ID4NCj4gPiBkaWZmIC0tZ2l0IGEvYXJjaC9wb3dlcnBj
L2luY2x1ZGUvYXNtL2lvbW11LmgNCj4gPiBiL2FyY2gvcG93ZXJwYy9pbmNsdWRlL2FzbS9pb21t
dS5oDQo+ID4gaW5kZXggY2JmZTY3OC4uNWJhNjZjYiAxMDA2NDQNCj4gPiAtLS0gYS9hcmNoL3Bv
d2VycGMvaW5jbHVkZS9hc20vaW9tbXUuaA0KPiA+ICsrKyBiL2FyY2gvcG93ZXJwYy9pbmNsdWRl
L2FzbS9pb21tdS5oDQo+ID4gQEAgLTY0LDMwICs2NCwzMyBAQCBzdHJ1Y3QgaW9tbXVfcG9vbCB7
ICB9DQo+ID4gX19fX2NhY2hlbGluZV9hbGlnbmVkX2luX3NtcDsNCj4gPg0KPiA+ICBzdHJ1Y3Qg
aW9tbXVfdGFibGUgew0KPiA+ICAJdW5zaWduZWQgbG9uZyAgaXRfYnVzbm87ICAgICAvKiBCdXMg
bnVtYmVyIHRoaXMgdGFibGUgYmVsb25ncyB0byAqLw0KPiA+ICAJdW5zaWduZWQgbG9uZyAgaXRf
c2l6ZTsgICAgICAvKiBTaXplIG9mIGlvbW11IHRhYmxlIGluIGVudHJpZXMgKi8NCj4gPiAgCXVu
c2lnbmVkIGxvbmcgIGl0X29mZnNldDsgICAgLyogT2Zmc2V0IGludG8gZ2xvYmFsIHRhYmxlICov
DQo+ID4gIAl1bnNpZ25lZCBsb25nICBpdF9iYXNlOyAgICAgIC8qIG1hcHBlZCBhZGRyZXNzIG9m
IHRjZSB0YWJsZSAqLw0KPiA+ICAJdW5zaWduZWQgbG9uZyAgaXRfaW5kZXg7ICAgICAvKiB3aGlj
aCBpb21tdSB0YWJsZSB0aGlzIGlzICovDQo+ID4gIAl1bnNpZ25lZCBsb25nICBpdF90eXBlOyAg
ICAgIC8qIHR5cGU6IFBDSSBvciBWaXJ0dWFsIEJ1cyAqLw0KPiA+ICAJdW5zaWduZWQgbG9uZyAg
aXRfYmxvY2tzaXplOyAvKiBFbnRyaWVzIGluIGVhY2ggYmxvY2sgKGNhY2hlbGluZSkNCj4gKi8N
Cj4gPiAgCXVuc2lnbmVkIGxvbmcgIHBvb2xzaXplOw0KPiA+ICAJdW5zaWduZWQgbG9uZyAgbnJf
cG9vbHM7DQo+ID4gIAlzdHJ1Y3QgaW9tbXVfcG9vbCBsYXJnZV9wb29sOw0KPiA+ICAJc3RydWN0
IGlvbW11X3Bvb2wgcG9vbHNbSU9NTVVfTlJfUE9PTFNdOw0KPiA+ICAJdW5zaWduZWQgbG9uZyAq
aXRfbWFwOyAgICAgICAvKiBBIHNpbXBsZSBhbGxvY2F0aW9uIGJpdG1hcCBmb3Igbm93DQo+ICov
DQo+ID4gKyNpZmRlZiBDT05GSUdfSU9NTVVfQVBJDQo+ID4gKwlzdHJ1Y3QgaW9tbXVfZ3JvdXAg
Kml0X2dyb3VwOw0KPiA+ICsjZW5kaWYNCj4gPiAgfTsNCj4gPg0KPiA+ICBzdHJ1Y3Qgc2NhdHRl
cmxpc3Q7DQo+ID4NCj4gPiAgc3RhdGljIGlubGluZSB2b2lkIHNldF9pb21tdV90YWJsZV9iYXNl
KHN0cnVjdCBkZXZpY2UgKmRldiwgdm9pZA0KPiA+ICpiYXNlKSAgew0KPiA+ICAJZGV2LT5hcmNo
ZGF0YS5kbWFfZGF0YS5pb21tdV90YWJsZV9iYXNlID0gYmFzZTsgIH0NCj4gPg0KPiA+ICBzdGF0
aWMgaW5saW5lIHZvaWQgKmdldF9pb21tdV90YWJsZV9iYXNlKHN0cnVjdCBkZXZpY2UgKmRldikg
IHsNCj4gPiAgCXJldHVybiBkZXYtPmFyY2hkYXRhLmRtYV9kYXRhLmlvbW11X3RhYmxlX2Jhc2U7
DQo+ID4gIH0NCj4gPg0KPiA+ICAvKiBGcmVlcyB0YWJsZSBmb3IgYW4gaW5kaXZpZHVhbCBkZXZp
Y2Ugbm9kZSAqLyBAQCAtMTM1LDE3ICsxMzgsMjAgQEANCj4gPiBzdGF0aWMgaW5saW5lIHZvaWQg
cGNpX2lvbW11X2luaXQodm9pZCkgeyB9ICBleHRlcm4gdm9pZA0KPiA+IGFsbG9jX2RhcnRfdGFi
bGUodm9pZCk7ICAjaWYgZGVmaW5lZChDT05GSUdfUFBDNjQpICYmDQo+ID4gZGVmaW5lZChDT05G
SUdfUE0pICBzdGF0aWMgaW5saW5lIHZvaWQgaW9tbXVfc2F2ZSh2b2lkKSAgew0KPiA+ICAJaWYg
KHBwY19tZC5pb21tdV9zYXZlKQ0KPiA+ICAJCXBwY19tZC5pb21tdV9zYXZlKCk7DQo+ID4gIH0N
Cj4gPg0KPiA+ICBzdGF0aWMgaW5saW5lIHZvaWQgaW9tbXVfcmVzdG9yZSh2b2lkKSAgew0KPiA+
ICAJaWYgKHBwY19tZC5pb21tdV9yZXN0b3JlKQ0KPiA+ICAJCXBwY19tZC5pb21tdV9yZXN0b3Jl
KCk7DQo+ID4gIH0NCj4gPiAgI2VuZGlmDQo+ID4NCj4gPiArZXh0ZXJuIGxvbmcgaW9tbXVfcHV0
X3RjZXMoc3RydWN0IGlvbW11X3RhYmxlICp0YmwsIHVuc2lnbmVkIGxvbmcNCj4gZW50cnksIHVp
bnQ2NF90IHRjZSwNCj4gPiArCQllbnVtIGRtYV9kYXRhX2RpcmVjdGlvbiBkaXJlY3Rpb24sIHVu
c2lnbmVkIGxvbmcgcGFnZXMpOw0KPiA+ICsNCj4gPiAgI2VuZGlmIC8qIF9fS0VSTkVMX18gKi8N
Cj4gPiAgI2VuZGlmIC8qIF9BU01fSU9NTVVfSCAqLw0KPiA+IGRpZmYgLS1naXQgYS9hcmNoL3Bv
d2VycGMva2VybmVsL2lvbW11LmMgYi9hcmNoL3Bvd2VycGMva2VybmVsL2lvbW11LmMNCj4gPiBp
bmRleCBmZjVhNmNlLi45NGY2MTRiIDEwMDY0NA0KPiA+IC0tLSBhL2FyY2gvcG93ZXJwYy9rZXJu
ZWwvaW9tbXUuYw0KPiA+ICsrKyBiL2FyY2gvcG93ZXJwYy9rZXJuZWwvaW9tbXUuYw0KPiA+IEBA
IC0zMiwzMCArMzIsMzEgQEANCj4gPiAgI2luY2x1ZGUgPGxpbnV4L2RtYS1tYXBwaW5nLmg+DQo+
ID4gICNpbmNsdWRlIDxsaW51eC9iaXRtYXAuaD4NCj4gPiAgI2luY2x1ZGUgPGxpbnV4L2lvbW11
LWhlbHBlci5oPg0KPiA+ICAjaW5jbHVkZSA8bGludXgvY3Jhc2hfZHVtcC5oPg0KPiA+ICAjaW5j
bHVkZSA8bGludXgvaGFzaC5oPg0KPiA+ICAjaW5jbHVkZSA8bGludXgvZmF1bHQtaW5qZWN0Lmg+
DQo+ID4gICNpbmNsdWRlIDxsaW51eC9wY2kuaD4NCj4gPiAgI2luY2x1ZGUgPGFzbS9pby5oPg0K
PiA+ICAjaW5jbHVkZSA8YXNtL3Byb20uaD4NCj4gPiAgI2luY2x1ZGUgPGFzbS9pb21tdS5oPg0K
PiA+ICAjaW5jbHVkZSA8YXNtL3BjaS1icmlkZ2UuaD4NCj4gPiAgI2luY2x1ZGUgPGFzbS9tYWNo
ZGVwLmg+DQo+ID4gICNpbmNsdWRlIDxhc20va2R1bXAuaD4NCj4gPiAgI2luY2x1ZGUgPGFzbS9m
YWR1bXAuaD4NCj4gPiAgI2luY2x1ZGUgPGFzbS92aW8uaD4NCj4gPiArI2luY2x1ZGUgPGFzbS90
Y2UuaD4NCj4gPg0KPiA+ICAjZGVmaW5lIERCRyguLi4pDQo+ID4NCj4gPiAgc3RhdGljIGludCBu
b3ZtZXJnZTsNCj4gPg0KPiA+ICBzdGF0aWMgdm9pZCBfX2lvbW11X2ZyZWUoc3RydWN0IGlvbW11
X3RhYmxlICosIGRtYV9hZGRyX3QsIHVuc2lnbmVkDQo+ID4gaW50KTsNCj4gPg0KPiA+ICBzdGF0
aWMgaW50IF9faW5pdCBzZXR1cF9pb21tdShjaGFyICpzdHIpICB7DQo+ID4gIAlpZiAoIXN0cmNt
cChzdHIsICJub3ZtZXJnZSIpKQ0KPiA+ICAJCW5vdm1lcmdlID0gMTsNCj4gPiAgCWVsc2UgaWYg
KCFzdHJjbXAoc3RyLCAidm1lcmdlIikpDQo+ID4gIAkJbm92bWVyZ2UgPSAwOw0KPiA+ICAJcmV0
dXJuIDE7DQo+ID4gIH0NCj4gPiBAQCAtODQ0LDE1ICs4NDUsMTU0IEBAIHZvaWQgKmlvbW11X2Fs
bG9jX2NvaGVyZW50KHN0cnVjdCBkZXZpY2UgKmRldiwNCj4gPiBzdHJ1Y3QgaW9tbXVfdGFibGUg
KnRibCwgIH0NCj4gPg0KPiA+ICB2b2lkIGlvbW11X2ZyZWVfY29oZXJlbnQoc3RydWN0IGlvbW11
X3RhYmxlICp0YmwsIHNpemVfdCBzaXplLA0KPiA+ICAJCQkgdm9pZCAqdmFkZHIsIGRtYV9hZGRy
X3QgZG1hX2hhbmRsZSkgIHsNCj4gPiAgCWlmICh0YmwpIHsNCj4gPiAgCQl1bnNpZ25lZCBpbnQg
bmlvX3BhZ2VzOw0KPiA+DQo+ID4gIAkJc2l6ZSA9IFBBR0VfQUxJR04oc2l6ZSk7DQo+ID4gIAkJ
bmlvX3BhZ2VzID0gc2l6ZSA+PiBJT01NVV9QQUdFX1NISUZUOw0KPiA+ICAJCWlvbW11X2ZyZWUo
dGJsLCBkbWFfaGFuZGxlLCBuaW9fcGFnZXMpOw0KPiA+ICAJCXNpemUgPSBQQUdFX0FMSUdOKHNp
emUpOw0KPiA+ICAJCWZyZWVfcGFnZXMoKHVuc2lnbmVkIGxvbmcpdmFkZHIsIGdldF9vcmRlcihz
aXplKSk7DQo+ID4gIAl9DQo+ID4gIH0NCj4gPiArDQo+ID4gKyNpZmRlZiBDT05GSUdfSU9NTVVf
QVBJDQo+ID4gKy8qDQo+ID4gKyAqIFNQQVBSIFRDRSBBUEkNCj4gPiArICovDQo+ID4gK3N0YXRp
YyBzdHJ1Y3QgcGFnZSAqZnJlZV90Y2Uoc3RydWN0IGlvbW11X3RhYmxlICp0YmwsIHVuc2lnbmVk
IGxvbmcNCj4gPiArZW50cnkpIHsNCj4gPiArCXN0cnVjdCBwYWdlICpwYWdlID0gTlVMTDsNCj4g
DQo+IE5VTEwgaW5pdGlhbGl6YXRpb24gZG9lc24ndCBhcHBlYXIgdG8gYmUgbmVjZXNzYXJ5DQo+
IA0KPiA+ICsJdW5zaWduZWQgbG9uZyBvbGR0Y2U7DQo+ID4gKw0KPiA+ICsJb2xkdGNlID0gcHBj
X21kLnRjZV9nZXQodGJsLCBlbnRyeSk7DQo+ID4gKw0KPiA+ICsJaWYgKCEob2xkdGNlICYgKFRD
RV9QQ0lfV1JJVEUgfCBUQ0VfUENJX1JFQUQpKSkNCj4gPiArCQlyZXR1cm4gTlVMTDsNCj4gPiAr
DQo+ID4gKwlwYWdlID0gcGZuX3RvX3BhZ2Uob2xkdGNlID4+IFBBR0VfU0hJRlQpOw0KPiA+ICsN
Cj4gPiArCVdBUk5fT04oIXBhZ2UpOw0KPiA+ICsJaWYgKHBhZ2UgJiYgKG9sZHRjZSAmIFRDRV9Q
Q0lfV1JJVEUpKQ0KPiA+ICsJCVNldFBhZ2VEaXJ0eShwYWdlKTsNCj4gPiArCXBwY19tZC50Y2Vf
ZnJlZSh0YmwsIGVudHJ5LCAxKTsNCj4gPiArDQo+ID4gKwlyZXR1cm4gcGFnZTsNCj4gPiArfQ0K
PiA+ICsNCj4gPiArc3RhdGljIGludCBwdXRfdGNlKHN0cnVjdCBpb21tdV90YWJsZSAqdGJsLCB1
bnNpZ25lZCBsb25nIGVudHJ5LA0KPiA+ICsJCXVpbnQ2NF90IHRjZSwgZW51bSBkbWFfZGF0YV9k
aXJlY3Rpb24gZGlyZWN0aW9uKSB7DQo+ID4gKwlpbnQgcmV0Ow0KPiA+ICsJc3RydWN0IHBhZ2Ug
KnBhZ2UgPSBOVUxMOw0KPiA+ICsJdW5zaWduZWQgbG9uZyBrdmEsIG9mZnNldDsNCj4gPiArDQo+
ID4gKwkvKiBNYXAgbmV3IFRDRSAqLw0KPiA+ICsJb2Zmc2V0ID0gKHRjZSAmIElPTU1VX1BBR0Vf
TUFTSykgLSAodGNlICYgUEFHRV9NQVNLKTsNCj4gPiArCXJldCA9IGdldF91c2VyX3BhZ2VzX2Zh
c3QodGNlICYgUEFHRV9NQVNLLCAxLA0KPiA+ICsJCQlkaXJlY3Rpb24gIT0gRE1BX1RPX0RFVklD
RSwgJnBhZ2UpOw0KPiA+ICsJaWYgKHJldCA8IDEpIHsNCj4gPiArCQlwcmludGsoS0VSTl9FUlIg
InRjZV92ZmlvOiBnZXRfdXNlcl9wYWdlc19mYXN0IGZhaWxlZA0KPiB0Y2U9JWxseCBpb2JhPSVs
eCByZXQ9JWRcbiIsDQo+ID4gKwkJCQl0Y2UsIGVudHJ5IDw8IElPTU1VX1BBR0VfU0hJRlQsIHJl
dCk7DQo+ID4gKwkJaWYgKCFyZXQpDQo+ID4gKwkJCXJldCA9IC1FRkFVTFQ7DQo+IA0KPiBNaXNz
aW5nIHJldHVybiByZXQ/ICBPdGhlcndpc2Ugd2UndmUgZ290IHNvbWUgYm9ndXMgdXNlcyBvZiBw
YWdlIGJlbG93DQo+IGFuZCB3ZSdyZSBzZXR0aW5nIHJldCBmb3Igbm8gcmVhc29uIGhlcmUuDQo+
IA0KPiA+ICsJfQ0KPiA+ICsNCj4gPiArCWt2YSA9ICh1bnNpZ25lZCBsb25nKSBwYWdlX2FkZHJl
c3MocGFnZSk7DQo+ID4gKwlrdmEgKz0gb2Zmc2V0Ow0KPiA+ICsNCj4gPiArCS8qIHRjZV9idWls
ZCByZWNlaXZlcyBhIHZpcnR1YWwgYWRkcmVzcyAqLw0KPiA+ICsJZW50cnkgKz0gdGJsLT5pdF9v
ZmZzZXQ7IC8qIE9mZnNldCBpbnRvIHJlYWwgVENFIHRhYmxlICovDQo+ID4gKwlyZXQgPSBwcGNf
bWQudGNlX2J1aWxkKHRibCwgZW50cnksIDEsIGt2YSwgZGlyZWN0aW9uLCBOVUxMKTsNCj4gPiAr
DQo+ID4gKwkvKiB0Y2VfYnVpbGQoKSBvbmx5IHJldHVybnMgbm9uLXplcm8gZm9yIHRyYW5zaWVu
dCBlcnJvcnMgKi8NCj4gPiArCWlmICh1bmxpa2VseShyZXQpKSB7DQo+ID4gKwkJcHJpbnRrKEtF
Uk5fRVJSICJ0Y2VfdmZpbzogdGNlX3B1dCBmYWlsZWQgb24gdGNlPSVsbHgNCj4gaW9iYT0lbHgg
a3ZhPSVseCByZXQ9JWRcbiIsDQo+ID4gKwkJCQl0Y2UsIGVudHJ5IDw8IElPTU1VX1BBR0VfU0hJ
RlQsIGt2YSwgcmV0KTsNCj4gPiArCQlwdXRfcGFnZShwYWdlKTsNCj4gPiArCQlyZXR1cm4gLUVJ
TzsNCj4gPiArCX0NCj4gPiArDQo+ID4gKwlyZXR1cm4gMDsNCj4gPiArfQ0KPiA+ICsNCj4gPiAr
c3RhdGljIHZvaWQgdGNlX2ZsdXNoKHN0cnVjdCBpb21tdV90YWJsZSAqdGJsKSB7DQo+ID4gKwkv
KiBGbHVzaC9pbnZhbGlkYXRlIFRMQiBjYWNoZXMgaWYgbmVjZXNzYXJ5ICovDQo+ID4gKwlpZiAo
cHBjX21kLnRjZV9mbHVzaCkNCj4gPiArCQlwcGNfbWQudGNlX2ZsdXNoKHRibCk7DQo+ID4gKw0K
PiA+ICsJLyogTWFrZSBzdXJlIHVwZGF0ZXMgYXJlIHNlZW4gYnkgaGFyZHdhcmUgKi8NCj4gPiAr
CW1iKCk7DQo+ID4gK30NCj4gPiArDQo+ID4gK2xvbmcgaW9tbXVfcHV0X3RjZXMoc3RydWN0IGlv
bW11X3RhYmxlICp0YmwsIHVuc2lnbmVkIGxvbmcgZW50cnksDQo+IHVpbnQ2NF90IHRjZSwNCj4g
PiArCQllbnVtIGRtYV9kYXRhX2RpcmVjdGlvbiBkaXJlY3Rpb24sIHVuc2lnbmVkIGxvbmcgcGFn
ZXMpIHsNCj4gPiArCWludCBpLCByZXQgPSAwLCBwYWdlc190b19wdXQgPSAwOw0KPiA+ICsJc3Ry
dWN0IHBhZ2UgKnBhZ2U7DQo+ID4gKwlzdHJ1Y3QgaW9tbXVfcG9vbCAqcG9vbCA9IGdldF9wb29s
KHRibCwgZW50cnkpOw0KPiA+ICsJc3RydWN0IHBhZ2UgKipvbGRwYWdlczsNCj4gPiArCWNvbnN0
IGludCBvbGRwYWdlc251bSA9IFBBR0VfU0laRS9zaXplb2YoKm9sZHBhZ2VzKTsNCj4gPiArDQo+
ID4gKwlCVUlMRF9CVUdfT04oUEFHRV9TSVpFIDwgSU9NTVVfUEFHRV9TSVpFKTsNCj4gPiArDQo+
ID4gKwkvKiBIYW5kbGUgYSBzaW5nbGUgcGFnZSByZXF1ZXN0IHdpdGhvdXQgYWxsb2NhdGlvbg0K
PiA+ICsJICAgb2YgcGFnZXMtdG8tcmVsZWFzZSBhcnJheSAqLw0KPiA+ICsJaWYgKHBhZ2VzID09
IDEpIHsNCj4gPiArCQlzcGluX2xvY2soJihwb29sLT5sb2NrKSk7DQo+ID4gKwkJcGFnZSA9IGZy
ZWVfdGNlKHRibCwgZW50cnkpOw0KPiA+ICsNCj4gPiArCQlpZiAoZGlyZWN0aW9uICE9IERNQV9O
T05FKQ0KPiA+ICsJCQlyZXQgPSBwdXRfdGNlKHRibCwgZW50cnksIHRjZSwgZGlyZWN0aW9uKTsN
Cj4gPiArDQo+ID4gKwkJdGNlX2ZsdXNoKHRibCk7DQo+ID4gKw0KPiA+ICsJCWlmIChwYWdlKQ0K
PiA+ICsJCQlwdXRfcGFnZShwYWdlKTsNCj4gPiArDQo+ID4gKwkJc3Bpbl91bmxvY2soJihwb29s
LT5sb2NrKSk7DQo+ID4gKwkJcmV0dXJuIHJldDsNCj4gPiArCX0NCj4gPiArDQo+ID4gKwkvKiBS
ZWxlYXNpbmcgbXVsdGlwbGUgcGFnZXMgKi8NCj4gPiArCS8qIEFsbG9jYXRlIGFuIGFycmF5IGZv
ciBwYWdlcyB0byBiZSByZWxlYXNlZCBhZnRlciBUQ0UgdGFibGUNCj4gPiArCSAgIGlzIHVwZGF0
ZWQgKi8NCj4gPiArCW9sZHBhZ2VzID0ga21hbGxvYyhQQUdFX1NJWkUsIEdGUF9LRVJORUwpOw0K
PiA+ICsJaWYgKCFvbGRwYWdlcykNCj4gPiArCQlyZXR1cm4gLUVOT01FTTsNCj4gPiArDQo+ID4g
KwlzcGluX2xvY2soJihwb29sLT5sb2NrKSk7DQo+ID4gKw0KPiA+ICsJZm9yIChpID0gMDsgKGkg
PCBwYWdlcykgJiYgIXJldDsgKytpLCArK2VudHJ5LCB0Y2UgKz0NCj4gSU9NTVVfUEFHRV9TSVpF
KSB7DQo+ID4gKwkJcGFnZSA9IGZyZWVfdGNlKHRibCwgZW50cnkpOw0KPiA+ICsJCWlmIChwYWdl
KSB7DQo+ID4gKwkJCW9sZHBhZ2VzW3BhZ2VzX3RvX3B1dF0gPSBwYWdlOw0KPiA+ICsJCQkrK3Bh
Z2VzX3RvX3B1dDsNCj4gPiArCQl9DQo+ID4gKw0KPiA+ICsJCWlmIChkaXJlY3Rpb24gIT0gRE1B
X05PTkUpDQo+ID4gKwkJCXJldCA9IHB1dF90Y2UodGJsLCBlbnRyeSwgdGNlLCBkaXJlY3Rpb24p
Ow0KPiA+ICsNCj4gPiArCQkvKiBSZWxlYXNlIG9sZCBwYWdlcyBpZiB3ZSByZWFjaGVkIHRoZSBl
bmQgb2Ygb2xkcGFnZXNbXSBvcg0KPiA+ICsJCSAgIGl0IGlzIHRoZSBsYXN0IHBhZ2Ugb3Igd2Ug
YXJlIGFib3V0IHRvIGV4aXQgdGhlIGxvb3AgKi8NCj4gPiArCQlpZiAoKHBhZ2VzX3RvX3B1dCA9
PSBvbGRwYWdlc251bSkgfHwgKGkgPT0gcGFnZXMgLSAxKSB8fCByZXQpDQo+IHsNCj4gPiArCQkJ
dGNlX2ZsdXNoKHRibCk7DQo+IA0KPiBBdm9pZGluZyB0Y2VfZmx1c2goKSBpcyB0aGUgcmVhc29u
IGZvciBhbGwgdGhpcyBleHRyYSBvdmVyaGVhZCwgcmlnaHQ/DQo+IEkgd29uZGVyIGlmIGl0J2Qg
YmUgY2xlYW5lciBzZXBhcmF0aW5nIG1hcCB2cyB1bm1hcCwgd2hlcmUgdGhlIG1hcCBjYXNlDQo+
IGNhbiBhdm9pZCB0aGUgb2xkcGFnZXMgYXJyYXkuLi4gYnV0IHRoYXQgbWVhbnMgaW5zZXJ0aW5n
IG5ldyBtYXBwaW5ncyBvbg0KPiB0b3Agb2Ygb2xkIG9uZXMgd291bGRuJ3QgcHV0IHRoZSBwYWdl
cy4NCj4gDQo+ID4gKw0KPiA+ICsJCQkvKiBSZWxlYXNlIHBhZ2VzIGFmdGVyIHJlbW92aW5nIHRo
ZW0gZnJvbSBUQ0UgdGFibGUgKi8NCj4gPiArCQkJd2hpbGUgKHBhZ2VzX3RvX3B1dCkgew0KPiA+
ICsJCQkJLS1wYWdlc190b19wdXQ7DQo+ID4gKwkJCQlwdXRfcGFnZShvbGRwYWdlc1twYWdlc190
b19wdXRdKTsNCj4gPiArCQkJfQ0KPiA+ICsJCX0NCj4gPiArCX0NCj4gPiArDQo+ID4gKwlzcGlu
X3VubG9jaygmKHBvb2wtPmxvY2spKTsNCj4gPiArCWtmcmVlKG9sZHBhZ2VzKTsNCj4gPiArDQo+
ID4gKwlyZXR1cm4gcmV0Ow0KPiA+ICt9DQo+ID4gK0VYUE9SVF9TWU1CT0xfR1BMKGlvbW11X3B1
dF90Y2VzKTsNCj4gPiArI2VuZGlmIC8qIENPTkZJR19JT01NVV9BUEkgKi8NCj4gPiBkaWZmIC0t
Z2l0IGEvYXJjaC9wb3dlcnBjL3BsYXRmb3Jtcy9wb3dlcm52L3BjaS5jDQo+ID4gYi9hcmNoL3Bv
d2VycGMvcGxhdGZvcm1zL3Bvd2VybnYvcGNpLmMNCj4gPiBpbmRleCAwNTIwNWNmLi42NzZmNGQ5
IDEwMDY0NA0KPiA+IC0tLSBhL2FyY2gvcG93ZXJwYy9wbGF0Zm9ybXMvcG93ZXJudi9wY2kuYw0K
PiA+ICsrKyBiL2FyY2gvcG93ZXJwYy9wbGF0Zm9ybXMvcG93ZXJudi9wY2kuYw0KPiA+IEBAIC04
LDMwICs4LDMxIEBADQo+ID4gICAqIFRoaXMgcHJvZ3JhbSBpcyBmcmVlIHNvZnR3YXJlOyB5b3Ug
Y2FuIHJlZGlzdHJpYnV0ZSBpdCBhbmQvb3INCj4gPiAgICogbW9kaWZ5IGl0IHVuZGVyIHRoZSB0
ZXJtcyBvZiB0aGUgR05VIEdlbmVyYWwgUHVibGljIExpY2Vuc2UNCj4gPiAgICogYXMgcHVibGlz
aGVkIGJ5IHRoZSBGcmVlIFNvZnR3YXJlIEZvdW5kYXRpb247IGVpdGhlciB2ZXJzaW9uDQo+ID4g
ICAqIDIgb2YgdGhlIExpY2Vuc2UsIG9yIChhdCB5b3VyIG9wdGlvbikgYW55IGxhdGVyIHZlcnNp
b24uDQo+ID4gICAqLw0KPiA+DQo+ID4gICNpbmNsdWRlIDxsaW51eC9rZXJuZWwuaD4NCj4gPiAg
I2luY2x1ZGUgPGxpbnV4L3BjaS5oPg0KPiA+ICAjaW5jbHVkZSA8bGludXgvZGVsYXkuaD4NCj4g
PiAgI2luY2x1ZGUgPGxpbnV4L3N0cmluZy5oPg0KPiA+ICAjaW5jbHVkZSA8bGludXgvaW5pdC5o
Pg0KPiA+ICAjaW5jbHVkZSA8bGludXgvYm9vdG1lbS5oPg0KPiA+ICAjaW5jbHVkZSA8bGludXgv
aXJxLmg+DQo+ID4gICNpbmNsdWRlIDxsaW51eC9pby5oPg0KPiA+ICAjaW5jbHVkZSA8bGludXgv
bXNpLmg+DQo+ID4gKyNpbmNsdWRlIDxsaW51eC9pb21tdS5oPg0KPiA+DQo+ID4gICNpbmNsdWRl
IDxhc20vc2VjdGlvbnMuaD4NCj4gPiAgI2luY2x1ZGUgPGFzbS9pby5oPg0KPiA+ICAjaW5jbHVk
ZSA8YXNtL3Byb20uaD4NCj4gPiAgI2luY2x1ZGUgPGFzbS9wY2ktYnJpZGdlLmg+DQo+ID4gICNp
bmNsdWRlIDxhc20vbWFjaGRlcC5oPg0KPiA+ICAjaW5jbHVkZSA8YXNtL3BwYy1wY2kuaD4NCj4g
PiAgI2luY2x1ZGUgPGFzbS9vcGFsLmg+DQo+ID4gICNpbmNsdWRlIDxhc20vaW9tbXUuaD4NCj4g
PiAgI2luY2x1ZGUgPGFzbS90Y2UuaD4NCj4gPiAgI2luY2x1ZGUgPGFzbS9hYnNfYWRkci5oPg0K
PiA+ICAjaW5jbHVkZSA8YXNtL2Zpcm13YXJlLmg+DQo+ID4NCj4gPiAgI2luY2x1ZGUgInBvd2Vy
bnYuaCINCj4gPiAgI2luY2x1ZGUgInBjaS5oIg0KPiA+IEBAIC02MDEsMTUgKzYwMiwxNDkgQEAg
dm9pZCBfX2luaXQgcG52X3BjaV9pbml0KHZvaWQpDQo+ID4gIAkvKiBDb25maWd1cmUgSU9NTVUg
RE1BIGhvb2tzICovDQo+ID4gIAlwcGNfbWQucGNpX2RtYV9kZXZfc2V0dXAgPSBwbnZfcGNpX2Rt
YV9kZXZfc2V0dXA7DQo+ID4gIAlwcGNfbWQudGNlX2J1aWxkID0gcG52X3RjZV9idWlsZDsNCj4g
PiAgCXBwY19tZC50Y2VfZnJlZSA9IHBudl90Y2VfZnJlZTsNCj4gPiAgCXBwY19tZC50Y2VfZ2V0
ID0gcG52X3RjZV9nZXQ7DQo+ID4gIAlwcGNfbWQucGNpX3Byb2JlX21vZGUgPSBwbnZfcGNpX3By
b2JlX21vZGU7DQo+ID4gIAlzZXRfcGNpX2RtYV9vcHMoJmRtYV9pb21tdV9vcHMpOw0KPiA+DQo+
ID4gIAkvKiBDb25maWd1cmUgTVNJcyAqLw0KPiA+ICAjaWZkZWYgQ09ORklHX1BDSV9NU0kNCj4g
PiAgCXBwY19tZC5tc2lfY2hlY2tfZGV2aWNlID0gcG52X21zaV9jaGVja19kZXZpY2U7DQo+ID4g
IAlwcGNfbWQuc2V0dXBfbXNpX2lycXMgPSBwbnZfc2V0dXBfbXNpX2lycXM7DQo+ID4gIAlwcGNf
bWQudGVhcmRvd25fbXNpX2lycXMgPSBwbnZfdGVhcmRvd25fbXNpX2lycXM7ICAjZW5kaWYgIH0N
Cj4gPiArDQo+ID4gKyNpZmRlZiBDT05GSUdfSU9NTVVfQVBJDQo+ID4gKy8qDQo+ID4gKyAqIElP
TU1VIGdyb3VwcyBzdXBwb3J0IHJlcXVpcmVkIGJ5IFZGSU8gICovIHN0YXRpYyBpbnQNCj4gPiAr
YWRkX2RldmljZShzdHJ1Y3QgZGV2aWNlICpkZXYpIHsNCj4gPiArCXN0cnVjdCBpb21tdV90YWJs
ZSAqdGJsOw0KPiA+ICsJaW50IHJldCA9IDA7DQo+ID4gKw0KPiA+ICsJaWYgKFdBUk5fT04oZGV2
LT5pb21tdV9ncm91cCkpIHsNCj4gPiArCQlwcmludGsoS0VSTl9XQVJOSU5HICJ0Y2VfdmZpbzog
ZGV2aWNlICVzIGlzIGFscmVhZHkgaW4gaW9tbXUNCj4gZ3JvdXAgJWQsIHNraXBwaW5nXG4iLA0K
PiA+ICsJCQkJZGV2LT5rb2JqLm5hbWUsDQo+IA0KPiBkZXZfbmFtZShkZXYpDQo+IA0KPiA+ICsJ
CQkJaW9tbXVfZ3JvdXBfaWQoZGV2LT5pb21tdV9ncm91cCkpOw0KPiA+ICsJCXJldHVybiAtRUJV
U1k7DQo+ID4gKwl9DQo+ID4gKw0KPiA+ICsJdGJsID0gZ2V0X2lvbW11X3RhYmxlX2Jhc2UoZGV2
KTsNCj4gPiArCWlmICghdGJsKSB7DQo+ID4gKwkJcHJfZGVidWcoInRjZV92ZmlvOiBza2lwcGlu
ZyBkZXZpY2UgJXMgd2l0aCBubyB0YmxcbiIsDQo+ID4gKwkJCQlkZXYtPmtvYmoubmFtZSk7DQo+
ID4gKwkJcmV0dXJuIDA7DQo+ID4gKwl9DQo+ID4gKw0KPiA+ICsJcHJfZGVidWcoInRjZV92Zmlv
OiBhZGRpbmcgJXMgdG8gaW9tbXUgZ3JvdXAgJWRcbiIsDQo+ID4gKwkJCWRldi0+a29iai5uYW1l
LCBpb21tdV9ncm91cF9pZCh0YmwtPml0X2dyb3VwKSk7DQo+ID4gKw0KPiA+ICsJcmV0ID0gaW9t
bXVfZ3JvdXBfYWRkX2RldmljZSh0YmwtPml0X2dyb3VwLCBkZXYpOw0KPiA+ICsJaWYgKHJldCA8
IDApDQo+ID4gKwkJcHJpbnRrKEtFUk5fRVJSICJ0Y2VfdmZpbzogJXMgaGFzIG5vdCBiZWVuIGFk
ZGVkLCByZXQ9JWRcbiIsDQo+ID4gKwkJCQlkZXYtPmtvYmoubmFtZSwgcmV0KTsNCj4gPiArDQo+
ID4gKwlyZXR1cm4gcmV0Ow0KPiA+ICt9DQo+ID4gKw0KPiA+ICtzdGF0aWMgdm9pZCBkZWxfZGV2
aWNlKHN0cnVjdCBkZXZpY2UgKmRldikgew0KPiA+ICsJaW9tbXVfZ3JvdXBfcmVtb3ZlX2Rldmlj
ZShkZXYpOw0KPiA+ICt9DQo+ID4gKw0KPiA+ICtzdGF0aWMgaW50IGlvbW11X2J1c19ub3RpZmll
cihzdHJ1Y3Qgbm90aWZpZXJfYmxvY2sgKm5iLA0KPiA+ICsJCQkgICAgICB1bnNpZ25lZCBsb25n
IGFjdGlvbiwgdm9pZCAqZGF0YSkgew0KPiA+ICsJc3RydWN0IGRldmljZSAqZGV2ID0gZGF0YTsN
Cj4gPiArDQo+ID4gKwlzd2l0Y2ggKGFjdGlvbikgew0KPiA+ICsJY2FzZSBCVVNfTk9USUZZX0FE
RF9ERVZJQ0U6DQo+ID4gKwkJcmV0dXJuIGFkZF9kZXZpY2UoZGV2KTsNCj4gPiArCWNhc2UgQlVT
X05PVElGWV9ERUxfREVWSUNFOg0KPiA+ICsJCWRlbF9kZXZpY2UoZGV2KTsNCj4gPiArCQlyZXR1
cm4gMDsNCj4gPiArCWRlZmF1bHQ6DQo+ID4gKwkJcmV0dXJuIDA7DQo+ID4gKwl9DQo+ID4gK30N
Cj4gPiArDQo+ID4gK3N0YXRpYyBzdHJ1Y3Qgbm90aWZpZXJfYmxvY2sgdGNlX2lvbW11X2J1c19u
YiA9IHsNCj4gPiArCS5ub3RpZmllcl9jYWxsID0gaW9tbXVfYnVzX25vdGlmaWVyLCB9Ow0KPiA+
ICsNCj4gPiArc3RhdGljIHZvaWQgZ3JvdXBfcmVsZWFzZSh2b2lkICppb21tdV9kYXRhKSB7DQo+
ID4gKwlzdHJ1Y3QgaW9tbXVfdGFibGUgKnRibCA9IGlvbW11X2RhdGE7DQo+ID4gKwl0YmwtPml0
X2dyb3VwID0gTlVMTDsNCj4gPiArfQ0KPiA+ICsNCj4gPiArc3RhdGljIGludCBfX2luaXQgdGNl
X2lvbW11X2luaXQodm9pZCkgew0KPiA+ICsJc3RydWN0IHBjaV9kZXYgKnBkZXYgPSBOVUxMOw0K
PiA+ICsJc3RydWN0IGlvbW11X3RhYmxlICp0Ymw7DQo+ID4gKwlzdHJ1Y3QgaW9tbXVfZ3JvdXAg
KmdycDsNCj4gPiArDQo+ID4gKwlidXNfcmVnaXN0ZXJfbm90aWZpZXIoJnBjaV9idXNfdHlwZSwg
JnRjZV9pb21tdV9idXNfbmIpOw0KPiANCj4gVGhlcmUncyBhbHJlYWR5IGEgbm90aWZpZXIgaW4g
dGhlIGlvbW11IGNvZGUgaWYgeW91IHdlcmUgdG8gcmVnaXN0ZXIgYW4NCj4gaW9tbXVfb3BzIHdp
dGggdGhlIGFkZC9yZW1vdmVfZGV2aWNlIGVudHJpZXMuICBUaGF0IHdvdWxkIGFsbG93IHlvdSB0
bw0KPiByZW1vdmUgdGhlIG5vdGlmaWVyIGJsb2NrIGFuZCBub3RpZmllciBmdW5jdGlvbiBiZWxv
dyBhbmQgdGhlIHNlY29uZCBsb29wDQo+IGJlbG93LiAgQXJlIHlvdSBhdm9pZGluZyB0aGF0IHRv
IGF2b2lkIHRoZSByZXN0IG9mIGlvbW11X29wcz8NCj4gDQpbU2V0aGkgVmFydW4tQjE2Mzk1XSBD
b3VsZCBiZSBvbmUgcmVhc29uLCBhbHNvIHRoZXkgYXJlIGFzc29jaWF0aW5nIHRoZSBpb21tdSBn
cm91cCB3aXRoIHRoZSB0Y2UgdGFibGUgZW50cnkgYW5kIG5vdCB0aGUgZGV2aWNlLg0KDQo+IEFs
c28sIHNob3VsZG4ndCB0aGlzIG5vdGlmaWVyIG9ubHkgYmUgcmVnaXN0ZXJlZCBhZnRlciB0aGUg
Zmlyc3QgbG9vcA0KPiBiZWxvdz8gIE90aGVyd2lzZSBBRERfREVWSUNFIGNvdWxkIHJhY2Ugd2l0
aCBzZXR0aW5nIHVwIGdyb3Vwcywgd2hpY2ggd2UNCj4gYXNzdW1lIGFyZSBwcmVzZW50IGluIHRo
ZSBhZGRfZGV2aWNlKCkgYWJvdmUuDQpbU2V0aGkgVmFydW4tQjE2Mzk1XSBJc24ndCB0aGlzIHNp
bWlsYXIgdG8gaG93IGhvdyB0aGUgbm90aWZpZXIgaXMgcmVnaXN0ZXJlZCBpbiBpb21tdV9idXNf
aW5pdD8gRmlyc3QgYSBub3RpZmllciBpcyByZWdpc3RlcmVkIGFuZCB0aGVuIHdlIGNoZWNrIGZv
ciBkZXZpY2VzIHRoYXQgaGF2ZSBhbHJlYWR5IGJlZW4gcHJvYmVkLg0KDQotVmFydW4NCg==

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
  2012-11-22 11:56       ` Sethi Varun-B16395
@ 2012-11-23  2:02         ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-23  2:02 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Sethi Varun-B16395, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

On 22/11/12 22:56, Sethi Varun-B16395 wrote:
>
>
>> -----Original Message-----
>> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
>> owner@vger.kernel.org] On Behalf Of Alex Williamson
>> Sent: Tuesday, November 20, 2012 11:50 PM
>> To: Alexey Kardashevskiy
>> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
>> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
>> David Gibson
>> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
>> platform
>>
>> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
>>> VFIO implements platform independent stuff such as a PCI driver, BAR
>>> access (via read/write on a file descriptor or direct mapping when
>>> possible) and IRQ signaling.
>>> The platform dependent part includes IOMMU initialization and
>>> handling.
>>>
>>> This patch initializes IOMMU groups based on the IOMMU configuration
>>> discovered during the PCI scan, only POWERNV platform is supported at
>>> the moment.
>>>
>>> Also the patch implements an VFIO-IOMMU driver which manages DMA
>>> mapping/unmapping requests coming from the client (now QEMU). It also
>>> returns a DMA window information to let the guest initialize the
>>> device tree for a guest OS properly. Although this driver has been
>>> tested only on POWERNV, it should work on any platform supporting TCE
>>> tables.
>>>
>>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
>>>
>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>>   arch/powerpc/include/asm/iommu.h     |    6 +
>>>   arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
>>>   arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
>>>   drivers/iommu/Kconfig                |    8 ++
>>>   drivers/vfio/Kconfig                 |    6 +
>>>   drivers/vfio/Makefile                |    1 +
>>>   drivers/vfio/vfio_iommu_spapr_tce.c  |  247
>> ++++++++++++++++++++++++++++++++++
>>>   include/linux/vfio.h                 |   20 +++
>>>   8 files changed, 563 insertions(+)
>>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>
>>> diff --git a/arch/powerpc/include/asm/iommu.h
>>> b/arch/powerpc/include/asm/iommu.h
>>> index cbfe678..5ba66cb 100644
>>> --- a/arch/powerpc/include/asm/iommu.h
>>> +++ b/arch/powerpc/include/asm/iommu.h
>>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
>>> ____cacheline_aligned_in_smp;
>>>
>>>   struct iommu_table {
>>>   	unsigned long  it_busno;     /* Bus number this table belongs to */
>>>   	unsigned long  it_size;      /* Size of iommu table in entries */
>>>   	unsigned long  it_offset;    /* Offset into global table */
>>>   	unsigned long  it_base;      /* mapped address of tce table */
>>>   	unsigned long  it_index;     /* which iommu table this is */
>>>   	unsigned long  it_type;      /* type: PCI or Virtual Bus */
>>>   	unsigned long  it_blocksize; /* Entries in each block (cacheline)
>> */
>>>   	unsigned long  poolsize;
>>>   	unsigned long  nr_pools;
>>>   	struct iommu_pool large_pool;
>>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
>>>   	unsigned long *it_map;       /* A simple allocation bitmap for now
>> */
>>> +#ifdef CONFIG_IOMMU_API
>>> +	struct iommu_group *it_group;
>>> +#endif
>>>   };
>>>
>>>   struct scatterlist;
>>>
>>>   static inline void set_iommu_table_base(struct device *dev, void
>>> *base)  {
>>>   	dev->archdata.dma_data.iommu_table_base = base;  }
>>>
>>>   static inline void *get_iommu_table_base(struct device *dev)  {
>>>   	return dev->archdata.dma_data.iommu_table_base;
>>>   }
>>>
>>>   /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
>>> static inline void pci_iommu_init(void) { }  extern void
>>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
>>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
>>>   	if (ppc_md.iommu_save)
>>>   		ppc_md.iommu_save();
>>>   }
>>>
>>>   static inline void iommu_restore(void)  {
>>>   	if (ppc_md.iommu_restore)
>>>   		ppc_md.iommu_restore();
>>>   }
>>>   #endif
>>>
>>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
>> entry, uint64_t tce,
>>> +		enum dma_data_direction direction, unsigned long pages);
>>> +
>>>   #endif /* __KERNEL__ */
>>>   #endif /* _ASM_IOMMU_H */
>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>> index ff5a6ce..94f614b 100644
>>> --- a/arch/powerpc/kernel/iommu.c
>>> +++ b/arch/powerpc/kernel/iommu.c
>>> @@ -32,30 +32,31 @@
>>>   #include <linux/dma-mapping.h>
>>>   #include <linux/bitmap.h>
>>>   #include <linux/iommu-helper.h>
>>>   #include <linux/crash_dump.h>
>>>   #include <linux/hash.h>
>>>   #include <linux/fault-inject.h>
>>>   #include <linux/pci.h>
>>>   #include <asm/io.h>
>>>   #include <asm/prom.h>
>>>   #include <asm/iommu.h>
>>>   #include <asm/pci-bridge.h>
>>>   #include <asm/machdep.h>
>>>   #include <asm/kdump.h>
>>>   #include <asm/fadump.h>
>>>   #include <asm/vio.h>
>>> +#include <asm/tce.h>
>>>
>>>   #define DBG(...)
>>>
>>>   static int novmerge;
>>>
>>>   static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
>>> int);
>>>
>>>   static int __init setup_iommu(char *str)  {
>>>   	if (!strcmp(str, "novmerge"))
>>>   		novmerge = 1;
>>>   	else if (!strcmp(str, "vmerge"))
>>>   		novmerge = 0;
>>>   	return 1;
>>>   }
>>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
>>> struct iommu_table *tbl,  }
>>>
>>>   void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>>   			 void *vaddr, dma_addr_t dma_handle)  {
>>>   	if (tbl) {
>>>   		unsigned int nio_pages;
>>>
>>>   		size = PAGE_ALIGN(size);
>>>   		nio_pages = size >> IOMMU_PAGE_SHIFT;
>>>   		iommu_free(tbl, dma_handle, nio_pages);
>>>   		size = PAGE_ALIGN(size);
>>>   		free_pages((unsigned long)vaddr, get_order(size));
>>>   	}
>>>   }
>>> +
>>> +#ifdef CONFIG_IOMMU_API
>>> +/*
>>> + * SPAPR TCE API
>>> + */
>>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
>>> +entry) {
>>> +	struct page *page = NULL;
>>
>> NULL initialization doesn't appear to be necessary
>>
>>> +	unsigned long oldtce;
>>> +
>>> +	oldtce = ppc_md.tce_get(tbl, entry);
>>> +
>>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>>> +		return NULL;
>>> +
>>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
>>> +
>>> +	WARN_ON(!page);
>>> +	if (page && (oldtce & TCE_PCI_WRITE))
>>> +		SetPageDirty(page);
>>> +	ppc_md.tce_free(tbl, entry, 1);
>>> +
>>> +	return page;
>>> +}
>>> +
>>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>>> +		uint64_t tce, enum dma_data_direction direction) {
>>> +	int ret;
>>> +	struct page *page = NULL;
>>> +	unsigned long kva, offset;
>>> +
>>> +	/* Map new TCE */
>>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>>> +			direction != DMA_TO_DEVICE, &page);
>>> +	if (ret < 1) {
>>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
>> tce=%llx ioba=%lx ret=%d\n",
>>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
>>> +		if (!ret)
>>> +			ret = -EFAULT;
>>
>> Missing return ret?  Otherwise we've got some bogus uses of page below
>> and we're setting ret for no reason here.
>>
>>> +	}
>>> +
>>> +	kva = (unsigned long) page_address(page);
>>> +	kva += offset;
>>> +
>>> +	/* tce_build receives a virtual address */
>>> +	entry += tbl->it_offset; /* Offset into real TCE table */
>>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>>> +
>>> +	/* tce_build() only returns non-zero for transient errors */
>>> +	if (unlikely(ret)) {
>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
>> ioba=%lx kva=%lx ret=%d\n",
>>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>>> +		put_page(page);
>>> +		return -EIO;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static void tce_flush(struct iommu_table *tbl) {
>>> +	/* Flush/invalidate TLB caches if necessary */
>>> +	if (ppc_md.tce_flush)
>>> +		ppc_md.tce_flush(tbl);
>>> +
>>> +	/* Make sure updates are seen by hardware */
>>> +	mb();
>>> +}
>>> +
>>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> uint64_t tce,
>>> +		enum dma_data_direction direction, unsigned long pages) {
>>> +	int i, ret = 0, pages_to_put = 0;
>>> +	struct page *page;
>>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>>> +	struct page **oldpages;
>>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
>>> +
>>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>>> +
>>> +	/* Handle a single page request without allocation
>>> +	   of pages-to-release array */
>>> +	if (pages == 1) {
>>> +		spin_lock(&(pool->lock));
>>> +		page = free_tce(tbl, entry);
>>> +
>>> +		if (direction != DMA_NONE)
>>> +			ret = put_tce(tbl, entry, tce, direction);
>>> +
>>> +		tce_flush(tbl);
>>> +
>>> +		if (page)
>>> +			put_page(page);
>>> +
>>> +		spin_unlock(&(pool->lock));
>>> +		return ret;
>>> +	}
>>> +
>>> +	/* Releasing multiple pages */
>>> +	/* Allocate an array for pages to be released after TCE table
>>> +	   is updated */
>>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
>>> +	if (!oldpages)
>>> +		return -ENOMEM;
>>> +
>>> +	spin_lock(&(pool->lock));
>>> +
>>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
>> IOMMU_PAGE_SIZE) {
>>> +		page = free_tce(tbl, entry);
>>> +		if (page) {
>>> +			oldpages[pages_to_put] = page;
>>> +			++pages_to_put;
>>> +		}
>>> +
>>> +		if (direction != DMA_NONE)
>>> +			ret = put_tce(tbl, entry, tce, direction);
>>> +
>>> +		/* Release old pages if we reached the end of oldpages[] or
>>> +		   it is the last page or we are about to exit the loop */
>>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
>> {
>>> +			tce_flush(tbl);
>>
>> Avoiding tce_flush() is the reason for all this extra overhead, right?
>> I wonder if it'd be cleaner separating map vs unmap, where the map case
>> can avoid the oldpages array... but that means inserting new mappings on
>> top of old ones wouldn't put the pages.


Yes, we do not want to loose pages if the guest forgot to unmap them.


>>> +
>>> +			/* Release pages after removing them from TCE table */
>>> +			while (pages_to_put) {
>>> +				--pages_to_put;
>>> +				put_page(oldpages[pages_to_put]);
>>> +			}
>>> +		}
>>> +	}
>>> +
>>> +	spin_unlock(&(pool->lock));
>>> +	kfree(oldpages);
>>> +
>>> +	return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
>>> +#endif /* CONFIG_IOMMU_API */
>>> diff --git a/arch/powerpc/platforms/powernv/pci.c
>>> b/arch/powerpc/platforms/powernv/pci.c
>>> index 05205cf..676f4d9 100644
>>> --- a/arch/powerpc/platforms/powernv/pci.c
>>> +++ b/arch/powerpc/platforms/powernv/pci.c
>>> @@ -8,30 +8,31 @@
>>>    * This program is free software; you can redistribute it and/or
>>>    * modify it under the terms of the GNU General Public License
>>>    * as published by the Free Software Foundation; either version
>>>    * 2 of the License, or (at your option) any later version.
>>>    */
>>>
>>>   #include <linux/kernel.h>
>>>   #include <linux/pci.h>
>>>   #include <linux/delay.h>
>>>   #include <linux/string.h>
>>>   #include <linux/init.h>
>>>   #include <linux/bootmem.h>
>>>   #include <linux/irq.h>
>>>   #include <linux/io.h>
>>>   #include <linux/msi.h>
>>> +#include <linux/iommu.h>
>>>
>>>   #include <asm/sections.h>
>>>   #include <asm/io.h>
>>>   #include <asm/prom.h>
>>>   #include <asm/pci-bridge.h>
>>>   #include <asm/machdep.h>
>>>   #include <asm/ppc-pci.h>
>>>   #include <asm/opal.h>
>>>   #include <asm/iommu.h>
>>>   #include <asm/tce.h>
>>>   #include <asm/abs_addr.h>
>>>   #include <asm/firmware.h>
>>>
>>>   #include "powernv.h"
>>>   #include "pci.h"
>>> @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
>>>   	/* Configure IOMMU DMA hooks */
>>>   	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
>>>   	ppc_md.tce_build = pnv_tce_build;
>>>   	ppc_md.tce_free = pnv_tce_free;
>>>   	ppc_md.tce_get = pnv_tce_get;
>>>   	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
>>>   	set_pci_dma_ops(&dma_iommu_ops);
>>>
>>>   	/* Configure MSIs */
>>>   #ifdef CONFIG_PCI_MSI
>>>   	ppc_md.msi_check_device = pnv_msi_check_device;
>>>   	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
>>>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;  #endif  }
>>> +
>>> +#ifdef CONFIG_IOMMU_API
>>> +/*
>>> + * IOMMU groups support required by VFIO  */ static int
>>> +add_device(struct device *dev) {
>>> +	struct iommu_table *tbl;
>>> +	int ret = 0;
>>> +
>>> +	if (WARN_ON(dev->iommu_group)) {
>>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu
>> group %d, skipping\n",
>>> +				dev->kobj.name,
>>
>> dev_name(dev)
>>
>>> +				iommu_group_id(dev->iommu_group));
>>> +		return -EBUSY;
>>> +	}
>>> +
>>> +	tbl = get_iommu_table_base(dev);
>>> +	if (!tbl) {
>>> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
>>> +				dev->kobj.name);
>>> +		return 0;
>>> +	}
>>> +
>>> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
>>> +			dev->kobj.name, iommu_group_id(tbl->it_group));
>>> +
>>> +	ret = iommu_group_add_device(tbl->it_group, dev);
>>> +	if (ret < 0)
>>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>>> +				dev->kobj.name, ret);
>>> +
>>> +	return ret;
>>> +}
>>> +
>>> +static void del_device(struct device *dev) {
>>> +	iommu_group_remove_device(dev);
>>> +}
>>> +
>>> +static int iommu_bus_notifier(struct notifier_block *nb,
>>> +			      unsigned long action, void *data) {
>>> +	struct device *dev = data;
>>> +
>>> +	switch (action) {
>>> +	case BUS_NOTIFY_ADD_DEVICE:
>>> +		return add_device(dev);
>>> +	case BUS_NOTIFY_DEL_DEVICE:
>>> +		del_device(dev);
>>> +		return 0;
>>> +	default:
>>> +		return 0;
>>> +	}
>>> +}
>>> +
>>> +static struct notifier_block tce_iommu_bus_nb = {
>>> +	.notifier_call = iommu_bus_notifier, };
>>> +
>>> +static void group_release(void *iommu_data) {
>>> +	struct iommu_table *tbl = iommu_data;
>>> +	tbl->it_group = NULL;
>>> +}
>>> +
>>> +static int __init tce_iommu_init(void) {
>>> +	struct pci_dev *pdev = NULL;
>>> +	struct iommu_table *tbl;
>>> +	struct iommu_group *grp;
>>> +
>>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>>
>> There's already a notifier in the iommu code if you were to register an
>> iommu_ops with the add/remove_device entries.  That would allow you to
>> remove the notifier block and notifier function below and the second loop
>> below.  Are you avoiding that to avoid the rest of iommu_ops?

Yes. I need to implement either a small part of iommu_ops (especially the 
part which I think should not be there at all) or notifier, cannot how how 
the first is simpler.


> [Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device.

Also true.
I would actually allocate IOMMU groups right after we discovered the new 
one but this is done during PCI scan which works before 
subsys_initcall(iommu_init) is called so I added this first loop.


>> Also, shouldn't this notifier only be registered after the first loop
>> below?  Otherwise ADD_DEVICE could race with setting up groups, which we
>> assume are present in the add_device() above.
> [Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed.

Yep. Not very familiar with this stuff but if it is done one way and it is 
already upstream, I cannot see why I should go another way :)



-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-23  2:02         ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-23  2:02 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, Sethi Varun-B16395,
	linuxppc-dev, David Gibson

On 22/11/12 22:56, Sethi Varun-B16395 wrote:
>
>
>> -----Original Message-----
>> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
>> owner@vger.kernel.org] On Behalf Of Alex Williamson
>> Sent: Tuesday, November 20, 2012 11:50 PM
>> To: Alexey Kardashevskiy
>> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
>> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
>> David Gibson
>> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
>> platform
>>
>> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
>>> VFIO implements platform independent stuff such as a PCI driver, BAR
>>> access (via read/write on a file descriptor or direct mapping when
>>> possible) and IRQ signaling.
>>> The platform dependent part includes IOMMU initialization and
>>> handling.
>>>
>>> This patch initializes IOMMU groups based on the IOMMU configuration
>>> discovered during the PCI scan, only POWERNV platform is supported at
>>> the moment.
>>>
>>> Also the patch implements an VFIO-IOMMU driver which manages DMA
>>> mapping/unmapping requests coming from the client (now QEMU). It also
>>> returns a DMA window information to let the guest initialize the
>>> device tree for a guest OS properly. Although this driver has been
>>> tested only on POWERNV, it should work on any platform supporting TCE
>>> tables.
>>>
>>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
>>>
>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>>   arch/powerpc/include/asm/iommu.h     |    6 +
>>>   arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
>>>   arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
>>>   drivers/iommu/Kconfig                |    8 ++
>>>   drivers/vfio/Kconfig                 |    6 +
>>>   drivers/vfio/Makefile                |    1 +
>>>   drivers/vfio/vfio_iommu_spapr_tce.c  |  247
>> ++++++++++++++++++++++++++++++++++
>>>   include/linux/vfio.h                 |   20 +++
>>>   8 files changed, 563 insertions(+)
>>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>
>>> diff --git a/arch/powerpc/include/asm/iommu.h
>>> b/arch/powerpc/include/asm/iommu.h
>>> index cbfe678..5ba66cb 100644
>>> --- a/arch/powerpc/include/asm/iommu.h
>>> +++ b/arch/powerpc/include/asm/iommu.h
>>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
>>> ____cacheline_aligned_in_smp;
>>>
>>>   struct iommu_table {
>>>   	unsigned long  it_busno;     /* Bus number this table belongs to */
>>>   	unsigned long  it_size;      /* Size of iommu table in entries */
>>>   	unsigned long  it_offset;    /* Offset into global table */
>>>   	unsigned long  it_base;      /* mapped address of tce table */
>>>   	unsigned long  it_index;     /* which iommu table this is */
>>>   	unsigned long  it_type;      /* type: PCI or Virtual Bus */
>>>   	unsigned long  it_blocksize; /* Entries in each block (cacheline)
>> */
>>>   	unsigned long  poolsize;
>>>   	unsigned long  nr_pools;
>>>   	struct iommu_pool large_pool;
>>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
>>>   	unsigned long *it_map;       /* A simple allocation bitmap for now
>> */
>>> +#ifdef CONFIG_IOMMU_API
>>> +	struct iommu_group *it_group;
>>> +#endif
>>>   };
>>>
>>>   struct scatterlist;
>>>
>>>   static inline void set_iommu_table_base(struct device *dev, void
>>> *base)  {
>>>   	dev->archdata.dma_data.iommu_table_base = base;  }
>>>
>>>   static inline void *get_iommu_table_base(struct device *dev)  {
>>>   	return dev->archdata.dma_data.iommu_table_base;
>>>   }
>>>
>>>   /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
>>> static inline void pci_iommu_init(void) { }  extern void
>>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
>>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
>>>   	if (ppc_md.iommu_save)
>>>   		ppc_md.iommu_save();
>>>   }
>>>
>>>   static inline void iommu_restore(void)  {
>>>   	if (ppc_md.iommu_restore)
>>>   		ppc_md.iommu_restore();
>>>   }
>>>   #endif
>>>
>>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
>> entry, uint64_t tce,
>>> +		enum dma_data_direction direction, unsigned long pages);
>>> +
>>>   #endif /* __KERNEL__ */
>>>   #endif /* _ASM_IOMMU_H */
>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>> index ff5a6ce..94f614b 100644
>>> --- a/arch/powerpc/kernel/iommu.c
>>> +++ b/arch/powerpc/kernel/iommu.c
>>> @@ -32,30 +32,31 @@
>>>   #include <linux/dma-mapping.h>
>>>   #include <linux/bitmap.h>
>>>   #include <linux/iommu-helper.h>
>>>   #include <linux/crash_dump.h>
>>>   #include <linux/hash.h>
>>>   #include <linux/fault-inject.h>
>>>   #include <linux/pci.h>
>>>   #include <asm/io.h>
>>>   #include <asm/prom.h>
>>>   #include <asm/iommu.h>
>>>   #include <asm/pci-bridge.h>
>>>   #include <asm/machdep.h>
>>>   #include <asm/kdump.h>
>>>   #include <asm/fadump.h>
>>>   #include <asm/vio.h>
>>> +#include <asm/tce.h>
>>>
>>>   #define DBG(...)
>>>
>>>   static int novmerge;
>>>
>>>   static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
>>> int);
>>>
>>>   static int __init setup_iommu(char *str)  {
>>>   	if (!strcmp(str, "novmerge"))
>>>   		novmerge = 1;
>>>   	else if (!strcmp(str, "vmerge"))
>>>   		novmerge = 0;
>>>   	return 1;
>>>   }
>>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
>>> struct iommu_table *tbl,  }
>>>
>>>   void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>>   			 void *vaddr, dma_addr_t dma_handle)  {
>>>   	if (tbl) {
>>>   		unsigned int nio_pages;
>>>
>>>   		size = PAGE_ALIGN(size);
>>>   		nio_pages = size >> IOMMU_PAGE_SHIFT;
>>>   		iommu_free(tbl, dma_handle, nio_pages);
>>>   		size = PAGE_ALIGN(size);
>>>   		free_pages((unsigned long)vaddr, get_order(size));
>>>   	}
>>>   }
>>> +
>>> +#ifdef CONFIG_IOMMU_API
>>> +/*
>>> + * SPAPR TCE API
>>> + */
>>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
>>> +entry) {
>>> +	struct page *page = NULL;
>>
>> NULL initialization doesn't appear to be necessary
>>
>>> +	unsigned long oldtce;
>>> +
>>> +	oldtce = ppc_md.tce_get(tbl, entry);
>>> +
>>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>>> +		return NULL;
>>> +
>>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
>>> +
>>> +	WARN_ON(!page);
>>> +	if (page && (oldtce & TCE_PCI_WRITE))
>>> +		SetPageDirty(page);
>>> +	ppc_md.tce_free(tbl, entry, 1);
>>> +
>>> +	return page;
>>> +}
>>> +
>>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>>> +		uint64_t tce, enum dma_data_direction direction) {
>>> +	int ret;
>>> +	struct page *page = NULL;
>>> +	unsigned long kva, offset;
>>> +
>>> +	/* Map new TCE */
>>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>>> +			direction != DMA_TO_DEVICE, &page);
>>> +	if (ret < 1) {
>>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
>> tce=%llx ioba=%lx ret=%d\n",
>>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
>>> +		if (!ret)
>>> +			ret = -EFAULT;
>>
>> Missing return ret?  Otherwise we've got some bogus uses of page below
>> and we're setting ret for no reason here.
>>
>>> +	}
>>> +
>>> +	kva = (unsigned long) page_address(page);
>>> +	kva += offset;
>>> +
>>> +	/* tce_build receives a virtual address */
>>> +	entry += tbl->it_offset; /* Offset into real TCE table */
>>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>>> +
>>> +	/* tce_build() only returns non-zero for transient errors */
>>> +	if (unlikely(ret)) {
>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
>> ioba=%lx kva=%lx ret=%d\n",
>>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>>> +		put_page(page);
>>> +		return -EIO;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static void tce_flush(struct iommu_table *tbl) {
>>> +	/* Flush/invalidate TLB caches if necessary */
>>> +	if (ppc_md.tce_flush)
>>> +		ppc_md.tce_flush(tbl);
>>> +
>>> +	/* Make sure updates are seen by hardware */
>>> +	mb();
>>> +}
>>> +
>>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> uint64_t tce,
>>> +		enum dma_data_direction direction, unsigned long pages) {
>>> +	int i, ret = 0, pages_to_put = 0;
>>> +	struct page *page;
>>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>>> +	struct page **oldpages;
>>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
>>> +
>>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>>> +
>>> +	/* Handle a single page request without allocation
>>> +	   of pages-to-release array */
>>> +	if (pages == 1) {
>>> +		spin_lock(&(pool->lock));
>>> +		page = free_tce(tbl, entry);
>>> +
>>> +		if (direction != DMA_NONE)
>>> +			ret = put_tce(tbl, entry, tce, direction);
>>> +
>>> +		tce_flush(tbl);
>>> +
>>> +		if (page)
>>> +			put_page(page);
>>> +
>>> +		spin_unlock(&(pool->lock));
>>> +		return ret;
>>> +	}
>>> +
>>> +	/* Releasing multiple pages */
>>> +	/* Allocate an array for pages to be released after TCE table
>>> +	   is updated */
>>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
>>> +	if (!oldpages)
>>> +		return -ENOMEM;
>>> +
>>> +	spin_lock(&(pool->lock));
>>> +
>>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
>> IOMMU_PAGE_SIZE) {
>>> +		page = free_tce(tbl, entry);
>>> +		if (page) {
>>> +			oldpages[pages_to_put] = page;
>>> +			++pages_to_put;
>>> +		}
>>> +
>>> +		if (direction != DMA_NONE)
>>> +			ret = put_tce(tbl, entry, tce, direction);
>>> +
>>> +		/* Release old pages if we reached the end of oldpages[] or
>>> +		   it is the last page or we are about to exit the loop */
>>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
>> {
>>> +			tce_flush(tbl);
>>
>> Avoiding tce_flush() is the reason for all this extra overhead, right?
>> I wonder if it'd be cleaner separating map vs unmap, where the map case
>> can avoid the oldpages array... but that means inserting new mappings on
>> top of old ones wouldn't put the pages.


Yes, we do not want to loose pages if the guest forgot to unmap them.


>>> +
>>> +			/* Release pages after removing them from TCE table */
>>> +			while (pages_to_put) {
>>> +				--pages_to_put;
>>> +				put_page(oldpages[pages_to_put]);
>>> +			}
>>> +		}
>>> +	}
>>> +
>>> +	spin_unlock(&(pool->lock));
>>> +	kfree(oldpages);
>>> +
>>> +	return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
>>> +#endif /* CONFIG_IOMMU_API */
>>> diff --git a/arch/powerpc/platforms/powernv/pci.c
>>> b/arch/powerpc/platforms/powernv/pci.c
>>> index 05205cf..676f4d9 100644
>>> --- a/arch/powerpc/platforms/powernv/pci.c
>>> +++ b/arch/powerpc/platforms/powernv/pci.c
>>> @@ -8,30 +8,31 @@
>>>    * This program is free software; you can redistribute it and/or
>>>    * modify it under the terms of the GNU General Public License
>>>    * as published by the Free Software Foundation; either version
>>>    * 2 of the License, or (at your option) any later version.
>>>    */
>>>
>>>   #include <linux/kernel.h>
>>>   #include <linux/pci.h>
>>>   #include <linux/delay.h>
>>>   #include <linux/string.h>
>>>   #include <linux/init.h>
>>>   #include <linux/bootmem.h>
>>>   #include <linux/irq.h>
>>>   #include <linux/io.h>
>>>   #include <linux/msi.h>
>>> +#include <linux/iommu.h>
>>>
>>>   #include <asm/sections.h>
>>>   #include <asm/io.h>
>>>   #include <asm/prom.h>
>>>   #include <asm/pci-bridge.h>
>>>   #include <asm/machdep.h>
>>>   #include <asm/ppc-pci.h>
>>>   #include <asm/opal.h>
>>>   #include <asm/iommu.h>
>>>   #include <asm/tce.h>
>>>   #include <asm/abs_addr.h>
>>>   #include <asm/firmware.h>
>>>
>>>   #include "powernv.h"
>>>   #include "pci.h"
>>> @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
>>>   	/* Configure IOMMU DMA hooks */
>>>   	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
>>>   	ppc_md.tce_build = pnv_tce_build;
>>>   	ppc_md.tce_free = pnv_tce_free;
>>>   	ppc_md.tce_get = pnv_tce_get;
>>>   	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
>>>   	set_pci_dma_ops(&dma_iommu_ops);
>>>
>>>   	/* Configure MSIs */
>>>   #ifdef CONFIG_PCI_MSI
>>>   	ppc_md.msi_check_device = pnv_msi_check_device;
>>>   	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
>>>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;  #endif  }
>>> +
>>> +#ifdef CONFIG_IOMMU_API
>>> +/*
>>> + * IOMMU groups support required by VFIO  */ static int
>>> +add_device(struct device *dev) {
>>> +	struct iommu_table *tbl;
>>> +	int ret = 0;
>>> +
>>> +	if (WARN_ON(dev->iommu_group)) {
>>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu
>> group %d, skipping\n",
>>> +				dev->kobj.name,
>>
>> dev_name(dev)
>>
>>> +				iommu_group_id(dev->iommu_group));
>>> +		return -EBUSY;
>>> +	}
>>> +
>>> +	tbl = get_iommu_table_base(dev);
>>> +	if (!tbl) {
>>> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
>>> +				dev->kobj.name);
>>> +		return 0;
>>> +	}
>>> +
>>> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
>>> +			dev->kobj.name, iommu_group_id(tbl->it_group));
>>> +
>>> +	ret = iommu_group_add_device(tbl->it_group, dev);
>>> +	if (ret < 0)
>>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>>> +				dev->kobj.name, ret);
>>> +
>>> +	return ret;
>>> +}
>>> +
>>> +static void del_device(struct device *dev) {
>>> +	iommu_group_remove_device(dev);
>>> +}
>>> +
>>> +static int iommu_bus_notifier(struct notifier_block *nb,
>>> +			      unsigned long action, void *data) {
>>> +	struct device *dev = data;
>>> +
>>> +	switch (action) {
>>> +	case BUS_NOTIFY_ADD_DEVICE:
>>> +		return add_device(dev);
>>> +	case BUS_NOTIFY_DEL_DEVICE:
>>> +		del_device(dev);
>>> +		return 0;
>>> +	default:
>>> +		return 0;
>>> +	}
>>> +}
>>> +
>>> +static struct notifier_block tce_iommu_bus_nb = {
>>> +	.notifier_call = iommu_bus_notifier, };
>>> +
>>> +static void group_release(void *iommu_data) {
>>> +	struct iommu_table *tbl = iommu_data;
>>> +	tbl->it_group = NULL;
>>> +}
>>> +
>>> +static int __init tce_iommu_init(void) {
>>> +	struct pci_dev *pdev = NULL;
>>> +	struct iommu_table *tbl;
>>> +	struct iommu_group *grp;
>>> +
>>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>>
>> There's already a notifier in the iommu code if you were to register an
>> iommu_ops with the add/remove_device entries.  That would allow you to
>> remove the notifier block and notifier function below and the second loop
>> below.  Are you avoiding that to avoid the rest of iommu_ops?

Yes. I need to implement either a small part of iommu_ops (especially the 
part which I think should not be there at all) or notifier, cannot how how 
the first is simpler.


> [Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device.

Also true.
I would actually allocate IOMMU groups right after we discovered the new 
one but this is done during PCI scan which works before 
subsys_initcall(iommu_init) is called so I added this first loop.


>> Also, shouldn't this notifier only be registered after the first loop
>> below?  Otherwise ADD_DEVICE could race with setting up groups, which we
>> assume are present in the add_device() above.
> [Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed.

Yep. Not very familiar with this stuff but if it is done one way and it is 
already upstream, I cannot see why I should go another way :)



-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH 0/2] vfio powerpc: implemented and enabled
  2012-11-20 18:19     ` Alex Williamson
@ 2012-11-23  9:03       ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-23  9:03 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	David Gibson, linuxppc-dev, linux-kernel, kvm

The series includes IOMMU implementation and necessary IOMMU groups initialization.

Alexey Kardashevskiy (2):
  vfio powerpc: implemented IOMMU driver for VFIO
  vfio powerpc: enabled on powernv platform

 arch/powerpc/include/asm/iommu.h     |    6 +
 arch/powerpc/kernel/iommu.c          |  141 +++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 drivers/vfio/Kconfig                 |    6 +
 drivers/vfio/Makefile                |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  247 ++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                 |   20 +++
 8 files changed, 564 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH 0/2] vfio powerpc: implemented and enabled
@ 2012-11-23  9:03       ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-23  9:03 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

The series includes IOMMU implementation and necessary IOMMU groups initialization.

Alexey Kardashevskiy (2):
  vfio powerpc: implemented IOMMU driver for VFIO
  vfio powerpc: enabled on powernv platform

 arch/powerpc/include/asm/iommu.h     |    6 +
 arch/powerpc/kernel/iommu.c          |  141 +++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 drivers/vfio/Kconfig                 |    6 +
 drivers/vfio/Makefile                |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  247 ++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                 |   20 +++
 8 files changed, 564 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

-- 
1.7.10.4

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-23  9:03       ` Alexey Kardashevskiy
@ 2012-11-23  9:03         ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-23  9:03 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	David Gibson, linuxppc-dev, linux-kernel, kvm

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   20 +++
 4 files changed, 274 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..46a6298
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,247 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma64_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.dma64_window_start = 0;
+		info.dma64_window_size = 0;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction = DMA_NONE;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
+			direction = DMA_BIDIRECTIONAL;
+		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
+			direction = DMA_TO_DEVICE;
+		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
+			direction = DMA_FROM_DEVICE;
+		}
+
+		param.size += param.iova & ~IOMMU_PAGE_MASK;
+		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
+
+		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.vaddr & IOMMU_PAGE_MASK, direction,
+				param.size >> IOMMU_PAGE_SHIFT);
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		param.size += param.iova & ~IOMMU_PAGE_MASK;
+		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
+
+		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
+	}
+	default:
+		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
+		container->tbl = NULL;
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..3ecd65c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,23 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;
+	__u32 dma32_window_start;
+	__u32 dma32_window_size;
+	__u64 dma64_window_start;
+	__u64 dma64_window_size;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-23  9:03         ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-23  9:03 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   20 +++
 4 files changed, 274 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..46a6298
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,247 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma64_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.dma64_window_start = 0;
+		info.dma64_window_size = 0;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction = DMA_NONE;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
+			direction = DMA_BIDIRECTIONAL;
+		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
+			direction = DMA_TO_DEVICE;
+		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
+			direction = DMA_FROM_DEVICE;
+		}
+
+		param.size += param.iova & ~IOMMU_PAGE_MASK;
+		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
+
+		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.vaddr & IOMMU_PAGE_MASK, direction,
+				param.size >> IOMMU_PAGE_SHIFT);
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		param.size += param.iova & ~IOMMU_PAGE_MASK;
+		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
+
+		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
+	}
+	default:
+		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
+		container->tbl = NULL;
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..3ecd65c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,23 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;
+	__u32 dma32_window_start;
+	__u32 dma32_window_size;
+	__u64 dma64_window_start;
+	__u64 dma64_window_size;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH 2/2] vfio powerpc: enabled on powernv platform
  2012-11-23  9:03       ` Alexey Kardashevskiy
@ 2012-11-23  9:03         ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-23  9:03 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	David Gibson, linuxppc-dev, linux-kernel, kvm

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    6 ++
 arch/powerpc/kernel/iommu.c          |  141 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 290 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5ba66cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,8 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+		enum dma_data_direction direction, unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..c8dad1f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,143 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	struct page *page;
+	unsigned long oldtce;
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+
+	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		return NULL;
+
+	page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+	WARN_ON(!page);
+	if (page && (oldtce & TCE_PCI_WRITE))
+		SetPageDirty(page);
+	ppc_md.tce_free(tbl, entry, 1);
+
+	return page;
+}
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret)
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+		enum dma_data_direction direction, unsigned long pages)
+{
+	int i, ret = 0, pages_to_put = 0;
+	struct page *page;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+	struct page **oldpages;
+	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+	/* Handle a single page request without allocation
+	   of pages-to-release array */
+	if (pages == 1) {
+		spin_lock(&(pool->lock));
+		page = free_tce(tbl, entry);
+
+		if (direction != DMA_NONE)
+			ret = put_tce(tbl, entry, tce, direction);
+
+		tce_flush(tbl);
+
+		if (page)
+			put_page(page);
+
+		spin_unlock(&(pool->lock));
+		return ret;
+	}
+
+	/* Releasing multiple pages */
+	/* Allocate an array for pages to be released after TCE table
+	   is updated */
+	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!oldpages)
+		return -ENOMEM;
+
+	spin_lock(&(pool->lock));
+
+	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
+		page = free_tce(tbl, entry);
+		if (page) {
+			oldpages[pages_to_put] = page;
+			++pages_to_put;
+		}
+
+		if (direction != DMA_NONE)
+			ret = put_tce(tbl, entry, tce, direction);
+
+		/* Release old pages if we reached the end of oldpages[] or
+		   it is the last page or we are about to exit the loop */
+		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
+			tce_flush(tbl);
+
+			/* Release pages after removing them from TCE table */
+			while (pages_to_put) {
+				--pages_to_put;
+				put_page(oldpages[pages_to_put]);
+			}
+		}
+	}
+
+	spin_unlock(&(pool->lock));
+	kfree(oldpages);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..660dcc6 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH 2/2] vfio powerpc: enabled on powernv platform
@ 2012-11-23  9:03         ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-23  9:03 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    6 ++
 arch/powerpc/kernel/iommu.c          |  141 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 290 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5ba66cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,8 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+		enum dma_data_direction direction, unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..c8dad1f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,143 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	struct page *page;
+	unsigned long oldtce;
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+
+	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		return NULL;
+
+	page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+	WARN_ON(!page);
+	if (page && (oldtce & TCE_PCI_WRITE))
+		SetPageDirty(page);
+	ppc_md.tce_free(tbl, entry, 1);
+
+	return page;
+}
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret)
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
+		enum dma_data_direction direction, unsigned long pages)
+{
+	int i, ret = 0, pages_to_put = 0;
+	struct page *page;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+	struct page **oldpages;
+	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+	/* Handle a single page request without allocation
+	   of pages-to-release array */
+	if (pages == 1) {
+		spin_lock(&(pool->lock));
+		page = free_tce(tbl, entry);
+
+		if (direction != DMA_NONE)
+			ret = put_tce(tbl, entry, tce, direction);
+
+		tce_flush(tbl);
+
+		if (page)
+			put_page(page);
+
+		spin_unlock(&(pool->lock));
+		return ret;
+	}
+
+	/* Releasing multiple pages */
+	/* Allocate an array for pages to be released after TCE table
+	   is updated */
+	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!oldpages)
+		return -ENOMEM;
+
+	spin_lock(&(pool->lock));
+
+	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
+		page = free_tce(tbl, entry);
+		if (page) {
+			oldpages[pages_to_put] = page;
+			++pages_to_put;
+		}
+
+		if (direction != DMA_NONE)
+			ret = put_tce(tbl, entry, tce, direction);
+
+		/* Release old pages if we reached the end of oldpages[] or
+		   it is the last page or we are about to exit the loop */
+		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
+			tce_flush(tbl);
+
+			/* Release pages after removing them from TCE table */
+			while (pages_to_put) {
+				--pages_to_put;
+				put_page(oldpages[pages_to_put]);
+			}
+		}
+	}
+
+	spin_unlock(&(pool->lock));
+	kfree(oldpages);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..660dcc6 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
  2012-11-22 11:56       ` Sethi Varun-B16395
@ 2012-11-26 15:08         ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-26 15:08 UTC (permalink / raw)
  To: Sethi Varun-B16395
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

On Thu, 2012-11-22 at 11:56 +0000, Sethi Varun-B16395 wrote:
> 
> > -----Original Message-----
> > From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> > owner@vger.kernel.org] On Behalf Of Alex Williamson
> > Sent: Tuesday, November 20, 2012 11:50 PM
> > To: Alexey Kardashevskiy
> > Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> > dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > David Gibson
> > Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> > platform
> > 
> > On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > > VFIO implements platform independent stuff such as a PCI driver, BAR
> > > access (via read/write on a file descriptor or direct mapping when
> > > possible) and IRQ signaling.
> > > The platform dependent part includes IOMMU initialization and
> > > handling.
> > >
> > > This patch initializes IOMMU groups based on the IOMMU configuration
> > > discovered during the PCI scan, only POWERNV platform is supported at
> > > the moment.
> > >
> > > Also the patch implements an VFIO-IOMMU driver which manages DMA
> > > mapping/unmapping requests coming from the client (now QEMU). It also
> > > returns a DMA window information to let the guest initialize the
> > > device tree for a guest OS properly. Although this driver has been
> > > tested only on POWERNV, it should work on any platform supporting TCE
> > > tables.
> > >
> > > To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> > >
> > > Cc: David Gibson <david@gibson.dropbear.id.au>
> > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > ---
> > >  arch/powerpc/include/asm/iommu.h     |    6 +
> > >  arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> > >  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> > >  drivers/iommu/Kconfig                |    8 ++
> > >  drivers/vfio/Kconfig                 |    6 +
> > >  drivers/vfio/Makefile                |    1 +
> > >  drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> > ++++++++++++++++++++++++++++++++++
> > >  include/linux/vfio.h                 |   20 +++
> > >  8 files changed, 563 insertions(+)
> > >  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> > >
> > > diff --git a/arch/powerpc/include/asm/iommu.h
> > > b/arch/powerpc/include/asm/iommu.h
> > > index cbfe678..5ba66cb 100644
> > > --- a/arch/powerpc/include/asm/iommu.h
> > > +++ b/arch/powerpc/include/asm/iommu.h
> > > @@ -64,30 +64,33 @@ struct iommu_pool {  }
> > > ____cacheline_aligned_in_smp;
> > >
> > >  struct iommu_table {
> > >  	unsigned long  it_busno;     /* Bus number this table belongs to */
> > >  	unsigned long  it_size;      /* Size of iommu table in entries */
> > >  	unsigned long  it_offset;    /* Offset into global table */
> > >  	unsigned long  it_base;      /* mapped address of tce table */
> > >  	unsigned long  it_index;     /* which iommu table this is */
> > >  	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> > >  	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> > */
> > >  	unsigned long  poolsize;
> > >  	unsigned long  nr_pools;
> > >  	struct iommu_pool large_pool;
> > >  	struct iommu_pool pools[IOMMU_NR_POOLS];
> > >  	unsigned long *it_map;       /* A simple allocation bitmap for now
> > */
> > > +#ifdef CONFIG_IOMMU_API
> > > +	struct iommu_group *it_group;
> > > +#endif
> > >  };
> > >
> > >  struct scatterlist;
> > >
> > >  static inline void set_iommu_table_base(struct device *dev, void
> > > *base)  {
> > >  	dev->archdata.dma_data.iommu_table_base = base;  }
> > >
> > >  static inline void *get_iommu_table_base(struct device *dev)  {
> > >  	return dev->archdata.dma_data.iommu_table_base;
> > >  }
> > >
> > >  /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > > static inline void pci_iommu_init(void) { }  extern void
> > > alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> > > defined(CONFIG_PM)  static inline void iommu_save(void)  {
> > >  	if (ppc_md.iommu_save)
> > >  		ppc_md.iommu_save();
> > >  }
> > >
> > >  static inline void iommu_restore(void)  {
> > >  	if (ppc_md.iommu_restore)
> > >  		ppc_md.iommu_restore();
> > >  }
> > >  #endif
> > >
> > > +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> > entry, uint64_t tce,
> > > +		enum dma_data_direction direction, unsigned long pages);
> > > +
> > >  #endif /* __KERNEL__ */
> > >  #endif /* _ASM_IOMMU_H */
> > > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > > index ff5a6ce..94f614b 100644
> > > --- a/arch/powerpc/kernel/iommu.c
> > > +++ b/arch/powerpc/kernel/iommu.c
> > > @@ -32,30 +32,31 @@
> > >  #include <linux/dma-mapping.h>
> > >  #include <linux/bitmap.h>
> > >  #include <linux/iommu-helper.h>
> > >  #include <linux/crash_dump.h>
> > >  #include <linux/hash.h>
> > >  #include <linux/fault-inject.h>
> > >  #include <linux/pci.h>
> > >  #include <asm/io.h>
> > >  #include <asm/prom.h>
> > >  #include <asm/iommu.h>
> > >  #include <asm/pci-bridge.h>
> > >  #include <asm/machdep.h>
> > >  #include <asm/kdump.h>
> > >  #include <asm/fadump.h>
> > >  #include <asm/vio.h>
> > > +#include <asm/tce.h>
> > >
> > >  #define DBG(...)
> > >
> > >  static int novmerge;
> > >
> > >  static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > > int);
> > >
> > >  static int __init setup_iommu(char *str)  {
> > >  	if (!strcmp(str, "novmerge"))
> > >  		novmerge = 1;
> > >  	else if (!strcmp(str, "vmerge"))
> > >  		novmerge = 0;
> > >  	return 1;
> > >  }
> > > @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> > > struct iommu_table *tbl,  }
> > >
> > >  void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> > >  			 void *vaddr, dma_addr_t dma_handle)  {
> > >  	if (tbl) {
> > >  		unsigned int nio_pages;
> > >
> > >  		size = PAGE_ALIGN(size);
> > >  		nio_pages = size >> IOMMU_PAGE_SHIFT;
> > >  		iommu_free(tbl, dma_handle, nio_pages);
> > >  		size = PAGE_ALIGN(size);
> > >  		free_pages((unsigned long)vaddr, get_order(size));
> > >  	}
> > >  }
> > > +
> > > +#ifdef CONFIG_IOMMU_API
> > > +/*
> > > + * SPAPR TCE API
> > > + */
> > > +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> > > +entry) {
> > > +	struct page *page = NULL;
> > 
> > NULL initialization doesn't appear to be necessary
> > 
> > > +	unsigned long oldtce;
> > > +
> > > +	oldtce = ppc_md.tce_get(tbl, entry);
> > > +
> > > +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> > > +		return NULL;
> > > +
> > > +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > > +
> > > +	WARN_ON(!page);
> > > +	if (page && (oldtce & TCE_PCI_WRITE))
> > > +		SetPageDirty(page);
> > > +	ppc_md.tce_free(tbl, entry, 1);
> > > +
> > > +	return page;
> > > +}
> > > +
> > > +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > > +		uint64_t tce, enum dma_data_direction direction) {
> > > +	int ret;
> > > +	struct page *page = NULL;
> > > +	unsigned long kva, offset;
> > > +
> > > +	/* Map new TCE */
> > > +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> > > +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > > +			direction != DMA_TO_DEVICE, &page);
> > > +	if (ret < 1) {
> > > +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> > tce=%llx ioba=%lx ret=%d\n",
> > > +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> > > +		if (!ret)
> > > +			ret = -EFAULT;
> > 
> > Missing return ret?  Otherwise we've got some bogus uses of page below
> > and we're setting ret for no reason here.
> > 
> > > +	}
> > > +
> > > +	kva = (unsigned long) page_address(page);
> > > +	kva += offset;
> > > +
> > > +	/* tce_build receives a virtual address */
> > > +	entry += tbl->it_offset; /* Offset into real TCE table */
> > > +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > > +
> > > +	/* tce_build() only returns non-zero for transient errors */
> > > +	if (unlikely(ret)) {
> > > +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> > ioba=%lx kva=%lx ret=%d\n",
> > > +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > > +		put_page(page);
> > > +		return -EIO;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void tce_flush(struct iommu_table *tbl) {
> > > +	/* Flush/invalidate TLB caches if necessary */
> > > +	if (ppc_md.tce_flush)
> > > +		ppc_md.tce_flush(tbl);
> > > +
> > > +	/* Make sure updates are seen by hardware */
> > > +	mb();
> > > +}
> > > +
> > > +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> > uint64_t tce,
> > > +		enum dma_data_direction direction, unsigned long pages) {
> > > +	int i, ret = 0, pages_to_put = 0;
> > > +	struct page *page;
> > > +	struct iommu_pool *pool = get_pool(tbl, entry);
> > > +	struct page **oldpages;
> > > +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> > > +
> > > +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > > +
> > > +	/* Handle a single page request without allocation
> > > +	   of pages-to-release array */
> > > +	if (pages == 1) {
> > > +		spin_lock(&(pool->lock));
> > > +		page = free_tce(tbl, entry);
> > > +
> > > +		if (direction != DMA_NONE)
> > > +			ret = put_tce(tbl, entry, tce, direction);
> > > +
> > > +		tce_flush(tbl);
> > > +
> > > +		if (page)
> > > +			put_page(page);
> > > +
> > > +		spin_unlock(&(pool->lock));
> > > +		return ret;
> > > +	}
> > > +
> > > +	/* Releasing multiple pages */
> > > +	/* Allocate an array for pages to be released after TCE table
> > > +	   is updated */
> > > +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > > +	if (!oldpages)
> > > +		return -ENOMEM;
> > > +
> > > +	spin_lock(&(pool->lock));
> > > +
> > > +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> > IOMMU_PAGE_SIZE) {
> > > +		page = free_tce(tbl, entry);
> > > +		if (page) {
> > > +			oldpages[pages_to_put] = page;
> > > +			++pages_to_put;
> > > +		}
> > > +
> > > +		if (direction != DMA_NONE)
> > > +			ret = put_tce(tbl, entry, tce, direction);
> > > +
> > > +		/* Release old pages if we reached the end of oldpages[] or
> > > +		   it is the last page or we are about to exit the loop */
> > > +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> > {
> > > +			tce_flush(tbl);
> > 
> > Avoiding tce_flush() is the reason for all this extra overhead, right?
> > I wonder if it'd be cleaner separating map vs unmap, where the map case
> > can avoid the oldpages array... but that means inserting new mappings on
> > top of old ones wouldn't put the pages.
> > 
> > > +
> > > +			/* Release pages after removing them from TCE table */
> > > +			while (pages_to_put) {
> > > +				--pages_to_put;
> > > +				put_page(oldpages[pages_to_put]);
> > > +			}
> > > +		}
> > > +	}
> > > +
> > > +	spin_unlock(&(pool->lock));
> > > +	kfree(oldpages);
> > > +
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL_GPL(iommu_put_tces);
> > > +#endif /* CONFIG_IOMMU_API */
> > > diff --git a/arch/powerpc/platforms/powernv/pci.c
> > > b/arch/powerpc/platforms/powernv/pci.c
> > > index 05205cf..676f4d9 100644
> > > --- a/arch/powerpc/platforms/powernv/pci.c
> > > +++ b/arch/powerpc/platforms/powernv/pci.c
> > > @@ -8,30 +8,31 @@
> > >   * This program is free software; you can redistribute it and/or
> > >   * modify it under the terms of the GNU General Public License
> > >   * as published by the Free Software Foundation; either version
> > >   * 2 of the License, or (at your option) any later version.
> > >   */
> > >
> > >  #include <linux/kernel.h>
> > >  #include <linux/pci.h>
> > >  #include <linux/delay.h>
> > >  #include <linux/string.h>
> > >  #include <linux/init.h>
> > >  #include <linux/bootmem.h>
> > >  #include <linux/irq.h>
> > >  #include <linux/io.h>
> > >  #include <linux/msi.h>
> > > +#include <linux/iommu.h>
> > >
> > >  #include <asm/sections.h>
> > >  #include <asm/io.h>
> > >  #include <asm/prom.h>
> > >  #include <asm/pci-bridge.h>
> > >  #include <asm/machdep.h>
> > >  #include <asm/ppc-pci.h>
> > >  #include <asm/opal.h>
> > >  #include <asm/iommu.h>
> > >  #include <asm/tce.h>
> > >  #include <asm/abs_addr.h>
> > >  #include <asm/firmware.h>
> > >
> > >  #include "powernv.h"
> > >  #include "pci.h"
> > > @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> > >  	/* Configure IOMMU DMA hooks */
> > >  	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> > >  	ppc_md.tce_build = pnv_tce_build;
> > >  	ppc_md.tce_free = pnv_tce_free;
> > >  	ppc_md.tce_get = pnv_tce_get;
> > >  	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> > >  	set_pci_dma_ops(&dma_iommu_ops);
> > >
> > >  	/* Configure MSIs */
> > >  #ifdef CONFIG_PCI_MSI
> > >  	ppc_md.msi_check_device = pnv_msi_check_device;
> > >  	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> > >  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;  #endif  }
> > > +
> > > +#ifdef CONFIG_IOMMU_API
> > > +/*
> > > + * IOMMU groups support required by VFIO  */ static int
> > > +add_device(struct device *dev) {
> > > +	struct iommu_table *tbl;
> > > +	int ret = 0;
> > > +
> > > +	if (WARN_ON(dev->iommu_group)) {
> > > +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu
> > group %d, skipping\n",
> > > +				dev->kobj.name,
> > 
> > dev_name(dev)
> > 
> > > +				iommu_group_id(dev->iommu_group));
> > > +		return -EBUSY;
> > > +	}
> > > +
> > > +	tbl = get_iommu_table_base(dev);
> > > +	if (!tbl) {
> > > +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> > > +				dev->kobj.name);
> > > +		return 0;
> > > +	}
> > > +
> > > +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> > > +			dev->kobj.name, iommu_group_id(tbl->it_group));
> > > +
> > > +	ret = iommu_group_add_device(tbl->it_group, dev);
> > > +	if (ret < 0)
> > > +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> > > +				dev->kobj.name, ret);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static void del_device(struct device *dev) {
> > > +	iommu_group_remove_device(dev);
> > > +}
> > > +
> > > +static int iommu_bus_notifier(struct notifier_block *nb,
> > > +			      unsigned long action, void *data) {
> > > +	struct device *dev = data;
> > > +
> > > +	switch (action) {
> > > +	case BUS_NOTIFY_ADD_DEVICE:
> > > +		return add_device(dev);
> > > +	case BUS_NOTIFY_DEL_DEVICE:
> > > +		del_device(dev);
> > > +		return 0;
> > > +	default:
> > > +		return 0;
> > > +	}
> > > +}
> > > +
> > > +static struct notifier_block tce_iommu_bus_nb = {
> > > +	.notifier_call = iommu_bus_notifier, };
> > > +
> > > +static void group_release(void *iommu_data) {
> > > +	struct iommu_table *tbl = iommu_data;
> > > +	tbl->it_group = NULL;
> > > +}
> > > +
> > > +static int __init tce_iommu_init(void) {
> > > +	struct pci_dev *pdev = NULL;
> > > +	struct iommu_table *tbl;
> > > +	struct iommu_group *grp;
> > > +
> > > +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> > 
> > There's already a notifier in the iommu code if you were to register an
> > iommu_ops with the add/remove_device entries.  That would allow you to
> > remove the notifier block and notifier function below and the second loop
> > below.  Are you avoiding that to avoid the rest of iommu_ops?
> > 
> [Sethi Varun-B16395] Could be one reason, also they are associating
> the iommu group with the tce table entry and not the device.

That's fine, the tce is the level they claim to have isolation.

> 
> > Also, shouldn't this notifier only be registered after the first loop
> > below?  Otherwise ADD_DEVICE could race with setting up groups, which we
> > assume are present in the add_device() above.
> [Sethi Varun-B16395] Isn't this similar to how how the notifier is
> registered in iommu_bus_init? First a notifier is registered and then
> we check for devices that have already been probed.

It's not quite the same because the existing notifier callbacks to add
devices also creates groups as necessary.  My point here was that we
register a notifier that assumes a group prior to setting up the groups.
In the existing code the order doesn't matter so much because the system
isn't susceptible to hot device adds at that point.  That's likely the
case here too, but registering a notifier before setting up the data the
callback references seems unnecessary.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-26 15:08         ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-26 15:08 UTC (permalink / raw)
  To: Sethi Varun-B16395
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

On Thu, 2012-11-22 at 11:56 +0000, Sethi Varun-B16395 wrote:
> 
> > -----Original Message-----
> > From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> > owner@vger.kernel.org] On Behalf Of Alex Williamson
> > Sent: Tuesday, November 20, 2012 11:50 PM
> > To: Alexey Kardashevskiy
> > Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> > dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > David Gibson
> > Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> > platform
> > 
> > On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > > VFIO implements platform independent stuff such as a PCI driver, BAR
> > > access (via read/write on a file descriptor or direct mapping when
> > > possible) and IRQ signaling.
> > > The platform dependent part includes IOMMU initialization and
> > > handling.
> > >
> > > This patch initializes IOMMU groups based on the IOMMU configuration
> > > discovered during the PCI scan, only POWERNV platform is supported at
> > > the moment.
> > >
> > > Also the patch implements an VFIO-IOMMU driver which manages DMA
> > > mapping/unmapping requests coming from the client (now QEMU). It also
> > > returns a DMA window information to let the guest initialize the
> > > device tree for a guest OS properly. Although this driver has been
> > > tested only on POWERNV, it should work on any platform supporting TCE
> > > tables.
> > >
> > > To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> > >
> > > Cc: David Gibson <david@gibson.dropbear.id.au>
> > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > ---
> > >  arch/powerpc/include/asm/iommu.h     |    6 +
> > >  arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> > >  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> > >  drivers/iommu/Kconfig                |    8 ++
> > >  drivers/vfio/Kconfig                 |    6 +
> > >  drivers/vfio/Makefile                |    1 +
> > >  drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> > ++++++++++++++++++++++++++++++++++
> > >  include/linux/vfio.h                 |   20 +++
> > >  8 files changed, 563 insertions(+)
> > >  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> > >
> > > diff --git a/arch/powerpc/include/asm/iommu.h
> > > b/arch/powerpc/include/asm/iommu.h
> > > index cbfe678..5ba66cb 100644
> > > --- a/arch/powerpc/include/asm/iommu.h
> > > +++ b/arch/powerpc/include/asm/iommu.h
> > > @@ -64,30 +64,33 @@ struct iommu_pool {  }
> > > ____cacheline_aligned_in_smp;
> > >
> > >  struct iommu_table {
> > >  	unsigned long  it_busno;     /* Bus number this table belongs to */
> > >  	unsigned long  it_size;      /* Size of iommu table in entries */
> > >  	unsigned long  it_offset;    /* Offset into global table */
> > >  	unsigned long  it_base;      /* mapped address of tce table */
> > >  	unsigned long  it_index;     /* which iommu table this is */
> > >  	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> > >  	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> > */
> > >  	unsigned long  poolsize;
> > >  	unsigned long  nr_pools;
> > >  	struct iommu_pool large_pool;
> > >  	struct iommu_pool pools[IOMMU_NR_POOLS];
> > >  	unsigned long *it_map;       /* A simple allocation bitmap for now
> > */
> > > +#ifdef CONFIG_IOMMU_API
> > > +	struct iommu_group *it_group;
> > > +#endif
> > >  };
> > >
> > >  struct scatterlist;
> > >
> > >  static inline void set_iommu_table_base(struct device *dev, void
> > > *base)  {
> > >  	dev->archdata.dma_data.iommu_table_base = base;  }
> > >
> > >  static inline void *get_iommu_table_base(struct device *dev)  {
> > >  	return dev->archdata.dma_data.iommu_table_base;
> > >  }
> > >
> > >  /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > > static inline void pci_iommu_init(void) { }  extern void
> > > alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> > > defined(CONFIG_PM)  static inline void iommu_save(void)  {
> > >  	if (ppc_md.iommu_save)
> > >  		ppc_md.iommu_save();
> > >  }
> > >
> > >  static inline void iommu_restore(void)  {
> > >  	if (ppc_md.iommu_restore)
> > >  		ppc_md.iommu_restore();
> > >  }
> > >  #endif
> > >
> > > +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> > entry, uint64_t tce,
> > > +		enum dma_data_direction direction, unsigned long pages);
> > > +
> > >  #endif /* __KERNEL__ */
> > >  #endif /* _ASM_IOMMU_H */
> > > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > > index ff5a6ce..94f614b 100644
> > > --- a/arch/powerpc/kernel/iommu.c
> > > +++ b/arch/powerpc/kernel/iommu.c
> > > @@ -32,30 +32,31 @@
> > >  #include <linux/dma-mapping.h>
> > >  #include <linux/bitmap.h>
> > >  #include <linux/iommu-helper.h>
> > >  #include <linux/crash_dump.h>
> > >  #include <linux/hash.h>
> > >  #include <linux/fault-inject.h>
> > >  #include <linux/pci.h>
> > >  #include <asm/io.h>
> > >  #include <asm/prom.h>
> > >  #include <asm/iommu.h>
> > >  #include <asm/pci-bridge.h>
> > >  #include <asm/machdep.h>
> > >  #include <asm/kdump.h>
> > >  #include <asm/fadump.h>
> > >  #include <asm/vio.h>
> > > +#include <asm/tce.h>
> > >
> > >  #define DBG(...)
> > >
> > >  static int novmerge;
> > >
> > >  static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > > int);
> > >
> > >  static int __init setup_iommu(char *str)  {
> > >  	if (!strcmp(str, "novmerge"))
> > >  		novmerge = 1;
> > >  	else if (!strcmp(str, "vmerge"))
> > >  		novmerge = 0;
> > >  	return 1;
> > >  }
> > > @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> > > struct iommu_table *tbl,  }
> > >
> > >  void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> > >  			 void *vaddr, dma_addr_t dma_handle)  {
> > >  	if (tbl) {
> > >  		unsigned int nio_pages;
> > >
> > >  		size = PAGE_ALIGN(size);
> > >  		nio_pages = size >> IOMMU_PAGE_SHIFT;
> > >  		iommu_free(tbl, dma_handle, nio_pages);
> > >  		size = PAGE_ALIGN(size);
> > >  		free_pages((unsigned long)vaddr, get_order(size));
> > >  	}
> > >  }
> > > +
> > > +#ifdef CONFIG_IOMMU_API
> > > +/*
> > > + * SPAPR TCE API
> > > + */
> > > +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> > > +entry) {
> > > +	struct page *page = NULL;
> > 
> > NULL initialization doesn't appear to be necessary
> > 
> > > +	unsigned long oldtce;
> > > +
> > > +	oldtce = ppc_md.tce_get(tbl, entry);
> > > +
> > > +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> > > +		return NULL;
> > > +
> > > +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > > +
> > > +	WARN_ON(!page);
> > > +	if (page && (oldtce & TCE_PCI_WRITE))
> > > +		SetPageDirty(page);
> > > +	ppc_md.tce_free(tbl, entry, 1);
> > > +
> > > +	return page;
> > > +}
> > > +
> > > +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > > +		uint64_t tce, enum dma_data_direction direction) {
> > > +	int ret;
> > > +	struct page *page = NULL;
> > > +	unsigned long kva, offset;
> > > +
> > > +	/* Map new TCE */
> > > +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> > > +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > > +			direction != DMA_TO_DEVICE, &page);
> > > +	if (ret < 1) {
> > > +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> > tce=%llx ioba=%lx ret=%d\n",
> > > +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> > > +		if (!ret)
> > > +			ret = -EFAULT;
> > 
> > Missing return ret?  Otherwise we've got some bogus uses of page below
> > and we're setting ret for no reason here.
> > 
> > > +	}
> > > +
> > > +	kva = (unsigned long) page_address(page);
> > > +	kva += offset;
> > > +
> > > +	/* tce_build receives a virtual address */
> > > +	entry += tbl->it_offset; /* Offset into real TCE table */
> > > +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > > +
> > > +	/* tce_build() only returns non-zero for transient errors */
> > > +	if (unlikely(ret)) {
> > > +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> > ioba=%lx kva=%lx ret=%d\n",
> > > +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > > +		put_page(page);
> > > +		return -EIO;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void tce_flush(struct iommu_table *tbl) {
> > > +	/* Flush/invalidate TLB caches if necessary */
> > > +	if (ppc_md.tce_flush)
> > > +		ppc_md.tce_flush(tbl);
> > > +
> > > +	/* Make sure updates are seen by hardware */
> > > +	mb();
> > > +}
> > > +
> > > +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> > uint64_t tce,
> > > +		enum dma_data_direction direction, unsigned long pages) {
> > > +	int i, ret = 0, pages_to_put = 0;
> > > +	struct page *page;
> > > +	struct iommu_pool *pool = get_pool(tbl, entry);
> > > +	struct page **oldpages;
> > > +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> > > +
> > > +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > > +
> > > +	/* Handle a single page request without allocation
> > > +	   of pages-to-release array */
> > > +	if (pages == 1) {
> > > +		spin_lock(&(pool->lock));
> > > +		page = free_tce(tbl, entry);
> > > +
> > > +		if (direction != DMA_NONE)
> > > +			ret = put_tce(tbl, entry, tce, direction);
> > > +
> > > +		tce_flush(tbl);
> > > +
> > > +		if (page)
> > > +			put_page(page);
> > > +
> > > +		spin_unlock(&(pool->lock));
> > > +		return ret;
> > > +	}
> > > +
> > > +	/* Releasing multiple pages */
> > > +	/* Allocate an array for pages to be released after TCE table
> > > +	   is updated */
> > > +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > > +	if (!oldpages)
> > > +		return -ENOMEM;
> > > +
> > > +	spin_lock(&(pool->lock));
> > > +
> > > +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> > IOMMU_PAGE_SIZE) {
> > > +		page = free_tce(tbl, entry);
> > > +		if (page) {
> > > +			oldpages[pages_to_put] = page;
> > > +			++pages_to_put;
> > > +		}
> > > +
> > > +		if (direction != DMA_NONE)
> > > +			ret = put_tce(tbl, entry, tce, direction);
> > > +
> > > +		/* Release old pages if we reached the end of oldpages[] or
> > > +		   it is the last page or we are about to exit the loop */
> > > +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> > {
> > > +			tce_flush(tbl);
> > 
> > Avoiding tce_flush() is the reason for all this extra overhead, right?
> > I wonder if it'd be cleaner separating map vs unmap, where the map case
> > can avoid the oldpages array... but that means inserting new mappings on
> > top of old ones wouldn't put the pages.
> > 
> > > +
> > > +			/* Release pages after removing them from TCE table */
> > > +			while (pages_to_put) {
> > > +				--pages_to_put;
> > > +				put_page(oldpages[pages_to_put]);
> > > +			}
> > > +		}
> > > +	}
> > > +
> > > +	spin_unlock(&(pool->lock));
> > > +	kfree(oldpages);
> > > +
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL_GPL(iommu_put_tces);
> > > +#endif /* CONFIG_IOMMU_API */
> > > diff --git a/arch/powerpc/platforms/powernv/pci.c
> > > b/arch/powerpc/platforms/powernv/pci.c
> > > index 05205cf..676f4d9 100644
> > > --- a/arch/powerpc/platforms/powernv/pci.c
> > > +++ b/arch/powerpc/platforms/powernv/pci.c
> > > @@ -8,30 +8,31 @@
> > >   * This program is free software; you can redistribute it and/or
> > >   * modify it under the terms of the GNU General Public License
> > >   * as published by the Free Software Foundation; either version
> > >   * 2 of the License, or (at your option) any later version.
> > >   */
> > >
> > >  #include <linux/kernel.h>
> > >  #include <linux/pci.h>
> > >  #include <linux/delay.h>
> > >  #include <linux/string.h>
> > >  #include <linux/init.h>
> > >  #include <linux/bootmem.h>
> > >  #include <linux/irq.h>
> > >  #include <linux/io.h>
> > >  #include <linux/msi.h>
> > > +#include <linux/iommu.h>
> > >
> > >  #include <asm/sections.h>
> > >  #include <asm/io.h>
> > >  #include <asm/prom.h>
> > >  #include <asm/pci-bridge.h>
> > >  #include <asm/machdep.h>
> > >  #include <asm/ppc-pci.h>
> > >  #include <asm/opal.h>
> > >  #include <asm/iommu.h>
> > >  #include <asm/tce.h>
> > >  #include <asm/abs_addr.h>
> > >  #include <asm/firmware.h>
> > >
> > >  #include "powernv.h"
> > >  #include "pci.h"
> > > @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> > >  	/* Configure IOMMU DMA hooks */
> > >  	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> > >  	ppc_md.tce_build = pnv_tce_build;
> > >  	ppc_md.tce_free = pnv_tce_free;
> > >  	ppc_md.tce_get = pnv_tce_get;
> > >  	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> > >  	set_pci_dma_ops(&dma_iommu_ops);
> > >
> > >  	/* Configure MSIs */
> > >  #ifdef CONFIG_PCI_MSI
> > >  	ppc_md.msi_check_device = pnv_msi_check_device;
> > >  	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> > >  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;  #endif  }
> > > +
> > > +#ifdef CONFIG_IOMMU_API
> > > +/*
> > > + * IOMMU groups support required by VFIO  */ static int
> > > +add_device(struct device *dev) {
> > > +	struct iommu_table *tbl;
> > > +	int ret = 0;
> > > +
> > > +	if (WARN_ON(dev->iommu_group)) {
> > > +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu
> > group %d, skipping\n",
> > > +				dev->kobj.name,
> > 
> > dev_name(dev)
> > 
> > > +				iommu_group_id(dev->iommu_group));
> > > +		return -EBUSY;
> > > +	}
> > > +
> > > +	tbl = get_iommu_table_base(dev);
> > > +	if (!tbl) {
> > > +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> > > +				dev->kobj.name);
> > > +		return 0;
> > > +	}
> > > +
> > > +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> > > +			dev->kobj.name, iommu_group_id(tbl->it_group));
> > > +
> > > +	ret = iommu_group_add_device(tbl->it_group, dev);
> > > +	if (ret < 0)
> > > +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> > > +				dev->kobj.name, ret);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static void del_device(struct device *dev) {
> > > +	iommu_group_remove_device(dev);
> > > +}
> > > +
> > > +static int iommu_bus_notifier(struct notifier_block *nb,
> > > +			      unsigned long action, void *data) {
> > > +	struct device *dev = data;
> > > +
> > > +	switch (action) {
> > > +	case BUS_NOTIFY_ADD_DEVICE:
> > > +		return add_device(dev);
> > > +	case BUS_NOTIFY_DEL_DEVICE:
> > > +		del_device(dev);
> > > +		return 0;
> > > +	default:
> > > +		return 0;
> > > +	}
> > > +}
> > > +
> > > +static struct notifier_block tce_iommu_bus_nb = {
> > > +	.notifier_call = iommu_bus_notifier, };
> > > +
> > > +static void group_release(void *iommu_data) {
> > > +	struct iommu_table *tbl = iommu_data;
> > > +	tbl->it_group = NULL;
> > > +}
> > > +
> > > +static int __init tce_iommu_init(void) {
> > > +	struct pci_dev *pdev = NULL;
> > > +	struct iommu_table *tbl;
> > > +	struct iommu_group *grp;
> > > +
> > > +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> > 
> > There's already a notifier in the iommu code if you were to register an
> > iommu_ops with the add/remove_device entries.  That would allow you to
> > remove the notifier block and notifier function below and the second loop
> > below.  Are you avoiding that to avoid the rest of iommu_ops?
> > 
> [Sethi Varun-B16395] Could be one reason, also they are associating
> the iommu group with the tce table entry and not the device.

That's fine, the tce is the level they claim to have isolation.

> 
> > Also, shouldn't this notifier only be registered after the first loop
> > below?  Otherwise ADD_DEVICE could race with setting up groups, which we
> > assume are present in the add_device() above.
> [Sethi Varun-B16395] Isn't this similar to how how the notifier is
> registered in iommu_bus_init? First a notifier is registered and then
> we check for devices that have already been probed.

It's not quite the same because the existing notifier callbacks to add
devices also creates groups as necessary.  My point here was that we
register a notifier that assumes a group prior to setting up the groups.
In the existing code the order doesn't matter so much because the system
isn't susceptible to hot device adds at that point.  That's likely the
case here too, but registering a notifier before setting up the data the
callback references seems unnecessary.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
  2012-11-23  2:02         ` Alexey Kardashevskiy
@ 2012-11-26 15:18           ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-26 15:18 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Sethi Varun-B16395, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
> On 22/11/12 22:56, Sethi Varun-B16395 wrote:
> >
> >
> >> -----Original Message-----
> >> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> >> owner@vger.kernel.org] On Behalf Of Alex Williamson
> >> Sent: Tuesday, November 20, 2012 11:50 PM
> >> To: Alexey Kardashevskiy
> >> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> >> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> >> David Gibson
> >> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> >> platform
> >>
> >> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> >>> VFIO implements platform independent stuff such as a PCI driver, BAR
> >>> access (via read/write on a file descriptor or direct mapping when
> >>> possible) and IRQ signaling.
> >>> The platform dependent part includes IOMMU initialization and
> >>> handling.
> >>>
> >>> This patch initializes IOMMU groups based on the IOMMU configuration
> >>> discovered during the PCI scan, only POWERNV platform is supported at
> >>> the moment.
> >>>
> >>> Also the patch implements an VFIO-IOMMU driver which manages DMA
> >>> mapping/unmapping requests coming from the client (now QEMU). It also
> >>> returns a DMA window information to let the guest initialize the
> >>> device tree for a guest OS properly. Although this driver has been
> >>> tested only on POWERNV, it should work on any platform supporting TCE
> >>> tables.
> >>>
> >>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >>>
> >>> Cc: David Gibson <david@gibson.dropbear.id.au>
> >>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>> ---
> >>>   arch/powerpc/include/asm/iommu.h     |    6 +
> >>>   arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> >>>   arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> >>>   drivers/iommu/Kconfig                |    8 ++
> >>>   drivers/vfio/Kconfig                 |    6 +
> >>>   drivers/vfio/Makefile                |    1 +
> >>>   drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> >> ++++++++++++++++++++++++++++++++++
> >>>   include/linux/vfio.h                 |   20 +++
> >>>   8 files changed, 563 insertions(+)
> >>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>
> >>> diff --git a/arch/powerpc/include/asm/iommu.h
> >>> b/arch/powerpc/include/asm/iommu.h
> >>> index cbfe678..5ba66cb 100644
> >>> --- a/arch/powerpc/include/asm/iommu.h
> >>> +++ b/arch/powerpc/include/asm/iommu.h
> >>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
> >>> ____cacheline_aligned_in_smp;
> >>>
> >>>   struct iommu_table {
> >>>   	unsigned long  it_busno;     /* Bus number this table belongs to */
> >>>   	unsigned long  it_size;      /* Size of iommu table in entries */
> >>>   	unsigned long  it_offset;    /* Offset into global table */
> >>>   	unsigned long  it_base;      /* mapped address of tce table */
> >>>   	unsigned long  it_index;     /* which iommu table this is */
> >>>   	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> >>>   	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> >> */
> >>>   	unsigned long  poolsize;
> >>>   	unsigned long  nr_pools;
> >>>   	struct iommu_pool large_pool;
> >>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
> >>>   	unsigned long *it_map;       /* A simple allocation bitmap for now
> >> */
> >>> +#ifdef CONFIG_IOMMU_API
> >>> +	struct iommu_group *it_group;
> >>> +#endif
> >>>   };
> >>>
> >>>   struct scatterlist;
> >>>
> >>>   static inline void set_iommu_table_base(struct device *dev, void
> >>> *base)  {
> >>>   	dev->archdata.dma_data.iommu_table_base = base;  }
> >>>
> >>>   static inline void *get_iommu_table_base(struct device *dev)  {
> >>>   	return dev->archdata.dma_data.iommu_table_base;
> >>>   }
> >>>
> >>>   /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> >>> static inline void pci_iommu_init(void) { }  extern void
> >>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> >>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
> >>>   	if (ppc_md.iommu_save)
> >>>   		ppc_md.iommu_save();
> >>>   }
> >>>
> >>>   static inline void iommu_restore(void)  {
> >>>   	if (ppc_md.iommu_restore)
> >>>   		ppc_md.iommu_restore();
> >>>   }
> >>>   #endif
> >>>
> >>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> >> entry, uint64_t tce,
> >>> +		enum dma_data_direction direction, unsigned long pages);
> >>> +
> >>>   #endif /* __KERNEL__ */
> >>>   #endif /* _ASM_IOMMU_H */
> >>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >>> index ff5a6ce..94f614b 100644
> >>> --- a/arch/powerpc/kernel/iommu.c
> >>> +++ b/arch/powerpc/kernel/iommu.c
> >>> @@ -32,30 +32,31 @@
> >>>   #include <linux/dma-mapping.h>
> >>>   #include <linux/bitmap.h>
> >>>   #include <linux/iommu-helper.h>
> >>>   #include <linux/crash_dump.h>
> >>>   #include <linux/hash.h>
> >>>   #include <linux/fault-inject.h>
> >>>   #include <linux/pci.h>
> >>>   #include <asm/io.h>
> >>>   #include <asm/prom.h>
> >>>   #include <asm/iommu.h>
> >>>   #include <asm/pci-bridge.h>
> >>>   #include <asm/machdep.h>
> >>>   #include <asm/kdump.h>
> >>>   #include <asm/fadump.h>
> >>>   #include <asm/vio.h>
> >>> +#include <asm/tce.h>
> >>>
> >>>   #define DBG(...)
> >>>
> >>>   static int novmerge;
> >>>
> >>>   static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> >>> int);
> >>>
> >>>   static int __init setup_iommu(char *str)  {
> >>>   	if (!strcmp(str, "novmerge"))
> >>>   		novmerge = 1;
> >>>   	else if (!strcmp(str, "vmerge"))
> >>>   		novmerge = 0;
> >>>   	return 1;
> >>>   }
> >>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> >>> struct iommu_table *tbl,  }
> >>>
> >>>   void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >>>   			 void *vaddr, dma_addr_t dma_handle)  {
> >>>   	if (tbl) {
> >>>   		unsigned int nio_pages;
> >>>
> >>>   		size = PAGE_ALIGN(size);
> >>>   		nio_pages = size >> IOMMU_PAGE_SHIFT;
> >>>   		iommu_free(tbl, dma_handle, nio_pages);
> >>>   		size = PAGE_ALIGN(size);
> >>>   		free_pages((unsigned long)vaddr, get_order(size));
> >>>   	}
> >>>   }
> >>> +
> >>> +#ifdef CONFIG_IOMMU_API
> >>> +/*
> >>> + * SPAPR TCE API
> >>> + */
> >>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> >>> +entry) {
> >>> +	struct page *page = NULL;
> >>
> >> NULL initialization doesn't appear to be necessary
> >>
> >>> +	unsigned long oldtce;
> >>> +
> >>> +	oldtce = ppc_md.tce_get(tbl, entry);
> >>> +
> >>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >>> +		return NULL;
> >>> +
> >>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> >>> +
> >>> +	WARN_ON(!page);
> >>> +	if (page && (oldtce & TCE_PCI_WRITE))
> >>> +		SetPageDirty(page);
> >>> +	ppc_md.tce_free(tbl, entry, 1);
> >>> +
> >>> +	return page;
> >>> +}
> >>> +
> >>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> >>> +		uint64_t tce, enum dma_data_direction direction) {
> >>> +	int ret;
> >>> +	struct page *page = NULL;
> >>> +	unsigned long kva, offset;
> >>> +
> >>> +	/* Map new TCE */
> >>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >>> +			direction != DMA_TO_DEVICE, &page);
> >>> +	if (ret < 1) {
> >>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> >> tce=%llx ioba=%lx ret=%d\n",
> >>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> >>> +		if (!ret)
> >>> +			ret = -EFAULT;
> >>
> >> Missing return ret?  Otherwise we've got some bogus uses of page below
> >> and we're setting ret for no reason here.
> >>
> >>> +	}
> >>> +
> >>> +	kva = (unsigned long) page_address(page);
> >>> +	kva += offset;
> >>> +
> >>> +	/* tce_build receives a virtual address */
> >>> +	entry += tbl->it_offset; /* Offset into real TCE table */
> >>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >>> +
> >>> +	/* tce_build() only returns non-zero for transient errors */
> >>> +	if (unlikely(ret)) {
> >>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> >> ioba=%lx kva=%lx ret=%d\n",
> >>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> >>> +		put_page(page);
> >>> +		return -EIO;
> >>> +	}
> >>> +
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static void tce_flush(struct iommu_table *tbl) {
> >>> +	/* Flush/invalidate TLB caches if necessary */
> >>> +	if (ppc_md.tce_flush)
> >>> +		ppc_md.tce_flush(tbl);
> >>> +
> >>> +	/* Make sure updates are seen by hardware */
> >>> +	mb();
> >>> +}
> >>> +
> >>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >> uint64_t tce,
> >>> +		enum dma_data_direction direction, unsigned long pages) {
> >>> +	int i, ret = 0, pages_to_put = 0;
> >>> +	struct page *page;
> >>> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >>> +	struct page **oldpages;
> >>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> >>> +
> >>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> >>> +
> >>> +	/* Handle a single page request without allocation
> >>> +	   of pages-to-release array */
> >>> +	if (pages == 1) {
> >>> +		spin_lock(&(pool->lock));
> >>> +		page = free_tce(tbl, entry);
> >>> +
> >>> +		if (direction != DMA_NONE)
> >>> +			ret = put_tce(tbl, entry, tce, direction);
> >>> +
> >>> +		tce_flush(tbl);
> >>> +
> >>> +		if (page)
> >>> +			put_page(page);
> >>> +
> >>> +		spin_unlock(&(pool->lock));
> >>> +		return ret;
> >>> +	}
> >>> +
> >>> +	/* Releasing multiple pages */
> >>> +	/* Allocate an array for pages to be released after TCE table
> >>> +	   is updated */
> >>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> >>> +	if (!oldpages)
> >>> +		return -ENOMEM;
> >>> +
> >>> +	spin_lock(&(pool->lock));
> >>> +
> >>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> >> IOMMU_PAGE_SIZE) {
> >>> +		page = free_tce(tbl, entry);
> >>> +		if (page) {
> >>> +			oldpages[pages_to_put] = page;
> >>> +			++pages_to_put;
> >>> +		}
> >>> +
> >>> +		if (direction != DMA_NONE)
> >>> +			ret = put_tce(tbl, entry, tce, direction);
> >>> +
> >>> +		/* Release old pages if we reached the end of oldpages[] or
> >>> +		   it is the last page or we are about to exit the loop */
> >>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> >> {
> >>> +			tce_flush(tbl);
> >>
> >> Avoiding tce_flush() is the reason for all this extra overhead, right?
> >> I wonder if it'd be cleaner separating map vs unmap, where the map case
> >> can avoid the oldpages array... but that means inserting new mappings on
> >> top of old ones wouldn't put the pages.
> 
> 
> Yes, we do not want to loose pages if the guest forgot to unmap them.

Hmm, does that mean we're not actively clearing tce entries or somehow
disabling the iommu window when the iommu is released through vfio?

> >>> +
> >>> +			/* Release pages after removing them from TCE table */
> >>> +			while (pages_to_put) {
> >>> +				--pages_to_put;
> >>> +				put_page(oldpages[pages_to_put]);
> >>> +			}
> >>> +		}
> >>> +	}
> >>> +
> >>> +	spin_unlock(&(pool->lock));
> >>> +	kfree(oldpages);
> >>> +
> >>> +	return ret;
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> >>> +#endif /* CONFIG_IOMMU_API */
> >>> diff --git a/arch/powerpc/platforms/powernv/pci.c
> >>> b/arch/powerpc/platforms/powernv/pci.c
> >>> index 05205cf..676f4d9 100644
> >>> --- a/arch/powerpc/platforms/powernv/pci.c
> >>> +++ b/arch/powerpc/platforms/powernv/pci.c
> >>> @@ -8,30 +8,31 @@
> >>>    * This program is free software; you can redistribute it and/or
> >>>    * modify it under the terms of the GNU General Public License
> >>>    * as published by the Free Software Foundation; either version
> >>>    * 2 of the License, or (at your option) any later version.
> >>>    */
> >>>
> >>>   #include <linux/kernel.h>
> >>>   #include <linux/pci.h>
> >>>   #include <linux/delay.h>
> >>>   #include <linux/string.h>
> >>>   #include <linux/init.h>
> >>>   #include <linux/bootmem.h>
> >>>   #include <linux/irq.h>
> >>>   #include <linux/io.h>
> >>>   #include <linux/msi.h>
> >>> +#include <linux/iommu.h>
> >>>
> >>>   #include <asm/sections.h>
> >>>   #include <asm/io.h>
> >>>   #include <asm/prom.h>
> >>>   #include <asm/pci-bridge.h>
> >>>   #include <asm/machdep.h>
> >>>   #include <asm/ppc-pci.h>
> >>>   #include <asm/opal.h>
> >>>   #include <asm/iommu.h>
> >>>   #include <asm/tce.h>
> >>>   #include <asm/abs_addr.h>
> >>>   #include <asm/firmware.h>
> >>>
> >>>   #include "powernv.h"
> >>>   #include "pci.h"
> >>> @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> >>>   	/* Configure IOMMU DMA hooks */
> >>>   	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> >>>   	ppc_md.tce_build = pnv_tce_build;
> >>>   	ppc_md.tce_free = pnv_tce_free;
> >>>   	ppc_md.tce_get = pnv_tce_get;
> >>>   	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> >>>   	set_pci_dma_ops(&dma_iommu_ops);
> >>>
> >>>   	/* Configure MSIs */
> >>>   #ifdef CONFIG_PCI_MSI
> >>>   	ppc_md.msi_check_device = pnv_msi_check_device;
> >>>   	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> >>>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;  #endif  }
> >>> +
> >>> +#ifdef CONFIG_IOMMU_API
> >>> +/*
> >>> + * IOMMU groups support required by VFIO  */ static int
> >>> +add_device(struct device *dev) {
> >>> +	struct iommu_table *tbl;
> >>> +	int ret = 0;
> >>> +
> >>> +	if (WARN_ON(dev->iommu_group)) {
> >>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu
> >> group %d, skipping\n",
> >>> +				dev->kobj.name,
> >>
> >> dev_name(dev)
> >>
> >>> +				iommu_group_id(dev->iommu_group));
> >>> +		return -EBUSY;
> >>> +	}
> >>> +
> >>> +	tbl = get_iommu_table_base(dev);
> >>> +	if (!tbl) {
> >>> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> >>> +				dev->kobj.name);
> >>> +		return 0;
> >>> +	}
> >>> +
> >>> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> >>> +			dev->kobj.name, iommu_group_id(tbl->it_group));
> >>> +
> >>> +	ret = iommu_group_add_device(tbl->it_group, dev);
> >>> +	if (ret < 0)
> >>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> >>> +				dev->kobj.name, ret);
> >>> +
> >>> +	return ret;
> >>> +}
> >>> +
> >>> +static void del_device(struct device *dev) {
> >>> +	iommu_group_remove_device(dev);
> >>> +}
> >>> +
> >>> +static int iommu_bus_notifier(struct notifier_block *nb,
> >>> +			      unsigned long action, void *data) {
> >>> +	struct device *dev = data;
> >>> +
> >>> +	switch (action) {
> >>> +	case BUS_NOTIFY_ADD_DEVICE:
> >>> +		return add_device(dev);
> >>> +	case BUS_NOTIFY_DEL_DEVICE:
> >>> +		del_device(dev);
> >>> +		return 0;
> >>> +	default:
> >>> +		return 0;
> >>> +	}
> >>> +}
> >>> +
> >>> +static struct notifier_block tce_iommu_bus_nb = {
> >>> +	.notifier_call = iommu_bus_notifier, };
> >>> +
> >>> +static void group_release(void *iommu_data) {
> >>> +	struct iommu_table *tbl = iommu_data;
> >>> +	tbl->it_group = NULL;
> >>> +}
> >>> +
> >>> +static int __init tce_iommu_init(void) {
> >>> +	struct pci_dev *pdev = NULL;
> >>> +	struct iommu_table *tbl;
> >>> +	struct iommu_group *grp;
> >>> +
> >>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >>
> >> There's already a notifier in the iommu code if you were to register an
> >> iommu_ops with the add/remove_device entries.  That would allow you to
> >> remove the notifier block and notifier function below and the second loop
> >> below.  Are you avoiding that to avoid the rest of iommu_ops?
> 
> Yes. I need to implement either a small part of iommu_ops (especially the 
> part which I think should not be there at all) or notifier, cannot how how 
> the first is simpler.
> 
> 
> > [Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device.
> 
> Also true.
> I would actually allocate IOMMU groups right after we discovered the new 
> one but this is done during PCI scan which works before 
> subsys_initcall(iommu_init) is called so I added this first loop.
> 
> 
> >> Also, shouldn't this notifier only be registered after the first loop
> >> below?  Otherwise ADD_DEVICE could race with setting up groups, which we
> >> assume are present in the add_device() above.
> > [Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed.
> 
> Yep. Not very familiar with this stuff but if it is done one way and it is 
> already upstream, I cannot see why I should go another way :)

The existing notifier callback and loop should be able to operate in
parallel... of course they don't because we're not actively adding new
devices at the point where it's setup.  IIRC, the notifier callback
blindly uses something that's not setup at the point it's registered.
That's a bit sloppy.  Maybe I'm mis-remembering, I'll verify in your new
version.  Thanks,

Alex




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-26 15:18           ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-26 15:18 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, Sethi Varun-B16395,
	linuxppc-dev, David Gibson

On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
> On 22/11/12 22:56, Sethi Varun-B16395 wrote:
> >
> >
> >> -----Original Message-----
> >> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> >> owner@vger.kernel.org] On Behalf Of Alex Williamson
> >> Sent: Tuesday, November 20, 2012 11:50 PM
> >> To: Alexey Kardashevskiy
> >> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> >> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> >> David Gibson
> >> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> >> platform
> >>
> >> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> >>> VFIO implements platform independent stuff such as a PCI driver, BAR
> >>> access (via read/write on a file descriptor or direct mapping when
> >>> possible) and IRQ signaling.
> >>> The platform dependent part includes IOMMU initialization and
> >>> handling.
> >>>
> >>> This patch initializes IOMMU groups based on the IOMMU configuration
> >>> discovered during the PCI scan, only POWERNV platform is supported at
> >>> the moment.
> >>>
> >>> Also the patch implements an VFIO-IOMMU driver which manages DMA
> >>> mapping/unmapping requests coming from the client (now QEMU). It also
> >>> returns a DMA window information to let the guest initialize the
> >>> device tree for a guest OS properly. Although this driver has been
> >>> tested only on POWERNV, it should work on any platform supporting TCE
> >>> tables.
> >>>
> >>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >>>
> >>> Cc: David Gibson <david@gibson.dropbear.id.au>
> >>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>> ---
> >>>   arch/powerpc/include/asm/iommu.h     |    6 +
> >>>   arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> >>>   arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> >>>   drivers/iommu/Kconfig                |    8 ++
> >>>   drivers/vfio/Kconfig                 |    6 +
> >>>   drivers/vfio/Makefile                |    1 +
> >>>   drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> >> ++++++++++++++++++++++++++++++++++
> >>>   include/linux/vfio.h                 |   20 +++
> >>>   8 files changed, 563 insertions(+)
> >>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>
> >>> diff --git a/arch/powerpc/include/asm/iommu.h
> >>> b/arch/powerpc/include/asm/iommu.h
> >>> index cbfe678..5ba66cb 100644
> >>> --- a/arch/powerpc/include/asm/iommu.h
> >>> +++ b/arch/powerpc/include/asm/iommu.h
> >>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
> >>> ____cacheline_aligned_in_smp;
> >>>
> >>>   struct iommu_table {
> >>>   	unsigned long  it_busno;     /* Bus number this table belongs to */
> >>>   	unsigned long  it_size;      /* Size of iommu table in entries */
> >>>   	unsigned long  it_offset;    /* Offset into global table */
> >>>   	unsigned long  it_base;      /* mapped address of tce table */
> >>>   	unsigned long  it_index;     /* which iommu table this is */
> >>>   	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> >>>   	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> >> */
> >>>   	unsigned long  poolsize;
> >>>   	unsigned long  nr_pools;
> >>>   	struct iommu_pool large_pool;
> >>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
> >>>   	unsigned long *it_map;       /* A simple allocation bitmap for now
> >> */
> >>> +#ifdef CONFIG_IOMMU_API
> >>> +	struct iommu_group *it_group;
> >>> +#endif
> >>>   };
> >>>
> >>>   struct scatterlist;
> >>>
> >>>   static inline void set_iommu_table_base(struct device *dev, void
> >>> *base)  {
> >>>   	dev->archdata.dma_data.iommu_table_base = base;  }
> >>>
> >>>   static inline void *get_iommu_table_base(struct device *dev)  {
> >>>   	return dev->archdata.dma_data.iommu_table_base;
> >>>   }
> >>>
> >>>   /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> >>> static inline void pci_iommu_init(void) { }  extern void
> >>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> >>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
> >>>   	if (ppc_md.iommu_save)
> >>>   		ppc_md.iommu_save();
> >>>   }
> >>>
> >>>   static inline void iommu_restore(void)  {
> >>>   	if (ppc_md.iommu_restore)
> >>>   		ppc_md.iommu_restore();
> >>>   }
> >>>   #endif
> >>>
> >>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> >> entry, uint64_t tce,
> >>> +		enum dma_data_direction direction, unsigned long pages);
> >>> +
> >>>   #endif /* __KERNEL__ */
> >>>   #endif /* _ASM_IOMMU_H */
> >>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >>> index ff5a6ce..94f614b 100644
> >>> --- a/arch/powerpc/kernel/iommu.c
> >>> +++ b/arch/powerpc/kernel/iommu.c
> >>> @@ -32,30 +32,31 @@
> >>>   #include <linux/dma-mapping.h>
> >>>   #include <linux/bitmap.h>
> >>>   #include <linux/iommu-helper.h>
> >>>   #include <linux/crash_dump.h>
> >>>   #include <linux/hash.h>
> >>>   #include <linux/fault-inject.h>
> >>>   #include <linux/pci.h>
> >>>   #include <asm/io.h>
> >>>   #include <asm/prom.h>
> >>>   #include <asm/iommu.h>
> >>>   #include <asm/pci-bridge.h>
> >>>   #include <asm/machdep.h>
> >>>   #include <asm/kdump.h>
> >>>   #include <asm/fadump.h>
> >>>   #include <asm/vio.h>
> >>> +#include <asm/tce.h>
> >>>
> >>>   #define DBG(...)
> >>>
> >>>   static int novmerge;
> >>>
> >>>   static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> >>> int);
> >>>
> >>>   static int __init setup_iommu(char *str)  {
> >>>   	if (!strcmp(str, "novmerge"))
> >>>   		novmerge = 1;
> >>>   	else if (!strcmp(str, "vmerge"))
> >>>   		novmerge = 0;
> >>>   	return 1;
> >>>   }
> >>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> >>> struct iommu_table *tbl,  }
> >>>
> >>>   void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >>>   			 void *vaddr, dma_addr_t dma_handle)  {
> >>>   	if (tbl) {
> >>>   		unsigned int nio_pages;
> >>>
> >>>   		size = PAGE_ALIGN(size);
> >>>   		nio_pages = size >> IOMMU_PAGE_SHIFT;
> >>>   		iommu_free(tbl, dma_handle, nio_pages);
> >>>   		size = PAGE_ALIGN(size);
> >>>   		free_pages((unsigned long)vaddr, get_order(size));
> >>>   	}
> >>>   }
> >>> +
> >>> +#ifdef CONFIG_IOMMU_API
> >>> +/*
> >>> + * SPAPR TCE API
> >>> + */
> >>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> >>> +entry) {
> >>> +	struct page *page = NULL;
> >>
> >> NULL initialization doesn't appear to be necessary
> >>
> >>> +	unsigned long oldtce;
> >>> +
> >>> +	oldtce = ppc_md.tce_get(tbl, entry);
> >>> +
> >>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >>> +		return NULL;
> >>> +
> >>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> >>> +
> >>> +	WARN_ON(!page);
> >>> +	if (page && (oldtce & TCE_PCI_WRITE))
> >>> +		SetPageDirty(page);
> >>> +	ppc_md.tce_free(tbl, entry, 1);
> >>> +
> >>> +	return page;
> >>> +}
> >>> +
> >>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> >>> +		uint64_t tce, enum dma_data_direction direction) {
> >>> +	int ret;
> >>> +	struct page *page = NULL;
> >>> +	unsigned long kva, offset;
> >>> +
> >>> +	/* Map new TCE */
> >>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >>> +			direction != DMA_TO_DEVICE, &page);
> >>> +	if (ret < 1) {
> >>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> >> tce=%llx ioba=%lx ret=%d\n",
> >>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> >>> +		if (!ret)
> >>> +			ret = -EFAULT;
> >>
> >> Missing return ret?  Otherwise we've got some bogus uses of page below
> >> and we're setting ret for no reason here.
> >>
> >>> +	}
> >>> +
> >>> +	kva = (unsigned long) page_address(page);
> >>> +	kva += offset;
> >>> +
> >>> +	/* tce_build receives a virtual address */
> >>> +	entry += tbl->it_offset; /* Offset into real TCE table */
> >>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >>> +
> >>> +	/* tce_build() only returns non-zero for transient errors */
> >>> +	if (unlikely(ret)) {
> >>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> >> ioba=%lx kva=%lx ret=%d\n",
> >>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> >>> +		put_page(page);
> >>> +		return -EIO;
> >>> +	}
> >>> +
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static void tce_flush(struct iommu_table *tbl) {
> >>> +	/* Flush/invalidate TLB caches if necessary */
> >>> +	if (ppc_md.tce_flush)
> >>> +		ppc_md.tce_flush(tbl);
> >>> +
> >>> +	/* Make sure updates are seen by hardware */
> >>> +	mb();
> >>> +}
> >>> +
> >>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >> uint64_t tce,
> >>> +		enum dma_data_direction direction, unsigned long pages) {
> >>> +	int i, ret = 0, pages_to_put = 0;
> >>> +	struct page *page;
> >>> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >>> +	struct page **oldpages;
> >>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> >>> +
> >>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> >>> +
> >>> +	/* Handle a single page request without allocation
> >>> +	   of pages-to-release array */
> >>> +	if (pages == 1) {
> >>> +		spin_lock(&(pool->lock));
> >>> +		page = free_tce(tbl, entry);
> >>> +
> >>> +		if (direction != DMA_NONE)
> >>> +			ret = put_tce(tbl, entry, tce, direction);
> >>> +
> >>> +		tce_flush(tbl);
> >>> +
> >>> +		if (page)
> >>> +			put_page(page);
> >>> +
> >>> +		spin_unlock(&(pool->lock));
> >>> +		return ret;
> >>> +	}
> >>> +
> >>> +	/* Releasing multiple pages */
> >>> +	/* Allocate an array for pages to be released after TCE table
> >>> +	   is updated */
> >>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> >>> +	if (!oldpages)
> >>> +		return -ENOMEM;
> >>> +
> >>> +	spin_lock(&(pool->lock));
> >>> +
> >>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> >> IOMMU_PAGE_SIZE) {
> >>> +		page = free_tce(tbl, entry);
> >>> +		if (page) {
> >>> +			oldpages[pages_to_put] = page;
> >>> +			++pages_to_put;
> >>> +		}
> >>> +
> >>> +		if (direction != DMA_NONE)
> >>> +			ret = put_tce(tbl, entry, tce, direction);
> >>> +
> >>> +		/* Release old pages if we reached the end of oldpages[] or
> >>> +		   it is the last page or we are about to exit the loop */
> >>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> >> {
> >>> +			tce_flush(tbl);
> >>
> >> Avoiding tce_flush() is the reason for all this extra overhead, right?
> >> I wonder if it'd be cleaner separating map vs unmap, where the map case
> >> can avoid the oldpages array... but that means inserting new mappings on
> >> top of old ones wouldn't put the pages.
> 
> 
> Yes, we do not want to loose pages if the guest forgot to unmap them.

Hmm, does that mean we're not actively clearing tce entries or somehow
disabling the iommu window when the iommu is released through vfio?

> >>> +
> >>> +			/* Release pages after removing them from TCE table */
> >>> +			while (pages_to_put) {
> >>> +				--pages_to_put;
> >>> +				put_page(oldpages[pages_to_put]);
> >>> +			}
> >>> +		}
> >>> +	}
> >>> +
> >>> +	spin_unlock(&(pool->lock));
> >>> +	kfree(oldpages);
> >>> +
> >>> +	return ret;
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> >>> +#endif /* CONFIG_IOMMU_API */
> >>> diff --git a/arch/powerpc/platforms/powernv/pci.c
> >>> b/arch/powerpc/platforms/powernv/pci.c
> >>> index 05205cf..676f4d9 100644
> >>> --- a/arch/powerpc/platforms/powernv/pci.c
> >>> +++ b/arch/powerpc/platforms/powernv/pci.c
> >>> @@ -8,30 +8,31 @@
> >>>    * This program is free software; you can redistribute it and/or
> >>>    * modify it under the terms of the GNU General Public License
> >>>    * as published by the Free Software Foundation; either version
> >>>    * 2 of the License, or (at your option) any later version.
> >>>    */
> >>>
> >>>   #include <linux/kernel.h>
> >>>   #include <linux/pci.h>
> >>>   #include <linux/delay.h>
> >>>   #include <linux/string.h>
> >>>   #include <linux/init.h>
> >>>   #include <linux/bootmem.h>
> >>>   #include <linux/irq.h>
> >>>   #include <linux/io.h>
> >>>   #include <linux/msi.h>
> >>> +#include <linux/iommu.h>
> >>>
> >>>   #include <asm/sections.h>
> >>>   #include <asm/io.h>
> >>>   #include <asm/prom.h>
> >>>   #include <asm/pci-bridge.h>
> >>>   #include <asm/machdep.h>
> >>>   #include <asm/ppc-pci.h>
> >>>   #include <asm/opal.h>
> >>>   #include <asm/iommu.h>
> >>>   #include <asm/tce.h>
> >>>   #include <asm/abs_addr.h>
> >>>   #include <asm/firmware.h>
> >>>
> >>>   #include "powernv.h"
> >>>   #include "pci.h"
> >>> @@ -601,15 +602,149 @@ void __init pnv_pci_init(void)
> >>>   	/* Configure IOMMU DMA hooks */
> >>>   	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
> >>>   	ppc_md.tce_build = pnv_tce_build;
> >>>   	ppc_md.tce_free = pnv_tce_free;
> >>>   	ppc_md.tce_get = pnv_tce_get;
> >>>   	ppc_md.pci_probe_mode = pnv_pci_probe_mode;
> >>>   	set_pci_dma_ops(&dma_iommu_ops);
> >>>
> >>>   	/* Configure MSIs */
> >>>   #ifdef CONFIG_PCI_MSI
> >>>   	ppc_md.msi_check_device = pnv_msi_check_device;
> >>>   	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
> >>>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;  #endif  }
> >>> +
> >>> +#ifdef CONFIG_IOMMU_API
> >>> +/*
> >>> + * IOMMU groups support required by VFIO  */ static int
> >>> +add_device(struct device *dev) {
> >>> +	struct iommu_table *tbl;
> >>> +	int ret = 0;
> >>> +
> >>> +	if (WARN_ON(dev->iommu_group)) {
> >>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu
> >> group %d, skipping\n",
> >>> +				dev->kobj.name,
> >>
> >> dev_name(dev)
> >>
> >>> +				iommu_group_id(dev->iommu_group));
> >>> +		return -EBUSY;
> >>> +	}
> >>> +
> >>> +	tbl = get_iommu_table_base(dev);
> >>> +	if (!tbl) {
> >>> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> >>> +				dev->kobj.name);
> >>> +		return 0;
> >>> +	}
> >>> +
> >>> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> >>> +			dev->kobj.name, iommu_group_id(tbl->it_group));
> >>> +
> >>> +	ret = iommu_group_add_device(tbl->it_group, dev);
> >>> +	if (ret < 0)
> >>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> >>> +				dev->kobj.name, ret);
> >>> +
> >>> +	return ret;
> >>> +}
> >>> +
> >>> +static void del_device(struct device *dev) {
> >>> +	iommu_group_remove_device(dev);
> >>> +}
> >>> +
> >>> +static int iommu_bus_notifier(struct notifier_block *nb,
> >>> +			      unsigned long action, void *data) {
> >>> +	struct device *dev = data;
> >>> +
> >>> +	switch (action) {
> >>> +	case BUS_NOTIFY_ADD_DEVICE:
> >>> +		return add_device(dev);
> >>> +	case BUS_NOTIFY_DEL_DEVICE:
> >>> +		del_device(dev);
> >>> +		return 0;
> >>> +	default:
> >>> +		return 0;
> >>> +	}
> >>> +}
> >>> +
> >>> +static struct notifier_block tce_iommu_bus_nb = {
> >>> +	.notifier_call = iommu_bus_notifier, };
> >>> +
> >>> +static void group_release(void *iommu_data) {
> >>> +	struct iommu_table *tbl = iommu_data;
> >>> +	tbl->it_group = NULL;
> >>> +}
> >>> +
> >>> +static int __init tce_iommu_init(void) {
> >>> +	struct pci_dev *pdev = NULL;
> >>> +	struct iommu_table *tbl;
> >>> +	struct iommu_group *grp;
> >>> +
> >>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >>
> >> There's already a notifier in the iommu code if you were to register an
> >> iommu_ops with the add/remove_device entries.  That would allow you to
> >> remove the notifier block and notifier function below and the second loop
> >> below.  Are you avoiding that to avoid the rest of iommu_ops?
> 
> Yes. I need to implement either a small part of iommu_ops (especially the 
> part which I think should not be there at all) or notifier, cannot how how 
> the first is simpler.
> 
> 
> > [Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device.
> 
> Also true.
> I would actually allocate IOMMU groups right after we discovered the new 
> one but this is done during PCI scan which works before 
> subsys_initcall(iommu_init) is called so I added this first loop.
> 
> 
> >> Also, shouldn't this notifier only be registered after the first loop
> >> below?  Otherwise ADD_DEVICE could race with setting up groups, which we
> >> assume are present in the add_device() above.
> > [Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed.
> 
> Yep. Not very familiar with this stuff but if it is done one way and it is 
> already upstream, I cannot see why I should go another way :)

The existing notifier callback and loop should be able to operate in
parallel... of course they don't because we're not actively adding new
devices at the point where it's setup.  IIRC, the notifier callback
blindly uses something that's not setup at the point it's registered.
That's a bit sloppy.  Maybe I'm mis-remembering, I'll verify in your new
version.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
  2012-11-26 15:18           ` Alex Williamson
@ 2012-11-26 18:04             ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-26 18:04 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Sethi Varun-B16395, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

On Mon, 2012-11-26 at 08:18 -0700, Alex Williamson wrote:
> On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
> > On 22/11/12 22:56, Sethi Varun-B16395 wrote:
> > >
> > >
> > >> -----Original Message-----
> > >> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> > >> owner@vger.kernel.org] On Behalf Of Alex Williamson
> > >> Sent: Tuesday, November 20, 2012 11:50 PM
> > >> To: Alexey Kardashevskiy
> > >> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> > >> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > >> David Gibson
> > >> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> > >> platform
> > >>
> > >> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > >>> VFIO implements platform independent stuff such as a PCI driver, BAR
> > >>> access (via read/write on a file descriptor or direct mapping when
> > >>> possible) and IRQ signaling.
> > >>> The platform dependent part includes IOMMU initialization and
> > >>> handling.
> > >>>
> > >>> This patch initializes IOMMU groups based on the IOMMU configuration
> > >>> discovered during the PCI scan, only POWERNV platform is supported at
> > >>> the moment.
> > >>>
> > >>> Also the patch implements an VFIO-IOMMU driver which manages DMA
> > >>> mapping/unmapping requests coming from the client (now QEMU). It also
> > >>> returns a DMA window information to let the guest initialize the
> > >>> device tree for a guest OS properly. Although this driver has been
> > >>> tested only on POWERNV, it should work on any platform supporting TCE
> > >>> tables.
> > >>>
> > >>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> > >>>
> > >>> Cc: David Gibson <david@gibson.dropbear.id.au>
> > >>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > >>> ---
> > >>>   arch/powerpc/include/asm/iommu.h     |    6 +
> > >>>   arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> > >>>   arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> > >>>   drivers/iommu/Kconfig                |    8 ++
> > >>>   drivers/vfio/Kconfig                 |    6 +
> > >>>   drivers/vfio/Makefile                |    1 +
> > >>>   drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> > >> ++++++++++++++++++++++++++++++++++
> > >>>   include/linux/vfio.h                 |   20 +++
> > >>>   8 files changed, 563 insertions(+)
> > >>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> > >>>
> > >>> diff --git a/arch/powerpc/include/asm/iommu.h
> > >>> b/arch/powerpc/include/asm/iommu.h
> > >>> index cbfe678..5ba66cb 100644
> > >>> --- a/arch/powerpc/include/asm/iommu.h
> > >>> +++ b/arch/powerpc/include/asm/iommu.h
> > >>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
> > >>> ____cacheline_aligned_in_smp;
> > >>>
> > >>>   struct iommu_table {
> > >>>   	unsigned long  it_busno;     /* Bus number this table belongs to */
> > >>>   	unsigned long  it_size;      /* Size of iommu table in entries */
> > >>>   	unsigned long  it_offset;    /* Offset into global table */
> > >>>   	unsigned long  it_base;      /* mapped address of tce table */
> > >>>   	unsigned long  it_index;     /* which iommu table this is */
> > >>>   	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> > >>>   	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> > >> */
> > >>>   	unsigned long  poolsize;
> > >>>   	unsigned long  nr_pools;
> > >>>   	struct iommu_pool large_pool;
> > >>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
> > >>>   	unsigned long *it_map;       /* A simple allocation bitmap for now
> > >> */
> > >>> +#ifdef CONFIG_IOMMU_API
> > >>> +	struct iommu_group *it_group;
> > >>> +#endif
> > >>>   };
> > >>>
> > >>>   struct scatterlist;
> > >>>
> > >>>   static inline void set_iommu_table_base(struct device *dev, void
> > >>> *base)  {
> > >>>   	dev->archdata.dma_data.iommu_table_base = base;  }
> > >>>
> > >>>   static inline void *get_iommu_table_base(struct device *dev)  {
> > >>>   	return dev->archdata.dma_data.iommu_table_base;
> > >>>   }
> > >>>
> > >>>   /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > >>> static inline void pci_iommu_init(void) { }  extern void
> > >>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> > >>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
> > >>>   	if (ppc_md.iommu_save)
> > >>>   		ppc_md.iommu_save();
> > >>>   }
> > >>>
> > >>>   static inline void iommu_restore(void)  {
> > >>>   	if (ppc_md.iommu_restore)
> > >>>   		ppc_md.iommu_restore();
> > >>>   }
> > >>>   #endif
> > >>>
> > >>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> > >> entry, uint64_t tce,
> > >>> +		enum dma_data_direction direction, unsigned long pages);
> > >>> +
> > >>>   #endif /* __KERNEL__ */
> > >>>   #endif /* _ASM_IOMMU_H */
> > >>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > >>> index ff5a6ce..94f614b 100644
> > >>> --- a/arch/powerpc/kernel/iommu.c
> > >>> +++ b/arch/powerpc/kernel/iommu.c
> > >>> @@ -32,30 +32,31 @@
> > >>>   #include <linux/dma-mapping.h>
> > >>>   #include <linux/bitmap.h>
> > >>>   #include <linux/iommu-helper.h>
> > >>>   #include <linux/crash_dump.h>
> > >>>   #include <linux/hash.h>
> > >>>   #include <linux/fault-inject.h>
> > >>>   #include <linux/pci.h>
> > >>>   #include <asm/io.h>
> > >>>   #include <asm/prom.h>
> > >>>   #include <asm/iommu.h>
> > >>>   #include <asm/pci-bridge.h>
> > >>>   #include <asm/machdep.h>
> > >>>   #include <asm/kdump.h>
> > >>>   #include <asm/fadump.h>
> > >>>   #include <asm/vio.h>
> > >>> +#include <asm/tce.h>
> > >>>
> > >>>   #define DBG(...)
> > >>>
> > >>>   static int novmerge;
> > >>>
> > >>>   static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > >>> int);
> > >>>
> > >>>   static int __init setup_iommu(char *str)  {
> > >>>   	if (!strcmp(str, "novmerge"))
> > >>>   		novmerge = 1;
> > >>>   	else if (!strcmp(str, "vmerge"))
> > >>>   		novmerge = 0;
> > >>>   	return 1;
> > >>>   }
> > >>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> > >>> struct iommu_table *tbl,  }
> > >>>
> > >>>   void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> > >>>   			 void *vaddr, dma_addr_t dma_handle)  {
> > >>>   	if (tbl) {
> > >>>   		unsigned int nio_pages;
> > >>>
> > >>>   		size = PAGE_ALIGN(size);
> > >>>   		nio_pages = size >> IOMMU_PAGE_SHIFT;
> > >>>   		iommu_free(tbl, dma_handle, nio_pages);
> > >>>   		size = PAGE_ALIGN(size);
> > >>>   		free_pages((unsigned long)vaddr, get_order(size));
> > >>>   	}
> > >>>   }
> > >>> +
> > >>> +#ifdef CONFIG_IOMMU_API
> > >>> +/*
> > >>> + * SPAPR TCE API
> > >>> + */
> > >>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> > >>> +entry) {
> > >>> +	struct page *page = NULL;
> > >>
> > >> NULL initialization doesn't appear to be necessary
> > >>
> > >>> +	unsigned long oldtce;
> > >>> +
> > >>> +	oldtce = ppc_md.tce_get(tbl, entry);
> > >>> +
> > >>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> > >>> +		return NULL;
> > >>> +
> > >>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > >>> +
> > >>> +	WARN_ON(!page);
> > >>> +	if (page && (oldtce & TCE_PCI_WRITE))
> > >>> +		SetPageDirty(page);
> > >>> +	ppc_md.tce_free(tbl, entry, 1);
> > >>> +
> > >>> +	return page;
> > >>> +}
> > >>> +
> > >>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > >>> +		uint64_t tce, enum dma_data_direction direction) {
> > >>> +	int ret;
> > >>> +	struct page *page = NULL;
> > >>> +	unsigned long kva, offset;
> > >>> +
> > >>> +	/* Map new TCE */
> > >>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> > >>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > >>> +			direction != DMA_TO_DEVICE, &page);
> > >>> +	if (ret < 1) {
> > >>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> > >> tce=%llx ioba=%lx ret=%d\n",
> > >>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> > >>> +		if (!ret)
> > >>> +			ret = -EFAULT;
> > >>
> > >> Missing return ret?  Otherwise we've got some bogus uses of page below
> > >> and we're setting ret for no reason here.
> > >>
> > >>> +	}
> > >>> +
> > >>> +	kva = (unsigned long) page_address(page);
> > >>> +	kva += offset;
> > >>> +
> > >>> +	/* tce_build receives a virtual address */
> > >>> +	entry += tbl->it_offset; /* Offset into real TCE table */
> > >>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > >>> +
> > >>> +	/* tce_build() only returns non-zero for transient errors */
> > >>> +	if (unlikely(ret)) {
> > >>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> > >> ioba=%lx kva=%lx ret=%d\n",
> > >>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > >>> +		put_page(page);
> > >>> +		return -EIO;
> > >>> +	}
> > >>> +
> > >>> +	return 0;
> > >>> +}
> > >>> +
> > >>> +static void tce_flush(struct iommu_table *tbl) {
> > >>> +	/* Flush/invalidate TLB caches if necessary */
> > >>> +	if (ppc_md.tce_flush)
> > >>> +		ppc_md.tce_flush(tbl);
> > >>> +
> > >>> +	/* Make sure updates are seen by hardware */
> > >>> +	mb();
> > >>> +}
> > >>> +
> > >>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> > >> uint64_t tce,
> > >>> +		enum dma_data_direction direction, unsigned long pages) {
> > >>> +	int i, ret = 0, pages_to_put = 0;
> > >>> +	struct page *page;
> > >>> +	struct iommu_pool *pool = get_pool(tbl, entry);
> > >>> +	struct page **oldpages;
> > >>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> > >>> +
> > >>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > >>> +
> > >>> +	/* Handle a single page request without allocation
> > >>> +	   of pages-to-release array */
> > >>> +	if (pages == 1) {
> > >>> +		spin_lock(&(pool->lock));
> > >>> +		page = free_tce(tbl, entry);
> > >>> +
> > >>> +		if (direction != DMA_NONE)
> > >>> +			ret = put_tce(tbl, entry, tce, direction);
> > >>> +
> > >>> +		tce_flush(tbl);
> > >>> +
> > >>> +		if (page)
> > >>> +			put_page(page);
> > >>> +
> > >>> +		spin_unlock(&(pool->lock));
> > >>> +		return ret;
> > >>> +	}
> > >>> +
> > >>> +	/* Releasing multiple pages */
> > >>> +	/* Allocate an array for pages to be released after TCE table
> > >>> +	   is updated */
> > >>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > >>> +	if (!oldpages)
> > >>> +		return -ENOMEM;
> > >>> +
> > >>> +	spin_lock(&(pool->lock));
> > >>> +
> > >>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> > >> IOMMU_PAGE_SIZE) {
> > >>> +		page = free_tce(tbl, entry);
> > >>> +		if (page) {
> > >>> +			oldpages[pages_to_put] = page;
> > >>> +			++pages_to_put;
> > >>> +		}
> > >>> +
> > >>> +		if (direction != DMA_NONE)
> > >>> +			ret = put_tce(tbl, entry, tce, direction);
> > >>> +
> > >>> +		/* Release old pages if we reached the end of oldpages[] or
> > >>> +		   it is the last page or we are about to exit the loop */
> > >>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> > >> {
> > >>> +			tce_flush(tbl);
> > >>
> > >> Avoiding tce_flush() is the reason for all this extra overhead, right?
> > >> I wonder if it'd be cleaner separating map vs unmap, where the map case
> > >> can avoid the oldpages array... but that means inserting new mappings on
> > >> top of old ones wouldn't put the pages.
> > 
> > 
> > Yes, we do not want to loose pages if the guest forgot to unmap them.
> 
> Hmm, does that mean we're not actively clearing tce entries or somehow
> disabling the iommu window when the iommu is released through vfio?

Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
more concerned about the guest simply mapping over top of it's own
mappings.  Is that common?  Is it common enough for every multi-page
mapping to assume it will happen?  I know this is a performance
sensitive path for you and it seems like a map-only w/ fallback to
unmap, remap would be better in the general case.

On x86 we do exactly that, but we do the unmap, remap from userspace
when we get an EBUSY.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-26 18:04             ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-26 18:04 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, Sethi Varun-B16395,
	linuxppc-dev, David Gibson

On Mon, 2012-11-26 at 08:18 -0700, Alex Williamson wrote:
> On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
> > On 22/11/12 22:56, Sethi Varun-B16395 wrote:
> > >
> > >
> > >> -----Original Message-----
> > >> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> > >> owner@vger.kernel.org] On Behalf Of Alex Williamson
> > >> Sent: Tuesday, November 20, 2012 11:50 PM
> > >> To: Alexey Kardashevskiy
> > >> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> > >> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > >> David Gibson
> > >> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> > >> platform
> > >>
> > >> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> > >>> VFIO implements platform independent stuff such as a PCI driver, BAR
> > >>> access (via read/write on a file descriptor or direct mapping when
> > >>> possible) and IRQ signaling.
> > >>> The platform dependent part includes IOMMU initialization and
> > >>> handling.
> > >>>
> > >>> This patch initializes IOMMU groups based on the IOMMU configuration
> > >>> discovered during the PCI scan, only POWERNV platform is supported at
> > >>> the moment.
> > >>>
> > >>> Also the patch implements an VFIO-IOMMU driver which manages DMA
> > >>> mapping/unmapping requests coming from the client (now QEMU). It also
> > >>> returns a DMA window information to let the guest initialize the
> > >>> device tree for a guest OS properly. Although this driver has been
> > >>> tested only on POWERNV, it should work on any platform supporting TCE
> > >>> tables.
> > >>>
> > >>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> > >>>
> > >>> Cc: David Gibson <david@gibson.dropbear.id.au>
> > >>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > >>> ---
> > >>>   arch/powerpc/include/asm/iommu.h     |    6 +
> > >>>   arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> > >>>   arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> > >>>   drivers/iommu/Kconfig                |    8 ++
> > >>>   drivers/vfio/Kconfig                 |    6 +
> > >>>   drivers/vfio/Makefile                |    1 +
> > >>>   drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> > >> ++++++++++++++++++++++++++++++++++
> > >>>   include/linux/vfio.h                 |   20 +++
> > >>>   8 files changed, 563 insertions(+)
> > >>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> > >>>
> > >>> diff --git a/arch/powerpc/include/asm/iommu.h
> > >>> b/arch/powerpc/include/asm/iommu.h
> > >>> index cbfe678..5ba66cb 100644
> > >>> --- a/arch/powerpc/include/asm/iommu.h
> > >>> +++ b/arch/powerpc/include/asm/iommu.h
> > >>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
> > >>> ____cacheline_aligned_in_smp;
> > >>>
> > >>>   struct iommu_table {
> > >>>   	unsigned long  it_busno;     /* Bus number this table belongs to */
> > >>>   	unsigned long  it_size;      /* Size of iommu table in entries */
> > >>>   	unsigned long  it_offset;    /* Offset into global table */
> > >>>   	unsigned long  it_base;      /* mapped address of tce table */
> > >>>   	unsigned long  it_index;     /* which iommu table this is */
> > >>>   	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> > >>>   	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> > >> */
> > >>>   	unsigned long  poolsize;
> > >>>   	unsigned long  nr_pools;
> > >>>   	struct iommu_pool large_pool;
> > >>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
> > >>>   	unsigned long *it_map;       /* A simple allocation bitmap for now
> > >> */
> > >>> +#ifdef CONFIG_IOMMU_API
> > >>> +	struct iommu_group *it_group;
> > >>> +#endif
> > >>>   };
> > >>>
> > >>>   struct scatterlist;
> > >>>
> > >>>   static inline void set_iommu_table_base(struct device *dev, void
> > >>> *base)  {
> > >>>   	dev->archdata.dma_data.iommu_table_base = base;  }
> > >>>
> > >>>   static inline void *get_iommu_table_base(struct device *dev)  {
> > >>>   	return dev->archdata.dma_data.iommu_table_base;
> > >>>   }
> > >>>
> > >>>   /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> > >>> static inline void pci_iommu_init(void) { }  extern void
> > >>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> > >>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
> > >>>   	if (ppc_md.iommu_save)
> > >>>   		ppc_md.iommu_save();
> > >>>   }
> > >>>
> > >>>   static inline void iommu_restore(void)  {
> > >>>   	if (ppc_md.iommu_restore)
> > >>>   		ppc_md.iommu_restore();
> > >>>   }
> > >>>   #endif
> > >>>
> > >>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> > >> entry, uint64_t tce,
> > >>> +		enum dma_data_direction direction, unsigned long pages);
> > >>> +
> > >>>   #endif /* __KERNEL__ */
> > >>>   #endif /* _ASM_IOMMU_H */
> > >>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > >>> index ff5a6ce..94f614b 100644
> > >>> --- a/arch/powerpc/kernel/iommu.c
> > >>> +++ b/arch/powerpc/kernel/iommu.c
> > >>> @@ -32,30 +32,31 @@
> > >>>   #include <linux/dma-mapping.h>
> > >>>   #include <linux/bitmap.h>
> > >>>   #include <linux/iommu-helper.h>
> > >>>   #include <linux/crash_dump.h>
> > >>>   #include <linux/hash.h>
> > >>>   #include <linux/fault-inject.h>
> > >>>   #include <linux/pci.h>
> > >>>   #include <asm/io.h>
> > >>>   #include <asm/prom.h>
> > >>>   #include <asm/iommu.h>
> > >>>   #include <asm/pci-bridge.h>
> > >>>   #include <asm/machdep.h>
> > >>>   #include <asm/kdump.h>
> > >>>   #include <asm/fadump.h>
> > >>>   #include <asm/vio.h>
> > >>> +#include <asm/tce.h>
> > >>>
> > >>>   #define DBG(...)
> > >>>
> > >>>   static int novmerge;
> > >>>
> > >>>   static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> > >>> int);
> > >>>
> > >>>   static int __init setup_iommu(char *str)  {
> > >>>   	if (!strcmp(str, "novmerge"))
> > >>>   		novmerge = 1;
> > >>>   	else if (!strcmp(str, "vmerge"))
> > >>>   		novmerge = 0;
> > >>>   	return 1;
> > >>>   }
> > >>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> > >>> struct iommu_table *tbl,  }
> > >>>
> > >>>   void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> > >>>   			 void *vaddr, dma_addr_t dma_handle)  {
> > >>>   	if (tbl) {
> > >>>   		unsigned int nio_pages;
> > >>>
> > >>>   		size = PAGE_ALIGN(size);
> > >>>   		nio_pages = size >> IOMMU_PAGE_SHIFT;
> > >>>   		iommu_free(tbl, dma_handle, nio_pages);
> > >>>   		size = PAGE_ALIGN(size);
> > >>>   		free_pages((unsigned long)vaddr, get_order(size));
> > >>>   	}
> > >>>   }
> > >>> +
> > >>> +#ifdef CONFIG_IOMMU_API
> > >>> +/*
> > >>> + * SPAPR TCE API
> > >>> + */
> > >>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> > >>> +entry) {
> > >>> +	struct page *page = NULL;
> > >>
> > >> NULL initialization doesn't appear to be necessary
> > >>
> > >>> +	unsigned long oldtce;
> > >>> +
> > >>> +	oldtce = ppc_md.tce_get(tbl, entry);
> > >>> +
> > >>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> > >>> +		return NULL;
> > >>> +
> > >>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > >>> +
> > >>> +	WARN_ON(!page);
> > >>> +	if (page && (oldtce & TCE_PCI_WRITE))
> > >>> +		SetPageDirty(page);
> > >>> +	ppc_md.tce_free(tbl, entry, 1);
> > >>> +
> > >>> +	return page;
> > >>> +}
> > >>> +
> > >>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > >>> +		uint64_t tce, enum dma_data_direction direction) {
> > >>> +	int ret;
> > >>> +	struct page *page = NULL;
> > >>> +	unsigned long kva, offset;
> > >>> +
> > >>> +	/* Map new TCE */
> > >>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> > >>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > >>> +			direction != DMA_TO_DEVICE, &page);
> > >>> +	if (ret < 1) {
> > >>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> > >> tce=%llx ioba=%lx ret=%d\n",
> > >>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> > >>> +		if (!ret)
> > >>> +			ret = -EFAULT;
> > >>
> > >> Missing return ret?  Otherwise we've got some bogus uses of page below
> > >> and we're setting ret for no reason here.
> > >>
> > >>> +	}
> > >>> +
> > >>> +	kva = (unsigned long) page_address(page);
> > >>> +	kva += offset;
> > >>> +
> > >>> +	/* tce_build receives a virtual address */
> > >>> +	entry += tbl->it_offset; /* Offset into real TCE table */
> > >>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > >>> +
> > >>> +	/* tce_build() only returns non-zero for transient errors */
> > >>> +	if (unlikely(ret)) {
> > >>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> > >> ioba=%lx kva=%lx ret=%d\n",
> > >>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > >>> +		put_page(page);
> > >>> +		return -EIO;
> > >>> +	}
> > >>> +
> > >>> +	return 0;
> > >>> +}
> > >>> +
> > >>> +static void tce_flush(struct iommu_table *tbl) {
> > >>> +	/* Flush/invalidate TLB caches if necessary */
> > >>> +	if (ppc_md.tce_flush)
> > >>> +		ppc_md.tce_flush(tbl);
> > >>> +
> > >>> +	/* Make sure updates are seen by hardware */
> > >>> +	mb();
> > >>> +}
> > >>> +
> > >>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> > >> uint64_t tce,
> > >>> +		enum dma_data_direction direction, unsigned long pages) {
> > >>> +	int i, ret = 0, pages_to_put = 0;
> > >>> +	struct page *page;
> > >>> +	struct iommu_pool *pool = get_pool(tbl, entry);
> > >>> +	struct page **oldpages;
> > >>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> > >>> +
> > >>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > >>> +
> > >>> +	/* Handle a single page request without allocation
> > >>> +	   of pages-to-release array */
> > >>> +	if (pages == 1) {
> > >>> +		spin_lock(&(pool->lock));
> > >>> +		page = free_tce(tbl, entry);
> > >>> +
> > >>> +		if (direction != DMA_NONE)
> > >>> +			ret = put_tce(tbl, entry, tce, direction);
> > >>> +
> > >>> +		tce_flush(tbl);
> > >>> +
> > >>> +		if (page)
> > >>> +			put_page(page);
> > >>> +
> > >>> +		spin_unlock(&(pool->lock));
> > >>> +		return ret;
> > >>> +	}
> > >>> +
> > >>> +	/* Releasing multiple pages */
> > >>> +	/* Allocate an array for pages to be released after TCE table
> > >>> +	   is updated */
> > >>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > >>> +	if (!oldpages)
> > >>> +		return -ENOMEM;
> > >>> +
> > >>> +	spin_lock(&(pool->lock));
> > >>> +
> > >>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> > >> IOMMU_PAGE_SIZE) {
> > >>> +		page = free_tce(tbl, entry);
> > >>> +		if (page) {
> > >>> +			oldpages[pages_to_put] = page;
> > >>> +			++pages_to_put;
> > >>> +		}
> > >>> +
> > >>> +		if (direction != DMA_NONE)
> > >>> +			ret = put_tce(tbl, entry, tce, direction);
> > >>> +
> > >>> +		/* Release old pages if we reached the end of oldpages[] or
> > >>> +		   it is the last page or we are about to exit the loop */
> > >>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> > >> {
> > >>> +			tce_flush(tbl);
> > >>
> > >> Avoiding tce_flush() is the reason for all this extra overhead, right?
> > >> I wonder if it'd be cleaner separating map vs unmap, where the map case
> > >> can avoid the oldpages array... but that means inserting new mappings on
> > >> top of old ones wouldn't put the pages.
> > 
> > 
> > Yes, we do not want to loose pages if the guest forgot to unmap them.
> 
> Hmm, does that mean we're not actively clearing tce entries or somehow
> disabling the iommu window when the iommu is released through vfio?

Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
more concerned about the guest simply mapping over top of it's own
mappings.  Is that common?  Is it common enough for every multi-page
mapping to assume it will happen?  I know this is a performance
sensitive path for you and it seems like a map-only w/ fallback to
unmap, remap would be better in the general case.

On x86 we do exactly that, but we do the unmap, remap from userspace
when we get an EBUSY.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-23  9:03         ` Alexey Kardashevskiy
@ 2012-11-26 18:20           ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-26 18:20 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  drivers/vfio/Kconfig                |    6 +
>  drivers/vfio/Makefile               |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                |   20 +++
>  4 files changed, 274 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..46a6298
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,247 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);

I think your patch ordering is backwards here.  it_group isn't added
until 2/2.  I'd really like to see the arch/powerpc code approved and
merged by the powerpc maintainer before we add the code that makes use
of it into vfio.  Otherwise we just get lots of churn if interfaces
change or they disapprove of it altogether.

> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION: {
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +	}
> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma64_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.dma64_window_start = 0;
> +		info.dma64_window_size = 0;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map param;
> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction = DMA_NONE;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> +			direction = DMA_BIDIRECTIONAL;
> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> +			direction = DMA_TO_DEVICE;
> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> +			direction = DMA_FROM_DEVICE;
> +		}
> +
> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);

On x86 we force iova, vaddr, and size to all be aligned to the smallest
page granularity of the iommu and return -EINVAL if it doesn't fit.
What does it imply to the user if they're always aligned to work here?
Won't this interface happily map overlapping entries with no indication
to the user that the previous mapping is no longer valid?

Maybe another reason why a combined unmap/map makes me nervous, we have
to assume the user knows what they're doing.

> +
> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.vaddr & IOMMU_PAGE_MASK, direction,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap param;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> +
> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> +	}
> +	default:
> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);

pr_warn

> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",

pr_warn

> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;

Would it be too much paranoia to clear all the tce here as you do below
on detach?  ie. is there any risk that there's leftover programming?
x86 allocates a new domain on open of the iommu, so we always start out
clean.

> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",

pr_warn

> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);

So this cleans out any mappings when vfio is closed, good.

> +		container->tbl = NULL;
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..3ecd65c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,23 @@ struct vfio_iommu_type1_dma_unmap {
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;
> +	__u32 dma32_window_start;
> +	__u32 dma32_window_size;
> +	__u64 dma64_window_start;
> +	__u64 dma64_window_size;
> +};

Is there anything we can document about this?  It should probably list
that size is in bytes.  Is there any need to communicate the IOMMU page
size here?

> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */

Thanks,

Alex




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-26 18:20           ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-26 18:20 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  drivers/vfio/Kconfig                |    6 +
>  drivers/vfio/Makefile               |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                |   20 +++
>  4 files changed, 274 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..46a6298
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,247 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);

I think your patch ordering is backwards here.  it_group isn't added
until 2/2.  I'd really like to see the arch/powerpc code approved and
merged by the powerpc maintainer before we add the code that makes use
of it into vfio.  Otherwise we just get lots of churn if interfaces
change or they disapprove of it altogether.

> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION: {
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +	}
> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma64_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.dma64_window_start = 0;
> +		info.dma64_window_size = 0;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map param;
> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction = DMA_NONE;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> +			direction = DMA_BIDIRECTIONAL;
> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> +			direction = DMA_TO_DEVICE;
> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> +			direction = DMA_FROM_DEVICE;
> +		}
> +
> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);

On x86 we force iova, vaddr, and size to all be aligned to the smallest
page granularity of the iommu and return -EINVAL if it doesn't fit.
What does it imply to the user if they're always aligned to work here?
Won't this interface happily map overlapping entries with no indication
to the user that the previous mapping is no longer valid?

Maybe another reason why a combined unmap/map makes me nervous, we have
to assume the user knows what they're doing.

> +
> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.vaddr & IOMMU_PAGE_MASK, direction,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap param;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> +
> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> +	}
> +	default:
> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);

pr_warn

> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",

pr_warn

> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;

Would it be too much paranoia to clear all the tce here as you do below
on detach?  ie. is there any risk that there's leftover programming?
x86 allocates a new domain on open of the iommu, so we always start out
clean.

> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",

pr_warn

> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);

So this cleans out any mappings when vfio is closed, good.

> +		container->tbl = NULL;
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..3ecd65c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,23 @@ struct vfio_iommu_type1_dma_unmap {
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;
> +	__u32 dma32_window_start;
> +	__u32 dma32_window_size;
> +	__u64 dma64_window_start;
> +	__u64 dma64_window_size;
> +};

Is there anything we can document about this?  It should probably list
that size is in bytes.  Is there any need to communicate the IOMMU page
size here?

> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */

Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
  2012-11-26 18:04             ` Alex Williamson
@ 2012-11-27  0:21               ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 122+ messages in thread
From: Benjamin Herrenschmidt @ 2012-11-27  0:21 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Sethi Varun-B16395, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

On Mon, 2012-11-26 at 11:04 -0700, Alex Williamson wrote:
> Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
> more concerned about the guest simply mapping over top of it's own
> mappings.  Is that common?  Is it common enough for every multi-page
> mapping to assume it will happen?  I know this is a performance
> sensitive path for you and it seems like a map-only w/ fallback to
> unmap, remap would be better in the general case.
> 
> On x86 we do exactly that, but we do the unmap, remap from userspace
> when we get an EBUSY.  Thanks, 

Right, Linux as guest at least will never map "over" an existing
mapping. It will always unmap first. IE. The only transition we do on
H_PUT_TCE are 0 -> valid and valid -> 0.

So it would be fine to simplify the code and keep the "map over map" as
a slow fallback. I can't tell for other operating systems but we don't
care about those at this point :-)

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-27  0:21               ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 122+ messages in thread
From: Benjamin Herrenschmidt @ 2012-11-27  0:21 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	Sethi Varun-B16395, linuxppc-dev, David Gibson

On Mon, 2012-11-26 at 11:04 -0700, Alex Williamson wrote:
> Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
> more concerned about the guest simply mapping over top of it's own
> mappings.  Is that common?  Is it common enough for every multi-page
> mapping to assume it will happen?  I know this is a performance
> sensitive path for you and it seems like a map-only w/ fallback to
> unmap, remap would be better in the general case.
> 
> On x86 we do exactly that, but we do the unmap, remap from userspace
> when we get an EBUSY.  Thanks, 

Right, Linux as guest at least will never map "over" an existing
mapping. It will always unmap first. IE. The only transition we do on
H_PUT_TCE are 0 -> valid and valid -> 0.

So it would be fine to simplify the code and keep the "map over map" as
a slow fallback. I can't tell for other operating systems but we don't
care about those at this point :-)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
  2012-11-26 18:04             ` Alex Williamson
@ 2012-11-27  3:28               ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-27  3:28 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Sethi Varun-B16395, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

On 27/11/12 05:04, Alex Williamson wrote:
> On Mon, 2012-11-26 at 08:18 -0700, Alex Williamson wrote:
>> On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
>>> On 22/11/12 22:56, Sethi Varun-B16395 wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
>>>>> owner@vger.kernel.org] On Behalf Of Alex Williamson
>>>>> Sent: Tuesday, November 20, 2012 11:50 PM
>>>>> To: Alexey Kardashevskiy
>>>>> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
>>>>> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
>>>>> David Gibson
>>>>> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
>>>>> platform
>>>>>
>>>>> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
>>>>>> VFIO implements platform independent stuff such as a PCI driver, BAR
>>>>>> access (via read/write on a file descriptor or direct mapping when
>>>>>> possible) and IRQ signaling.
>>>>>> The platform dependent part includes IOMMU initialization and
>>>>>> handling.
>>>>>>
>>>>>> This patch initializes IOMMU groups based on the IOMMU configuration
>>>>>> discovered during the PCI scan, only POWERNV platform is supported at
>>>>>> the moment.
>>>>>>
>>>>>> Also the patch implements an VFIO-IOMMU driver which manages DMA
>>>>>> mapping/unmapping requests coming from the client (now QEMU). It also
>>>>>> returns a DMA window information to let the guest initialize the
>>>>>> device tree for a guest OS properly. Although this driver has been
>>>>>> tested only on POWERNV, it should work on any platform supporting TCE
>>>>>> tables.
>>>>>>
>>>>>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
>>>>>>
>>>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> ---
>>>>>>    arch/powerpc/include/asm/iommu.h     |    6 +
>>>>>>    arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
>>>>>>    arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
>>>>>>    drivers/iommu/Kconfig                |    8 ++
>>>>>>    drivers/vfio/Kconfig                 |    6 +
>>>>>>    drivers/vfio/Makefile                |    1 +
>>>>>>    drivers/vfio/vfio_iommu_spapr_tce.c  |  247
>>>>> ++++++++++++++++++++++++++++++++++
>>>>>>    include/linux/vfio.h                 |   20 +++
>>>>>>    8 files changed, 563 insertions(+)
>>>>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>>>>
>>>>>> diff --git a/arch/powerpc/include/asm/iommu.h
>>>>>> b/arch/powerpc/include/asm/iommu.h
>>>>>> index cbfe678..5ba66cb 100644
>>>>>> --- a/arch/powerpc/include/asm/iommu.h
>>>>>> +++ b/arch/powerpc/include/asm/iommu.h
>>>>>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
>>>>>> ____cacheline_aligned_in_smp;
>>>>>>
>>>>>>    struct iommu_table {
>>>>>>    	unsigned long  it_busno;     /* Bus number this table belongs to */
>>>>>>    	unsigned long  it_size;      /* Size of iommu table in entries */
>>>>>>    	unsigned long  it_offset;    /* Offset into global table */
>>>>>>    	unsigned long  it_base;      /* mapped address of tce table */
>>>>>>    	unsigned long  it_index;     /* which iommu table this is */
>>>>>>    	unsigned long  it_type;      /* type: PCI or Virtual Bus */
>>>>>>    	unsigned long  it_blocksize; /* Entries in each block (cacheline)
>>>>> */
>>>>>>    	unsigned long  poolsize;
>>>>>>    	unsigned long  nr_pools;
>>>>>>    	struct iommu_pool large_pool;
>>>>>>    	struct iommu_pool pools[IOMMU_NR_POOLS];
>>>>>>    	unsigned long *it_map;       /* A simple allocation bitmap for now
>>>>> */
>>>>>> +#ifdef CONFIG_IOMMU_API
>>>>>> +	struct iommu_group *it_group;
>>>>>> +#endif
>>>>>>    };
>>>>>>
>>>>>>    struct scatterlist;
>>>>>>
>>>>>>    static inline void set_iommu_table_base(struct device *dev, void
>>>>>> *base)  {
>>>>>>    	dev->archdata.dma_data.iommu_table_base = base;  }
>>>>>>
>>>>>>    static inline void *get_iommu_table_base(struct device *dev)  {
>>>>>>    	return dev->archdata.dma_data.iommu_table_base;
>>>>>>    }
>>>>>>
>>>>>>    /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
>>>>>> static inline void pci_iommu_init(void) { }  extern void
>>>>>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
>>>>>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
>>>>>>    	if (ppc_md.iommu_save)
>>>>>>    		ppc_md.iommu_save();
>>>>>>    }
>>>>>>
>>>>>>    static inline void iommu_restore(void)  {
>>>>>>    	if (ppc_md.iommu_restore)
>>>>>>    		ppc_md.iommu_restore();
>>>>>>    }
>>>>>>    #endif
>>>>>>
>>>>>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
>>>>> entry, uint64_t tce,
>>>>>> +		enum dma_data_direction direction, unsigned long pages);
>>>>>> +
>>>>>>    #endif /* __KERNEL__ */
>>>>>>    #endif /* _ASM_IOMMU_H */
>>>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>>>>> index ff5a6ce..94f614b 100644
>>>>>> --- a/arch/powerpc/kernel/iommu.c
>>>>>> +++ b/arch/powerpc/kernel/iommu.c
>>>>>> @@ -32,30 +32,31 @@
>>>>>>    #include <linux/dma-mapping.h>
>>>>>>    #include <linux/bitmap.h>
>>>>>>    #include <linux/iommu-helper.h>
>>>>>>    #include <linux/crash_dump.h>
>>>>>>    #include <linux/hash.h>
>>>>>>    #include <linux/fault-inject.h>
>>>>>>    #include <linux/pci.h>
>>>>>>    #include <asm/io.h>
>>>>>>    #include <asm/prom.h>
>>>>>>    #include <asm/iommu.h>
>>>>>>    #include <asm/pci-bridge.h>
>>>>>>    #include <asm/machdep.h>
>>>>>>    #include <asm/kdump.h>
>>>>>>    #include <asm/fadump.h>
>>>>>>    #include <asm/vio.h>
>>>>>> +#include <asm/tce.h>
>>>>>>
>>>>>>    #define DBG(...)
>>>>>>
>>>>>>    static int novmerge;
>>>>>>
>>>>>>    static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
>>>>>> int);
>>>>>>
>>>>>>    static int __init setup_iommu(char *str)  {
>>>>>>    	if (!strcmp(str, "novmerge"))
>>>>>>    		novmerge = 1;
>>>>>>    	else if (!strcmp(str, "vmerge"))
>>>>>>    		novmerge = 0;
>>>>>>    	return 1;
>>>>>>    }
>>>>>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
>>>>>> struct iommu_table *tbl,  }
>>>>>>
>>>>>>    void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>>>>>    			 void *vaddr, dma_addr_t dma_handle)  {
>>>>>>    	if (tbl) {
>>>>>>    		unsigned int nio_pages;
>>>>>>
>>>>>>    		size = PAGE_ALIGN(size);
>>>>>>    		nio_pages = size >> IOMMU_PAGE_SHIFT;
>>>>>>    		iommu_free(tbl, dma_handle, nio_pages);
>>>>>>    		size = PAGE_ALIGN(size);
>>>>>>    		free_pages((unsigned long)vaddr, get_order(size));
>>>>>>    	}
>>>>>>    }
>>>>>> +
>>>>>> +#ifdef CONFIG_IOMMU_API
>>>>>> +/*
>>>>>> + * SPAPR TCE API
>>>>>> + */
>>>>>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
>>>>>> +entry) {
>>>>>> +	struct page *page = NULL;
>>>>>
>>>>> NULL initialization doesn't appear to be necessary
>>>>>
>>>>>> +	unsigned long oldtce;
>>>>>> +
>>>>>> +	oldtce = ppc_md.tce_get(tbl, entry);
>>>>>> +
>>>>>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>>>>>> +		return NULL;
>>>>>> +
>>>>>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
>>>>>> +
>>>>>> +	WARN_ON(!page);
>>>>>> +	if (page && (oldtce & TCE_PCI_WRITE))
>>>>>> +		SetPageDirty(page);
>>>>>> +	ppc_md.tce_free(tbl, entry, 1);
>>>>>> +
>>>>>> +	return page;
>>>>>> +}
>>>>>> +
>>>>>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>>>>>> +		uint64_t tce, enum dma_data_direction direction) {
>>>>>> +	int ret;
>>>>>> +	struct page *page = NULL;
>>>>>> +	unsigned long kva, offset;
>>>>>> +
>>>>>> +	/* Map new TCE */
>>>>>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>>>>>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>>>>>> +			direction != DMA_TO_DEVICE, &page);
>>>>>> +	if (ret < 1) {
>>>>>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
>>>>> tce=%llx ioba=%lx ret=%d\n",
>>>>>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
>>>>>> +		if (!ret)
>>>>>> +			ret = -EFAULT;
>>>>>
>>>>> Missing return ret?  Otherwise we've got some bogus uses of page below
>>>>> and we're setting ret for no reason here.
>>>>>
>>>>>> +	}
>>>>>> +
>>>>>> +	kva = (unsigned long) page_address(page);
>>>>>> +	kva += offset;
>>>>>> +
>>>>>> +	/* tce_build receives a virtual address */
>>>>>> +	entry += tbl->it_offset; /* Offset into real TCE table */
>>>>>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>>>>>> +
>>>>>> +	/* tce_build() only returns non-zero for transient errors */
>>>>>> +	if (unlikely(ret)) {
>>>>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
>>>>> ioba=%lx kva=%lx ret=%d\n",
>>>>>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>>>>>> +		put_page(page);
>>>>>> +		return -EIO;
>>>>>> +	}
>>>>>> +
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void tce_flush(struct iommu_table *tbl) {
>>>>>> +	/* Flush/invalidate TLB caches if necessary */
>>>>>> +	if (ppc_md.tce_flush)
>>>>>> +		ppc_md.tce_flush(tbl);
>>>>>> +
>>>>>> +	/* Make sure updates are seen by hardware */
>>>>>> +	mb();
>>>>>> +}
>>>>>> +
>>>>>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>>>>> uint64_t tce,
>>>>>> +		enum dma_data_direction direction, unsigned long pages) {
>>>>>> +	int i, ret = 0, pages_to_put = 0;
>>>>>> +	struct page *page;
>>>>>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>>>>>> +	struct page **oldpages;
>>>>>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
>>>>>> +
>>>>>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>>>>>> +
>>>>>> +	/* Handle a single page request without allocation
>>>>>> +	   of pages-to-release array */
>>>>>> +	if (pages == 1) {
>>>>>> +		spin_lock(&(pool->lock));
>>>>>> +		page = free_tce(tbl, entry);
>>>>>> +
>>>>>> +		if (direction != DMA_NONE)
>>>>>> +			ret = put_tce(tbl, entry, tce, direction);
>>>>>> +
>>>>>> +		tce_flush(tbl);
>>>>>> +
>>>>>> +		if (page)
>>>>>> +			put_page(page);
>>>>>> +
>>>>>> +		spin_unlock(&(pool->lock));
>>>>>> +		return ret;
>>>>>> +	}
>>>>>> +
>>>>>> +	/* Releasing multiple pages */
>>>>>> +	/* Allocate an array for pages to be released after TCE table
>>>>>> +	   is updated */
>>>>>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
>>>>>> +	if (!oldpages)
>>>>>> +		return -ENOMEM;
>>>>>> +
>>>>>> +	spin_lock(&(pool->lock));
>>>>>> +
>>>>>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
>>>>> IOMMU_PAGE_SIZE) {
>>>>>> +		page = free_tce(tbl, entry);
>>>>>> +		if (page) {
>>>>>> +			oldpages[pages_to_put] = page;
>>>>>> +			++pages_to_put;
>>>>>> +		}
>>>>>> +
>>>>>> +		if (direction != DMA_NONE)
>>>>>> +			ret = put_tce(tbl, entry, tce, direction);
>>>>>> +
>>>>>> +		/* Release old pages if we reached the end of oldpages[] or
>>>>>> +		   it is the last page or we are about to exit the loop */
>>>>>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
>>>>> {
>>>>>> +			tce_flush(tbl);
>>>>>
>>>>> Avoiding tce_flush() is the reason for all this extra overhead, right?
>>>>> I wonder if it'd be cleaner separating map vs unmap, where the map case
>>>>> can avoid the oldpages array... but that means inserting new mappings on
>>>>> top of old ones wouldn't put the pages.
>>>
>>>
>>> Yes, we do not want to loose pages if the guest forgot to unmap them.
>>
>> Hmm, does that mean we're not actively clearing tce entries or somehow
>> disabling the iommu window when the iommu is released through vfio?
>
> Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
> more concerned about the guest simply mapping over top of it's own
> mappings.  Is that common?  Is it common enough for every multi-page
> mapping to assume it will happen?  I know this is a performance
> sensitive path for you and it seems like a map-only w/ fallback to
> unmap, remap would be better in the general case.


I do not get it. Where exactly does the performance suffer?
iommu_put_tces() with non zero "tce" (i.e. "map") has to check if the entry 
is not used, at least to return EBUSY when it is, and this check is 
performed. If it is zero, there is no overhead at all. And it is going to 
be the 99.(9)% case as the guest (un)maps one page per call.

Generally speaking we want to move "put tce" completely to the kernel for 
the (much) better performance and vfio won't be dealing with it all.

We already agreed that SPAPR TCE driver uses x86 (aka type1) API but I do 
not see why the powerpc implementation should look x86 alike as it still 
operates with powerpc machine dependent callbacks so the reader has to have 
some powerpc knowledge.


> On x86 we do exactly that, but we do the unmap, remap from userspace
> when we get an EBUSY.  Thanks,



-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-27  3:28               ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-27  3:28 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, Sethi Varun-B16395,
	linuxppc-dev, David Gibson

On 27/11/12 05:04, Alex Williamson wrote:
> On Mon, 2012-11-26 at 08:18 -0700, Alex Williamson wrote:
>> On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
>>> On 22/11/12 22:56, Sethi Varun-B16395 wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
>>>>> owner@vger.kernel.org] On Behalf Of Alex Williamson
>>>>> Sent: Tuesday, November 20, 2012 11:50 PM
>>>>> To: Alexey Kardashevskiy
>>>>> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
>>>>> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
>>>>> David Gibson
>>>>> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
>>>>> platform
>>>>>
>>>>> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
>>>>>> VFIO implements platform independent stuff such as a PCI driver, BAR
>>>>>> access (via read/write on a file descriptor or direct mapping when
>>>>>> possible) and IRQ signaling.
>>>>>> The platform dependent part includes IOMMU initialization and
>>>>>> handling.
>>>>>>
>>>>>> This patch initializes IOMMU groups based on the IOMMU configuration
>>>>>> discovered during the PCI scan, only POWERNV platform is supported at
>>>>>> the moment.
>>>>>>
>>>>>> Also the patch implements an VFIO-IOMMU driver which manages DMA
>>>>>> mapping/unmapping requests coming from the client (now QEMU). It also
>>>>>> returns a DMA window information to let the guest initialize the
>>>>>> device tree for a guest OS properly. Although this driver has been
>>>>>> tested only on POWERNV, it should work on any platform supporting TCE
>>>>>> tables.
>>>>>>
>>>>>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
>>>>>>
>>>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> ---
>>>>>>    arch/powerpc/include/asm/iommu.h     |    6 +
>>>>>>    arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
>>>>>>    arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
>>>>>>    drivers/iommu/Kconfig                |    8 ++
>>>>>>    drivers/vfio/Kconfig                 |    6 +
>>>>>>    drivers/vfio/Makefile                |    1 +
>>>>>>    drivers/vfio/vfio_iommu_spapr_tce.c  |  247
>>>>> ++++++++++++++++++++++++++++++++++
>>>>>>    include/linux/vfio.h                 |   20 +++
>>>>>>    8 files changed, 563 insertions(+)
>>>>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>>>>
>>>>>> diff --git a/arch/powerpc/include/asm/iommu.h
>>>>>> b/arch/powerpc/include/asm/iommu.h
>>>>>> index cbfe678..5ba66cb 100644
>>>>>> --- a/arch/powerpc/include/asm/iommu.h
>>>>>> +++ b/arch/powerpc/include/asm/iommu.h
>>>>>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
>>>>>> ____cacheline_aligned_in_smp;
>>>>>>
>>>>>>    struct iommu_table {
>>>>>>    	unsigned long  it_busno;     /* Bus number this table belongs to */
>>>>>>    	unsigned long  it_size;      /* Size of iommu table in entries */
>>>>>>    	unsigned long  it_offset;    /* Offset into global table */
>>>>>>    	unsigned long  it_base;      /* mapped address of tce table */
>>>>>>    	unsigned long  it_index;     /* which iommu table this is */
>>>>>>    	unsigned long  it_type;      /* type: PCI or Virtual Bus */
>>>>>>    	unsigned long  it_blocksize; /* Entries in each block (cacheline)
>>>>> */
>>>>>>    	unsigned long  poolsize;
>>>>>>    	unsigned long  nr_pools;
>>>>>>    	struct iommu_pool large_pool;
>>>>>>    	struct iommu_pool pools[IOMMU_NR_POOLS];
>>>>>>    	unsigned long *it_map;       /* A simple allocation bitmap for now
>>>>> */
>>>>>> +#ifdef CONFIG_IOMMU_API
>>>>>> +	struct iommu_group *it_group;
>>>>>> +#endif
>>>>>>    };
>>>>>>
>>>>>>    struct scatterlist;
>>>>>>
>>>>>>    static inline void set_iommu_table_base(struct device *dev, void
>>>>>> *base)  {
>>>>>>    	dev->archdata.dma_data.iommu_table_base = base;  }
>>>>>>
>>>>>>    static inline void *get_iommu_table_base(struct device *dev)  {
>>>>>>    	return dev->archdata.dma_data.iommu_table_base;
>>>>>>    }
>>>>>>
>>>>>>    /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
>>>>>> static inline void pci_iommu_init(void) { }  extern void
>>>>>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
>>>>>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
>>>>>>    	if (ppc_md.iommu_save)
>>>>>>    		ppc_md.iommu_save();
>>>>>>    }
>>>>>>
>>>>>>    static inline void iommu_restore(void)  {
>>>>>>    	if (ppc_md.iommu_restore)
>>>>>>    		ppc_md.iommu_restore();
>>>>>>    }
>>>>>>    #endif
>>>>>>
>>>>>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
>>>>> entry, uint64_t tce,
>>>>>> +		enum dma_data_direction direction, unsigned long pages);
>>>>>> +
>>>>>>    #endif /* __KERNEL__ */
>>>>>>    #endif /* _ASM_IOMMU_H */
>>>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>>>>> index ff5a6ce..94f614b 100644
>>>>>> --- a/arch/powerpc/kernel/iommu.c
>>>>>> +++ b/arch/powerpc/kernel/iommu.c
>>>>>> @@ -32,30 +32,31 @@
>>>>>>    #include <linux/dma-mapping.h>
>>>>>>    #include <linux/bitmap.h>
>>>>>>    #include <linux/iommu-helper.h>
>>>>>>    #include <linux/crash_dump.h>
>>>>>>    #include <linux/hash.h>
>>>>>>    #include <linux/fault-inject.h>
>>>>>>    #include <linux/pci.h>
>>>>>>    #include <asm/io.h>
>>>>>>    #include <asm/prom.h>
>>>>>>    #include <asm/iommu.h>
>>>>>>    #include <asm/pci-bridge.h>
>>>>>>    #include <asm/machdep.h>
>>>>>>    #include <asm/kdump.h>
>>>>>>    #include <asm/fadump.h>
>>>>>>    #include <asm/vio.h>
>>>>>> +#include <asm/tce.h>
>>>>>>
>>>>>>    #define DBG(...)
>>>>>>
>>>>>>    static int novmerge;
>>>>>>
>>>>>>    static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
>>>>>> int);
>>>>>>
>>>>>>    static int __init setup_iommu(char *str)  {
>>>>>>    	if (!strcmp(str, "novmerge"))
>>>>>>    		novmerge = 1;
>>>>>>    	else if (!strcmp(str, "vmerge"))
>>>>>>    		novmerge = 0;
>>>>>>    	return 1;
>>>>>>    }
>>>>>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
>>>>>> struct iommu_table *tbl,  }
>>>>>>
>>>>>>    void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>>>>>    			 void *vaddr, dma_addr_t dma_handle)  {
>>>>>>    	if (tbl) {
>>>>>>    		unsigned int nio_pages;
>>>>>>
>>>>>>    		size = PAGE_ALIGN(size);
>>>>>>    		nio_pages = size >> IOMMU_PAGE_SHIFT;
>>>>>>    		iommu_free(tbl, dma_handle, nio_pages);
>>>>>>    		size = PAGE_ALIGN(size);
>>>>>>    		free_pages((unsigned long)vaddr, get_order(size));
>>>>>>    	}
>>>>>>    }
>>>>>> +
>>>>>> +#ifdef CONFIG_IOMMU_API
>>>>>> +/*
>>>>>> + * SPAPR TCE API
>>>>>> + */
>>>>>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
>>>>>> +entry) {
>>>>>> +	struct page *page = NULL;
>>>>>
>>>>> NULL initialization doesn't appear to be necessary
>>>>>
>>>>>> +	unsigned long oldtce;
>>>>>> +
>>>>>> +	oldtce = ppc_md.tce_get(tbl, entry);
>>>>>> +
>>>>>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>>>>>> +		return NULL;
>>>>>> +
>>>>>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
>>>>>> +
>>>>>> +	WARN_ON(!page);
>>>>>> +	if (page && (oldtce & TCE_PCI_WRITE))
>>>>>> +		SetPageDirty(page);
>>>>>> +	ppc_md.tce_free(tbl, entry, 1);
>>>>>> +
>>>>>> +	return page;
>>>>>> +}
>>>>>> +
>>>>>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>>>>>> +		uint64_t tce, enum dma_data_direction direction) {
>>>>>> +	int ret;
>>>>>> +	struct page *page = NULL;
>>>>>> +	unsigned long kva, offset;
>>>>>> +
>>>>>> +	/* Map new TCE */
>>>>>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>>>>>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>>>>>> +			direction != DMA_TO_DEVICE, &page);
>>>>>> +	if (ret < 1) {
>>>>>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
>>>>> tce=%llx ioba=%lx ret=%d\n",
>>>>>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
>>>>>> +		if (!ret)
>>>>>> +			ret = -EFAULT;
>>>>>
>>>>> Missing return ret?  Otherwise we've got some bogus uses of page below
>>>>> and we're setting ret for no reason here.
>>>>>
>>>>>> +	}
>>>>>> +
>>>>>> +	kva = (unsigned long) page_address(page);
>>>>>> +	kva += offset;
>>>>>> +
>>>>>> +	/* tce_build receives a virtual address */
>>>>>> +	entry += tbl->it_offset; /* Offset into real TCE table */
>>>>>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>>>>>> +
>>>>>> +	/* tce_build() only returns non-zero for transient errors */
>>>>>> +	if (unlikely(ret)) {
>>>>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
>>>>> ioba=%lx kva=%lx ret=%d\n",
>>>>>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>>>>>> +		put_page(page);
>>>>>> +		return -EIO;
>>>>>> +	}
>>>>>> +
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void tce_flush(struct iommu_table *tbl) {
>>>>>> +	/* Flush/invalidate TLB caches if necessary */
>>>>>> +	if (ppc_md.tce_flush)
>>>>>> +		ppc_md.tce_flush(tbl);
>>>>>> +
>>>>>> +	/* Make sure updates are seen by hardware */
>>>>>> +	mb();
>>>>>> +}
>>>>>> +
>>>>>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>>>>> uint64_t tce,
>>>>>> +		enum dma_data_direction direction, unsigned long pages) {
>>>>>> +	int i, ret = 0, pages_to_put = 0;
>>>>>> +	struct page *page;
>>>>>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>>>>>> +	struct page **oldpages;
>>>>>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
>>>>>> +
>>>>>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>>>>>> +
>>>>>> +	/* Handle a single page request without allocation
>>>>>> +	   of pages-to-release array */
>>>>>> +	if (pages == 1) {
>>>>>> +		spin_lock(&(pool->lock));
>>>>>> +		page = free_tce(tbl, entry);
>>>>>> +
>>>>>> +		if (direction != DMA_NONE)
>>>>>> +			ret = put_tce(tbl, entry, tce, direction);
>>>>>> +
>>>>>> +		tce_flush(tbl);
>>>>>> +
>>>>>> +		if (page)
>>>>>> +			put_page(page);
>>>>>> +
>>>>>> +		spin_unlock(&(pool->lock));
>>>>>> +		return ret;
>>>>>> +	}
>>>>>> +
>>>>>> +	/* Releasing multiple pages */
>>>>>> +	/* Allocate an array for pages to be released after TCE table
>>>>>> +	   is updated */
>>>>>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
>>>>>> +	if (!oldpages)
>>>>>> +		return -ENOMEM;
>>>>>> +
>>>>>> +	spin_lock(&(pool->lock));
>>>>>> +
>>>>>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
>>>>> IOMMU_PAGE_SIZE) {
>>>>>> +		page = free_tce(tbl, entry);
>>>>>> +		if (page) {
>>>>>> +			oldpages[pages_to_put] = page;
>>>>>> +			++pages_to_put;
>>>>>> +		}
>>>>>> +
>>>>>> +		if (direction != DMA_NONE)
>>>>>> +			ret = put_tce(tbl, entry, tce, direction);
>>>>>> +
>>>>>> +		/* Release old pages if we reached the end of oldpages[] or
>>>>>> +		   it is the last page or we are about to exit the loop */
>>>>>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
>>>>> {
>>>>>> +			tce_flush(tbl);
>>>>>
>>>>> Avoiding tce_flush() is the reason for all this extra overhead, right?
>>>>> I wonder if it'd be cleaner separating map vs unmap, where the map case
>>>>> can avoid the oldpages array... but that means inserting new mappings on
>>>>> top of old ones wouldn't put the pages.
>>>
>>>
>>> Yes, we do not want to loose pages if the guest forgot to unmap them.
>>
>> Hmm, does that mean we're not actively clearing tce entries or somehow
>> disabling the iommu window when the iommu is released through vfio?
>
> Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
> more concerned about the guest simply mapping over top of it's own
> mappings.  Is that common?  Is it common enough for every multi-page
> mapping to assume it will happen?  I know this is a performance
> sensitive path for you and it seems like a map-only w/ fallback to
> unmap, remap would be better in the general case.


I do not get it. Where exactly does the performance suffer?
iommu_put_tces() with non zero "tce" (i.e. "map") has to check if the entry 
is not used, at least to return EBUSY when it is, and this check is 
performed. If it is zero, there is no overhead at all. And it is going to 
be the 99.(9)% case as the guest (un)maps one page per call.

Generally speaking we want to move "put tce" completely to the kernel for 
the (much) better performance and vfio won't be dealing with it all.

We already agreed that SPAPR TCE driver uses x86 (aka type1) API but I do 
not see why the powerpc implementation should look x86 alike as it still 
operates with powerpc machine dependent callbacks so the reader has to have 
some powerpc knowledge.


> On x86 we do exactly that, but we do the unmap, remap from userspace
> when we get an EBUSY.  Thanks,



-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-26 18:20           ` Alex Williamson
@ 2012-11-27  4:06             ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-27  4:06 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On 27/11/12 05:20, Alex Williamson wrote:
> On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
>> VFIO implements platform independent stuff such as
>> a PCI driver, BAR access (via read/write on a file descriptor
>> or direct mapping when possible) and IRQ signaling.
>>
>> The platform dependent part includes IOMMU initialization
>> and handling. This patch implements an IOMMU driver for VFIO
>> which does mapping/unmapping pages for the guest IO and
>> provides information about DMA window (required by a POWERPC
>> guest).
>>
>> The counterpart in QEMU is required to support this functionality.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   drivers/vfio/Kconfig                |    6 +
>>   drivers/vfio/Makefile               |    1 +
>>   drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
>>   include/linux/vfio.h                |   20 +++
>>   4 files changed, 274 insertions(+)
>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>
>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>> index 7cd5dec..b464687 100644
>> --- a/drivers/vfio/Kconfig
>> +++ b/drivers/vfio/Kconfig
>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>   	depends on VFIO
>>   	default n
>>
>> +config VFIO_IOMMU_SPAPR_TCE
>> +	tristate
>> +	depends on VFIO && SPAPR_TCE_IOMMU
>> +	default n
>> +
>>   menuconfig VFIO
>>   	tristate "VFIO Non-Privileged userspace driver framework"
>>   	depends on IOMMU_API
>>   	select VFIO_IOMMU_TYPE1 if X86
>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>   	help
>>   	  VFIO provides a framework for secure userspace device drivers.
>>   	  See Documentation/vfio.txt for more details.
>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>> index 2398d4a..72bfabc 100644
>> --- a/drivers/vfio/Makefile
>> +++ b/drivers/vfio/Makefile
>> @@ -1,3 +1,4 @@
>>   obj-$(CONFIG_VFIO) += vfio.o
>>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>   obj-$(CONFIG_VFIO_PCI) += pci/
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> new file mode 100644
>> index 0000000..46a6298
>> --- /dev/null
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -0,0 +1,247 @@
>> +/*
>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>> + *
>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * Derived from original vfio_iommu_type1.c:
>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/err.h>
>> +#include <linux/vfio.h>
>> +#include <asm/iommu.h>
>> +
>> +#define DRIVER_VERSION  "0.1"
>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group);
>> +
>> +/*
>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>> + */
>> +
>> +/*
>> + * The container descriptor supports only a single group per container.
>> + * Required by the API as the container is not supplied with the IOMMU group
>> + * at the moment of initialization.
>> + */
>> +struct tce_container {
>> +	struct mutex lock;
>> +	struct iommu_table *tbl;
>> +};
>> +
>> +static void *tce_iommu_open(unsigned long arg)
>> +{
>> +	struct tce_container *container;
>> +
>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
>> +		return ERR_PTR(-EINVAL);
>> +	}
>> +
>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
>> +	if (!container)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	mutex_init(&container->lock);
>> +
>> +	return container;
>> +}
>> +
>> +static void tce_iommu_release(void *iommu_data)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +
>> +	WARN_ON(container->tbl && !container->tbl->it_group);
>
> I think your patch ordering is backwards here.  it_group isn't added
> until 2/2.  I'd really like to see the arch/powerpc code approved and
> merged by the powerpc maintainer before we add the code that makes use
> of it into vfio.  Otherwise we just get lots of churn if interfaces
> change or they disapprove of it altogether.


Makes sense, thanks.


>> +	if (container->tbl && container->tbl->it_group)
>> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>> +
>> +	mutex_destroy(&container->lock);
>> +
>> +	kfree(container);
>> +}
>> +
>> +static long tce_iommu_ioctl(void *iommu_data,
>> +				 unsigned int cmd, unsigned long arg)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	unsigned long minsz;
>> +
>> +	switch (cmd) {
>> +	case VFIO_CHECK_EXTENSION: {
>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>> +	}
>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>> +		struct vfio_iommu_spapr_tce_info info;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>> +				dma64_window_size);
>> +
>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (info.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>> +		info.dma64_window_start = 0;
>> +		info.dma64_window_size = 0;
>> +		info.flags = 0;
>> +
>> +		if (copy_to_user((void __user *)arg, &info, minsz))
>> +			return -EFAULT;
>> +
>> +		return 0;
>> +	}
>> +	case VFIO_IOMMU_MAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_map param;
>> +		struct iommu_table *tbl = container->tbl;
>> +		enum dma_data_direction direction = DMA_NONE;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
>> +			direction = DMA_BIDIRECTIONAL;
>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
>> +			direction = DMA_TO_DEVICE;
>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
>> +			direction = DMA_FROM_DEVICE;
>> +		}
>> +
>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>
> On x86 we force iova, vaddr, and size to all be aligned to the smallest
> page granularity of the iommu and return -EINVAL if it doesn't fit.
> What does it imply to the user if they're always aligned to work here?
> Won't this interface happily map overlapping entries with no indication
> to the user that the previous mapping is no longer valid?
> Maybe another reason why a combined unmap/map makes me nervous, we have
> to assume the user knows what they're doing.


I got used to guests which do know what they are doing so I am pretty calm :)
but ok, I'll move alignment to the QEMU, it makes sense.


>> +
>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> +				param.vaddr & IOMMU_PAGE_MASK, direction,
>> +				param.size >> IOMMU_PAGE_SHIFT);
>> +	}
>> +	case VFIO_IOMMU_UNMAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_unmap param;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>> +
>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
>> +	}
>> +	default:
>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
>
> pr_warn
>
>> +	}
>> +
>> +	return -ENOTTY;
>> +}
>> +
>> +static int tce_iommu_attach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	BUG_ON(!tbl);
>> +	mutex_lock(&container->lock);
>> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>> +			iommu_group_id(iommu_group), iommu_group);
>> +	if (container->tbl) {
>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>
> pr_warn
>
>> +				iommu_group_id(container->tbl->it_group),
>> +				iommu_group_id(iommu_group));
>> +		mutex_unlock(&container->lock);
>> +		return -EBUSY;
>> +	}
>> +
>> +	container->tbl = tbl;
>
> Would it be too much paranoia to clear all the tce here as you do below
> on detach?

Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e. 
unmaps) the whole DMA window at the boot time.


> ie. is there any risk that there's leftover programming?
> x86 allocates a new domain on open of the iommu, so we always start out
> clean.


>> +	mutex_unlock(&container->lock);
>> +
>> +	return 0;
>> +}
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	BUG_ON(!tbl);
>> +	mutex_lock(&container->lock);
>> +	if (tbl != container->tbl) {
>> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
>
> pr_warn
>
>> +				iommu_group_id(iommu_group),
>> +				iommu_group_id(tbl->it_group));
>> +	} else {
>> +
>> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
>> +				iommu_group_id(iommu_group), iommu_group);
>> +
>> +		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
>
> So this cleans out any mappings when vfio is closed, good.
>
>> +		container->tbl = NULL;
>> +	}
>> +	mutex_unlock(&container->lock);
>> +}
>> +
>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>> +	.name		= "iommu-vfio-powerpc",
>> +	.owner		= THIS_MODULE,
>> +	.open		= tce_iommu_open,
>> +	.release	= tce_iommu_release,
>> +	.ioctl		= tce_iommu_ioctl,
>> +	.attach_group	= tce_iommu_attach_group,
>> +	.detach_group	= tce_iommu_detach_group,
>> +};
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> +
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 0a4f180..3ecd65c 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>>   /* Extensions */
>>
>>   #define VFIO_TYPE1_IOMMU		1
>> +#define VFIO_SPAPR_TCE_IOMMU		2
>>
>>   /*
>>    * The IOCTL interface is designed for extensibility by embedding the
>> @@ -442,4 +443,23 @@ struct vfio_iommu_type1_dma_unmap {
>>
>>   #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>
>> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>> +
>> +struct vfio_iommu_spapr_tce_info {
>> +	__u32 argsz;
>> +	__u32 flags;
>> +	__u32 dma32_window_start;
>> +	__u32 dma32_window_size;
>> +	__u64 dma64_window_start;
>> +	__u64 dma64_window_size;
>> +};
>
> Is there anything we can document about this?

I'll put some.

> It should probably list that size is in bytes.  Is there any need to communicate the IOMMU page
> size here?

It is always 4k. I'll put it to comments.

>> +
>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +/* Reuse type1 map/unmap structs as they are the same at the moment */
>> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
>> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
>> +
>> +/* ***************************************************************** */
>> +
>>   #endif /* VFIO_H */
>
> Thanks,
>
> Alex
>
>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-27  4:06             ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-27  4:06 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On 27/11/12 05:20, Alex Williamson wrote:
> On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
>> VFIO implements platform independent stuff such as
>> a PCI driver, BAR access (via read/write on a file descriptor
>> or direct mapping when possible) and IRQ signaling.
>>
>> The platform dependent part includes IOMMU initialization
>> and handling. This patch implements an IOMMU driver for VFIO
>> which does mapping/unmapping pages for the guest IO and
>> provides information about DMA window (required by a POWERPC
>> guest).
>>
>> The counterpart in QEMU is required to support this functionality.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   drivers/vfio/Kconfig                |    6 +
>>   drivers/vfio/Makefile               |    1 +
>>   drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
>>   include/linux/vfio.h                |   20 +++
>>   4 files changed, 274 insertions(+)
>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>
>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>> index 7cd5dec..b464687 100644
>> --- a/drivers/vfio/Kconfig
>> +++ b/drivers/vfio/Kconfig
>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>   	depends on VFIO
>>   	default n
>>
>> +config VFIO_IOMMU_SPAPR_TCE
>> +	tristate
>> +	depends on VFIO && SPAPR_TCE_IOMMU
>> +	default n
>> +
>>   menuconfig VFIO
>>   	tristate "VFIO Non-Privileged userspace driver framework"
>>   	depends on IOMMU_API
>>   	select VFIO_IOMMU_TYPE1 if X86
>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>   	help
>>   	  VFIO provides a framework for secure userspace device drivers.
>>   	  See Documentation/vfio.txt for more details.
>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>> index 2398d4a..72bfabc 100644
>> --- a/drivers/vfio/Makefile
>> +++ b/drivers/vfio/Makefile
>> @@ -1,3 +1,4 @@
>>   obj-$(CONFIG_VFIO) += vfio.o
>>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>   obj-$(CONFIG_VFIO_PCI) += pci/
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> new file mode 100644
>> index 0000000..46a6298
>> --- /dev/null
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -0,0 +1,247 @@
>> +/*
>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>> + *
>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * Derived from original vfio_iommu_type1.c:
>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/err.h>
>> +#include <linux/vfio.h>
>> +#include <asm/iommu.h>
>> +
>> +#define DRIVER_VERSION  "0.1"
>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group);
>> +
>> +/*
>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>> + */
>> +
>> +/*
>> + * The container descriptor supports only a single group per container.
>> + * Required by the API as the container is not supplied with the IOMMU group
>> + * at the moment of initialization.
>> + */
>> +struct tce_container {
>> +	struct mutex lock;
>> +	struct iommu_table *tbl;
>> +};
>> +
>> +static void *tce_iommu_open(unsigned long arg)
>> +{
>> +	struct tce_container *container;
>> +
>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
>> +		return ERR_PTR(-EINVAL);
>> +	}
>> +
>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
>> +	if (!container)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	mutex_init(&container->lock);
>> +
>> +	return container;
>> +}
>> +
>> +static void tce_iommu_release(void *iommu_data)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +
>> +	WARN_ON(container->tbl && !container->tbl->it_group);
>
> I think your patch ordering is backwards here.  it_group isn't added
> until 2/2.  I'd really like to see the arch/powerpc code approved and
> merged by the powerpc maintainer before we add the code that makes use
> of it into vfio.  Otherwise we just get lots of churn if interfaces
> change or they disapprove of it altogether.


Makes sense, thanks.


>> +	if (container->tbl && container->tbl->it_group)
>> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>> +
>> +	mutex_destroy(&container->lock);
>> +
>> +	kfree(container);
>> +}
>> +
>> +static long tce_iommu_ioctl(void *iommu_data,
>> +				 unsigned int cmd, unsigned long arg)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	unsigned long minsz;
>> +
>> +	switch (cmd) {
>> +	case VFIO_CHECK_EXTENSION: {
>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>> +	}
>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>> +		struct vfio_iommu_spapr_tce_info info;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>> +				dma64_window_size);
>> +
>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (info.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>> +		info.dma64_window_start = 0;
>> +		info.dma64_window_size = 0;
>> +		info.flags = 0;
>> +
>> +		if (copy_to_user((void __user *)arg, &info, minsz))
>> +			return -EFAULT;
>> +
>> +		return 0;
>> +	}
>> +	case VFIO_IOMMU_MAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_map param;
>> +		struct iommu_table *tbl = container->tbl;
>> +		enum dma_data_direction direction = DMA_NONE;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
>> +			direction = DMA_BIDIRECTIONAL;
>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
>> +			direction = DMA_TO_DEVICE;
>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
>> +			direction = DMA_FROM_DEVICE;
>> +		}
>> +
>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>
> On x86 we force iova, vaddr, and size to all be aligned to the smallest
> page granularity of the iommu and return -EINVAL if it doesn't fit.
> What does it imply to the user if they're always aligned to work here?
> Won't this interface happily map overlapping entries with no indication
> to the user that the previous mapping is no longer valid?
> Maybe another reason why a combined unmap/map makes me nervous, we have
> to assume the user knows what they're doing.


I got used to guests which do know what they are doing so I am pretty calm :)
but ok, I'll move alignment to the QEMU, it makes sense.


>> +
>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> +				param.vaddr & IOMMU_PAGE_MASK, direction,
>> +				param.size >> IOMMU_PAGE_SHIFT);
>> +	}
>> +	case VFIO_IOMMU_UNMAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_unmap param;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>> +
>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
>> +	}
>> +	default:
>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
>
> pr_warn
>
>> +	}
>> +
>> +	return -ENOTTY;
>> +}
>> +
>> +static int tce_iommu_attach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	BUG_ON(!tbl);
>> +	mutex_lock(&container->lock);
>> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>> +			iommu_group_id(iommu_group), iommu_group);
>> +	if (container->tbl) {
>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>
> pr_warn
>
>> +				iommu_group_id(container->tbl->it_group),
>> +				iommu_group_id(iommu_group));
>> +		mutex_unlock(&container->lock);
>> +		return -EBUSY;
>> +	}
>> +
>> +	container->tbl = tbl;
>
> Would it be too much paranoia to clear all the tce here as you do below
> on detach?

Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e. 
unmaps) the whole DMA window at the boot time.


> ie. is there any risk that there's leftover programming?
> x86 allocates a new domain on open of the iommu, so we always start out
> clean.


>> +	mutex_unlock(&container->lock);
>> +
>> +	return 0;
>> +}
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	BUG_ON(!tbl);
>> +	mutex_lock(&container->lock);
>> +	if (tbl != container->tbl) {
>> +		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected group is #%u\n",
>
> pr_warn
>
>> +				iommu_group_id(iommu_group),
>> +				iommu_group_id(tbl->it_group));
>> +	} else {
>> +
>> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
>> +				iommu_group_id(iommu_group), iommu_group);
>> +
>> +		iommu_put_tces(tbl, tbl->it_offset, 0, DMA_NONE, tbl->it_size);
>
> So this cleans out any mappings when vfio is closed, good.
>
>> +		container->tbl = NULL;
>> +	}
>> +	mutex_unlock(&container->lock);
>> +}
>> +
>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>> +	.name		= "iommu-vfio-powerpc",
>> +	.owner		= THIS_MODULE,
>> +	.open		= tce_iommu_open,
>> +	.release	= tce_iommu_release,
>> +	.ioctl		= tce_iommu_ioctl,
>> +	.attach_group	= tce_iommu_attach_group,
>> +	.detach_group	= tce_iommu_detach_group,
>> +};
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> +
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 0a4f180..3ecd65c 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>>   /* Extensions */
>>
>>   #define VFIO_TYPE1_IOMMU		1
>> +#define VFIO_SPAPR_TCE_IOMMU		2
>>
>>   /*
>>    * The IOCTL interface is designed for extensibility by embedding the
>> @@ -442,4 +443,23 @@ struct vfio_iommu_type1_dma_unmap {
>>
>>   #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>
>> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>> +
>> +struct vfio_iommu_spapr_tce_info {
>> +	__u32 argsz;
>> +	__u32 flags;
>> +	__u32 dma32_window_start;
>> +	__u32 dma32_window_size;
>> +	__u64 dma64_window_start;
>> +	__u64 dma64_window_size;
>> +};
>
> Is there anything we can document about this?

I'll put some.

> It should probably list that size is in bytes.  Is there any need to communicate the IOMMU page
> size here?

It is always 4k. I'll put it to comments.

>> +
>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +/* Reuse type1 map/unmap structs as they are the same at the moment */
>> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
>> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
>> +
>> +/* ***************************************************************** */
>> +
>>   #endif /* VFIO_H */
>
> Thanks,
>
> Alex
>
>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
  2012-11-27  3:28               ` Alexey Kardashevskiy
@ 2012-11-27  4:23                 ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-27  4:23 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Sethi Varun-B16395, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

On Tue, 2012-11-27 at 14:28 +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 05:04, Alex Williamson wrote:
> > On Mon, 2012-11-26 at 08:18 -0700, Alex Williamson wrote:
> >> On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
> >>> On 22/11/12 22:56, Sethi Varun-B16395 wrote:
> >>>>
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> >>>>> owner@vger.kernel.org] On Behalf Of Alex Williamson
> >>>>> Sent: Tuesday, November 20, 2012 11:50 PM
> >>>>> To: Alexey Kardashevskiy
> >>>>> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> >>>>> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> >>>>> David Gibson
> >>>>> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> >>>>> platform
> >>>>>
> >>>>> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> >>>>>> VFIO implements platform independent stuff such as a PCI driver, BAR
> >>>>>> access (via read/write on a file descriptor or direct mapping when
> >>>>>> possible) and IRQ signaling.
> >>>>>> The platform dependent part includes IOMMU initialization and
> >>>>>> handling.
> >>>>>>
> >>>>>> This patch initializes IOMMU groups based on the IOMMU configuration
> >>>>>> discovered during the PCI scan, only POWERNV platform is supported at
> >>>>>> the moment.
> >>>>>>
> >>>>>> Also the patch implements an VFIO-IOMMU driver which manages DMA
> >>>>>> mapping/unmapping requests coming from the client (now QEMU). It also
> >>>>>> returns a DMA window information to let the guest initialize the
> >>>>>> device tree for a guest OS properly. Although this driver has been
> >>>>>> tested only on POWERNV, it should work on any platform supporting TCE
> >>>>>> tables.
> >>>>>>
> >>>>>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >>>>>>
> >>>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
> >>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>> ---
> >>>>>>    arch/powerpc/include/asm/iommu.h     |    6 +
> >>>>>>    arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> >>>>>>    arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> >>>>>>    drivers/iommu/Kconfig                |    8 ++
> >>>>>>    drivers/vfio/Kconfig                 |    6 +
> >>>>>>    drivers/vfio/Makefile                |    1 +
> >>>>>>    drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> >>>>> ++++++++++++++++++++++++++++++++++
> >>>>>>    include/linux/vfio.h                 |   20 +++
> >>>>>>    8 files changed, 563 insertions(+)
> >>>>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>>>
> >>>>>> diff --git a/arch/powerpc/include/asm/iommu.h
> >>>>>> b/arch/powerpc/include/asm/iommu.h
> >>>>>> index cbfe678..5ba66cb 100644
> >>>>>> --- a/arch/powerpc/include/asm/iommu.h
> >>>>>> +++ b/arch/powerpc/include/asm/iommu.h
> >>>>>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
> >>>>>> ____cacheline_aligned_in_smp;
> >>>>>>
> >>>>>>    struct iommu_table {
> >>>>>>    	unsigned long  it_busno;     /* Bus number this table belongs to */
> >>>>>>    	unsigned long  it_size;      /* Size of iommu table in entries */
> >>>>>>    	unsigned long  it_offset;    /* Offset into global table */
> >>>>>>    	unsigned long  it_base;      /* mapped address of tce table */
> >>>>>>    	unsigned long  it_index;     /* which iommu table this is */
> >>>>>>    	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> >>>>>>    	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> >>>>> */
> >>>>>>    	unsigned long  poolsize;
> >>>>>>    	unsigned long  nr_pools;
> >>>>>>    	struct iommu_pool large_pool;
> >>>>>>    	struct iommu_pool pools[IOMMU_NR_POOLS];
> >>>>>>    	unsigned long *it_map;       /* A simple allocation bitmap for now
> >>>>> */
> >>>>>> +#ifdef CONFIG_IOMMU_API
> >>>>>> +	struct iommu_group *it_group;
> >>>>>> +#endif
> >>>>>>    };
> >>>>>>
> >>>>>>    struct scatterlist;
> >>>>>>
> >>>>>>    static inline void set_iommu_table_base(struct device *dev, void
> >>>>>> *base)  {
> >>>>>>    	dev->archdata.dma_data.iommu_table_base = base;  }
> >>>>>>
> >>>>>>    static inline void *get_iommu_table_base(struct device *dev)  {
> >>>>>>    	return dev->archdata.dma_data.iommu_table_base;
> >>>>>>    }
> >>>>>>
> >>>>>>    /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> >>>>>> static inline void pci_iommu_init(void) { }  extern void
> >>>>>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> >>>>>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
> >>>>>>    	if (ppc_md.iommu_save)
> >>>>>>    		ppc_md.iommu_save();
> >>>>>>    }
> >>>>>>
> >>>>>>    static inline void iommu_restore(void)  {
> >>>>>>    	if (ppc_md.iommu_restore)
> >>>>>>    		ppc_md.iommu_restore();
> >>>>>>    }
> >>>>>>    #endif
> >>>>>>
> >>>>>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> >>>>> entry, uint64_t tce,
> >>>>>> +		enum dma_data_direction direction, unsigned long pages);
> >>>>>> +
> >>>>>>    #endif /* __KERNEL__ */
> >>>>>>    #endif /* _ASM_IOMMU_H */
> >>>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >>>>>> index ff5a6ce..94f614b 100644
> >>>>>> --- a/arch/powerpc/kernel/iommu.c
> >>>>>> +++ b/arch/powerpc/kernel/iommu.c
> >>>>>> @@ -32,30 +32,31 @@
> >>>>>>    #include <linux/dma-mapping.h>
> >>>>>>    #include <linux/bitmap.h>
> >>>>>>    #include <linux/iommu-helper.h>
> >>>>>>    #include <linux/crash_dump.h>
> >>>>>>    #include <linux/hash.h>
> >>>>>>    #include <linux/fault-inject.h>
> >>>>>>    #include <linux/pci.h>
> >>>>>>    #include <asm/io.h>
> >>>>>>    #include <asm/prom.h>
> >>>>>>    #include <asm/iommu.h>
> >>>>>>    #include <asm/pci-bridge.h>
> >>>>>>    #include <asm/machdep.h>
> >>>>>>    #include <asm/kdump.h>
> >>>>>>    #include <asm/fadump.h>
> >>>>>>    #include <asm/vio.h>
> >>>>>> +#include <asm/tce.h>
> >>>>>>
> >>>>>>    #define DBG(...)
> >>>>>>
> >>>>>>    static int novmerge;
> >>>>>>
> >>>>>>    static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> >>>>>> int);
> >>>>>>
> >>>>>>    static int __init setup_iommu(char *str)  {
> >>>>>>    	if (!strcmp(str, "novmerge"))
> >>>>>>    		novmerge = 1;
> >>>>>>    	else if (!strcmp(str, "vmerge"))
> >>>>>>    		novmerge = 0;
> >>>>>>    	return 1;
> >>>>>>    }
> >>>>>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> >>>>>> struct iommu_table *tbl,  }
> >>>>>>
> >>>>>>    void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >>>>>>    			 void *vaddr, dma_addr_t dma_handle)  {
> >>>>>>    	if (tbl) {
> >>>>>>    		unsigned int nio_pages;
> >>>>>>
> >>>>>>    		size = PAGE_ALIGN(size);
> >>>>>>    		nio_pages = size >> IOMMU_PAGE_SHIFT;
> >>>>>>    		iommu_free(tbl, dma_handle, nio_pages);
> >>>>>>    		size = PAGE_ALIGN(size);
> >>>>>>    		free_pages((unsigned long)vaddr, get_order(size));
> >>>>>>    	}
> >>>>>>    }
> >>>>>> +
> >>>>>> +#ifdef CONFIG_IOMMU_API
> >>>>>> +/*
> >>>>>> + * SPAPR TCE API
> >>>>>> + */
> >>>>>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> >>>>>> +entry) {
> >>>>>> +	struct page *page = NULL;
> >>>>>
> >>>>> NULL initialization doesn't appear to be necessary
> >>>>>
> >>>>>> +	unsigned long oldtce;
> >>>>>> +
> >>>>>> +	oldtce = ppc_md.tce_get(tbl, entry);
> >>>>>> +
> >>>>>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >>>>>> +		return NULL;
> >>>>>> +
> >>>>>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> >>>>>> +
> >>>>>> +	WARN_ON(!page);
> >>>>>> +	if (page && (oldtce & TCE_PCI_WRITE))
> >>>>>> +		SetPageDirty(page);
> >>>>>> +	ppc_md.tce_free(tbl, entry, 1);
> >>>>>> +
> >>>>>> +	return page;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> >>>>>> +		uint64_t tce, enum dma_data_direction direction) {
> >>>>>> +	int ret;
> >>>>>> +	struct page *page = NULL;
> >>>>>> +	unsigned long kva, offset;
> >>>>>> +
> >>>>>> +	/* Map new TCE */
> >>>>>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >>>>>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >>>>>> +			direction != DMA_TO_DEVICE, &page);
> >>>>>> +	if (ret < 1) {
> >>>>>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> >>>>> tce=%llx ioba=%lx ret=%d\n",
> >>>>>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> >>>>>> +		if (!ret)
> >>>>>> +			ret = -EFAULT;
> >>>>>
> >>>>> Missing return ret?  Otherwise we've got some bogus uses of page below
> >>>>> and we're setting ret for no reason here.
> >>>>>
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	kva = (unsigned long) page_address(page);
> >>>>>> +	kva += offset;
> >>>>>> +
> >>>>>> +	/* tce_build receives a virtual address */
> >>>>>> +	entry += tbl->it_offset; /* Offset into real TCE table */
> >>>>>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >>>>>> +
> >>>>>> +	/* tce_build() only returns non-zero for transient errors */
> >>>>>> +	if (unlikely(ret)) {
> >>>>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> >>>>> ioba=%lx kva=%lx ret=%d\n",
> >>>>>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> >>>>>> +		put_page(page);
> >>>>>> +		return -EIO;
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void tce_flush(struct iommu_table *tbl) {
> >>>>>> +	/* Flush/invalidate TLB caches if necessary */
> >>>>>> +	if (ppc_md.tce_flush)
> >>>>>> +		ppc_md.tce_flush(tbl);
> >>>>>> +
> >>>>>> +	/* Make sure updates are seen by hardware */
> >>>>>> +	mb();
> >>>>>> +}
> >>>>>> +
> >>>>>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >>>>> uint64_t tce,
> >>>>>> +		enum dma_data_direction direction, unsigned long pages) {
> >>>>>> +	int i, ret = 0, pages_to_put = 0;
> >>>>>> +	struct page *page;
> >>>>>> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >>>>>> +	struct page **oldpages;
> >>>>>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> >>>>>> +
> >>>>>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> >>>>>> +
> >>>>>> +	/* Handle a single page request without allocation
> >>>>>> +	   of pages-to-release array */
> >>>>>> +	if (pages == 1) {
> >>>>>> +		spin_lock(&(pool->lock));
> >>>>>> +		page = free_tce(tbl, entry);
> >>>>>> +
> >>>>>> +		if (direction != DMA_NONE)
> >>>>>> +			ret = put_tce(tbl, entry, tce, direction);
> >>>>>> +
> >>>>>> +		tce_flush(tbl);
> >>>>>> +
> >>>>>> +		if (page)
> >>>>>> +			put_page(page);
> >>>>>> +
> >>>>>> +		spin_unlock(&(pool->lock));
> >>>>>> +		return ret;
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	/* Releasing multiple pages */
> >>>>>> +	/* Allocate an array for pages to be released after TCE table
> >>>>>> +	   is updated */
> >>>>>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> >>>>>> +	if (!oldpages)
> >>>>>> +		return -ENOMEM;
> >>>>>> +
> >>>>>> +	spin_lock(&(pool->lock));
> >>>>>> +
> >>>>>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> >>>>> IOMMU_PAGE_SIZE) {
> >>>>>> +		page = free_tce(tbl, entry);
> >>>>>> +		if (page) {
> >>>>>> +			oldpages[pages_to_put] = page;
> >>>>>> +			++pages_to_put;
> >>>>>> +		}
> >>>>>> +
> >>>>>> +		if (direction != DMA_NONE)
> >>>>>> +			ret = put_tce(tbl, entry, tce, direction);
> >>>>>> +
> >>>>>> +		/* Release old pages if we reached the end of oldpages[] or
> >>>>>> +		   it is the last page or we are about to exit the loop */
> >>>>>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> >>>>> {
> >>>>>> +			tce_flush(tbl);
> >>>>>
> >>>>> Avoiding tce_flush() is the reason for all this extra overhead, right?
> >>>>> I wonder if it'd be cleaner separating map vs unmap, where the map case
> >>>>> can avoid the oldpages array... but that means inserting new mappings on
> >>>>> top of old ones wouldn't put the pages.
> >>>
> >>>
> >>> Yes, we do not want to loose pages if the guest forgot to unmap them.
> >>
> >> Hmm, does that mean we're not actively clearing tce entries or somehow
> >> disabling the iommu window when the iommu is released through vfio?
> >
> > Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
> > more concerned about the guest simply mapping over top of it's own
> > mappings.  Is that common?  Is it common enough for every multi-page
> > mapping to assume it will happen?  I know this is a performance
> > sensitive path for you and it seems like a map-only w/ fallback to
> > unmap, remap would be better in the general case.
> 
> 
> I do not get it. Where exactly does the performance suffer?
> iommu_put_tces() with non zero "tce" (i.e. "map") has to check if the entry 
> is not used, at least to return EBUSY when it is, and this check is 
> performed. If it is zero, there is no overhead at all. And it is going to 
> be the 99.(9)% case as the guest (un)maps one page per call.

I was mostly concerned about the kmalloc in your mapping path.  If you
had a map-only path it could scan the whole range to verify it's clear
and return EBUSY w/o allocating a buffer.  A second pass could do the
actual mappings.  Maybe it's not much of a win if you expect 99% of the
mappings to be single pages but since you effectively have a pv iommu
interface I wouldn't be surprised if they get batched in the guest.

> Generally speaking we want to move "put tce" completely to the kernel for 
> the (much) better performance and vfio won't be dealing with it all.

Right, but let's not use that as an excuse to be lazy and not ask if we
can do better here.

> We already agreed that SPAPR TCE driver uses x86 (aka type1) API but I do 
> not see why the powerpc implementation should look x86 alike as it still 
> operates with powerpc machine dependent callbacks so the reader has to have 
> some powerpc knowledge.

I'm only using x86 as an example because it's the only one we have.  I
don't think anything we're talking about here is x86-ish or powerpc-ish.
There's a kmalloc in a performance path and I'm asking if we can get rid
of it.  I'm also nervous that we're silently doing fixups on user
parameters to adjust mapping sizes and clear overlaps without any
warning to the user.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
@ 2012-11-27  4:23                 ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-27  4:23 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, Sethi Varun-B16395,
	linuxppc-dev, David Gibson

On Tue, 2012-11-27 at 14:28 +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 05:04, Alex Williamson wrote:
> > On Mon, 2012-11-26 at 08:18 -0700, Alex Williamson wrote:
> >> On Fri, 2012-11-23 at 13:02 +1100, Alexey Kardashevskiy wrote:
> >>> On 22/11/12 22:56, Sethi Varun-B16395 wrote:
> >>>>
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> >>>>> owner@vger.kernel.org] On Behalf Of Alex Williamson
> >>>>> Sent: Tuesday, November 20, 2012 11:50 PM
> >>>>> To: Alexey Kardashevskiy
> >>>>> Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc-
> >>>>> dev@lists.ozlabs.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> >>>>> David Gibson
> >>>>> Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv
> >>>>> platform
> >>>>>
> >>>>> On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
> >>>>>> VFIO implements platform independent stuff such as a PCI driver, BAR
> >>>>>> access (via read/write on a file descriptor or direct mapping when
> >>>>>> possible) and IRQ signaling.
> >>>>>> The platform dependent part includes IOMMU initialization and
> >>>>>> handling.
> >>>>>>
> >>>>>> This patch initializes IOMMU groups based on the IOMMU configuration
> >>>>>> discovered during the PCI scan, only POWERNV platform is supported at
> >>>>>> the moment.
> >>>>>>
> >>>>>> Also the patch implements an VFIO-IOMMU driver which manages DMA
> >>>>>> mapping/unmapping requests coming from the client (now QEMU). It also
> >>>>>> returns a DMA window information to let the guest initialize the
> >>>>>> device tree for a guest OS properly. Although this driver has been
> >>>>>> tested only on POWERNV, it should work on any platform supporting TCE
> >>>>>> tables.
> >>>>>>
> >>>>>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option.
> >>>>>>
> >>>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
> >>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>> ---
> >>>>>>    arch/powerpc/include/asm/iommu.h     |    6 +
> >>>>>>    arch/powerpc/kernel/iommu.c          |  140 +++++++++++++++++++
> >>>>>>    arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++
> >>>>>>    drivers/iommu/Kconfig                |    8 ++
> >>>>>>    drivers/vfio/Kconfig                 |    6 +
> >>>>>>    drivers/vfio/Makefile                |    1 +
> >>>>>>    drivers/vfio/vfio_iommu_spapr_tce.c  |  247
> >>>>> ++++++++++++++++++++++++++++++++++
> >>>>>>    include/linux/vfio.h                 |   20 +++
> >>>>>>    8 files changed, 563 insertions(+)
> >>>>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>>>
> >>>>>> diff --git a/arch/powerpc/include/asm/iommu.h
> >>>>>> b/arch/powerpc/include/asm/iommu.h
> >>>>>> index cbfe678..5ba66cb 100644
> >>>>>> --- a/arch/powerpc/include/asm/iommu.h
> >>>>>> +++ b/arch/powerpc/include/asm/iommu.h
> >>>>>> @@ -64,30 +64,33 @@ struct iommu_pool {  }
> >>>>>> ____cacheline_aligned_in_smp;
> >>>>>>
> >>>>>>    struct iommu_table {
> >>>>>>    	unsigned long  it_busno;     /* Bus number this table belongs to */
> >>>>>>    	unsigned long  it_size;      /* Size of iommu table in entries */
> >>>>>>    	unsigned long  it_offset;    /* Offset into global table */
> >>>>>>    	unsigned long  it_base;      /* mapped address of tce table */
> >>>>>>    	unsigned long  it_index;     /* which iommu table this is */
> >>>>>>    	unsigned long  it_type;      /* type: PCI or Virtual Bus */
> >>>>>>    	unsigned long  it_blocksize; /* Entries in each block (cacheline)
> >>>>> */
> >>>>>>    	unsigned long  poolsize;
> >>>>>>    	unsigned long  nr_pools;
> >>>>>>    	struct iommu_pool large_pool;
> >>>>>>    	struct iommu_pool pools[IOMMU_NR_POOLS];
> >>>>>>    	unsigned long *it_map;       /* A simple allocation bitmap for now
> >>>>> */
> >>>>>> +#ifdef CONFIG_IOMMU_API
> >>>>>> +	struct iommu_group *it_group;
> >>>>>> +#endif
> >>>>>>    };
> >>>>>>
> >>>>>>    struct scatterlist;
> >>>>>>
> >>>>>>    static inline void set_iommu_table_base(struct device *dev, void
> >>>>>> *base)  {
> >>>>>>    	dev->archdata.dma_data.iommu_table_base = base;  }
> >>>>>>
> >>>>>>    static inline void *get_iommu_table_base(struct device *dev)  {
> >>>>>>    	return dev->archdata.dma_data.iommu_table_base;
> >>>>>>    }
> >>>>>>
> >>>>>>    /* Frees table for an individual device node */ @@ -135,17 +138,20 @@
> >>>>>> static inline void pci_iommu_init(void) { }  extern void
> >>>>>> alloc_dart_table(void);  #if defined(CONFIG_PPC64) &&
> >>>>>> defined(CONFIG_PM)  static inline void iommu_save(void)  {
> >>>>>>    	if (ppc_md.iommu_save)
> >>>>>>    		ppc_md.iommu_save();
> >>>>>>    }
> >>>>>>
> >>>>>>    static inline void iommu_restore(void)  {
> >>>>>>    	if (ppc_md.iommu_restore)
> >>>>>>    		ppc_md.iommu_restore();
> >>>>>>    }
> >>>>>>    #endif
> >>>>>>
> >>>>>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long
> >>>>> entry, uint64_t tce,
> >>>>>> +		enum dma_data_direction direction, unsigned long pages);
> >>>>>> +
> >>>>>>    #endif /* __KERNEL__ */
> >>>>>>    #endif /* _ASM_IOMMU_H */
> >>>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >>>>>> index ff5a6ce..94f614b 100644
> >>>>>> --- a/arch/powerpc/kernel/iommu.c
> >>>>>> +++ b/arch/powerpc/kernel/iommu.c
> >>>>>> @@ -32,30 +32,31 @@
> >>>>>>    #include <linux/dma-mapping.h>
> >>>>>>    #include <linux/bitmap.h>
> >>>>>>    #include <linux/iommu-helper.h>
> >>>>>>    #include <linux/crash_dump.h>
> >>>>>>    #include <linux/hash.h>
> >>>>>>    #include <linux/fault-inject.h>
> >>>>>>    #include <linux/pci.h>
> >>>>>>    #include <asm/io.h>
> >>>>>>    #include <asm/prom.h>
> >>>>>>    #include <asm/iommu.h>
> >>>>>>    #include <asm/pci-bridge.h>
> >>>>>>    #include <asm/machdep.h>
> >>>>>>    #include <asm/kdump.h>
> >>>>>>    #include <asm/fadump.h>
> >>>>>>    #include <asm/vio.h>
> >>>>>> +#include <asm/tce.h>
> >>>>>>
> >>>>>>    #define DBG(...)
> >>>>>>
> >>>>>>    static int novmerge;
> >>>>>>
> >>>>>>    static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned
> >>>>>> int);
> >>>>>>
> >>>>>>    static int __init setup_iommu(char *str)  {
> >>>>>>    	if (!strcmp(str, "novmerge"))
> >>>>>>    		novmerge = 1;
> >>>>>>    	else if (!strcmp(str, "vmerge"))
> >>>>>>    		novmerge = 0;
> >>>>>>    	return 1;
> >>>>>>    }
> >>>>>> @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev,
> >>>>>> struct iommu_table *tbl,  }
> >>>>>>
> >>>>>>    void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >>>>>>    			 void *vaddr, dma_addr_t dma_handle)  {
> >>>>>>    	if (tbl) {
> >>>>>>    		unsigned int nio_pages;
> >>>>>>
> >>>>>>    		size = PAGE_ALIGN(size);
> >>>>>>    		nio_pages = size >> IOMMU_PAGE_SHIFT;
> >>>>>>    		iommu_free(tbl, dma_handle, nio_pages);
> >>>>>>    		size = PAGE_ALIGN(size);
> >>>>>>    		free_pages((unsigned long)vaddr, get_order(size));
> >>>>>>    	}
> >>>>>>    }
> >>>>>> +
> >>>>>> +#ifdef CONFIG_IOMMU_API
> >>>>>> +/*
> >>>>>> + * SPAPR TCE API
> >>>>>> + */
> >>>>>> +static struct page *free_tce(struct iommu_table *tbl, unsigned long
> >>>>>> +entry) {
> >>>>>> +	struct page *page = NULL;
> >>>>>
> >>>>> NULL initialization doesn't appear to be necessary
> >>>>>
> >>>>>> +	unsigned long oldtce;
> >>>>>> +
> >>>>>> +	oldtce = ppc_md.tce_get(tbl, entry);
> >>>>>> +
> >>>>>> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >>>>>> +		return NULL;
> >>>>>> +
> >>>>>> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> >>>>>> +
> >>>>>> +	WARN_ON(!page);
> >>>>>> +	if (page && (oldtce & TCE_PCI_WRITE))
> >>>>>> +		SetPageDirty(page);
> >>>>>> +	ppc_md.tce_free(tbl, entry, 1);
> >>>>>> +
> >>>>>> +	return page;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> >>>>>> +		uint64_t tce, enum dma_data_direction direction) {
> >>>>>> +	int ret;
> >>>>>> +	struct page *page = NULL;
> >>>>>> +	unsigned long kva, offset;
> >>>>>> +
> >>>>>> +	/* Map new TCE */
> >>>>>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >>>>>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >>>>>> +			direction != DMA_TO_DEVICE, &page);
> >>>>>> +	if (ret < 1) {
> >>>>>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed
> >>>>> tce=%llx ioba=%lx ret=%d\n",
> >>>>>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> >>>>>> +		if (!ret)
> >>>>>> +			ret = -EFAULT;
> >>>>>
> >>>>> Missing return ret?  Otherwise we've got some bogus uses of page below
> >>>>> and we're setting ret for no reason here.
> >>>>>
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	kva = (unsigned long) page_address(page);
> >>>>>> +	kva += offset;
> >>>>>> +
> >>>>>> +	/* tce_build receives a virtual address */
> >>>>>> +	entry += tbl->it_offset; /* Offset into real TCE table */
> >>>>>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >>>>>> +
> >>>>>> +	/* tce_build() only returns non-zero for transient errors */
> >>>>>> +	if (unlikely(ret)) {
> >>>>>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx
> >>>>> ioba=%lx kva=%lx ret=%d\n",
> >>>>>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> >>>>>> +		put_page(page);
> >>>>>> +		return -EIO;
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void tce_flush(struct iommu_table *tbl) {
> >>>>>> +	/* Flush/invalidate TLB caches if necessary */
> >>>>>> +	if (ppc_md.tce_flush)
> >>>>>> +		ppc_md.tce_flush(tbl);
> >>>>>> +
> >>>>>> +	/* Make sure updates are seen by hardware */
> >>>>>> +	mb();
> >>>>>> +}
> >>>>>> +
> >>>>>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >>>>> uint64_t tce,
> >>>>>> +		enum dma_data_direction direction, unsigned long pages) {
> >>>>>> +	int i, ret = 0, pages_to_put = 0;
> >>>>>> +	struct page *page;
> >>>>>> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >>>>>> +	struct page **oldpages;
> >>>>>> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> >>>>>> +
> >>>>>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> >>>>>> +
> >>>>>> +	/* Handle a single page request without allocation
> >>>>>> +	   of pages-to-release array */
> >>>>>> +	if (pages == 1) {
> >>>>>> +		spin_lock(&(pool->lock));
> >>>>>> +		page = free_tce(tbl, entry);
> >>>>>> +
> >>>>>> +		if (direction != DMA_NONE)
> >>>>>> +			ret = put_tce(tbl, entry, tce, direction);
> >>>>>> +
> >>>>>> +		tce_flush(tbl);
> >>>>>> +
> >>>>>> +		if (page)
> >>>>>> +			put_page(page);
> >>>>>> +
> >>>>>> +		spin_unlock(&(pool->lock));
> >>>>>> +		return ret;
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	/* Releasing multiple pages */
> >>>>>> +	/* Allocate an array for pages to be released after TCE table
> >>>>>> +	   is updated */
> >>>>>> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> >>>>>> +	if (!oldpages)
> >>>>>> +		return -ENOMEM;
> >>>>>> +
> >>>>>> +	spin_lock(&(pool->lock));
> >>>>>> +
> >>>>>> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce +=
> >>>>> IOMMU_PAGE_SIZE) {
> >>>>>> +		page = free_tce(tbl, entry);
> >>>>>> +		if (page) {
> >>>>>> +			oldpages[pages_to_put] = page;
> >>>>>> +			++pages_to_put;
> >>>>>> +		}
> >>>>>> +
> >>>>>> +		if (direction != DMA_NONE)
> >>>>>> +			ret = put_tce(tbl, entry, tce, direction);
> >>>>>> +
> >>>>>> +		/* Release old pages if we reached the end of oldpages[] or
> >>>>>> +		   it is the last page or we are about to exit the loop */
> >>>>>> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret)
> >>>>> {
> >>>>>> +			tce_flush(tbl);
> >>>>>
> >>>>> Avoiding tce_flush() is the reason for all this extra overhead, right?
> >>>>> I wonder if it'd be cleaner separating map vs unmap, where the map case
> >>>>> can avoid the oldpages array... but that means inserting new mappings on
> >>>>> top of old ones wouldn't put the pages.
> >>>
> >>>
> >>> Yes, we do not want to loose pages if the guest forgot to unmap them.
> >>
> >> Hmm, does that mean we're not actively clearing tce entries or somehow
> >> disabling the iommu window when the iommu is released through vfio?
> >
> > Ok, I see tces are put on shutdown via tce_iommu_detach_group, so you're
> > more concerned about the guest simply mapping over top of it's own
> > mappings.  Is that common?  Is it common enough for every multi-page
> > mapping to assume it will happen?  I know this is a performance
> > sensitive path for you and it seems like a map-only w/ fallback to
> > unmap, remap would be better in the general case.
> 
> 
> I do not get it. Where exactly does the performance suffer?
> iommu_put_tces() with non zero "tce" (i.e. "map") has to check if the entry 
> is not used, at least to return EBUSY when it is, and this check is 
> performed. If it is zero, there is no overhead at all. And it is going to 
> be the 99.(9)% case as the guest (un)maps one page per call.

I was mostly concerned about the kmalloc in your mapping path.  If you
had a map-only path it could scan the whole range to verify it's clear
and return EBUSY w/o allocating a buffer.  A second pass could do the
actual mappings.  Maybe it's not much of a win if you expect 99% of the
mappings to be single pages but since you effectively have a pv iommu
interface I wouldn't be surprised if they get batched in the guest.

> Generally speaking we want to move "put tce" completely to the kernel for 
> the (much) better performance and vfio won't be dealing with it all.

Right, but let's not use that as an excuse to be lazy and not ask if we
can do better here.

> We already agreed that SPAPR TCE driver uses x86 (aka type1) API but I do 
> not see why the powerpc implementation should look x86 alike as it still 
> operates with powerpc machine dependent callbacks so the reader has to have 
> some powerpc knowledge.

I'm only using x86 as an example because it's the only one we have.  I
don't think anything we're talking about here is x86-ish or powerpc-ish.
There's a kmalloc in a performance path and I'm asking if we can get rid
of it.  I'm also nervous that we're silently doing fixups on user
parameters to adjust mapping sizes and clear overlaps without any
warning to the user.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-27  4:06             ` Alexey Kardashevskiy
@ 2012-11-27  4:29               ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-27  4:29 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 05:20, Alex Williamson wrote:
> > On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> >> VFIO implements platform independent stuff such as
> >> a PCI driver, BAR access (via read/write on a file descriptor
> >> or direct mapping when possible) and IRQ signaling.
> >>
> >> The platform dependent part includes IOMMU initialization
> >> and handling. This patch implements an IOMMU driver for VFIO
> >> which does mapping/unmapping pages for the guest IO and
> >> provides information about DMA window (required by a POWERPC
> >> guest).
> >>
> >> The counterpart in QEMU is required to support this functionality.
> >>
> >> Cc: David Gibson <david@gibson.dropbear.id.au>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>   drivers/vfio/Kconfig                |    6 +
> >>   drivers/vfio/Makefile               |    1 +
> >>   drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
> >>   include/linux/vfio.h                |   20 +++
> >>   4 files changed, 274 insertions(+)
> >>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>
> >> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >> index 7cd5dec..b464687 100644
> >> --- a/drivers/vfio/Kconfig
> >> +++ b/drivers/vfio/Kconfig
> >> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>   	depends on VFIO
> >>   	default n
> >>
> >> +config VFIO_IOMMU_SPAPR_TCE
> >> +	tristate
> >> +	depends on VFIO && SPAPR_TCE_IOMMU
> >> +	default n
> >> +
> >>   menuconfig VFIO
> >>   	tristate "VFIO Non-Privileged userspace driver framework"
> >>   	depends on IOMMU_API
> >>   	select VFIO_IOMMU_TYPE1 if X86
> >> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>   	help
> >>   	  VFIO provides a framework for secure userspace device drivers.
> >>   	  See Documentation/vfio.txt for more details.
> >> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >> index 2398d4a..72bfabc 100644
> >> --- a/drivers/vfio/Makefile
> >> +++ b/drivers/vfio/Makefile
> >> @@ -1,3 +1,4 @@
> >>   obj-$(CONFIG_VFIO) += vfio.o
> >>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>   obj-$(CONFIG_VFIO_PCI) += pci/
> >> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> new file mode 100644
> >> index 0000000..46a6298
> >> --- /dev/null
> >> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> @@ -0,0 +1,247 @@
> >> +/*
> >> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >> + *
> >> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License version 2 as
> >> + * published by the Free Software Foundation.
> >> + *
> >> + * Derived from original vfio_iommu_type1.c:
> >> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> >> + */
> >> +
> >> +#include <linux/module.h>
> >> +#include <linux/pci.h>
> >> +#include <linux/slab.h>
> >> +#include <linux/uaccess.h>
> >> +#include <linux/err.h>
> >> +#include <linux/vfio.h>
> >> +#include <asm/iommu.h>
> >> +
> >> +#define DRIVER_VERSION  "0.1"
> >> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> >> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> >> +
> >> +static void tce_iommu_detach_group(void *iommu_data,
> >> +		struct iommu_group *iommu_group);
> >> +
> >> +/*
> >> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >> + */
> >> +
> >> +/*
> >> + * The container descriptor supports only a single group per container.
> >> + * Required by the API as the container is not supplied with the IOMMU group
> >> + * at the moment of initialization.
> >> + */
> >> +struct tce_container {
> >> +	struct mutex lock;
> >> +	struct iommu_table *tbl;
> >> +};
> >> +
> >> +static void *tce_iommu_open(unsigned long arg)
> >> +{
> >> +	struct tce_container *container;
> >> +
> >> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >> +		return ERR_PTR(-EINVAL);
> >> +	}
> >> +
> >> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> >> +	if (!container)
> >> +		return ERR_PTR(-ENOMEM);
> >> +
> >> +	mutex_init(&container->lock);
> >> +
> >> +	return container;
> >> +}
> >> +
> >> +static void tce_iommu_release(void *iommu_data)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +
> >> +	WARN_ON(container->tbl && !container->tbl->it_group);
> >
> > I think your patch ordering is backwards here.  it_group isn't added
> > until 2/2.  I'd really like to see the arch/powerpc code approved and
> > merged by the powerpc maintainer before we add the code that makes use
> > of it into vfio.  Otherwise we just get lots of churn if interfaces
> > change or they disapprove of it altogether.
> 
> 
> Makes sense, thanks.
> 
> 
> >> +	if (container->tbl && container->tbl->it_group)
> >> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> >> +
> >> +	mutex_destroy(&container->lock);
> >> +
> >> +	kfree(container);
> >> +}
> >> +
> >> +static long tce_iommu_ioctl(void *iommu_data,
> >> +				 unsigned int cmd, unsigned long arg)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +	unsigned long minsz;
> >> +
> >> +	switch (cmd) {
> >> +	case VFIO_CHECK_EXTENSION: {
> >> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >> +	}
> >> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >> +		struct vfio_iommu_spapr_tce_info info;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +
> >> +		if (WARN_ON(!tbl))
> >> +			return -ENXIO;
> >> +
> >> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >> +				dma64_window_size);
> >> +
> >> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (info.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >> +		info.dma64_window_start = 0;
> >> +		info.dma64_window_size = 0;
> >> +		info.flags = 0;
> >> +
> >> +		if (copy_to_user((void __user *)arg, &info, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		return 0;
> >> +	}
> >> +	case VFIO_IOMMU_MAP_DMA: {
> >> +		vfio_iommu_spapr_tce_dma_map param;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +		enum dma_data_direction direction = DMA_NONE;
> >> +
> >> +		if (WARN_ON(!tbl))
> >> +			return -ENXIO;
> >> +
> >> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >> +
> >> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (param.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> >> +			direction = DMA_BIDIRECTIONAL;
> >> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> >> +			direction = DMA_TO_DEVICE;
> >> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> >> +			direction = DMA_FROM_DEVICE;
> >> +		}
> >> +
> >> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >
> > On x86 we force iova, vaddr, and size to all be aligned to the smallest
> > page granularity of the iommu and return -EINVAL if it doesn't fit.
> > What does it imply to the user if they're always aligned to work here?
> > Won't this interface happily map overlapping entries with no indication
> > to the user that the previous mapping is no longer valid?
> > Maybe another reason why a combined unmap/map makes me nervous, we have
> > to assume the user knows what they're doing.
> 
> 
> I got used to guests which do know what they are doing so I am pretty calm :)
> but ok, I'll move alignment to the QEMU, it makes sense.
> 
> 
> >> +
> >> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >> +				param.vaddr & IOMMU_PAGE_MASK, direction,
> >> +				param.size >> IOMMU_PAGE_SHIFT);
> >> +	}
> >> +	case VFIO_IOMMU_UNMAP_DMA: {
> >> +		vfio_iommu_spapr_tce_dma_unmap param;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +
> >> +		if (WARN_ON(!tbl))
> >> +			return -ENXIO;
> >> +
> >> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> >> +
> >> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (param.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >> +
> >> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> >> +	}
> >> +	default:
> >> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >
> > pr_warn
> >
> >> +	}
> >> +
> >> +	return -ENOTTY;
> >> +}
> >> +
> >> +static int tce_iommu_attach_group(void *iommu_data,
> >> +		struct iommu_group *iommu_group)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >> +
> >> +	BUG_ON(!tbl);
> >> +	mutex_lock(&container->lock);
> >> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> >> +			iommu_group_id(iommu_group), iommu_group);
> >> +	if (container->tbl) {
> >> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> >
> > pr_warn
> >
> >> +				iommu_group_id(container->tbl->it_group),
> >> +				iommu_group_id(iommu_group));
> >> +		mutex_unlock(&container->lock);
> >> +		return -EBUSY;
> >> +	}
> >> +
> >> +	container->tbl = tbl;
> >
> > Would it be too much paranoia to clear all the tce here as you do below
> > on detach?
> 
> Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e. 
> unmaps) the whole DMA window at the boot time.

But that's just one user of this interface, we can't assume they'll all
be so agreeable.  If any tces were enabled here, a malicious user would
have a window to host memory, right?  Thanks,

Alex


^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-27  4:29               ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-27  4:29 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 05:20, Alex Williamson wrote:
> > On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> >> VFIO implements platform independent stuff such as
> >> a PCI driver, BAR access (via read/write on a file descriptor
> >> or direct mapping when possible) and IRQ signaling.
> >>
> >> The platform dependent part includes IOMMU initialization
> >> and handling. This patch implements an IOMMU driver for VFIO
> >> which does mapping/unmapping pages for the guest IO and
> >> provides information about DMA window (required by a POWERPC
> >> guest).
> >>
> >> The counterpart in QEMU is required to support this functionality.
> >>
> >> Cc: David Gibson <david@gibson.dropbear.id.au>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>   drivers/vfio/Kconfig                |    6 +
> >>   drivers/vfio/Makefile               |    1 +
> >>   drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
> >>   include/linux/vfio.h                |   20 +++
> >>   4 files changed, 274 insertions(+)
> >>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>
> >> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >> index 7cd5dec..b464687 100644
> >> --- a/drivers/vfio/Kconfig
> >> +++ b/drivers/vfio/Kconfig
> >> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>   	depends on VFIO
> >>   	default n
> >>
> >> +config VFIO_IOMMU_SPAPR_TCE
> >> +	tristate
> >> +	depends on VFIO && SPAPR_TCE_IOMMU
> >> +	default n
> >> +
> >>   menuconfig VFIO
> >>   	tristate "VFIO Non-Privileged userspace driver framework"
> >>   	depends on IOMMU_API
> >>   	select VFIO_IOMMU_TYPE1 if X86
> >> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>   	help
> >>   	  VFIO provides a framework for secure userspace device drivers.
> >>   	  See Documentation/vfio.txt for more details.
> >> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >> index 2398d4a..72bfabc 100644
> >> --- a/drivers/vfio/Makefile
> >> +++ b/drivers/vfio/Makefile
> >> @@ -1,3 +1,4 @@
> >>   obj-$(CONFIG_VFIO) += vfio.o
> >>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>   obj-$(CONFIG_VFIO_PCI) += pci/
> >> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> new file mode 100644
> >> index 0000000..46a6298
> >> --- /dev/null
> >> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> @@ -0,0 +1,247 @@
> >> +/*
> >> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >> + *
> >> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License version 2 as
> >> + * published by the Free Software Foundation.
> >> + *
> >> + * Derived from original vfio_iommu_type1.c:
> >> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> >> + */
> >> +
> >> +#include <linux/module.h>
> >> +#include <linux/pci.h>
> >> +#include <linux/slab.h>
> >> +#include <linux/uaccess.h>
> >> +#include <linux/err.h>
> >> +#include <linux/vfio.h>
> >> +#include <asm/iommu.h>
> >> +
> >> +#define DRIVER_VERSION  "0.1"
> >> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> >> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> >> +
> >> +static void tce_iommu_detach_group(void *iommu_data,
> >> +		struct iommu_group *iommu_group);
> >> +
> >> +/*
> >> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >> + */
> >> +
> >> +/*
> >> + * The container descriptor supports only a single group per container.
> >> + * Required by the API as the container is not supplied with the IOMMU group
> >> + * at the moment of initialization.
> >> + */
> >> +struct tce_container {
> >> +	struct mutex lock;
> >> +	struct iommu_table *tbl;
> >> +};
> >> +
> >> +static void *tce_iommu_open(unsigned long arg)
> >> +{
> >> +	struct tce_container *container;
> >> +
> >> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >> +		return ERR_PTR(-EINVAL);
> >> +	}
> >> +
> >> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> >> +	if (!container)
> >> +		return ERR_PTR(-ENOMEM);
> >> +
> >> +	mutex_init(&container->lock);
> >> +
> >> +	return container;
> >> +}
> >> +
> >> +static void tce_iommu_release(void *iommu_data)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +
> >> +	WARN_ON(container->tbl && !container->tbl->it_group);
> >
> > I think your patch ordering is backwards here.  it_group isn't added
> > until 2/2.  I'd really like to see the arch/powerpc code approved and
> > merged by the powerpc maintainer before we add the code that makes use
> > of it into vfio.  Otherwise we just get lots of churn if interfaces
> > change or they disapprove of it altogether.
> 
> 
> Makes sense, thanks.
> 
> 
> >> +	if (container->tbl && container->tbl->it_group)
> >> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> >> +
> >> +	mutex_destroy(&container->lock);
> >> +
> >> +	kfree(container);
> >> +}
> >> +
> >> +static long tce_iommu_ioctl(void *iommu_data,
> >> +				 unsigned int cmd, unsigned long arg)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +	unsigned long minsz;
> >> +
> >> +	switch (cmd) {
> >> +	case VFIO_CHECK_EXTENSION: {
> >> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >> +	}
> >> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >> +		struct vfio_iommu_spapr_tce_info info;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +
> >> +		if (WARN_ON(!tbl))
> >> +			return -ENXIO;
> >> +
> >> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >> +				dma64_window_size);
> >> +
> >> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (info.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >> +		info.dma64_window_start = 0;
> >> +		info.dma64_window_size = 0;
> >> +		info.flags = 0;
> >> +
> >> +		if (copy_to_user((void __user *)arg, &info, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		return 0;
> >> +	}
> >> +	case VFIO_IOMMU_MAP_DMA: {
> >> +		vfio_iommu_spapr_tce_dma_map param;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +		enum dma_data_direction direction = DMA_NONE;
> >> +
> >> +		if (WARN_ON(!tbl))
> >> +			return -ENXIO;
> >> +
> >> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >> +
> >> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (param.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> >> +			direction = DMA_BIDIRECTIONAL;
> >> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> >> +			direction = DMA_TO_DEVICE;
> >> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> >> +			direction = DMA_FROM_DEVICE;
> >> +		}
> >> +
> >> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >
> > On x86 we force iova, vaddr, and size to all be aligned to the smallest
> > page granularity of the iommu and return -EINVAL if it doesn't fit.
> > What does it imply to the user if they're always aligned to work here?
> > Won't this interface happily map overlapping entries with no indication
> > to the user that the previous mapping is no longer valid?
> > Maybe another reason why a combined unmap/map makes me nervous, we have
> > to assume the user knows what they're doing.
> 
> 
> I got used to guests which do know what they are doing so I am pretty calm :)
> but ok, I'll move alignment to the QEMU, it makes sense.
> 
> 
> >> +
> >> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >> +				param.vaddr & IOMMU_PAGE_MASK, direction,
> >> +				param.size >> IOMMU_PAGE_SHIFT);
> >> +	}
> >> +	case VFIO_IOMMU_UNMAP_DMA: {
> >> +		vfio_iommu_spapr_tce_dma_unmap param;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +
> >> +		if (WARN_ON(!tbl))
> >> +			return -ENXIO;
> >> +
> >> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> >> +
> >> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (param.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >> +
> >> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> >> +	}
> >> +	default:
> >> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >
> > pr_warn
> >
> >> +	}
> >> +
> >> +	return -ENOTTY;
> >> +}
> >> +
> >> +static int tce_iommu_attach_group(void *iommu_data,
> >> +		struct iommu_group *iommu_group)
> >> +{
> >> +	struct tce_container *container = iommu_data;
> >> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >> +
> >> +	BUG_ON(!tbl);
> >> +	mutex_lock(&container->lock);
> >> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> >> +			iommu_group_id(iommu_group), iommu_group);
> >> +	if (container->tbl) {
> >> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> >
> > pr_warn
> >
> >> +				iommu_group_id(container->tbl->it_group),
> >> +				iommu_group_id(iommu_group));
> >> +		mutex_unlock(&container->lock);
> >> +		return -EBUSY;
> >> +	}
> >> +
> >> +	container->tbl = tbl;
> >
> > Would it be too much paranoia to clear all the tce here as you do below
> > on detach?
> 
> Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e. 
> unmaps) the whole DMA window at the boot time.

But that's just one user of this interface, we can't assume they'll all
be so agreeable.  If any tces were enabled here, a malicious user would
have a window to host memory, right?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 2/2] vfio powerpc: enabled on powernv platform
  2012-11-23  9:03         ` Alexey Kardashevskiy
@ 2012-11-27  4:41           ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-27  4:41 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    6 ++
>  arch/powerpc/kernel/iommu.c          |  141 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 290 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5ba66cb 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,8 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> +		enum dma_data_direction direction, unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..c8dad1f 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,143 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
> +{
> +	struct page *page;
> +	unsigned long oldtce;
> +
> +	oldtce = ppc_md.tce_get(tbl, entry);
> +
> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +		return NULL;
> +
> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +	WARN_ON(!page);
> +	if (page && (oldtce & TCE_PCI_WRITE))
> +		SetPageDirty(page);
> +	ppc_md.tce_free(tbl, entry, 1);
> +
> +	return page;
> +}
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);

We're locking memory here on behalf of the user, but I don't see where
rlimit gets checked to verify the user has privileges to lock the pages.
I know you're locking a much smaller set of memory than x86 does, but
are we just foregoing that added security?

> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret)
> +			ret = -EFAULT;
> +		return ret;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> +		enum dma_data_direction direction, unsigned long pages)
> +{
> +	int i, ret = 0, pages_to_put = 0;
> +	struct page *page;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +	struct page **oldpages;
> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +
> +	/* Handle a single page request without allocation
> +	   of pages-to-release array */

nit, this comment style doesn't seem to match anything existing in this
file.  I'd also be tempted to use pr_err/warn in this file, but I'll
leave that for the maintainers.  Thanks,

Alex

> +	if (pages == 1) {
> +		spin_lock(&(pool->lock));
> +		page = free_tce(tbl, entry);
> +
> +		if (direction != DMA_NONE)
> +			ret = put_tce(tbl, entry, tce, direction);
> +
> +		tce_flush(tbl);
> +
> +		if (page)
> +			put_page(page);
> +
> +		spin_unlock(&(pool->lock));
> +		return ret;
> +	}
> +
> +	/* Releasing multiple pages */
> +	/* Allocate an array for pages to be released after TCE table
> +	   is updated */
> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!oldpages)
> +		return -ENOMEM;
> +
> +	spin_lock(&(pool->lock));
> +
> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
> +		page = free_tce(tbl, entry);
> +		if (page) {
> +			oldpages[pages_to_put] = page;
> +			++pages_to_put;
> +		}
> +
> +		if (direction != DMA_NONE)
> +			ret = put_tce(tbl, entry, tce, direction);
> +
> +		/* Release old pages if we reached the end of oldpages[] or
> +		   it is the last page or we are about to exit the loop */
> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
> +			tce_flush(tbl);
> +
> +			/* Release pages after removing them from TCE table */
> +			while (pages_to_put) {
> +				--pages_to_put;
> +				put_page(oldpages[pages_to_put]);
> +			}
> +		}
> +	}
> +
> +	spin_unlock(&(pool->lock));
> +	kfree(oldpages);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..660dcc6 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 2/2] vfio powerpc: enabled on powernv platform
@ 2012-11-27  4:41           ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-27  4:41 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    6 ++
>  arch/powerpc/kernel/iommu.c          |  141 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 290 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5ba66cb 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,8 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> +		enum dma_data_direction direction, unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..c8dad1f 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,143 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static struct page *free_tce(struct iommu_table *tbl, unsigned long entry)
> +{
> +	struct page *page;
> +	unsigned long oldtce;
> +
> +	oldtce = ppc_md.tce_get(tbl, entry);
> +
> +	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +		return NULL;
> +
> +	page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +	WARN_ON(!page);
> +	if (page && (oldtce & TCE_PCI_WRITE))
> +		SetPageDirty(page);
> +	ppc_md.tce_free(tbl, entry, 1);
> +
> +	return page;
> +}
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);

We're locking memory here on behalf of the user, but I don't see where
rlimit gets checked to verify the user has privileges to lock the pages.
I know you're locking a much smaller set of memory than x86 does, but
are we just foregoing that added security?

> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret)
> +			ret = -EFAULT;
> +		return ret;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce,
> +		enum dma_data_direction direction, unsigned long pages)
> +{
> +	int i, ret = 0, pages_to_put = 0;
> +	struct page *page;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +	struct page **oldpages;
> +	const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +
> +	/* Handle a single page request without allocation
> +	   of pages-to-release array */

nit, this comment style doesn't seem to match anything existing in this
file.  I'd also be tempted to use pr_err/warn in this file, but I'll
leave that for the maintainers.  Thanks,

Alex

> +	if (pages == 1) {
> +		spin_lock(&(pool->lock));
> +		page = free_tce(tbl, entry);
> +
> +		if (direction != DMA_NONE)
> +			ret = put_tce(tbl, entry, tce, direction);
> +
> +		tce_flush(tbl);
> +
> +		if (page)
> +			put_page(page);
> +
> +		spin_unlock(&(pool->lock));
> +		return ret;
> +	}
> +
> +	/* Releasing multiple pages */
> +	/* Allocate an array for pages to be released after TCE table
> +	   is updated */
> +	oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!oldpages)
> +		return -ENOMEM;
> +
> +	spin_lock(&(pool->lock));
> +
> +	for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += IOMMU_PAGE_SIZE) {
> +		page = free_tce(tbl, entry);
> +		if (page) {
> +			oldpages[pages_to_put] = page;
> +			++pages_to_put;
> +		}
> +
> +		if (direction != DMA_NONE)
> +			ret = put_tce(tbl, entry, tce, direction);
> +
> +		/* Release old pages if we reached the end of oldpages[] or
> +		   it is the last page or we are about to exit the loop */
> +		if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) {
> +			tce_flush(tbl);
> +
> +			/* Release pages after removing them from TCE table */
> +			while (pages_to_put) {
> +				--pages_to_put;
> +				put_page(oldpages[pages_to_put]);
> +			}
> +		}
> +	}
> +
> +	spin_unlock(&(pool->lock));
> +	kfree(oldpages);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..660dcc6 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-27  4:29               ` Alex Williamson
@ 2012-11-27  4:58                 ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-27  4:58 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On 27/11/12 15:29, Alex Williamson wrote:
> On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
>> On 27/11/12 05:20, Alex Williamson wrote:
>>> On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
>>>> VFIO implements platform independent stuff such as
>>>> a PCI driver, BAR access (via read/write on a file descriptor
>>>> or direct mapping when possible) and IRQ signaling.
>>>>
>>>> The platform dependent part includes IOMMU initialization
>>>> and handling. This patch implements an IOMMU driver for VFIO
>>>> which does mapping/unmapping pages for the guest IO and
>>>> provides information about DMA window (required by a POWERPC
>>>> guest).
>>>>
>>>> The counterpart in QEMU is required to support this functionality.
>>>>
>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>    drivers/vfio/Kconfig                |    6 +
>>>>    drivers/vfio/Makefile               |    1 +
>>>>    drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
>>>>    include/linux/vfio.h                |   20 +++
>>>>    4 files changed, 274 insertions(+)
>>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>>
>>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>>>> index 7cd5dec..b464687 100644
>>>> --- a/drivers/vfio/Kconfig
>>>> +++ b/drivers/vfio/Kconfig
>>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>>>    	depends on VFIO
>>>>    	default n
>>>>
>>>> +config VFIO_IOMMU_SPAPR_TCE
>>>> +	tristate
>>>> +	depends on VFIO && SPAPR_TCE_IOMMU
>>>> +	default n
>>>> +
>>>>    menuconfig VFIO
>>>>    	tristate "VFIO Non-Privileged userspace driver framework"
>>>>    	depends on IOMMU_API
>>>>    	select VFIO_IOMMU_TYPE1 if X86
>>>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>>>    	help
>>>>    	  VFIO provides a framework for secure userspace device drivers.
>>>>    	  See Documentation/vfio.txt for more details.
>>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>>>> index 2398d4a..72bfabc 100644
>>>> --- a/drivers/vfio/Makefile
>>>> +++ b/drivers/vfio/Makefile
>>>> @@ -1,3 +1,4 @@
>>>>    obj-$(CONFIG_VFIO) += vfio.o
>>>>    obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>>>    obj-$(CONFIG_VFIO_PCI) += pci/
>>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>> new file mode 100644
>>>> index 0000000..46a6298
>>>> --- /dev/null
>>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>> @@ -0,0 +1,247 @@
>>>> +/*
>>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>>>> + *
>>>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
>>>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify
>>>> + * it under the terms of the GNU General Public License version 2 as
>>>> + * published by the Free Software Foundation.
>>>> + *
>>>> + * Derived from original vfio_iommu_type1.c:
>>>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
>>>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
>>>> + */
>>>> +
>>>> +#include <linux/module.h>
>>>> +#include <linux/pci.h>
>>>> +#include <linux/slab.h>
>>>> +#include <linux/uaccess.h>
>>>> +#include <linux/err.h>
>>>> +#include <linux/vfio.h>
>>>> +#include <asm/iommu.h>
>>>> +
>>>> +#define DRIVER_VERSION  "0.1"
>>>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
>>>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
>>>> +
>>>> +static void tce_iommu_detach_group(void *iommu_data,
>>>> +		struct iommu_group *iommu_group);
>>>> +
>>>> +/*
>>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>>>> + */
>>>> +
>>>> +/*
>>>> + * The container descriptor supports only a single group per container.
>>>> + * Required by the API as the container is not supplied with the IOMMU group
>>>> + * at the moment of initialization.
>>>> + */
>>>> +struct tce_container {
>>>> +	struct mutex lock;
>>>> +	struct iommu_table *tbl;
>>>> +};
>>>> +
>>>> +static void *tce_iommu_open(unsigned long arg)
>>>> +{
>>>> +	struct tce_container *container;
>>>> +
>>>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
>>>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
>>>> +		return ERR_PTR(-EINVAL);
>>>> +	}
>>>> +
>>>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
>>>> +	if (!container)
>>>> +		return ERR_PTR(-ENOMEM);
>>>> +
>>>> +	mutex_init(&container->lock);
>>>> +
>>>> +	return container;
>>>> +}
>>>> +
>>>> +static void tce_iommu_release(void *iommu_data)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +
>>>> +	WARN_ON(container->tbl && !container->tbl->it_group);
>>>
>>> I think your patch ordering is backwards here.  it_group isn't added
>>> until 2/2.  I'd really like to see the arch/powerpc code approved and
>>> merged by the powerpc maintainer before we add the code that makes use
>>> of it into vfio.  Otherwise we just get lots of churn if interfaces
>>> change or they disapprove of it altogether.
>>
>>
>> Makes sense, thanks.
>>
>>
>>>> +	if (container->tbl && container->tbl->it_group)
>>>> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>>>> +
>>>> +	mutex_destroy(&container->lock);
>>>> +
>>>> +	kfree(container);
>>>> +}
>>>> +
>>>> +static long tce_iommu_ioctl(void *iommu_data,
>>>> +				 unsigned int cmd, unsigned long arg)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +	unsigned long minsz;
>>>> +
>>>> +	switch (cmd) {
>>>> +	case VFIO_CHECK_EXTENSION: {
>>>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>>>> +	}
>>>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>>>> +		struct vfio_iommu_spapr_tce_info info;
>>>> +		struct iommu_table *tbl = container->tbl;
>>>> +
>>>> +		if (WARN_ON(!tbl))
>>>> +			return -ENXIO;
>>>> +
>>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>>>> +				dma64_window_size);
>>>> +
>>>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (info.argsz < minsz)
>>>> +			return -EINVAL;
>>>> +
>>>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>>>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>>>> +		info.dma64_window_start = 0;
>>>> +		info.dma64_window_size = 0;
>>>> +		info.flags = 0;
>>>> +
>>>> +		if (copy_to_user((void __user *)arg, &info, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		return 0;
>>>> +	}
>>>> +	case VFIO_IOMMU_MAP_DMA: {
>>>> +		vfio_iommu_spapr_tce_dma_map param;
>>>> +		struct iommu_table *tbl = container->tbl;
>>>> +		enum dma_data_direction direction = DMA_NONE;
>>>> +
>>>> +		if (WARN_ON(!tbl))
>>>> +			return -ENXIO;
>>>> +
>>>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>>>> +
>>>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (param.argsz < minsz)
>>>> +			return -EINVAL;
>>>> +
>>>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>>>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
>>>> +			direction = DMA_BIDIRECTIONAL;
>>>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
>>>> +			direction = DMA_TO_DEVICE;
>>>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
>>>> +			direction = DMA_FROM_DEVICE;
>>>> +		}
>>>> +
>>>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
>>>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>>>
>>> On x86 we force iova, vaddr, and size to all be aligned to the smallest
>>> page granularity of the iommu and return -EINVAL if it doesn't fit.
>>> What does it imply to the user if they're always aligned to work here?
>>> Won't this interface happily map overlapping entries with no indication
>>> to the user that the previous mapping is no longer valid?
>>> Maybe another reason why a combined unmap/map makes me nervous, we have
>>> to assume the user knows what they're doing.
>>
>>
>> I got used to guests which do know what they are doing so I am pretty calm :)
>> but ok, I'll move alignment to the QEMU, it makes sense.
>>
>>
>>>> +
>>>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>>>> +				param.vaddr & IOMMU_PAGE_MASK, direction,
>>>> +				param.size >> IOMMU_PAGE_SHIFT);
>>>> +	}
>>>> +	case VFIO_IOMMU_UNMAP_DMA: {
>>>> +		vfio_iommu_spapr_tce_dma_unmap param;
>>>> +		struct iommu_table *tbl = container->tbl;
>>>> +
>>>> +		if (WARN_ON(!tbl))
>>>> +			return -ENXIO;
>>>> +
>>>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>>>> +
>>>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (param.argsz < minsz)
>>>> +			return -EINVAL;
>>>> +
>>>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
>>>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>>>> +
>>>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>>>> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
>>>> +	}
>>>> +	default:
>>>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
>>>
>>> pr_warn
>>>
>>>> +	}
>>>> +
>>>> +	return -ENOTTY;
>>>> +}
>>>> +
>>>> +static int tce_iommu_attach_group(void *iommu_data,
>>>> +		struct iommu_group *iommu_group)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>>>> +
>>>> +	BUG_ON(!tbl);
>>>> +	mutex_lock(&container->lock);
>>>> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>>>> +			iommu_group_id(iommu_group), iommu_group);
>>>> +	if (container->tbl) {
>>>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>>>
>>> pr_warn
>>>
>>>> +				iommu_group_id(container->tbl->it_group),
>>>> +				iommu_group_id(iommu_group));
>>>> +		mutex_unlock(&container->lock);
>>>> +		return -EBUSY;
>>>> +	}
>>>> +
>>>> +	container->tbl = tbl;
>>>
>>> Would it be too much paranoia to clear all the tce here as you do below
>>> on detach?
>>
>> Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
>> unmaps) the whole DMA window at the boot time.
>
> But that's just one user of this interface, we can't assume they'll all
> be so agreeable.  If any tces were enabled here, a malicious user would
> have a window to host memory, right?  Thanks,


But I still release pages on detach(), how can this code be not called on 
the guest exit (normal or crashed)?



>
> Alex
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-27  4:58                 ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-27  4:58 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On 27/11/12 15:29, Alex Williamson wrote:
> On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
>> On 27/11/12 05:20, Alex Williamson wrote:
>>> On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
>>>> VFIO implements platform independent stuff such as
>>>> a PCI driver, BAR access (via read/write on a file descriptor
>>>> or direct mapping when possible) and IRQ signaling.
>>>>
>>>> The platform dependent part includes IOMMU initialization
>>>> and handling. This patch implements an IOMMU driver for VFIO
>>>> which does mapping/unmapping pages for the guest IO and
>>>> provides information about DMA window (required by a POWERPC
>>>> guest).
>>>>
>>>> The counterpart in QEMU is required to support this functionality.
>>>>
>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>    drivers/vfio/Kconfig                |    6 +
>>>>    drivers/vfio/Makefile               |    1 +
>>>>    drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
>>>>    include/linux/vfio.h                |   20 +++
>>>>    4 files changed, 274 insertions(+)
>>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>>>
>>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>>>> index 7cd5dec..b464687 100644
>>>> --- a/drivers/vfio/Kconfig
>>>> +++ b/drivers/vfio/Kconfig
>>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>>>    	depends on VFIO
>>>>    	default n
>>>>
>>>> +config VFIO_IOMMU_SPAPR_TCE
>>>> +	tristate
>>>> +	depends on VFIO && SPAPR_TCE_IOMMU
>>>> +	default n
>>>> +
>>>>    menuconfig VFIO
>>>>    	tristate "VFIO Non-Privileged userspace driver framework"
>>>>    	depends on IOMMU_API
>>>>    	select VFIO_IOMMU_TYPE1 if X86
>>>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>>>    	help
>>>>    	  VFIO provides a framework for secure userspace device drivers.
>>>>    	  See Documentation/vfio.txt for more details.
>>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>>>> index 2398d4a..72bfabc 100644
>>>> --- a/drivers/vfio/Makefile
>>>> +++ b/drivers/vfio/Makefile
>>>> @@ -1,3 +1,4 @@
>>>>    obj-$(CONFIG_VFIO) += vfio.o
>>>>    obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>>>    obj-$(CONFIG_VFIO_PCI) += pci/
>>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>> new file mode 100644
>>>> index 0000000..46a6298
>>>> --- /dev/null
>>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>>>> @@ -0,0 +1,247 @@
>>>> +/*
>>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>>>> + *
>>>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
>>>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify
>>>> + * it under the terms of the GNU General Public License version 2 as
>>>> + * published by the Free Software Foundation.
>>>> + *
>>>> + * Derived from original vfio_iommu_type1.c:
>>>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
>>>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
>>>> + */
>>>> +
>>>> +#include <linux/module.h>
>>>> +#include <linux/pci.h>
>>>> +#include <linux/slab.h>
>>>> +#include <linux/uaccess.h>
>>>> +#include <linux/err.h>
>>>> +#include <linux/vfio.h>
>>>> +#include <asm/iommu.h>
>>>> +
>>>> +#define DRIVER_VERSION  "0.1"
>>>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
>>>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
>>>> +
>>>> +static void tce_iommu_detach_group(void *iommu_data,
>>>> +		struct iommu_group *iommu_group);
>>>> +
>>>> +/*
>>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>>>> + */
>>>> +
>>>> +/*
>>>> + * The container descriptor supports only a single group per container.
>>>> + * Required by the API as the container is not supplied with the IOMMU group
>>>> + * at the moment of initialization.
>>>> + */
>>>> +struct tce_container {
>>>> +	struct mutex lock;
>>>> +	struct iommu_table *tbl;
>>>> +};
>>>> +
>>>> +static void *tce_iommu_open(unsigned long arg)
>>>> +{
>>>> +	struct tce_container *container;
>>>> +
>>>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
>>>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
>>>> +		return ERR_PTR(-EINVAL);
>>>> +	}
>>>> +
>>>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
>>>> +	if (!container)
>>>> +		return ERR_PTR(-ENOMEM);
>>>> +
>>>> +	mutex_init(&container->lock);
>>>> +
>>>> +	return container;
>>>> +}
>>>> +
>>>> +static void tce_iommu_release(void *iommu_data)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +
>>>> +	WARN_ON(container->tbl && !container->tbl->it_group);
>>>
>>> I think your patch ordering is backwards here.  it_group isn't added
>>> until 2/2.  I'd really like to see the arch/powerpc code approved and
>>> merged by the powerpc maintainer before we add the code that makes use
>>> of it into vfio.  Otherwise we just get lots of churn if interfaces
>>> change or they disapprove of it altogether.
>>
>>
>> Makes sense, thanks.
>>
>>
>>>> +	if (container->tbl && container->tbl->it_group)
>>>> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>>>> +
>>>> +	mutex_destroy(&container->lock);
>>>> +
>>>> +	kfree(container);
>>>> +}
>>>> +
>>>> +static long tce_iommu_ioctl(void *iommu_data,
>>>> +				 unsigned int cmd, unsigned long arg)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +	unsigned long minsz;
>>>> +
>>>> +	switch (cmd) {
>>>> +	case VFIO_CHECK_EXTENSION: {
>>>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>>>> +	}
>>>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>>>> +		struct vfio_iommu_spapr_tce_info info;
>>>> +		struct iommu_table *tbl = container->tbl;
>>>> +
>>>> +		if (WARN_ON(!tbl))
>>>> +			return -ENXIO;
>>>> +
>>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>>>> +				dma64_window_size);
>>>> +
>>>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (info.argsz < minsz)
>>>> +			return -EINVAL;
>>>> +
>>>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>>>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>>>> +		info.dma64_window_start = 0;
>>>> +		info.dma64_window_size = 0;
>>>> +		info.flags = 0;
>>>> +
>>>> +		if (copy_to_user((void __user *)arg, &info, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		return 0;
>>>> +	}
>>>> +	case VFIO_IOMMU_MAP_DMA: {
>>>> +		vfio_iommu_spapr_tce_dma_map param;
>>>> +		struct iommu_table *tbl = container->tbl;
>>>> +		enum dma_data_direction direction = DMA_NONE;
>>>> +
>>>> +		if (WARN_ON(!tbl))
>>>> +			return -ENXIO;
>>>> +
>>>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>>>> +
>>>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (param.argsz < minsz)
>>>> +			return -EINVAL;
>>>> +
>>>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>>>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
>>>> +			direction = DMA_BIDIRECTIONAL;
>>>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
>>>> +			direction = DMA_TO_DEVICE;
>>>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
>>>> +			direction = DMA_FROM_DEVICE;
>>>> +		}
>>>> +
>>>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
>>>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>>>
>>> On x86 we force iova, vaddr, and size to all be aligned to the smallest
>>> page granularity of the iommu and return -EINVAL if it doesn't fit.
>>> What does it imply to the user if they're always aligned to work here?
>>> Won't this interface happily map overlapping entries with no indication
>>> to the user that the previous mapping is no longer valid?
>>> Maybe another reason why a combined unmap/map makes me nervous, we have
>>> to assume the user knows what they're doing.
>>
>>
>> I got used to guests which do know what they are doing so I am pretty calm :)
>> but ok, I'll move alignment to the QEMU, it makes sense.
>>
>>
>>>> +
>>>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>>>> +				param.vaddr & IOMMU_PAGE_MASK, direction,
>>>> +				param.size >> IOMMU_PAGE_SHIFT);
>>>> +	}
>>>> +	case VFIO_IOMMU_UNMAP_DMA: {
>>>> +		vfio_iommu_spapr_tce_dma_unmap param;
>>>> +		struct iommu_table *tbl = container->tbl;
>>>> +
>>>> +		if (WARN_ON(!tbl))
>>>> +			return -ENXIO;
>>>> +
>>>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>>>> +
>>>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>>>> +			return -EFAULT;
>>>> +
>>>> +		if (param.argsz < minsz)
>>>> +			return -EINVAL;
>>>> +
>>>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
>>>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
>>>> +
>>>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>>>> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
>>>> +	}
>>>> +	default:
>>>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
>>>
>>> pr_warn
>>>
>>>> +	}
>>>> +
>>>> +	return -ENOTTY;
>>>> +}
>>>> +
>>>> +static int tce_iommu_attach_group(void *iommu_data,
>>>> +		struct iommu_group *iommu_group)
>>>> +{
>>>> +	struct tce_container *container = iommu_data;
>>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>>>> +
>>>> +	BUG_ON(!tbl);
>>>> +	mutex_lock(&container->lock);
>>>> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>>>> +			iommu_group_id(iommu_group), iommu_group);
>>>> +	if (container->tbl) {
>>>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>>>
>>> pr_warn
>>>
>>>> +				iommu_group_id(container->tbl->it_group),
>>>> +				iommu_group_id(iommu_group));
>>>> +		mutex_unlock(&container->lock);
>>>> +		return -EBUSY;
>>>> +	}
>>>> +
>>>> +	container->tbl = tbl;
>>>
>>> Would it be too much paranoia to clear all the tce here as you do below
>>> on detach?
>>
>> Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
>> unmaps) the whole DMA window at the boot time.
>
> But that's just one user of this interface, we can't assume they'll all
> be so agreeable.  If any tces were enabled here, a malicious user would
> have a window to host memory, right?  Thanks,


But I still release pages on detach(), how can this code be not called on 
the guest exit (normal or crashed)?



>
> Alex
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-27  4:58                 ` Alexey Kardashevskiy
@ 2012-11-27  5:06                   ` David Gibson
  -1 siblings, 0 replies; 122+ messages in thread
From: David Gibson @ 2012-11-27  5:06 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Alex Williamson, kvm, linux-kernel, Paul Mackerras, linuxppc-dev

On Tue, Nov 27, 2012 at 03:58:14PM +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 15:29, Alex Williamson wrote:
> >On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
> >>On 27/11/12 05:20, Alex Williamson wrote:
> >>>On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> >>>>VFIO implements platform independent stuff such as
> >>>>a PCI driver, BAR access (via read/write on a file descriptor
> >>>>or direct mapping when possible) and IRQ signaling.
> >>>>
> >>>>The platform dependent part includes IOMMU initialization
> >>>>and handling. This patch implements an IOMMU driver for VFIO
> >>>>which does mapping/unmapping pages for the guest IO and
> >>>>provides information about DMA window (required by a POWERPC
> >>>>guest).
> >>>>
> >>>>The counterpart in QEMU is required to support this functionality.
> >>>>
> >>>>Cc: David Gibson <david@gibson.dropbear.id.au>
> >>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>---
> >>>>   drivers/vfio/Kconfig                |    6 +
> >>>>   drivers/vfio/Makefile               |    1 +
> >>>>   drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
> >>>>   include/linux/vfio.h                |   20 +++
> >>>>   4 files changed, 274 insertions(+)
> >>>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>
> >>>>diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >>>>index 7cd5dec..b464687 100644
> >>>>--- a/drivers/vfio/Kconfig
> >>>>+++ b/drivers/vfio/Kconfig
> >>>>@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>>>   	depends on VFIO
> >>>>   	default n
> >>>>
> >>>>+config VFIO_IOMMU_SPAPR_TCE
> >>>>+	tristate
> >>>>+	depends on VFIO && SPAPR_TCE_IOMMU
> >>>>+	default n
> >>>>+
> >>>>   menuconfig VFIO
> >>>>   	tristate "VFIO Non-Privileged userspace driver framework"
> >>>>   	depends on IOMMU_API
> >>>>   	select VFIO_IOMMU_TYPE1 if X86
> >>>>+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>>>   	help
> >>>>   	  VFIO provides a framework for secure userspace device drivers.
> >>>>   	  See Documentation/vfio.txt for more details.
> >>>>diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >>>>index 2398d4a..72bfabc 100644
> >>>>--- a/drivers/vfio/Makefile
> >>>>+++ b/drivers/vfio/Makefile
> >>>>@@ -1,3 +1,4 @@
> >>>>   obj-$(CONFIG_VFIO) += vfio.o
> >>>>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >>>>+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>>>   obj-$(CONFIG_VFIO_PCI) += pci/
> >>>>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>new file mode 100644
> >>>>index 0000000..46a6298
> >>>>--- /dev/null
> >>>>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>@@ -0,0 +1,247 @@
> >>>>+/*
> >>>>+ * VFIO: IOMMU DMA mapping support for TCE on POWER
> >>>>+ *
> >>>>+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >>>>+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>+ *
> >>>>+ * This program is free software; you can redistribute it and/or modify
> >>>>+ * it under the terms of the GNU General Public License version 2 as
> >>>>+ * published by the Free Software Foundation.
> >>>>+ *
> >>>>+ * Derived from original vfio_iommu_type1.c:
> >>>>+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >>>>+ *     Author: Alex Williamson <alex.williamson@redhat.com>
> >>>>+ */
> >>>>+
> >>>>+#include <linux/module.h>
> >>>>+#include <linux/pci.h>
> >>>>+#include <linux/slab.h>
> >>>>+#include <linux/uaccess.h>
> >>>>+#include <linux/err.h>
> >>>>+#include <linux/vfio.h>
> >>>>+#include <asm/iommu.h>
> >>>>+
> >>>>+#define DRIVER_VERSION  "0.1"
> >>>>+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> >>>>+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> >>>>+
> >>>>+static void tce_iommu_detach_group(void *iommu_data,
> >>>>+		struct iommu_group *iommu_group);
> >>>>+
> >>>>+/*
> >>>>+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >>>>+ */
> >>>>+
> >>>>+/*
> >>>>+ * The container descriptor supports only a single group per container.
> >>>>+ * Required by the API as the container is not supplied with the IOMMU group
> >>>>+ * at the moment of initialization.
> >>>>+ */
> >>>>+struct tce_container {
> >>>>+	struct mutex lock;
> >>>>+	struct iommu_table *tbl;
> >>>>+};
> >>>>+
> >>>>+static void *tce_iommu_open(unsigned long arg)
> >>>>+{
> >>>>+	struct tce_container *container;
> >>>>+
> >>>>+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >>>>+		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >>>>+		return ERR_PTR(-EINVAL);
> >>>>+	}
> >>>>+
> >>>>+	container = kzalloc(sizeof(*container), GFP_KERNEL);
> >>>>+	if (!container)
> >>>>+		return ERR_PTR(-ENOMEM);
> >>>>+
> >>>>+	mutex_init(&container->lock);
> >>>>+
> >>>>+	return container;
> >>>>+}
> >>>>+
> >>>>+static void tce_iommu_release(void *iommu_data)
> >>>>+{
> >>>>+	struct tce_container *container = iommu_data;
> >>>>+
> >>>>+	WARN_ON(container->tbl && !container->tbl->it_group);
> >>>
> >>>I think your patch ordering is backwards here.  it_group isn't added
> >>>until 2/2.  I'd really like to see the arch/powerpc code approved and
> >>>merged by the powerpc maintainer before we add the code that makes use
> >>>of it into vfio.  Otherwise we just get lots of churn if interfaces
> >>>change or they disapprove of it altogether.
> >>
> >>
> >>Makes sense, thanks.
> >>
> >>
> >>>>+	if (container->tbl && container->tbl->it_group)
> >>>>+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> >>>>+
> >>>>+	mutex_destroy(&container->lock);
> >>>>+
> >>>>+	kfree(container);
> >>>>+}
> >>>>+
> >>>>+static long tce_iommu_ioctl(void *iommu_data,
> >>>>+				 unsigned int cmd, unsigned long arg)
> >>>>+{
> >>>>+	struct tce_container *container = iommu_data;
> >>>>+	unsigned long minsz;
> >>>>+
> >>>>+	switch (cmd) {
> >>>>+	case VFIO_CHECK_EXTENSION: {
> >>>>+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >>>>+	}
> >>>>+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>>>+		struct vfio_iommu_spapr_tce_info info;
> >>>>+		struct iommu_table *tbl = container->tbl;
> >>>>+
> >>>>+		if (WARN_ON(!tbl))
> >>>>+			return -ENXIO;
> >>>>+
> >>>>+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >>>>+				dma64_window_size);
> >>>>+
> >>>>+		if (copy_from_user(&info, (void __user *)arg, minsz))
> >>>>+			return -EFAULT;
> >>>>+
> >>>>+		if (info.argsz < minsz)
> >>>>+			return -EINVAL;
> >>>>+
> >>>>+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >>>>+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >>>>+		info.dma64_window_start = 0;
> >>>>+		info.dma64_window_size = 0;
> >>>>+		info.flags = 0;
> >>>>+
> >>>>+		if (copy_to_user((void __user *)arg, &info, minsz))
> >>>>+			return -EFAULT;
> >>>>+
> >>>>+		return 0;
> >>>>+	}
> >>>>+	case VFIO_IOMMU_MAP_DMA: {
> >>>>+		vfio_iommu_spapr_tce_dma_map param;
> >>>>+		struct iommu_table *tbl = container->tbl;
> >>>>+		enum dma_data_direction direction = DMA_NONE;
> >>>>+
> >>>>+		if (WARN_ON(!tbl))
> >>>>+			return -ENXIO;
> >>>>+
> >>>>+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >>>>+
> >>>>+		if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>>+			return -EFAULT;
> >>>>+
> >>>>+		if (param.argsz < minsz)
> >>>>+			return -EINVAL;
> >>>>+
> >>>>+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >>>>+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> >>>>+			direction = DMA_BIDIRECTIONAL;
> >>>>+		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> >>>>+			direction = DMA_TO_DEVICE;
> >>>>+		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> >>>>+			direction = DMA_FROM_DEVICE;
> >>>>+		}
> >>>>+
> >>>>+		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>>+		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>
> >>>On x86 we force iova, vaddr, and size to all be aligned to the smallest
> >>>page granularity of the iommu and return -EINVAL if it doesn't fit.
> >>>What does it imply to the user if they're always aligned to work here?
> >>>Won't this interface happily map overlapping entries with no indication
> >>>to the user that the previous mapping is no longer valid?
> >>>Maybe another reason why a combined unmap/map makes me nervous, we have
> >>>to assume the user knows what they're doing.
> >>
> >>
> >>I got used to guests which do know what they are doing so I am pretty calm :)
> >>but ok, I'll move alignment to the QEMU, it makes sense.
> >>
> >>
> >>>>+
> >>>>+		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>>+				param.vaddr & IOMMU_PAGE_MASK, direction,
> >>>>+				param.size >> IOMMU_PAGE_SHIFT);
> >>>>+	}
> >>>>+	case VFIO_IOMMU_UNMAP_DMA: {
> >>>>+		vfio_iommu_spapr_tce_dma_unmap param;
> >>>>+		struct iommu_table *tbl = container->tbl;
> >>>>+
> >>>>+		if (WARN_ON(!tbl))
> >>>>+			return -ENXIO;
> >>>>+
> >>>>+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> >>>>+
> >>>>+		if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>>+			return -EFAULT;
> >>>>+
> >>>>+		if (param.argsz < minsz)
> >>>>+			return -EINVAL;
> >>>>+
> >>>>+		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>>+		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>>+
> >>>>+		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>>+				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> >>>>+	}
> >>>>+	default:
> >>>>+		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >>>
> >>>pr_warn
> >>>
> >>>>+	}
> >>>>+
> >>>>+	return -ENOTTY;
> >>>>+}
> >>>>+
> >>>>+static int tce_iommu_attach_group(void *iommu_data,
> >>>>+		struct iommu_group *iommu_group)
> >>>>+{
> >>>>+	struct tce_container *container = iommu_data;
> >>>>+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>>+
> >>>>+	BUG_ON(!tbl);
> >>>>+	mutex_lock(&container->lock);
> >>>>+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> >>>>+			iommu_group_id(iommu_group), iommu_group);
> >>>>+	if (container->tbl) {
> >>>>+		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> >>>
> >>>pr_warn
> >>>
> >>>>+				iommu_group_id(container->tbl->it_group),
> >>>>+				iommu_group_id(iommu_group));
> >>>>+		mutex_unlock(&container->lock);
> >>>>+		return -EBUSY;
> >>>>+	}
> >>>>+
> >>>>+	container->tbl = tbl;
> >>>
> >>>Would it be too much paranoia to clear all the tce here as you do below
> >>>on detach?
> >>
> >>Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
> >>unmaps) the whole DMA window at the boot time.
> >
> >But that's just one user of this interface, we can't assume they'll all
> >be so agreeable.  If any tces were enabled here, a malicious user would
> >have a window to host memory, right?  Thanks,
> 
> 
> But I still release pages on detach(), how can this code be not
> called on the guest exit (normal or crashed)?

I think the concern is about robustness if some bug elsewhere in the
kernel left some TCE entries in place before the table was handed over
to VFIO.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-27  5:06                   ` David Gibson
  0 siblings, 0 replies; 122+ messages in thread
From: David Gibson @ 2012-11-27  5:06 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, Alex Williamson, Paul Mackerras, linux-kernel, kvm

On Tue, Nov 27, 2012 at 03:58:14PM +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 15:29, Alex Williamson wrote:
> >On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
> >>On 27/11/12 05:20, Alex Williamson wrote:
> >>>On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> >>>>VFIO implements platform independent stuff such as
> >>>>a PCI driver, BAR access (via read/write on a file descriptor
> >>>>or direct mapping when possible) and IRQ signaling.
> >>>>
> >>>>The platform dependent part includes IOMMU initialization
> >>>>and handling. This patch implements an IOMMU driver for VFIO
> >>>>which does mapping/unmapping pages for the guest IO and
> >>>>provides information about DMA window (required by a POWERPC
> >>>>guest).
> >>>>
> >>>>The counterpart in QEMU is required to support this functionality.
> >>>>
> >>>>Cc: David Gibson <david@gibson.dropbear.id.au>
> >>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>---
> >>>>   drivers/vfio/Kconfig                |    6 +
> >>>>   drivers/vfio/Makefile               |    1 +
> >>>>   drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
> >>>>   include/linux/vfio.h                |   20 +++
> >>>>   4 files changed, 274 insertions(+)
> >>>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>
> >>>>diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >>>>index 7cd5dec..b464687 100644
> >>>>--- a/drivers/vfio/Kconfig
> >>>>+++ b/drivers/vfio/Kconfig
> >>>>@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>>>   	depends on VFIO
> >>>>   	default n
> >>>>
> >>>>+config VFIO_IOMMU_SPAPR_TCE
> >>>>+	tristate
> >>>>+	depends on VFIO && SPAPR_TCE_IOMMU
> >>>>+	default n
> >>>>+
> >>>>   menuconfig VFIO
> >>>>   	tristate "VFIO Non-Privileged userspace driver framework"
> >>>>   	depends on IOMMU_API
> >>>>   	select VFIO_IOMMU_TYPE1 if X86
> >>>>+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>>>   	help
> >>>>   	  VFIO provides a framework for secure userspace device drivers.
> >>>>   	  See Documentation/vfio.txt for more details.
> >>>>diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >>>>index 2398d4a..72bfabc 100644
> >>>>--- a/drivers/vfio/Makefile
> >>>>+++ b/drivers/vfio/Makefile
> >>>>@@ -1,3 +1,4 @@
> >>>>   obj-$(CONFIG_VFIO) += vfio.o
> >>>>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >>>>+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>>>   obj-$(CONFIG_VFIO_PCI) += pci/
> >>>>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>new file mode 100644
> >>>>index 0000000..46a6298
> >>>>--- /dev/null
> >>>>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>@@ -0,0 +1,247 @@
> >>>>+/*
> >>>>+ * VFIO: IOMMU DMA mapping support for TCE on POWER
> >>>>+ *
> >>>>+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >>>>+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>+ *
> >>>>+ * This program is free software; you can redistribute it and/or modify
> >>>>+ * it under the terms of the GNU General Public License version 2 as
> >>>>+ * published by the Free Software Foundation.
> >>>>+ *
> >>>>+ * Derived from original vfio_iommu_type1.c:
> >>>>+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >>>>+ *     Author: Alex Williamson <alex.williamson@redhat.com>
> >>>>+ */
> >>>>+
> >>>>+#include <linux/module.h>
> >>>>+#include <linux/pci.h>
> >>>>+#include <linux/slab.h>
> >>>>+#include <linux/uaccess.h>
> >>>>+#include <linux/err.h>
> >>>>+#include <linux/vfio.h>
> >>>>+#include <asm/iommu.h>
> >>>>+
> >>>>+#define DRIVER_VERSION  "0.1"
> >>>>+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> >>>>+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> >>>>+
> >>>>+static void tce_iommu_detach_group(void *iommu_data,
> >>>>+		struct iommu_group *iommu_group);
> >>>>+
> >>>>+/*
> >>>>+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >>>>+ */
> >>>>+
> >>>>+/*
> >>>>+ * The container descriptor supports only a single group per container.
> >>>>+ * Required by the API as the container is not supplied with the IOMMU group
> >>>>+ * at the moment of initialization.
> >>>>+ */
> >>>>+struct tce_container {
> >>>>+	struct mutex lock;
> >>>>+	struct iommu_table *tbl;
> >>>>+};
> >>>>+
> >>>>+static void *tce_iommu_open(unsigned long arg)
> >>>>+{
> >>>>+	struct tce_container *container;
> >>>>+
> >>>>+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >>>>+		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >>>>+		return ERR_PTR(-EINVAL);
> >>>>+	}
> >>>>+
> >>>>+	container = kzalloc(sizeof(*container), GFP_KERNEL);
> >>>>+	if (!container)
> >>>>+		return ERR_PTR(-ENOMEM);
> >>>>+
> >>>>+	mutex_init(&container->lock);
> >>>>+
> >>>>+	return container;
> >>>>+}
> >>>>+
> >>>>+static void tce_iommu_release(void *iommu_data)
> >>>>+{
> >>>>+	struct tce_container *container = iommu_data;
> >>>>+
> >>>>+	WARN_ON(container->tbl && !container->tbl->it_group);
> >>>
> >>>I think your patch ordering is backwards here.  it_group isn't added
> >>>until 2/2.  I'd really like to see the arch/powerpc code approved and
> >>>merged by the powerpc maintainer before we add the code that makes use
> >>>of it into vfio.  Otherwise we just get lots of churn if interfaces
> >>>change or they disapprove of it altogether.
> >>
> >>
> >>Makes sense, thanks.
> >>
> >>
> >>>>+	if (container->tbl && container->tbl->it_group)
> >>>>+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> >>>>+
> >>>>+	mutex_destroy(&container->lock);
> >>>>+
> >>>>+	kfree(container);
> >>>>+}
> >>>>+
> >>>>+static long tce_iommu_ioctl(void *iommu_data,
> >>>>+				 unsigned int cmd, unsigned long arg)
> >>>>+{
> >>>>+	struct tce_container *container = iommu_data;
> >>>>+	unsigned long minsz;
> >>>>+
> >>>>+	switch (cmd) {
> >>>>+	case VFIO_CHECK_EXTENSION: {
> >>>>+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >>>>+	}
> >>>>+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>>>+		struct vfio_iommu_spapr_tce_info info;
> >>>>+		struct iommu_table *tbl = container->tbl;
> >>>>+
> >>>>+		if (WARN_ON(!tbl))
> >>>>+			return -ENXIO;
> >>>>+
> >>>>+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >>>>+				dma64_window_size);
> >>>>+
> >>>>+		if (copy_from_user(&info, (void __user *)arg, minsz))
> >>>>+			return -EFAULT;
> >>>>+
> >>>>+		if (info.argsz < minsz)
> >>>>+			return -EINVAL;
> >>>>+
> >>>>+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >>>>+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >>>>+		info.dma64_window_start = 0;
> >>>>+		info.dma64_window_size = 0;
> >>>>+		info.flags = 0;
> >>>>+
> >>>>+		if (copy_to_user((void __user *)arg, &info, minsz))
> >>>>+			return -EFAULT;
> >>>>+
> >>>>+		return 0;
> >>>>+	}
> >>>>+	case VFIO_IOMMU_MAP_DMA: {
> >>>>+		vfio_iommu_spapr_tce_dma_map param;
> >>>>+		struct iommu_table *tbl = container->tbl;
> >>>>+		enum dma_data_direction direction = DMA_NONE;
> >>>>+
> >>>>+		if (WARN_ON(!tbl))
> >>>>+			return -ENXIO;
> >>>>+
> >>>>+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >>>>+
> >>>>+		if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>>+			return -EFAULT;
> >>>>+
> >>>>+		if (param.argsz < minsz)
> >>>>+			return -EINVAL;
> >>>>+
> >>>>+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >>>>+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> >>>>+			direction = DMA_BIDIRECTIONAL;
> >>>>+		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> >>>>+			direction = DMA_TO_DEVICE;
> >>>>+		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> >>>>+			direction = DMA_FROM_DEVICE;
> >>>>+		}
> >>>>+
> >>>>+		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>>+		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>
> >>>On x86 we force iova, vaddr, and size to all be aligned to the smallest
> >>>page granularity of the iommu and return -EINVAL if it doesn't fit.
> >>>What does it imply to the user if they're always aligned to work here?
> >>>Won't this interface happily map overlapping entries with no indication
> >>>to the user that the previous mapping is no longer valid?
> >>>Maybe another reason why a combined unmap/map makes me nervous, we have
> >>>to assume the user knows what they're doing.
> >>
> >>
> >>I got used to guests which do know what they are doing so I am pretty calm :)
> >>but ok, I'll move alignment to the QEMU, it makes sense.
> >>
> >>
> >>>>+
> >>>>+		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>>+				param.vaddr & IOMMU_PAGE_MASK, direction,
> >>>>+				param.size >> IOMMU_PAGE_SHIFT);
> >>>>+	}
> >>>>+	case VFIO_IOMMU_UNMAP_DMA: {
> >>>>+		vfio_iommu_spapr_tce_dma_unmap param;
> >>>>+		struct iommu_table *tbl = container->tbl;
> >>>>+
> >>>>+		if (WARN_ON(!tbl))
> >>>>+			return -ENXIO;
> >>>>+
> >>>>+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> >>>>+
> >>>>+		if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>>+			return -EFAULT;
> >>>>+
> >>>>+		if (param.argsz < minsz)
> >>>>+			return -EINVAL;
> >>>>+
> >>>>+		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>>+		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>>+
> >>>>+		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>>+				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> >>>>+	}
> >>>>+	default:
> >>>>+		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >>>
> >>>pr_warn
> >>>
> >>>>+	}
> >>>>+
> >>>>+	return -ENOTTY;
> >>>>+}
> >>>>+
> >>>>+static int tce_iommu_attach_group(void *iommu_data,
> >>>>+		struct iommu_group *iommu_group)
> >>>>+{
> >>>>+	struct tce_container *container = iommu_data;
> >>>>+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>>+
> >>>>+	BUG_ON(!tbl);
> >>>>+	mutex_lock(&container->lock);
> >>>>+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> >>>>+			iommu_group_id(iommu_group), iommu_group);
> >>>>+	if (container->tbl) {
> >>>>+		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> >>>
> >>>pr_warn
> >>>
> >>>>+				iommu_group_id(container->tbl->it_group),
> >>>>+				iommu_group_id(iommu_group));
> >>>>+		mutex_unlock(&container->lock);
> >>>>+		return -EBUSY;
> >>>>+	}
> >>>>+
> >>>>+	container->tbl = tbl;
> >>>
> >>>Would it be too much paranoia to clear all the tce here as you do below
> >>>on detach?
> >>
> >>Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
> >>unmaps) the whole DMA window at the boot time.
> >
> >But that's just one user of this interface, we can't assume they'll all
> >be so agreeable.  If any tces were enabled here, a malicious user would
> >have a window to host memory, right?  Thanks,
> 
> 
> But I still release pages on detach(), how can this code be not
> called on the guest exit (normal or crashed)?

I think the concern is about robustness if some bug elsewhere in the
kernel left some TCE entries in place before the table was handed over
to VFIO.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-27  4:58                 ` Alexey Kardashevskiy
@ 2012-11-27  5:07                   ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-27  5:07 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On Tue, 2012-11-27 at 15:58 +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 15:29, Alex Williamson wrote:
> > On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
> >> On 27/11/12 05:20, Alex Williamson wrote:
> >>> On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> >>>> VFIO implements platform independent stuff such as
> >>>> a PCI driver, BAR access (via read/write on a file descriptor
> >>>> or direct mapping when possible) and IRQ signaling.
> >>>>
> >>>> The platform dependent part includes IOMMU initialization
> >>>> and handling. This patch implements an IOMMU driver for VFIO
> >>>> which does mapping/unmapping pages for the guest IO and
> >>>> provides information about DMA window (required by a POWERPC
> >>>> guest).
> >>>>
> >>>> The counterpart in QEMU is required to support this functionality.
> >>>>
> >>>> Cc: David Gibson <david@gibson.dropbear.id.au>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> ---
> >>>>    drivers/vfio/Kconfig                |    6 +
> >>>>    drivers/vfio/Makefile               |    1 +
> >>>>    drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
> >>>>    include/linux/vfio.h                |   20 +++
> >>>>    4 files changed, 274 insertions(+)
> >>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>
> >>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >>>> index 7cd5dec..b464687 100644
> >>>> --- a/drivers/vfio/Kconfig
> >>>> +++ b/drivers/vfio/Kconfig
> >>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>>>    	depends on VFIO
> >>>>    	default n
> >>>>
> >>>> +config VFIO_IOMMU_SPAPR_TCE
> >>>> +	tristate
> >>>> +	depends on VFIO && SPAPR_TCE_IOMMU
> >>>> +	default n
> >>>> +
> >>>>    menuconfig VFIO
> >>>>    	tristate "VFIO Non-Privileged userspace driver framework"
> >>>>    	depends on IOMMU_API
> >>>>    	select VFIO_IOMMU_TYPE1 if X86
> >>>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>>>    	help
> >>>>    	  VFIO provides a framework for secure userspace device drivers.
> >>>>    	  See Documentation/vfio.txt for more details.
> >>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >>>> index 2398d4a..72bfabc 100644
> >>>> --- a/drivers/vfio/Makefile
> >>>> +++ b/drivers/vfio/Makefile
> >>>> @@ -1,3 +1,4 @@
> >>>>    obj-$(CONFIG_VFIO) += vfio.o
> >>>>    obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>>>    obj-$(CONFIG_VFIO_PCI) += pci/
> >>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> new file mode 100644
> >>>> index 0000000..46a6298
> >>>> --- /dev/null
> >>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> @@ -0,0 +1,247 @@
> >>>> +/*
> >>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >>>> + *
> >>>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >>>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> + *
> >>>> + * This program is free software; you can redistribute it and/or modify
> >>>> + * it under the terms of the GNU General Public License version 2 as
> >>>> + * published by the Free Software Foundation.
> >>>> + *
> >>>> + * Derived from original vfio_iommu_type1.c:
> >>>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >>>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> >>>> + */
> >>>> +
> >>>> +#include <linux/module.h>
> >>>> +#include <linux/pci.h>
> >>>> +#include <linux/slab.h>
> >>>> +#include <linux/uaccess.h>
> >>>> +#include <linux/err.h>
> >>>> +#include <linux/vfio.h>
> >>>> +#include <asm/iommu.h>
> >>>> +
> >>>> +#define DRIVER_VERSION  "0.1"
> >>>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> >>>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> >>>> +
> >>>> +static void tce_iommu_detach_group(void *iommu_data,
> >>>> +		struct iommu_group *iommu_group);
> >>>> +
> >>>> +/*
> >>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >>>> + */
> >>>> +
> >>>> +/*
> >>>> + * The container descriptor supports only a single group per container.
> >>>> + * Required by the API as the container is not supplied with the IOMMU group
> >>>> + * at the moment of initialization.
> >>>> + */
> >>>> +struct tce_container {
> >>>> +	struct mutex lock;
> >>>> +	struct iommu_table *tbl;
> >>>> +};
> >>>> +
> >>>> +static void *tce_iommu_open(unsigned long arg)
> >>>> +{
> >>>> +	struct tce_container *container;
> >>>> +
> >>>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >>>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >>>> +		return ERR_PTR(-EINVAL);
> >>>> +	}
> >>>> +
> >>>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> >>>> +	if (!container)
> >>>> +		return ERR_PTR(-ENOMEM);
> >>>> +
> >>>> +	mutex_init(&container->lock);
> >>>> +
> >>>> +	return container;
> >>>> +}
> >>>> +
> >>>> +static void tce_iommu_release(void *iommu_data)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +
> >>>> +	WARN_ON(container->tbl && !container->tbl->it_group);
> >>>
> >>> I think your patch ordering is backwards here.  it_group isn't added
> >>> until 2/2.  I'd really like to see the arch/powerpc code approved and
> >>> merged by the powerpc maintainer before we add the code that makes use
> >>> of it into vfio.  Otherwise we just get lots of churn if interfaces
> >>> change or they disapprove of it altogether.
> >>
> >>
> >> Makes sense, thanks.
> >>
> >>
> >>>> +	if (container->tbl && container->tbl->it_group)
> >>>> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> >>>> +
> >>>> +	mutex_destroy(&container->lock);
> >>>> +
> >>>> +	kfree(container);
> >>>> +}
> >>>> +
> >>>> +static long tce_iommu_ioctl(void *iommu_data,
> >>>> +				 unsigned int cmd, unsigned long arg)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +	unsigned long minsz;
> >>>> +
> >>>> +	switch (cmd) {
> >>>> +	case VFIO_CHECK_EXTENSION: {
> >>>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >>>> +	}
> >>>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>>> +		struct vfio_iommu_spapr_tce_info info;
> >>>> +		struct iommu_table *tbl = container->tbl;
> >>>> +
> >>>> +		if (WARN_ON(!tbl))
> >>>> +			return -ENXIO;
> >>>> +
> >>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >>>> +				dma64_window_size);
> >>>> +
> >>>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (info.argsz < minsz)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >>>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >>>> +		info.dma64_window_start = 0;
> >>>> +		info.dma64_window_size = 0;
> >>>> +		info.flags = 0;
> >>>> +
> >>>> +		if (copy_to_user((void __user *)arg, &info, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		return 0;
> >>>> +	}
> >>>> +	case VFIO_IOMMU_MAP_DMA: {
> >>>> +		vfio_iommu_spapr_tce_dma_map param;
> >>>> +		struct iommu_table *tbl = container->tbl;
> >>>> +		enum dma_data_direction direction = DMA_NONE;
> >>>> +
> >>>> +		if (WARN_ON(!tbl))
> >>>> +			return -ENXIO;
> >>>> +
> >>>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >>>> +
> >>>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (param.argsz < minsz)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >>>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> >>>> +			direction = DMA_BIDIRECTIONAL;
> >>>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> >>>> +			direction = DMA_TO_DEVICE;
> >>>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> >>>> +			direction = DMA_FROM_DEVICE;
> >>>> +		}
> >>>> +
> >>>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>
> >>> On x86 we force iova, vaddr, and size to all be aligned to the smallest
> >>> page granularity of the iommu and return -EINVAL if it doesn't fit.
> >>> What does it imply to the user if they're always aligned to work here?
> >>> Won't this interface happily map overlapping entries with no indication
> >>> to the user that the previous mapping is no longer valid?
> >>> Maybe another reason why a combined unmap/map makes me nervous, we have
> >>> to assume the user knows what they're doing.
> >>
> >>
> >> I got used to guests which do know what they are doing so I am pretty calm :)
> >> but ok, I'll move alignment to the QEMU, it makes sense.
> >>
> >>
> >>>> +
> >>>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>> +				param.vaddr & IOMMU_PAGE_MASK, direction,
> >>>> +				param.size >> IOMMU_PAGE_SHIFT);
> >>>> +	}
> >>>> +	case VFIO_IOMMU_UNMAP_DMA: {
> >>>> +		vfio_iommu_spapr_tce_dma_unmap param;
> >>>> +		struct iommu_table *tbl = container->tbl;
> >>>> +
> >>>> +		if (WARN_ON(!tbl))
> >>>> +			return -ENXIO;
> >>>> +
> >>>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> >>>> +
> >>>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (param.argsz < minsz)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>> +
> >>>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> >>>> +	}
> >>>> +	default:
> >>>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >>>
> >>> pr_warn
> >>>
> >>>> +	}
> >>>> +
> >>>> +	return -ENOTTY;
> >>>> +}
> >>>> +
> >>>> +static int tce_iommu_attach_group(void *iommu_data,
> >>>> +		struct iommu_group *iommu_group)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>> +
> >>>> +	BUG_ON(!tbl);
> >>>> +	mutex_lock(&container->lock);
> >>>> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> >>>> +			iommu_group_id(iommu_group), iommu_group);
> >>>> +	if (container->tbl) {
> >>>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> >>>
> >>> pr_warn
> >>>
> >>>> +				iommu_group_id(container->tbl->it_group),
> >>>> +				iommu_group_id(iommu_group));
> >>>> +		mutex_unlock(&container->lock);
> >>>> +		return -EBUSY;
> >>>> +	}
> >>>> +
> >>>> +	container->tbl = tbl;
> >>>
> >>> Would it be too much paranoia to clear all the tce here as you do below
> >>> on detach?
> >>
> >> Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
> >> unmaps) the whole DMA window at the boot time.
> >
> > But that's just one user of this interface, we can't assume they'll all
> > be so agreeable.  If any tces were enabled here, a malicious user would
> > have a window to host memory, right?  Thanks,
> 
> 
> But I still release pages on detach(), how can this code be not called on 
> the guest exit (normal or crashed)?

What's the initial state?  You leave it clean, but who came before you?
Thanks,

Alex


^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-27  5:07                   ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-27  5:07 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Tue, 2012-11-27 at 15:58 +1100, Alexey Kardashevskiy wrote:
> On 27/11/12 15:29, Alex Williamson wrote:
> > On Tue, 2012-11-27 at 15:06 +1100, Alexey Kardashevskiy wrote:
> >> On 27/11/12 05:20, Alex Williamson wrote:
> >>> On Fri, 2012-11-23 at 20:03 +1100, Alexey Kardashevskiy wrote:
> >>>> VFIO implements platform independent stuff such as
> >>>> a PCI driver, BAR access (via read/write on a file descriptor
> >>>> or direct mapping when possible) and IRQ signaling.
> >>>>
> >>>> The platform dependent part includes IOMMU initialization
> >>>> and handling. This patch implements an IOMMU driver for VFIO
> >>>> which does mapping/unmapping pages for the guest IO and
> >>>> provides information about DMA window (required by a POWERPC
> >>>> guest).
> >>>>
> >>>> The counterpart in QEMU is required to support this functionality.
> >>>>
> >>>> Cc: David Gibson <david@gibson.dropbear.id.au>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> ---
> >>>>    drivers/vfio/Kconfig                |    6 +
> >>>>    drivers/vfio/Makefile               |    1 +
> >>>>    drivers/vfio/vfio_iommu_spapr_tce.c |  247 +++++++++++++++++++++++++++++++++++
> >>>>    include/linux/vfio.h                |   20 +++
> >>>>    4 files changed, 274 insertions(+)
> >>>>    create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>>>
> >>>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >>>> index 7cd5dec..b464687 100644
> >>>> --- a/drivers/vfio/Kconfig
> >>>> +++ b/drivers/vfio/Kconfig
> >>>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>>>    	depends on VFIO
> >>>>    	default n
> >>>>
> >>>> +config VFIO_IOMMU_SPAPR_TCE
> >>>> +	tristate
> >>>> +	depends on VFIO && SPAPR_TCE_IOMMU
> >>>> +	default n
> >>>> +
> >>>>    menuconfig VFIO
> >>>>    	tristate "VFIO Non-Privileged userspace driver framework"
> >>>>    	depends on IOMMU_API
> >>>>    	select VFIO_IOMMU_TYPE1 if X86
> >>>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>>>    	help
> >>>>    	  VFIO provides a framework for secure userspace device drivers.
> >>>>    	  See Documentation/vfio.txt for more details.
> >>>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >>>> index 2398d4a..72bfabc 100644
> >>>> --- a/drivers/vfio/Makefile
> >>>> +++ b/drivers/vfio/Makefile
> >>>> @@ -1,3 +1,4 @@
> >>>>    obj-$(CONFIG_VFIO) += vfio.o
> >>>>    obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >>>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>>>    obj-$(CONFIG_VFIO_PCI) += pci/
> >>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> new file mode 100644
> >>>> index 0000000..46a6298
> >>>> --- /dev/null
> >>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> @@ -0,0 +1,247 @@
> >>>> +/*
> >>>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >>>> + *
> >>>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >>>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> + *
> >>>> + * This program is free software; you can redistribute it and/or modify
> >>>> + * it under the terms of the GNU General Public License version 2 as
> >>>> + * published by the Free Software Foundation.
> >>>> + *
> >>>> + * Derived from original vfio_iommu_type1.c:
> >>>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >>>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> >>>> + */
> >>>> +
> >>>> +#include <linux/module.h>
> >>>> +#include <linux/pci.h>
> >>>> +#include <linux/slab.h>
> >>>> +#include <linux/uaccess.h>
> >>>> +#include <linux/err.h>
> >>>> +#include <linux/vfio.h>
> >>>> +#include <asm/iommu.h>
> >>>> +
> >>>> +#define DRIVER_VERSION  "0.1"
> >>>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> >>>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> >>>> +
> >>>> +static void tce_iommu_detach_group(void *iommu_data,
> >>>> +		struct iommu_group *iommu_group);
> >>>> +
> >>>> +/*
> >>>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >>>> + */
> >>>> +
> >>>> +/*
> >>>> + * The container descriptor supports only a single group per container.
> >>>> + * Required by the API as the container is not supplied with the IOMMU group
> >>>> + * at the moment of initialization.
> >>>> + */
> >>>> +struct tce_container {
> >>>> +	struct mutex lock;
> >>>> +	struct iommu_table *tbl;
> >>>> +};
> >>>> +
> >>>> +static void *tce_iommu_open(unsigned long arg)
> >>>> +{
> >>>> +	struct tce_container *container;
> >>>> +
> >>>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> >>>> +		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
> >>>> +		return ERR_PTR(-EINVAL);
> >>>> +	}
> >>>> +
> >>>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> >>>> +	if (!container)
> >>>> +		return ERR_PTR(-ENOMEM);
> >>>> +
> >>>> +	mutex_init(&container->lock);
> >>>> +
> >>>> +	return container;
> >>>> +}
> >>>> +
> >>>> +static void tce_iommu_release(void *iommu_data)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +
> >>>> +	WARN_ON(container->tbl && !container->tbl->it_group);
> >>>
> >>> I think your patch ordering is backwards here.  it_group isn't added
> >>> until 2/2.  I'd really like to see the arch/powerpc code approved and
> >>> merged by the powerpc maintainer before we add the code that makes use
> >>> of it into vfio.  Otherwise we just get lots of churn if interfaces
> >>> change or they disapprove of it altogether.
> >>
> >>
> >> Makes sense, thanks.
> >>
> >>
> >>>> +	if (container->tbl && container->tbl->it_group)
> >>>> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> >>>> +
> >>>> +	mutex_destroy(&container->lock);
> >>>> +
> >>>> +	kfree(container);
> >>>> +}
> >>>> +
> >>>> +static long tce_iommu_ioctl(void *iommu_data,
> >>>> +				 unsigned int cmd, unsigned long arg)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +	unsigned long minsz;
> >>>> +
> >>>> +	switch (cmd) {
> >>>> +	case VFIO_CHECK_EXTENSION: {
> >>>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> >>>> +	}
> >>>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>>> +		struct vfio_iommu_spapr_tce_info info;
> >>>> +		struct iommu_table *tbl = container->tbl;
> >>>> +
> >>>> +		if (WARN_ON(!tbl))
> >>>> +			return -ENXIO;
> >>>> +
> >>>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> >>>> +				dma64_window_size);
> >>>> +
> >>>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (info.argsz < minsz)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> >>>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> >>>> +		info.dma64_window_start = 0;
> >>>> +		info.dma64_window_size = 0;
> >>>> +		info.flags = 0;
> >>>> +
> >>>> +		if (copy_to_user((void __user *)arg, &info, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		return 0;
> >>>> +	}
> >>>> +	case VFIO_IOMMU_MAP_DMA: {
> >>>> +		vfio_iommu_spapr_tce_dma_map param;
> >>>> +		struct iommu_table *tbl = container->tbl;
> >>>> +		enum dma_data_direction direction = DMA_NONE;
> >>>> +
> >>>> +		if (WARN_ON(!tbl))
> >>>> +			return -ENXIO;
> >>>> +
> >>>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >>>> +
> >>>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (param.argsz < minsz)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >>>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE)) {
> >>>> +			direction = DMA_BIDIRECTIONAL;
> >>>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
> >>>> +			direction = DMA_TO_DEVICE;
> >>>> +		} else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) {
> >>>> +			direction = DMA_FROM_DEVICE;
> >>>> +		}
> >>>> +
> >>>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>
> >>> On x86 we force iova, vaddr, and size to all be aligned to the smallest
> >>> page granularity of the iommu and return -EINVAL if it doesn't fit.
> >>> What does it imply to the user if they're always aligned to work here?
> >>> Won't this interface happily map overlapping entries with no indication
> >>> to the user that the previous mapping is no longer valid?
> >>> Maybe another reason why a combined unmap/map makes me nervous, we have
> >>> to assume the user knows what they're doing.
> >>
> >>
> >> I got used to guests which do know what they are doing so I am pretty calm :)
> >> but ok, I'll move alignment to the QEMU, it makes sense.
> >>
> >>
> >>>> +
> >>>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>> +				param.vaddr & IOMMU_PAGE_MASK, direction,
> >>>> +				param.size >> IOMMU_PAGE_SHIFT);
> >>>> +	}
> >>>> +	case VFIO_IOMMU_UNMAP_DMA: {
> >>>> +		vfio_iommu_spapr_tce_dma_unmap param;
> >>>> +		struct iommu_table *tbl = container->tbl;
> >>>> +
> >>>> +		if (WARN_ON(!tbl))
> >>>> +			return -ENXIO;
> >>>> +
> >>>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> >>>> +
> >>>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >>>> +			return -EFAULT;
> >>>> +
> >>>> +		if (param.argsz < minsz)
> >>>> +			return -EINVAL;
> >>>> +
> >>>> +		param.size += param.iova & ~IOMMU_PAGE_MASK;
> >>>> +		param.size = _ALIGN_UP(param.size, IOMMU_PAGE_SIZE);
> >>>> +
> >>>> +		return iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> >>>> +				0, DMA_NONE, param.size >> IOMMU_PAGE_SHIFT);
> >>>> +	}
> >>>> +	default:
> >>>> +		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
> >>>
> >>> pr_warn
> >>>
> >>>> +	}
> >>>> +
> >>>> +	return -ENOTTY;
> >>>> +}
> >>>> +
> >>>> +static int tce_iommu_attach_group(void *iommu_data,
> >>>> +		struct iommu_group *iommu_group)
> >>>> +{
> >>>> +	struct tce_container *container = iommu_data;
> >>>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> >>>> +
> >>>> +	BUG_ON(!tbl);
> >>>> +	mutex_lock(&container->lock);
> >>>> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> >>>> +			iommu_group_id(iommu_group), iommu_group);
> >>>> +	if (container->tbl) {
> >>>> +		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> >>>
> >>> pr_warn
> >>>
> >>>> +				iommu_group_id(container->tbl->it_group),
> >>>> +				iommu_group_id(iommu_group));
> >>>> +		mutex_unlock(&container->lock);
> >>>> +		return -EBUSY;
> >>>> +	}
> >>>> +
> >>>> +	container->tbl = tbl;
> >>>
> >>> Would it be too much paranoia to clear all the tce here as you do below
> >>> on detach?
> >>
> >> Guess so. I do unmap on detach() and the guest calls put_tce(0) (i.e.
> >> unmaps) the whole DMA window at the boot time.
> >
> > But that's just one user of this interface, we can't assume they'll all
> > be so agreeable.  If any tces were enabled here, a malicious user would
> > have a window to host memory, right?  Thanks,
> 
> 
> But I still release pages on detach(), how can this code be not called on 
> the guest exit (normal or crashed)?

What's the initial state?  You leave it clean, but who came before you?
Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
  2012-11-27  4:41           ` Alex Williamson
@ 2012-11-28  7:18             ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-28  7:18 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    9 +++
 arch/powerpc/kernel/iommu.c          |  147 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 299 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..1456b6e 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,149 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of pages
+ * which it called put_page() on.
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, pages_put = 0;
+	unsigned long oldtce;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		WARN_ON(!page);
+		if (!page)
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		++pages_put;
+		put_page(page);
+	}
+
+	return pages_put;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of released pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret)
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually locked pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+		if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE)
+		ret = put_tce(tbl, entry + i, tce, direction);
+
+	/* If failed, release locked pages, otherwise return the number of pages */
+	if (ret)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = pages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-11-28  7:18             ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-28  7:18 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, linux-kernel, Paul Mackerras, linuxppc-dev,
	David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    9 +++
 arch/powerpc/kernel/iommu.c          |  147 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 299 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..1456b6e 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,149 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of pages
+ * which it called put_page() on.
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, pages_put = 0;
+	unsigned long oldtce;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		WARN_ON(!page);
+		if (!page)
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		++pages_put;
+		put_page(page);
+	}
+
+	return pages_put;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of released pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret)
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually locked pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+		if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE)
+		ret = put_tce(tbl, entry + i, tce, direction);
+
+	/* If failed, release locked pages, otherwise return the number of pages */
+	if (ret)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = pages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-27  5:07                   ` Alex Williamson
@ 2012-11-28  7:21                     ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-28  7:21 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  332 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   33 ++++
 4 files changed, 372 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..b98770e
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,332 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm)
+		return; /* process exited */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma64_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.dma64_window_start = 0;
+		info.dma64_window_size = 0;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction;
+		unsigned long locked, lock_limit;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+			direction = DMA_BIDIRECTIONAL;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			direction = DMA_TO_DEVICE;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			direction = DMA_FROM_DEVICE;
+		else
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		/* Account for locked pages */
+		locked = current->mm->locked_vm +
+				(param.size >> IOMMU_PAGE_SHIFT);
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+					rlimit(RLIMIT_MEMLOCK));
+			return -ENOMEM;
+		}
+
+		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.vaddr, direction,
+				param.size >> IOMMU_PAGE_SHIFT);
+		if (ret > 0)
+			lock_acct(ret);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.size >> IOMMU_PAGE_SHIFT);
+		if (ret > 0)
+			lock_acct(-ret);
+
+		return ret;
+	}
+	default:
+		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+		container->tbl = NULL;
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..820af1e 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
+ * 64 bit window (not supported at the moment for the guest) is supposed to
+ * be mapped completely to the guest memory so the devices capable of 64bit
+ * DMA will not have to use map/unmap ioctls.
+ *
+ * The IOMMU page size is always 4K.
+ */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+	__u64 dma64_window_start;	/* 64 bit window start (bytes) */
+	__u64 dma64_window_size;	/* 64 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-28  7:21                     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-28  7:21 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  332 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   33 ++++
 4 files changed, 372 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..b98770e
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,332 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm)
+		return; /* process exited */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma64_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.dma64_window_start = 0;
+		info.dma64_window_size = 0;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction;
+		unsigned long locked, lock_limit;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+			direction = DMA_BIDIRECTIONAL;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			direction = DMA_TO_DEVICE;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			direction = DMA_FROM_DEVICE;
+		else
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		/* Account for locked pages */
+		locked = current->mm->locked_vm +
+				(param.size >> IOMMU_PAGE_SHIFT);
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+					rlimit(RLIMIT_MEMLOCK));
+			return -ENOMEM;
+		}
+
+		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.vaddr, direction,
+				param.size >> IOMMU_PAGE_SHIFT);
+		if (ret > 0)
+			lock_acct(ret);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.size >> IOMMU_PAGE_SHIFT);
+		if (ret > 0)
+			lock_acct(-ret);
+
+		return ret;
+	}
+	default:
+		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+		container->tbl = NULL;
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..820af1e 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
+ * 64 bit window (not supported at the moment for the guest) is supposed to
+ * be mapped completely to the guest memory so the devices capable of 64bit
+ * DMA will not have to use map/unmap ioctls.
+ *
+ * The IOMMU page size is always 4K.
+ */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+	__u64 dma64_window_start;	/* 64 bit window start (bytes) */
+	__u64 dma64_window_size;	/* 64 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-28  7:21                     ` Alexey Kardashevskiy
@ 2012-11-28 21:01                       ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-28 21:01 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On Wed, 2012-11-28 at 18:21 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  drivers/vfio/Kconfig                |    6 +
>  drivers/vfio/Makefile               |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  332 +++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                |   33 ++++
>  4 files changed, 372 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..b98770e
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,332 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	long			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);
> +	kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm)
> +		return; /* process exited */
> +
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to update
> +	 * mm->locked_vm later. If locked_vm were atomic, we
> +	 * wouldn't need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}

This looks familiar, should we split it out to a common file instead of
duplicating it?

> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		pr_err("tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);
> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +	long ret;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION: {
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +	}
> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma64_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.dma64_window_start = 0;
> +		info.dma64_window_size = 0;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map param;
> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction;
> +		unsigned long locked, lock_limit;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> +			direction = DMA_BIDIRECTIONAL;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> +			direction = DMA_TO_DEVICE;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> +			direction = DMA_FROM_DEVICE;
> +		else
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK) ||
> +				(param.vaddr & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		/* Account for locked pages */
> +		locked = current->mm->locked_vm +
> +				(param.size >> IOMMU_PAGE_SHIFT);
> +		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

This page accounting doesn't look right.  PAGE_SIZE is several orders
bigger than IOMMU_PAGE_SIZE (right?), but we mix them here, which seems
like it will over penalize the user.  For example, if a user maps 4x4k
(assume aligned and contiguous) IOMMU pages, isn't that only a single
pinned system page (assuming >=16k pages).

> +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> +					rlimit(RLIMIT_MEMLOCK));
> +			return -ENOMEM;
> +		}
> +
> +		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.vaddr, direction,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +		if (ret > 0)
> +			lock_acct(ret);
> +
> +		return ret;
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap param;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +		if (ret > 0)
> +			lock_acct(-ret);
> +
> +		return ret;
> +	}
> +	default:
> +		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;
> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +		container->tbl = NULL;
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..820af1e 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +/*
> + * The SPAPR TCE info struct provides the information about the PCI bus
> + * address ranges available for DMA, these values are programmed into
> + * the hardware so the guest has to know that information.
> + *
> + * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
                                          ^^^^^^^^^^^
explicitly

> + * 64 bit window (not supported at the moment for the guest) is supposed to
> + * be mapped completely to the guest memory so the devices capable of 64bit
> + * DMA will not have to use map/unmap ioctls.
> + *
> + * The IOMMU page size is always 4K.
> + */

Thanks,

Alex

> +
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;			/* reserved for future use */
> +	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
> +	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
> +	__u64 dma64_window_start;	/* 64 bit window start (bytes) */
> +	__u64 dma64_window_size;	/* 64 bit window size (bytes) */
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-28 21:01                       ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-28 21:01 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Wed, 2012-11-28 at 18:21 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  drivers/vfio/Kconfig                |    6 +
>  drivers/vfio/Makefile               |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  332 +++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                |   33 ++++
>  4 files changed, 372 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..b98770e
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,332 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	long			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);
> +	kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm)
> +		return; /* process exited */
> +
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to update
> +	 * mm->locked_vm later. If locked_vm were atomic, we
> +	 * wouldn't need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}

This looks familiar, should we split it out to a common file instead of
duplicating it?

> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		pr_err("tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);
> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +	long ret;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION: {
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +	}
> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma64_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.dma64_window_start = 0;
> +		info.dma64_window_size = 0;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map param;
> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction;
> +		unsigned long locked, lock_limit;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> +			direction = DMA_BIDIRECTIONAL;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> +			direction = DMA_TO_DEVICE;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> +			direction = DMA_FROM_DEVICE;
> +		else
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK) ||
> +				(param.vaddr & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		/* Account for locked pages */
> +		locked = current->mm->locked_vm +
> +				(param.size >> IOMMU_PAGE_SHIFT);
> +		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

This page accounting doesn't look right.  PAGE_SIZE is several orders
bigger than IOMMU_PAGE_SIZE (right?), but we mix them here, which seems
like it will over penalize the user.  For example, if a user maps 4x4k
(assume aligned and contiguous) IOMMU pages, isn't that only a single
pinned system page (assuming >=16k pages).

> +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> +					rlimit(RLIMIT_MEMLOCK));
> +			return -ENOMEM;
> +		}
> +
> +		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.vaddr, direction,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +		if (ret > 0)
> +			lock_acct(ret);
> +
> +		return ret;
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap param;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +		if (ret > 0)
> +			lock_acct(-ret);
> +
> +		return ret;
> +	}
> +	default:
> +		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;
> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +		container->tbl = NULL;
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..820af1e 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +/*
> + * The SPAPR TCE info struct provides the information about the PCI bus
> + * address ranges available for DMA, these values are programmed into
> + * the hardware so the guest has to know that information.
> + *
> + * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
                                          ^^^^^^^^^^^
explicitly

> + * 64 bit window (not supported at the moment for the guest) is supposed to
> + * be mapped completely to the guest memory so the devices capable of 64bit
> + * DMA will not have to use map/unmap ioctls.
> + *
> + * The IOMMU page size is always 4K.
> + */

Thanks,

Alex

> +
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;			/* reserved for future use */
> +	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
> +	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
> +	__u64 dma64_window_start;	/* 64 bit window start (bytes) */
> +	__u64 dma64_window_size;	/* 64 bit window size (bytes) */
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-11-28  7:18             ` Alexey Kardashevskiy
@ 2012-11-28 21:30               ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-28 21:30 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, David Gibson

On Wed, 2012-11-28 at 18:18 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    9 +++
>  arch/powerpc/kernel/iommu.c          |  147 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 299 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..1456b6e 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,149 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of pages
> + * which it called put_page() on.
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, pages_put = 0;
> +	unsigned long oldtce;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		WARN_ON(!page);
> +		if (!page)
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		++pages_put;
> +		put_page(page);
> +	}
> +
> +	return pages_put;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of released pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret)
> +			ret = -EFAULT;
> +		return ret;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually locked pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> +		if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE)
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +
> +	/* If failed, release locked pages, otherwise return the number of pages */
> +	if (ret)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = pages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}

Nice, no more kmalloc!  I'm still concerned about the IOMMU page size
mismatch here.  If nothing else, the comment is misleading since we're
locking system pages, but returning tce pages.  The user would therefore
need to multiply their locked memory limit by (PAGE_SHIFT -
IOMMU_PAGE_SHIFT).  Thanks,

Alex

> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-11-28 21:30               ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-28 21:30 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: David Gibson, Paul Mackerras, linuxppc-dev, linux-kernel

On Wed, 2012-11-28 at 18:18 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    9 +++
>  arch/powerpc/kernel/iommu.c          |  147 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 299 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..1456b6e 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,149 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of pages
> + * which it called put_page() on.
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, pages_put = 0;
> +	unsigned long oldtce;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		WARN_ON(!page);
> +		if (!page)
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		++pages_put;
> +		put_page(page);
> +	}
> +
> +	return pages_put;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of released pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret)
> +			ret = -EFAULT;
> +		return ret;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually locked pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> +		if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE)
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +
> +	/* If failed, release locked pages, otherwise return the number of pages */
> +	if (ret)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = pages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}

Nice, no more kmalloc!  I'm still concerned about the IOMMU page size
mismatch here.  If nothing else, the comment is misleading since we're
locking system pages, but returning tce pages.  The user would therefore
need to multiply their locked memory limit by (PAGE_SHIFT -
IOMMU_PAGE_SHIFT).  Thanks,

Alex

> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
  2012-11-28 21:01                       ` Alex Williamson
@ 2012-11-29  3:51                         ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-29  3:51 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On 29/11/12 08:01, Alex Williamson wrote:
> On Wed, 2012-11-28 at 18:21 +1100, Alexey Kardashevskiy wrote:
>> VFIO implements platform independent stuff such as
>> a PCI driver, BAR access (via read/write on a file descriptor
>> or direct mapping when possible) and IRQ signaling.
>>
>> The platform dependent part includes IOMMU initialization
>> and handling. This patch implements an IOMMU driver for VFIO
>> which does mapping/unmapping pages for the guest IO and
>> provides information about DMA window (required by a POWERPC
>> guest).
>>
>> The counterpart in QEMU is required to support this functionality.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   drivers/vfio/Kconfig                |    6 +
>>   drivers/vfio/Makefile               |    1 +
>>   drivers/vfio/vfio_iommu_spapr_tce.c |  332 +++++++++++++++++++++++++++++++++++
>>   include/linux/vfio.h                |   33 ++++
>>   4 files changed, 372 insertions(+)
>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>
>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>> index 7cd5dec..b464687 100644
>> --- a/drivers/vfio/Kconfig
>> +++ b/drivers/vfio/Kconfig
>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>   	depends on VFIO
>>   	default n
>>
>> +config VFIO_IOMMU_SPAPR_TCE
>> +	tristate
>> +	depends on VFIO && SPAPR_TCE_IOMMU
>> +	default n
>> +
>>   menuconfig VFIO
>>   	tristate "VFIO Non-Privileged userspace driver framework"
>>   	depends on IOMMU_API
>>   	select VFIO_IOMMU_TYPE1 if X86
>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>   	help
>>   	  VFIO provides a framework for secure userspace device drivers.
>>   	  See Documentation/vfio.txt for more details.
>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>> index 2398d4a..72bfabc 100644
>> --- a/drivers/vfio/Makefile
>> +++ b/drivers/vfio/Makefile
>> @@ -1,3 +1,4 @@
>>   obj-$(CONFIG_VFIO) += vfio.o
>>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>   obj-$(CONFIG_VFIO_PCI) += pci/
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> new file mode 100644
>> index 0000000..b98770e
>> --- /dev/null
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -0,0 +1,332 @@
>> +/*
>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>> + *
>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * Derived from original vfio_iommu_type1.c:
>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/err.h>
>> +#include <linux/vfio.h>
>> +#include <asm/iommu.h>
>> +
>> +#define DRIVER_VERSION  "0.1"
>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group);
>> +
>> +/*
>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>> + */
>> +
>> +/*
>> + * This code handles mapping and unmapping of user data buffers
>> + * into DMA'ble space using the IOMMU
>> + */
>> +
>> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
>> +
>> +struct vwork {
>> +	struct mm_struct	*mm;
>> +	long			npage;
>> +	struct work_struct	work;
>> +};
>> +
>> +/* delayed decrement/increment for locked_vm */
>> +static void lock_acct_bg(struct work_struct *work)
>> +{
>> +	struct vwork *vwork = container_of(work, struct vwork, work);
>> +	struct mm_struct *mm;
>> +
>> +	mm = vwork->mm;
>> +	down_write(&mm->mmap_sem);
>> +	mm->locked_vm += vwork->npage;
>> +	up_write(&mm->mmap_sem);
>> +	mmput(mm);
>> +	kfree(vwork);
>> +}
>> +
>> +static void lock_acct(long npage)
>> +{
>> +	struct vwork *vwork;
>> +	struct mm_struct *mm;
>> +
>> +	if (!current->mm)
>> +		return; /* process exited */
>> +
>> +	if (down_write_trylock(&current->mm->mmap_sem)) {
>> +		current->mm->locked_vm += npage;
>> +		up_write(&current->mm->mmap_sem);
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * Couldn't get mmap_sem lock, so must setup to update
>> +	 * mm->locked_vm later. If locked_vm were atomic, we
>> +	 * wouldn't need this silliness
>> +	 */
>> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>> +	if (!vwork)
>> +		return;
>> +	mm = get_task_mm(current);
>> +	if (!mm) {
>> +		kfree(vwork);
>> +		return;
>> +	}
>> +	INIT_WORK(&vwork->work, lock_acct_bg);
>> +	vwork->mm = mm;
>> +	vwork->npage = npage;
>> +	schedule_work(&vwork->work);
>> +}
>
> This looks familiar, should we split it out to a common file instead of
> duplicating it?

It is simple cut-n-paste from type1 driver :)
Moving it to a separate file is up to you but it is quite small piece of 
code to move it somewhere, and I have not fixed rlimit handling yet, so 
wait a bit.


>> +
>> +/*
>> + * The container descriptor supports only a single group per container.
>> + * Required by the API as the container is not supplied with the IOMMU group
>> + * at the moment of initialization.
>> + */
>> +struct tce_container {
>> +	struct mutex lock;
>> +	struct iommu_table *tbl;
>> +};
>> +
>> +static void *tce_iommu_open(unsigned long arg)
>> +{
>> +	struct tce_container *container;
>> +
>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
>> +		pr_err("tce_vfio: Wrong IOMMU type\n");
>> +		return ERR_PTR(-EINVAL);
>> +	}
>> +
>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
>> +	if (!container)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	mutex_init(&container->lock);
>> +
>> +	return container;
>> +}
>> +
>> +static void tce_iommu_release(void *iommu_data)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +
>> +	WARN_ON(container->tbl && !container->tbl->it_group);
>> +	if (container->tbl && container->tbl->it_group)
>> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>> +
>> +	mutex_destroy(&container->lock);
>> +
>> +	kfree(container);
>> +}
>> +
>> +static long tce_iommu_ioctl(void *iommu_data,
>> +				 unsigned int cmd, unsigned long arg)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	unsigned long minsz;
>> +	long ret;
>> +
>> +	switch (cmd) {
>> +	case VFIO_CHECK_EXTENSION: {
>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>> +	}
>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>> +		struct vfio_iommu_spapr_tce_info info;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>> +				dma64_window_size);
>> +
>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (info.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>> +		info.dma64_window_start = 0;
>> +		info.dma64_window_size = 0;
>> +		info.flags = 0;
>> +
>> +		if (copy_to_user((void __user *)arg, &info, minsz))
>> +			return -EFAULT;
>> +
>> +		return 0;
>> +	}
>> +	case VFIO_IOMMU_MAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_map param;
>> +		struct iommu_table *tbl = container->tbl;
>> +		enum dma_data_direction direction;
>> +		unsigned long locked, lock_limit;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
>> +			direction = DMA_BIDIRECTIONAL;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
>> +			direction = DMA_TO_DEVICE;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> +			direction = DMA_FROM_DEVICE;
>> +		else
>> +			return -EINVAL;
>> +
>> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
>> +				(param.iova & ~IOMMU_PAGE_MASK) ||
>> +				(param.vaddr & ~IOMMU_PAGE_MASK))
>> +			return -EINVAL;
>> +
>> +		/* Account for locked pages */
>> +		locked = current->mm->locked_vm +
>> +				(param.size >> IOMMU_PAGE_SHIFT);
>> +		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>
> This page accounting doesn't look right.  PAGE_SIZE is several orders
> bigger than IOMMU_PAGE_SIZE (right?), but we mix them here, which seems
> like it will over penalize the user.  For example, if a user maps 4x4k
> (assume aligned and contiguous) IOMMU pages, isn't that only a single
> pinned system page (assuming >=16k pages).

Oops. My bad. IOMMU_PAGE_SHIFT should be PAGE_SHIFT and should return the 
number of system pages.

But we do not track 4K pages so I do not see any easy solution here. Except 
fixing iommu_put_tces/iommu_clear_tces (*) to return the number of the very 
first 4K IOMMU pages within system 64K pages.
This won't be too accurate but should work, no?

I'll post it as a patch in reply to "vfio powerpc: enabled on powernv 
platform".



>> +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>> +			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
>> +					rlimit(RLIMIT_MEMLOCK));
>> +			return -ENOMEM;
>> +		}
>> +
>> +		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> +				param.vaddr, direction,
>> +				param.size >> IOMMU_PAGE_SHIFT);
>> +		if (ret > 0)
>> +			lock_acct(ret);
>> +
>> +		return ret;
>> +	}
>> +	case VFIO_IOMMU_UNMAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_unmap param;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
>> +				(param.iova & ~IOMMU_PAGE_MASK))
>> +			return -EINVAL;
>> +
>> +		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> +				param.size >> IOMMU_PAGE_SHIFT);
>> +		if (ret > 0)
>> +			lock_acct(-ret);
>> +
>> +		return ret;
>> +	}
>> +	default:
>> +		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
>> +	}
>> +
>> +	return -ENOTTY;
>> +}
>> +
>> +static int tce_iommu_attach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	BUG_ON(!tbl);
>> +	mutex_lock(&container->lock);
>> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>> +			iommu_group_id(iommu_group), iommu_group);
>> +	if (container->tbl) {
>> +		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>> +				iommu_group_id(container->tbl->it_group),
>> +				iommu_group_id(iommu_group));
>> +		mutex_unlock(&container->lock);
>> +		return -EBUSY;
>> +	}
>> +
>> +	container->tbl = tbl;
>> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>> +	mutex_unlock(&container->lock);
>> +
>> +	return 0;
>> +}
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	BUG_ON(!tbl);
>> +	mutex_lock(&container->lock);
>> +	if (tbl != container->tbl) {
>> +		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
>> +				iommu_group_id(iommu_group),
>> +				iommu_group_id(tbl->it_group));
>> +	} else {
>> +
>> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
>> +				iommu_group_id(iommu_group), iommu_group);
>> +
>> +		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>> +		container->tbl = NULL;
>> +	}
>> +	mutex_unlock(&container->lock);
>> +}
>> +
>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>> +	.name		= "iommu-vfio-powerpc",
>> +	.owner		= THIS_MODULE,
>> +	.open		= tce_iommu_open,
>> +	.release	= tce_iommu_release,
>> +	.ioctl		= tce_iommu_ioctl,
>> +	.attach_group	= tce_iommu_attach_group,
>> +	.detach_group	= tce_iommu_detach_group,
>> +};
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> +
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 0a4f180..820af1e 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>>   /* Extensions */
>>
>>   #define VFIO_TYPE1_IOMMU		1
>> +#define VFIO_SPAPR_TCE_IOMMU		2
>>
>>   /*
>>    * The IOCTL interface is designed for extensibility by embedding the
>> @@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {
>>
>>   #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>
>> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>> +
>> +/*
>> + * The SPAPR TCE info struct provides the information about the PCI bus
>> + * address ranges available for DMA, these values are programmed into
>> + * the hardware so the guest has to know that information.
>> + *
>> + * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
>                                            ^^^^^^^^^^^
> explicitly
>
>> + * 64 bit window (not supported at the moment for the guest) is supposed to
>> + * be mapped completely to the guest memory so the devices capable of 64bit
>> + * DMA will not have to use map/unmap ioctls.
>> + *
>> + * The IOMMU page size is always 4K.
>> + */
>
> Thanks,
>
> Alex
>
>> +
>> +struct vfio_iommu_spapr_tce_info {
>> +	__u32 argsz;
>> +	__u32 flags;			/* reserved for future use */
>> +	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
>> +	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
>> +	__u64 dma64_window_start;	/* 64 bit window start (bytes) */
>> +	__u64 dma64_window_size;	/* 64 bit window size (bytes) */
>> +};
>> +
>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +/* Reuse type1 map/unmap structs as they are the same at the moment */
>> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
>> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
>> +
>> +/* ***************************************************************** */
>> +
>>   #endif /* VFIO_H */
>
>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-11-29  3:51                         ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-29  3:51 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On 29/11/12 08:01, Alex Williamson wrote:
> On Wed, 2012-11-28 at 18:21 +1100, Alexey Kardashevskiy wrote:
>> VFIO implements platform independent stuff such as
>> a PCI driver, BAR access (via read/write on a file descriptor
>> or direct mapping when possible) and IRQ signaling.
>>
>> The platform dependent part includes IOMMU initialization
>> and handling. This patch implements an IOMMU driver for VFIO
>> which does mapping/unmapping pages for the guest IO and
>> provides information about DMA window (required by a POWERPC
>> guest).
>>
>> The counterpart in QEMU is required to support this functionality.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   drivers/vfio/Kconfig                |    6 +
>>   drivers/vfio/Makefile               |    1 +
>>   drivers/vfio/vfio_iommu_spapr_tce.c |  332 +++++++++++++++++++++++++++++++++++
>>   include/linux/vfio.h                |   33 ++++
>>   4 files changed, 372 insertions(+)
>>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>
>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>> index 7cd5dec..b464687 100644
>> --- a/drivers/vfio/Kconfig
>> +++ b/drivers/vfio/Kconfig
>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>>   	depends on VFIO
>>   	default n
>>
>> +config VFIO_IOMMU_SPAPR_TCE
>> +	tristate
>> +	depends on VFIO && SPAPR_TCE_IOMMU
>> +	default n
>> +
>>   menuconfig VFIO
>>   	tristate "VFIO Non-Privileged userspace driver framework"
>>   	depends on IOMMU_API
>>   	select VFIO_IOMMU_TYPE1 if X86
>> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>>   	help
>>   	  VFIO provides a framework for secure userspace device drivers.
>>   	  See Documentation/vfio.txt for more details.
>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>> index 2398d4a..72bfabc 100644
>> --- a/drivers/vfio/Makefile
>> +++ b/drivers/vfio/Makefile
>> @@ -1,3 +1,4 @@
>>   obj-$(CONFIG_VFIO) += vfio.o
>>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>>   obj-$(CONFIG_VFIO_PCI) += pci/
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> new file mode 100644
>> index 0000000..b98770e
>> --- /dev/null
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -0,0 +1,332 @@
>> +/*
>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>> + *
>> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
>> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * Derived from original vfio_iommu_type1.c:
>> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
>> + *     Author: Alex Williamson <alex.williamson@redhat.com>
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/err.h>
>> +#include <linux/vfio.h>
>> +#include <asm/iommu.h>
>> +
>> +#define DRIVER_VERSION  "0.1"
>> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
>> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group);
>> +
>> +/*
>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>> + */
>> +
>> +/*
>> + * This code handles mapping and unmapping of user data buffers
>> + * into DMA'ble space using the IOMMU
>> + */
>> +
>> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
>> +
>> +struct vwork {
>> +	struct mm_struct	*mm;
>> +	long			npage;
>> +	struct work_struct	work;
>> +};
>> +
>> +/* delayed decrement/increment for locked_vm */
>> +static void lock_acct_bg(struct work_struct *work)
>> +{
>> +	struct vwork *vwork = container_of(work, struct vwork, work);
>> +	struct mm_struct *mm;
>> +
>> +	mm = vwork->mm;
>> +	down_write(&mm->mmap_sem);
>> +	mm->locked_vm += vwork->npage;
>> +	up_write(&mm->mmap_sem);
>> +	mmput(mm);
>> +	kfree(vwork);
>> +}
>> +
>> +static void lock_acct(long npage)
>> +{
>> +	struct vwork *vwork;
>> +	struct mm_struct *mm;
>> +
>> +	if (!current->mm)
>> +		return; /* process exited */
>> +
>> +	if (down_write_trylock(&current->mm->mmap_sem)) {
>> +		current->mm->locked_vm += npage;
>> +		up_write(&current->mm->mmap_sem);
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * Couldn't get mmap_sem lock, so must setup to update
>> +	 * mm->locked_vm later. If locked_vm were atomic, we
>> +	 * wouldn't need this silliness
>> +	 */
>> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>> +	if (!vwork)
>> +		return;
>> +	mm = get_task_mm(current);
>> +	if (!mm) {
>> +		kfree(vwork);
>> +		return;
>> +	}
>> +	INIT_WORK(&vwork->work, lock_acct_bg);
>> +	vwork->mm = mm;
>> +	vwork->npage = npage;
>> +	schedule_work(&vwork->work);
>> +}
>
> This looks familiar, should we split it out to a common file instead of
> duplicating it?

It is simple cut-n-paste from type1 driver :)
Moving it to a separate file is up to you but it is quite small piece of 
code to move it somewhere, and I have not fixed rlimit handling yet, so 
wait a bit.


>> +
>> +/*
>> + * The container descriptor supports only a single group per container.
>> + * Required by the API as the container is not supplied with the IOMMU group
>> + * at the moment of initialization.
>> + */
>> +struct tce_container {
>> +	struct mutex lock;
>> +	struct iommu_table *tbl;
>> +};
>> +
>> +static void *tce_iommu_open(unsigned long arg)
>> +{
>> +	struct tce_container *container;
>> +
>> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
>> +		pr_err("tce_vfio: Wrong IOMMU type\n");
>> +		return ERR_PTR(-EINVAL);
>> +	}
>> +
>> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
>> +	if (!container)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	mutex_init(&container->lock);
>> +
>> +	return container;
>> +}
>> +
>> +static void tce_iommu_release(void *iommu_data)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +
>> +	WARN_ON(container->tbl && !container->tbl->it_group);
>> +	if (container->tbl && container->tbl->it_group)
>> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>> +
>> +	mutex_destroy(&container->lock);
>> +
>> +	kfree(container);
>> +}
>> +
>> +static long tce_iommu_ioctl(void *iommu_data,
>> +				 unsigned int cmd, unsigned long arg)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	unsigned long minsz;
>> +	long ret;
>> +
>> +	switch (cmd) {
>> +	case VFIO_CHECK_EXTENSION: {
>> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>> +	}
>> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>> +		struct vfio_iommu_spapr_tce_info info;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>> +				dma64_window_size);
>> +
>> +		if (copy_from_user(&info, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (info.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>> +		info.dma64_window_start = 0;
>> +		info.dma64_window_size = 0;
>> +		info.flags = 0;
>> +
>> +		if (copy_to_user((void __user *)arg, &info, minsz))
>> +			return -EFAULT;
>> +
>> +		return 0;
>> +	}
>> +	case VFIO_IOMMU_MAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_map param;
>> +		struct iommu_table *tbl = container->tbl;
>> +		enum dma_data_direction direction;
>> +		unsigned long locked, lock_limit;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
>> +			direction = DMA_BIDIRECTIONAL;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
>> +			direction = DMA_TO_DEVICE;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> +			direction = DMA_FROM_DEVICE;
>> +		else
>> +			return -EINVAL;
>> +
>> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
>> +				(param.iova & ~IOMMU_PAGE_MASK) ||
>> +				(param.vaddr & ~IOMMU_PAGE_MASK))
>> +			return -EINVAL;
>> +
>> +		/* Account for locked pages */
>> +		locked = current->mm->locked_vm +
>> +				(param.size >> IOMMU_PAGE_SHIFT);
>> +		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>
> This page accounting doesn't look right.  PAGE_SIZE is several orders
> bigger than IOMMU_PAGE_SIZE (right?), but we mix them here, which seems
> like it will over penalize the user.  For example, if a user maps 4x4k
> (assume aligned and contiguous) IOMMU pages, isn't that only a single
> pinned system page (assuming >=16k pages).

Oops. My bad. IOMMU_PAGE_SHIFT should be PAGE_SHIFT and should return the 
number of system pages.

But we do not track 4K pages so I do not see any easy solution here. Except 
fixing iommu_put_tces/iommu_clear_tces (*) to return the number of the very 
first 4K IOMMU pages within system 64K pages.
This won't be too accurate but should work, no?

I'll post it as a patch in reply to "vfio powerpc: enabled on powernv 
platform".



>> +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>> +			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
>> +					rlimit(RLIMIT_MEMLOCK));
>> +			return -ENOMEM;
>> +		}
>> +
>> +		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> +				param.vaddr, direction,
>> +				param.size >> IOMMU_PAGE_SHIFT);
>> +		if (ret > 0)
>> +			lock_acct(ret);
>> +
>> +		return ret;
>> +	}
>> +	case VFIO_IOMMU_UNMAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_unmap param;
>> +		struct iommu_table *tbl = container->tbl;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
>> +				(param.iova & ~IOMMU_PAGE_MASK))
>> +			return -EINVAL;
>> +
>> +		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> +				param.size >> IOMMU_PAGE_SHIFT);
>> +		if (ret > 0)
>> +			lock_acct(-ret);
>> +
>> +		return ret;
>> +	}
>> +	default:
>> +		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
>> +	}
>> +
>> +	return -ENOTTY;
>> +}
>> +
>> +static int tce_iommu_attach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	BUG_ON(!tbl);
>> +	mutex_lock(&container->lock);
>> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>> +			iommu_group_id(iommu_group), iommu_group);
>> +	if (container->tbl) {
>> +		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>> +				iommu_group_id(container->tbl->it_group),
>> +				iommu_group_id(iommu_group));
>> +		mutex_unlock(&container->lock);
>> +		return -EBUSY;
>> +	}
>> +
>> +	container->tbl = tbl;
>> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>> +	mutex_unlock(&container->lock);
>> +
>> +	return 0;
>> +}
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> +		struct iommu_group *iommu_group)
>> +{
>> +	struct tce_container *container = iommu_data;
>> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> +	BUG_ON(!tbl);
>> +	mutex_lock(&container->lock);
>> +	if (tbl != container->tbl) {
>> +		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
>> +				iommu_group_id(iommu_group),
>> +				iommu_group_id(tbl->it_group));
>> +	} else {
>> +
>> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
>> +				iommu_group_id(iommu_group), iommu_group);
>> +
>> +		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>> +		container->tbl = NULL;
>> +	}
>> +	mutex_unlock(&container->lock);
>> +}
>> +
>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>> +	.name		= "iommu-vfio-powerpc",
>> +	.owner		= THIS_MODULE,
>> +	.open		= tce_iommu_open,
>> +	.release	= tce_iommu_release,
>> +	.ioctl		= tce_iommu_ioctl,
>> +	.attach_group	= tce_iommu_attach_group,
>> +	.detach_group	= tce_iommu_detach_group,
>> +};
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> +
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 0a4f180..820af1e 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>>   /* Extensions */
>>
>>   #define VFIO_TYPE1_IOMMU		1
>> +#define VFIO_SPAPR_TCE_IOMMU		2
>>
>>   /*
>>    * The IOCTL interface is designed for extensibility by embedding the
>> @@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {
>>
>>   #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>
>> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>> +
>> +/*
>> + * The SPAPR TCE info struct provides the information about the PCI bus
>> + * address ranges available for DMA, these values are programmed into
>> + * the hardware so the guest has to know that information.
>> + *
>> + * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
>                                            ^^^^^^^^^^^
> explicitly
>
>> + * 64 bit window (not supported at the moment for the guest) is supposed to
>> + * be mapped completely to the guest memory so the devices capable of 64bit
>> + * DMA will not have to use map/unmap ioctls.
>> + *
>> + * The IOMMU page size is always 4K.
>> + */
>
> Thanks,
>
> Alex
>
>> +
>> +struct vfio_iommu_spapr_tce_info {
>> +	__u32 argsz;
>> +	__u32 flags;			/* reserved for future use */
>> +	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
>> +	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
>> +	__u64 dma64_window_start;	/* 64 bit window start (bytes) */
>> +	__u64 dma64_window_size;	/* 64 bit window size (bytes) */
>> +};
>> +
>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +/* Reuse type1 map/unmap structs as they are the same at the moment */
>> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
>> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
>> +
>> +/* ***************************************************************** */
>> +
>>   #endif /* VFIO_H */
>
>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
  2012-11-28 21:30               ` Alex Williamson
@ 2012-11-29  3:53                 ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-29  3:53 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    9 ++
 arch/powerpc/kernel/iommu.c          |  159 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 311 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..1225fbb 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,161 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of pages
+ * which it called put_page() on.
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0;
+	unsigned long oldtce;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		WARN_ON(!page);
+		if (!page)
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		if (!(oldtce & ~PAGE_MASK))
+			++retpages;
+
+		put_page(page);
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of released pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret)
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0, retpages = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+		if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		/*
+		 * As IOMMU page size is always 4K, the system page size
+		 * can be 64K and there is no special tracking for IOMMU pages,
+		 * we only do rlimit check/update for the very first
+		 * 4K IOMMUpage within 64K system page.
+		 */
+		if (!(tce & ~PAGE_MASK))
+			++retpages;
+	}
+
+	/* If failed, release locked pages, otherwise return the number of pages */
+	if (ret)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = retpages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-11-29  3:53                 ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-29  3:53 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, linux-kernel, Paul Mackerras, linuxppc-dev,
	David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    9 ++
 arch/powerpc/kernel/iommu.c          |  159 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 311 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..1225fbb 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,161 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of pages
+ * which it called put_page() on.
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0;
+	unsigned long oldtce;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		WARN_ON(!page);
+		if (!page)
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		if (!(oldtce & ~PAGE_MASK))
+			++retpages;
+
+		put_page(page);
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of released pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret)
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0, retpages = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+		if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		/*
+		 * As IOMMU page size is always 4K, the system page size
+		 * can be 64K and there is no special tracking for IOMMU pages,
+		 * we only do rlimit check/update for the very first
+		 * 4K IOMMUpage within 64K system page.
+		 */
+		if (!(tce & ~PAGE_MASK))
+			++retpages;
+	}
+
+	/* If failed, release locked pages, otherwise return the number of pages */
+	if (ret)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = retpages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-11-29  3:53                 ` Alexey Kardashevskiy
@ 2012-11-29  4:20                   ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-29  4:20 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, David Gibson

On Thu, 2012-11-29 at 14:53 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    9 ++
>  arch/powerpc/kernel/iommu.c          |  159 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 311 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..1225fbb 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,161 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of pages
> + * which it called put_page() on.
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0;
> +	unsigned long oldtce;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		WARN_ON(!page);
> +		if (!page)
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		if (!(oldtce & ~PAGE_MASK))
> +			++retpages;

I'm confused, it looks like you're trying to only increment the counter
for tce pages aligned at the start of a page, but don't we need to mask
out the read/write and valid bits?  Trickiness like this demands a
comment.

> +
> +		put_page(page);
> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of released pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret)
> +			ret = -EFAULT;
> +		return ret;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0, retpages = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> +		if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		/*
> +		 * As IOMMU page size is always 4K, the system page size
> +		 * can be 64K and there is no special tracking for IOMMU pages,
> +		 * we only do rlimit check/update for the very first
> +		 * 4K IOMMUpage within 64K system page.
> +		 */
> +		if (!(tce & ~PAGE_MASK))
> +			++retpages;

Ah, here's the comment I was looking for, though I'm still not sure
about the read/write bits.

Isn't there an exploit here that a user can lock pages beyond their
limits if they just skip mapping the first 4k of each page?  Thanks,

Alex

> +	}
> +
> +	/* If failed, release locked pages, otherwise return the number of pages */
> +	if (ret)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = retpages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-11-29  4:20                   ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-29  4:20 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: David Gibson, Paul Mackerras, linuxppc-dev, linux-kernel

On Thu, 2012-11-29 at 14:53 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    9 ++
>  arch/powerpc/kernel/iommu.c          |  159 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 311 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..1225fbb 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,161 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of pages
> + * which it called put_page() on.
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0;
> +	unsigned long oldtce;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		WARN_ON(!page);
> +		if (!page)
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		if (!(oldtce & ~PAGE_MASK))
> +			++retpages;

I'm confused, it looks like you're trying to only increment the counter
for tce pages aligned at the start of a page, but don't we need to mask
out the read/write and valid bits?  Trickiness like this demands a
comment.

> +
> +		put_page(page);
> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of released pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret)
> +			ret = -EFAULT;
> +		return ret;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0, retpages = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> +		if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) {
> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		/*
> +		 * As IOMMU page size is always 4K, the system page size
> +		 * can be 64K and there is no special tracking for IOMMU pages,
> +		 * we only do rlimit check/update for the very first
> +		 * 4K IOMMUpage within 64K system page.
> +		 */
> +		if (!(tce & ~PAGE_MASK))
> +			++retpages;

Ah, here's the comment I was looking for, though I'm still not sure
about the read/write bits.

Isn't there an exploit here that a user can lock pages beyond their
limits if they just skip mapping the first 4k of each page?  Thanks,

Alex

> +	}
> +
> +	/* If failed, release locked pages, otherwise return the number of pages */
> +	if (ret)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = retpages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
  2012-11-29  4:20                   ` Alex Williamson
@ 2012-11-30  6:14                     ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-30  6:14 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    9 ++
 arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 338 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..0646c50 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ * bitmap_weight is not used as it does not support bigendian maps.
+ */
+static int syspage_weight(unsigned long *map, unsigned long entry)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits--)
+		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
+
+	return ret;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		oldweight = syspage_weight(tbl->it_map, entry);
+		__clear_bit(entry, tbl->it_map);
+
+		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		WARN_ON(!page);
+		if (!page)
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ / of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret || (ret > 1))
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight(tbl->it_map, entry);
+	__set_bit(entry, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0, retpages = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
+				test_bit(entry + i, tbl->it_map)) {
+			WARN_ON(test_bit(entry + i, tbl->it_map));
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = retpages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-11-30  6:14                     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-30  6:14 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, linux-kernel, Paul Mackerras, linuxppc-dev,
	David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    9 ++
 arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 338 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..0646c50 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ * bitmap_weight is not used as it does not support bigendian maps.
+ */
+static int syspage_weight(unsigned long *map, unsigned long entry)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits--)
+		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
+
+	return ret;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		oldweight = syspage_weight(tbl->it_map, entry);
+		__clear_bit(entry, tbl->it_map);
+
+		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		WARN_ON(!page);
+		if (!page)
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ / of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret || (ret > 1))
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight(tbl->it_map, entry);
+	__set_bit(entry, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0, retpages = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
+				test_bit(entry + i, tbl->it_map)) {
+			WARN_ON(test_bit(entry + i, tbl->it_map));
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = retpages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-11-29  4:20                   ` Alex Williamson
@ 2012-11-30  6:16                     ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-30  6:16 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, David Gibson

On 29/11/12 15:20, Alex Williamson wrote:

>> +	/* Put tces to the table */
>> +	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE) {
>> +		ret = put_tce(tbl, entry + i, tce, direction);
>> +		/*
>> +		 * As IOMMU page size is always 4K, the system page size
>> +		 * can be 64K and there is no special tracking for IOMMU pages,
>> +		 * we only do rlimit check/update for the very first
>> +		 * 4K IOMMUpage within 64K system page.
>> +		 */
>> +		if (!(tce & ~PAGE_MASK))
>> +			++retpages;
>
> Ah, here's the comment I was looking for, though I'm still not sure
> about the read/write bits.
>
> Isn't there an exploit here that a user can lock pages beyond their
> limits if they just skip mapping the first 4k of each page?  Thanks,


Heh. True. Posted another patch with 4K pages per system page usage tracking.



-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-11-30  6:16                     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-11-30  6:16 UTC (permalink / raw)
  To: Alex Williamson; +Cc: David Gibson, Paul Mackerras, linuxppc-dev, linux-kernel

On 29/11/12 15:20, Alex Williamson wrote:

>> +	/* Put tces to the table */
>> +	for (i = 0; (i < pages) && !ret; ++i, tce += IOMMU_PAGE_SIZE) {
>> +		ret = put_tce(tbl, entry + i, tce, direction);
>> +		/*
>> +		 * As IOMMU page size is always 4K, the system page size
>> +		 * can be 64K and there is no special tracking for IOMMU pages,
>> +		 * we only do rlimit check/update for the very first
>> +		 * 4K IOMMUpage within 64K system page.
>> +		 */
>> +		if (!(tce & ~PAGE_MASK))
>> +			++retpages;
>
> Ah, here's the comment I was looking for, though I'm still not sure
> about the read/write bits.
>
> Isn't there an exploit here that a user can lock pages beyond their
> limits if they just skip mapping the first 4k of each page?  Thanks,


Heh. True. Posted another patch with 4K pages per system page usage tracking.



-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-11-30  6:14                     ` Alexey Kardashevskiy
@ 2012-11-30 16:48                       ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-30 16:48 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, David Gibson

On Fri, 2012-11-30 at 17:14 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    9 ++
>  arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 338 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..0646c50 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long entry)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits--)
> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;

Ok, entry is the iova page number.  So presumably it's relative to the
start of dma32_window_start since you're unlikely to have a bitmap that
covers all of memory.  I hadn't realized that previously.  Doesn't that
mean that it's actually impossible to create an ioctl based interface to
the dma64_window since we're not going to know which window is the
target?  I know you're not planning on one, but it seems limiting.  We
at least need some documentation here, but I'm wondering if iova
shouldn't be zero based so we can determine which window it hits.  Also,
now that I look at it, I can't find any range checking on the iova.
Thanks,

Alex

> +
> +	return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		oldweight = syspage_weight(tbl->it_map, entry);
> +		__clear_bit(entry, tbl->it_map);
> +
> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		WARN_ON(!page);
> +		if (!page)
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
> +			++retpages;
> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + / of released system pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret || (ret > 1))
> +			ret = -EFAULT;
> +		return ret;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight(tbl->it_map, entry);
> +	__set_bit(entry, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0, retpages = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
> +				test_bit(entry + i, tbl->it_map)) {
> +			WARN_ON(test_bit(entry + i, tbl->it_map));
> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = retpages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-11-30 16:48                       ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-11-30 16:48 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: David Gibson, Paul Mackerras, linuxppc-dev, linux-kernel

On Fri, 2012-11-30 at 17:14 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    9 ++
>  arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 338 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..0646c50 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long entry)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits--)
> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;

Ok, entry is the iova page number.  So presumably it's relative to the
start of dma32_window_start since you're unlikely to have a bitmap that
covers all of memory.  I hadn't realized that previously.  Doesn't that
mean that it's actually impossible to create an ioctl based interface to
the dma64_window since we're not going to know which window is the
target?  I know you're not planning on one, but it seems limiting.  We
at least need some documentation here, but I'm wondering if iova
shouldn't be zero based so we can determine which window it hits.  Also,
now that I look at it, I can't find any range checking on the iova.
Thanks,

Alex

> +
> +	return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		oldweight = syspage_weight(tbl->it_map, entry);
> +		__clear_bit(entry, tbl->it_map);
> +
> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		WARN_ON(!page);
> +		if (!page)
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
> +			++retpages;
> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + / of released system pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {
> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		if (!ret || (ret > 1))
> +			ret = -EFAULT;
> +		return ret;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight(tbl->it_map, entry);
> +	__set_bit(entry, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0, retpages = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
> +				test_bit(entry + i, tbl->it_map)) {
> +			WARN_ON(test_bit(entry + i, tbl->it_map));
> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = retpages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-11-30 16:48                       ` Alex Williamson
@ 2012-12-01  0:14                         ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-01  0:14 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, David Gibson

On 01/12/12 03:48, Alex Williamson wrote:
> On Fri, 2012-11-30 at 17:14 +1100, Alexey Kardashevskiy wrote:
>> This patch initializes IOMMU groups based on the IOMMU
>> configuration discovered during the PCI scan on POWERNV
>> (POWER non virtualized) platform. The IOMMU groups are
>> to be used later by VFIO driver (PCI pass through).
>>
>> It also implements an API for mapping/unmapping pages for
>> guest PCI drivers and providing DMA window properties.
>> This API is going to be used later by QEMU-VFIO to handle
>> h_put_tce hypercalls from the KVM guest.
>>
>> Although this driver has been tested only on the POWERNV
>> platform, it should work on any platform which supports
>> TCE tables.
>>
>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
>> option and configure VFIO as required.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/iommu.h     |    9 ++
>>   arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>>   arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>>   drivers/iommu/Kconfig                |    8 ++
>>   4 files changed, 338 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index cbfe678..5c7087a 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -76,6 +76,9 @@ struct iommu_table {
>>   	struct iommu_pool large_pool;
>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
>>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> +	struct iommu_group *it_group;
>> +#endif
>>   };
>>
>>   struct scatterlist;
>> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>>   }
>>   #endif
>>
>> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages);
>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long pages);
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index ff5a6ce..0646c50 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -44,6 +44,7 @@
>>   #include <asm/kdump.h>
>>   #include <asm/fadump.h>
>>   #include <asm/vio.h>
>> +#include <asm/tce.h>
>>
>>   #define DBG(...)
>>
>> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>   		free_pages((unsigned long)vaddr, get_order(size));
>>   	}
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * SPAPR TCE API
>> + */
>> +
>> +/*
>> + * Returns the number of used IOMMU pages (4K) within
>> + * the same system page (4K or 64K).
>> + * bitmap_weight is not used as it does not support bigendian maps.
>> + */
>> +static int syspage_weight(unsigned long *map, unsigned long entry)
>> +{
>> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
>> +
>> +	/* Aligns TCE entry number to system page boundary */
>> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +
>> +	/* Count used 4K pages */
>> +	while (nbits--)
>> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
>
> Ok, entry is the iova page number.  So presumably it's relative to the
> start of dma32_window_start since you're unlikely to have a bitmap that
> covers all of memory.  I hadn't realized that previously.

No, it is zero based. The DMA window is a filter but not offset. But you 
are right, the it_map does not cover the whole global table (one per PHB, 
roughly), will fix it, thanks for pointing. On my test system IOMMU group 
is a whole PHB and DMA window always starts from 0 so tests do not show 
everything :)

> Doesn't that
> mean that it's actually impossible to create an ioctl based interface to
> the dma64_window since we're not going to know which window is the
> target?  I know you're not planning on one, but it seems limiting.

No ,it is not limiting as iova is zero based. Even if it was, there are 
flags in map/unmap ioctls which we could use, no?

> We
> at least need some documentation here, but I'm wondering if iova
> shouldn't be zero based so we can determine which window it hits.  Also,
> now that I look at it, I can't find any range checking on the iova.

True... Have not hit this problem yet :) Good point, will fix, thanks.



-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-01  0:14                         ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-01  0:14 UTC (permalink / raw)
  To: Alex Williamson; +Cc: David Gibson, Paul Mackerras, linuxppc-dev, linux-kernel

On 01/12/12 03:48, Alex Williamson wrote:
> On Fri, 2012-11-30 at 17:14 +1100, Alexey Kardashevskiy wrote:
>> This patch initializes IOMMU groups based on the IOMMU
>> configuration discovered during the PCI scan on POWERNV
>> (POWER non virtualized) platform. The IOMMU groups are
>> to be used later by VFIO driver (PCI pass through).
>>
>> It also implements an API for mapping/unmapping pages for
>> guest PCI drivers and providing DMA window properties.
>> This API is going to be used later by QEMU-VFIO to handle
>> h_put_tce hypercalls from the KVM guest.
>>
>> Although this driver has been tested only on the POWERNV
>> platform, it should work on any platform which supports
>> TCE tables.
>>
>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
>> option and configure VFIO as required.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/iommu.h     |    9 ++
>>   arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>>   arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>>   drivers/iommu/Kconfig                |    8 ++
>>   4 files changed, 338 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index cbfe678..5c7087a 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -76,6 +76,9 @@ struct iommu_table {
>>   	struct iommu_pool large_pool;
>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
>>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> +	struct iommu_group *it_group;
>> +#endif
>>   };
>>
>>   struct scatterlist;
>> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>>   }
>>   #endif
>>
>> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages);
>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long pages);
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index ff5a6ce..0646c50 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -44,6 +44,7 @@
>>   #include <asm/kdump.h>
>>   #include <asm/fadump.h>
>>   #include <asm/vio.h>
>> +#include <asm/tce.h>
>>
>>   #define DBG(...)
>>
>> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>   		free_pages((unsigned long)vaddr, get_order(size));
>>   	}
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * SPAPR TCE API
>> + */
>> +
>> +/*
>> + * Returns the number of used IOMMU pages (4K) within
>> + * the same system page (4K or 64K).
>> + * bitmap_weight is not used as it does not support bigendian maps.
>> + */
>> +static int syspage_weight(unsigned long *map, unsigned long entry)
>> +{
>> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
>> +
>> +	/* Aligns TCE entry number to system page boundary */
>> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +
>> +	/* Count used 4K pages */
>> +	while (nbits--)
>> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
>
> Ok, entry is the iova page number.  So presumably it's relative to the
> start of dma32_window_start since you're unlikely to have a bitmap that
> covers all of memory.  I hadn't realized that previously.

No, it is zero based. The DMA window is a filter but not offset. But you 
are right, the it_map does not cover the whole global table (one per PHB, 
roughly), will fix it, thanks for pointing. On my test system IOMMU group 
is a whole PHB and DMA window always starts from 0 so tests do not show 
everything :)

> Doesn't that
> mean that it's actually impossible to create an ioctl based interface to
> the dma64_window since we're not going to know which window is the
> target?  I know you're not planning on one, but it seems limiting.

No ,it is not limiting as iova is zero based. Even if it was, there are 
flags in map/unmap ioctls which we could use, no?

> We
> at least need some documentation here, but I'm wondering if iova
> shouldn't be zero based so we can determine which window it hits.  Also,
> now that I look at it, I can't find any range checking on the iova.

True... Have not hit this problem yet :) Good point, will fix, thanks.



-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH 0/2] vfio on power: yet another try
  2012-11-23  9:03       ` Alexey Kardashevskiy
@ 2012-12-03  2:52         ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-03  2:52 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	David Gibson, linuxppc-dev, linux-kernel, kvm

The set includes 2 patches.

The first one adds necessary support for VFIO IOMMU support,
the second one adds a SPAPR TCE IOMMU driver to VFIO.

At the moment we have decided to get rid of DMA64 window
properties because we need more API than just 2 properties
(such as dynamic window allocation) but have not decided
about its actual design yet.

Alexey Kardashevskiy (2):
  vfio powerpc: enabled on powernv platform
  vfio powerpc: implemented IOMMU driver for VFIO

 arch/powerpc/include/asm/iommu.h     |    9 +
 arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++
 drivers/iommu/Kconfig                |    8 +
 drivers/vfio/Kconfig                 |    6 +
 drivers/vfio/Makefile                |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  350 ++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                 |   26 +++
 8 files changed, 721 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH 0/2] vfio on power: yet another try
@ 2012-12-03  2:52         ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-03  2:52 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

The set includes 2 patches.

The first one adds necessary support for VFIO IOMMU support,
the second one adds a SPAPR TCE IOMMU driver to VFIO.

At the moment we have decided to get rid of DMA64 window
properties because we need more API than just 2 properties
(such as dynamic window allocation) but have not decided
about its actual design yet.

Alexey Kardashevskiy (2):
  vfio powerpc: enabled on powernv platform
  vfio powerpc: implemented IOMMU driver for VFIO

 arch/powerpc/include/asm/iommu.h     |    9 +
 arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 +++++++++++++
 drivers/iommu/Kconfig                |    8 +
 drivers/vfio/Kconfig                 |    6 +
 drivers/vfio/Makefile                |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  350 ++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                 |   26 +++
 8 files changed, 721 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

-- 
1.7.10.4

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH 1/2] vfio powerpc: enabled on powernv platform
  2012-12-03  2:52         ` Alexey Kardashevskiy
@ 2012-12-03  2:52           ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-03  2:52 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	David Gibson, linuxppc-dev, linux-kernel, kvm

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    9 ++
 arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 338 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..2738aa4 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ * bitmap_weight is not used as it does not support bigendian maps.
+ */
+static int syspage_weight(unsigned long *map, unsigned long entry)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits--)
+		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
+
+	return ret;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		oldweight = syspage_weight(tbl->it_map, entry);
+		__clear_bit(entry - tbl->it_offset, tbl->it_map);
+
+		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		WARN_ON(!page);
+		if (!page)
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ / of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret || (ret > 1))
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight(tbl->it_map, entry);
+	__set_bit(entry - tbl->it_offset, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0, retpages = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
+				test_bit(entry + i, tbl->it_map)) {
+			WARN_ON(test_bit(entry + i, tbl->it_map));
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = retpages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH 1/2] vfio powerpc: enabled on powernv platform
@ 2012-12-03  2:52           ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-03  2:52 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    9 ++
 arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 338 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..2738aa4 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ * bitmap_weight is not used as it does not support bigendian maps.
+ */
+static int syspage_weight(unsigned long *map, unsigned long entry)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits--)
+		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
+
+	return ret;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		oldweight = syspage_weight(tbl->it_map, entry);
+		__clear_bit(entry - tbl->it_offset, tbl->it_map);
+
+		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		WARN_ON(!page);
+		if (!page)
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ / of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret || (ret > 1))
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight(tbl->it_map, entry);
+	__set_bit(entry - tbl->it_offset, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0, retpages = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
+				test_bit(entry + i, tbl->it_map)) {
+			WARN_ON(test_bit(entry + i, tbl->it_map));
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = retpages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH 2/2] vfio powerpc: implemented IOMMU driver for VFIO
  2012-12-03  2:52         ` Alexey Kardashevskiy
@ 2012-12-03  2:52           ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-03  2:52 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	David Gibson, linuxppc-dev, linux-kernel, kvm

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  350 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   26 +++
 4 files changed, 383 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..806ad9f
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,350 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm)
+		return; /* process exited */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction;
+		unsigned long locked, lock_limit;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+			direction = DMA_BIDIRECTIONAL;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			direction = DMA_TO_DEVICE;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			direction = DMA_FROM_DEVICE;
+		else
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		if ((param.iova + param.size) >
+				(tbl->it_size << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		/* Account for locked pages */
+		locked = current->mm->locked_vm +
+			(param.size >> PAGE_SHIFT);
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+					rlimit(RLIMIT_MEMLOCK));
+			return -ENOMEM;
+		}
+
+		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.vaddr, direction,
+				param.size >> IOMMU_PAGE_SHIFT);
+
+		if (ret > 0)
+			lock_acct(ret);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		if ((param.iova + param.size) >
+				(tbl->it_size << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.size >> IOMMU_PAGE_SHIFT);
+
+		if (ret > 0)
+			lock_acct(-ret);
+
+		return ret;
+	}
+	default:
+		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+		container->tbl = NULL;
+		/* Restore reserve for page 0 */
+		if (tbl->it_offset == 0)
+			set_bit(0, tbl->it_map);
+
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..a12295c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,29 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The IOMMU page size is always 4K.
+ */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH 2/2] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-12-03  2:52           ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-03  2:52 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  350 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   26 +++
 4 files changed, 383 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..806ad9f
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,350 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm)
+		return; /* process exited */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction;
+		unsigned long locked, lock_limit;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+			direction = DMA_BIDIRECTIONAL;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			direction = DMA_TO_DEVICE;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			direction = DMA_FROM_DEVICE;
+		else
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		if ((param.iova + param.size) >
+				(tbl->it_size << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		/* Account for locked pages */
+		locked = current->mm->locked_vm +
+			(param.size >> PAGE_SHIFT);
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+					rlimit(RLIMIT_MEMLOCK));
+			return -ENOMEM;
+		}
+
+		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.vaddr, direction,
+				param.size >> IOMMU_PAGE_SHIFT);
+
+		if (ret > 0)
+			lock_acct(ret);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		if ((param.iova + param.size) >
+				(tbl->it_size << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.size >> IOMMU_PAGE_SHIFT);
+
+		if (ret > 0)
+			lock_acct(-ret);
+
+		return ret;
+	}
+	default:
+		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+		container->tbl = NULL;
+		/* Restore reserve for page 0 */
+		if (tbl->it_offset == 0)
+			set_bit(0, tbl->it_map);
+
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..a12295c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,29 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The IOMMU page size is always 4K.
+ */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: enabled on powernv platform
  2012-12-03  2:52           ` Alexey Kardashevskiy
@ 2012-12-03 17:35             ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-03 17:35 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    9 ++
>  arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 338 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..2738aa4 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long entry)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits--)
> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
> +
> +	return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		oldweight = syspage_weight(tbl->it_map, entry);
> +		__clear_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +			continue;

Could this happen earlier, above syspage_weight() and __clear_bit()?

> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		WARN_ON(!page);
> +		if (!page)
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
> +			++retpages;

If you used __test_and_clear_bit() above I think you could avoid this
2nd call to syspage_weight.  A minor optimization though.

> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + / of released system pages
> + */

Something bad happened to your comments here.

> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);

Maybe the compiler will figure this out, but isn't this the same as tce
& (IOMMU_PAGE_MASK & PAGE_MASK)?

> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {

Probably (ret != 1) here or else we never get to your >1 case below.

> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);

Use pr_err

> +		if (!ret || (ret > 1))

Then (ret >= 0) here.  Or return (ret >= 0) ? -EFAULT : ret

> +			ret = -EFAULT;
> +		return ret;
> +	}

You're missing the code from x86 that handles mapping mmap'd ranges.
This is intended to allow peer-to-peer DMA between devices.  Is that
intentional?

> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */

Here's what makes me call the entry "relative" rather than zero-based.
The iova is relative to the start of dma32_window_start, ie. if the
window starts at bus address 512MB and I want to create a translation at
bus address 512MB, I pass in an iova of 0, right?  The above adds the
window offset.  So you've removed dma64 window, but we really need to
define iova better.

> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);

Use pr_err

> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight(tbl->it_map, entry);
> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0, retpages = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);

This doesn't seem BUG worthy, -EINVAL?  We can't assume tce_iommu_ioctl
will always be the only caller of this function.

> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
> +				test_bit(entry + i, tbl->it_map)) {
> +			WARN_ON(test_bit(entry + i, tbl->it_map));

The WARN_ON seems to confirm that these are redundant tests, does that
imply we don't trust it_map?  It would be a lot faster if we could rely
on it_map exclusively here.

> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = retpages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));

Use pr_warn

> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);

Use pr_err

> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));

Use pr_info

> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

Thanks,

Alex


^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: enabled on powernv platform
@ 2012-12-03 17:35             ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-03 17:35 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    9 ++
>  arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 338 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..2738aa4 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long entry)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits--)
> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
> +
> +	return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		oldweight = syspage_weight(tbl->it_map, entry);
> +		__clear_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +			continue;

Could this happen earlier, above syspage_weight() and __clear_bit()?

> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		WARN_ON(!page);
> +		if (!page)
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
> +			++retpages;

If you used __test_and_clear_bit() above I think you could avoid this
2nd call to syspage_weight.  A minor optimization though.

> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + / of released system pages
> + */

Something bad happened to your comments here.

> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);

Maybe the compiler will figure this out, but isn't this the same as tce
& (IOMMU_PAGE_MASK & PAGE_MASK)?

> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {

Probably (ret != 1) here or else we never get to your >1 case below.

> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);

Use pr_err

> +		if (!ret || (ret > 1))

Then (ret >= 0) here.  Or return (ret >= 0) ? -EFAULT : ret

> +			ret = -EFAULT;
> +		return ret;
> +	}

You're missing the code from x86 that handles mapping mmap'd ranges.
This is intended to allow peer-to-peer DMA between devices.  Is that
intentional?

> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */

Here's what makes me call the entry "relative" rather than zero-based.
The iova is relative to the start of dma32_window_start, ie. if the
window starts at bus address 512MB and I want to create a translation at
bus address 512MB, I pass in an iova of 0, right?  The above adds the
window offset.  So you've removed dma64 window, but we really need to
define iova better.

> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);

Use pr_err

> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight(tbl->it_map, entry);
> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0, retpages = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);

This doesn't seem BUG worthy, -EINVAL?  We can't assume tce_iommu_ioctl
will always be the only caller of this function.

> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
> +				test_bit(entry + i, tbl->it_map)) {
> +			WARN_ON(test_bit(entry + i, tbl->it_map));

The WARN_ON seems to confirm that these are redundant tests, does that
imply we don't trust it_map?  It would be a lot faster if we could rely
on it_map exclusively here.

> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = retpages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));

Use pr_warn

> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);

Use pr_err

> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));

Use pr_info

> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 2/2] vfio powerpc: implemented IOMMU driver for VFIO
  2012-12-03  2:52           ` Alexey Kardashevskiy
@ 2012-12-03 17:53             ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-03 17:53 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  drivers/vfio/Kconfig                |    6 +
>  drivers/vfio/Makefile               |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  350 +++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                |   26 +++
>  4 files changed, 383 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..806ad9f
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,350 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	long			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);
> +	kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm)
> +		return; /* process exited */
> +
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to update
> +	 * mm->locked_vm later. If locked_vm were atomic, we
> +	 * wouldn't need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		pr_err("tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);
> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +	long ret;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION: {
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +	}

nit, {}s are unnecessary for this case

> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma32_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map param;
> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction;
> +		unsigned long locked, lock_limit;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> +			direction = DMA_BIDIRECTIONAL;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> +			direction = DMA_TO_DEVICE;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> +			direction = DMA_FROM_DEVICE;
> +		else
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK) ||
> +				(param.vaddr & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		if ((param.iova + param.size) >
> +				(tbl->it_size << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;

This is confusing me, in 1/2 we had:

entry += tbl->it_offset; /* Offset into real TCE table */

Which implies to me that entry is relative to it_offset.  So I'm not
sure how iova can be less than it_offset (it's u64, so it can't be
negative).  If iova is not relative then the iova + size > it_size
doesn't make any sense.  Looks like you have a few extra ()s in these
too.  Same for unmap case below.

> +
> +		/* Account for locked pages */
> +		locked = current->mm->locked_vm +
> +			(param.size >> PAGE_SHIFT);
> +		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> +					rlimit(RLIMIT_MEMLOCK));
> +			return -ENOMEM;
> +		}
> +
> +		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.vaddr, direction,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +
> +		if (ret > 0)
> +			lock_acct(ret);
> +
> +		return ret;
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap param;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		if ((param.iova + param.size) >
> +				(tbl->it_size << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +
> +		if (ret > 0)
> +			lock_acct(-ret);
> +
> +		return ret;
> +	}
> +	default:
> +		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;
> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +		container->tbl = NULL;
> +		/* Restore reserve for page 0 */
> +		if (tbl->it_offset == 0)
> +			set_bit(0, tbl->it_map);
> +
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..a12295c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,29 @@ struct vfio_iommu_type1_dma_unmap {
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +/*
> + * The SPAPR TCE info struct provides the information about the PCI bus
> + * address ranges available for DMA, these values are programmed into
> + * the hardware so the guest has to know that information.
> + *
> + * The IOMMU page size is always 4K.
> + */
> +
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;			/* reserved for future use */
> +	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
> +	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;

One or both of us are still confused whether iova passed via these
structures are absolute or relative, so let's add some documentation for
that.  Thanks,

Alex

> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 2/2] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-12-03 17:53             ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-03 17:53 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  drivers/vfio/Kconfig                |    6 +
>  drivers/vfio/Makefile               |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  350 +++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                |   26 +++
>  4 files changed, 383 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..806ad9f
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,350 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	long			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);
> +	kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm)
> +		return; /* process exited */
> +
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to update
> +	 * mm->locked_vm later. If locked_vm were atomic, we
> +	 * wouldn't need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		pr_err("tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);
> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +	long ret;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION: {
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +	}

nit, {}s are unnecessary for this case

> +	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma32_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map param;
> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction;
> +		unsigned long locked, lock_limit;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> +			direction = DMA_BIDIRECTIONAL;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> +			direction = DMA_TO_DEVICE;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> +			direction = DMA_FROM_DEVICE;
> +		else
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK) ||
> +				(param.vaddr & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		if ((param.iova + param.size) >
> +				(tbl->it_size << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;

This is confusing me, in 1/2 we had:

entry += tbl->it_offset; /* Offset into real TCE table */

Which implies to me that entry is relative to it_offset.  So I'm not
sure how iova can be less than it_offset (it's u64, so it can't be
negative).  If iova is not relative then the iova + size > it_size
doesn't make any sense.  Looks like you have a few extra ()s in these
too.  Same for unmap case below.

> +
> +		/* Account for locked pages */
> +		locked = current->mm->locked_vm +
> +			(param.size >> PAGE_SHIFT);
> +		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> +					rlimit(RLIMIT_MEMLOCK));
> +			return -ENOMEM;
> +		}
> +
> +		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.vaddr, direction,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +
> +		if (ret > 0)
> +			lock_acct(ret);
> +
> +		return ret;
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap param;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		if ((param.iova + param.size) >
> +				(tbl->it_size << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +
> +		if (ret > 0)
> +			lock_acct(-ret);
> +
> +		return ret;
> +	}
> +	default:
> +		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;
> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +		container->tbl = NULL;
> +		/* Restore reserve for page 0 */
> +		if (tbl->it_offset == 0)
> +			set_bit(0, tbl->it_map);
> +
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..a12295c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,29 @@ struct vfio_iommu_type1_dma_unmap {
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +/*
> + * The SPAPR TCE info struct provides the information about the PCI bus
> + * address ranges available for DMA, these values are programmed into
> + * the hardware so the guest has to know that information.
> + *
> + * The IOMMU page size is always 4K.
> + */
> +
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;			/* reserved for future use */
> +	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
> +	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;

One or both of us are still confused whether iova passed via these
structures are absolute or relative, so let's add some documentation for
that.  Thanks,

Alex

> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: enabled on powernv platform
  2012-12-03 17:35             ` Alex Williamson
@ 2012-12-04  8:12               ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-04  8:12 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On 04/12/12 04:35, Alex Williamson wrote:
> On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
>> This patch initializes IOMMU groups based on the IOMMU
>> configuration discovered during the PCI scan on POWERNV
>> (POWER non virtualized) platform. The IOMMU groups are
>> to be used later by VFIO driver (PCI pass through).
>>
>> It also implements an API for mapping/unmapping pages for
>> guest PCI drivers and providing DMA window properties.
>> This API is going to be used later by QEMU-VFIO to handle
>> h_put_tce hypercalls from the KVM guest.
>>
>> Although this driver has been tested only on the POWERNV
>> platform, it should work on any platform which supports
>> TCE tables.
>>
>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
>> option and configure VFIO as required.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/iommu.h     |    9 ++
>>   arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>>   arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>>   drivers/iommu/Kconfig                |    8 ++
>>   4 files changed, 338 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index cbfe678..5c7087a 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -76,6 +76,9 @@ struct iommu_table {
>>   	struct iommu_pool large_pool;
>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
>>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> +	struct iommu_group *it_group;
>> +#endif
>>   };
>>
>>   struct scatterlist;
>> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>>   }
>>   #endif
>>
>> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages);
>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long pages);
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index ff5a6ce..2738aa4 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -44,6 +44,7 @@
>>   #include <asm/kdump.h>
>>   #include <asm/fadump.h>
>>   #include <asm/vio.h>
>> +#include <asm/tce.h>
>>
>>   #define DBG(...)
>>
>> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>   		free_pages((unsigned long)vaddr, get_order(size));
>>   	}
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * SPAPR TCE API
>> + */
>> +
>> +/*
>> + * Returns the number of used IOMMU pages (4K) within
>> + * the same system page (4K or 64K).
>> + * bitmap_weight is not used as it does not support bigendian maps.
>> + */
>> +static int syspage_weight(unsigned long *map, unsigned long entry)
>> +{
>> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
>> +
>> +	/* Aligns TCE entry number to system page boundary */
>> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +
>> +	/* Count used 4K pages */
>> +	while (nbits--)
>> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
>> +
>> +	return ret;
>> +}
>> +
>> +static void tce_flush(struct iommu_table *tbl)
>> +{
>> +	/* Flush/invalidate TLB caches if necessary */
>> +	if (ppc_md.tce_flush)
>> +		ppc_md.tce_flush(tbl);
>> +
>> +	/* Make sure updates are seen by hardware */
>> +	mb();
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number of system pages
>> + * which it called put_page() on
>> + */
>> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages)
>> +{
>> +	int i, retpages = 0;
>> +	unsigned long oldtce, oldweight;
>> +	struct page *page;
>> +
>> +	for (i = 0; i < pages; ++i) {
>> +		oldtce = ppc_md.tce_get(tbl, entry + i);
>> +		ppc_md.tce_free(tbl, entry + i, 1);
>> +
>> +		oldweight = syspage_weight(tbl->it_map, entry);
>> +		__clear_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>> +			continue;
>
> Could this happen earlier, above syspage_weight() and __clear_bit()?


Want to clear it anyway if it is not cleared by some reason. Added WARN_ON.


>> +
>> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
>> +
>> +		WARN_ON(!page);
>> +		if (!page)
>> +			continue;
>> +
>> +		if (oldtce & TCE_PCI_WRITE)
>> +			SetPageDirty(page);
>> +
>> +		put_page(page);
>> +
>> +		/* That was the last IOMMU page within the system page */
>> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
>> +			++retpages;
>
> If you used __test_and_clear_bit() above I think you could avoid this
> 2nd call to syspage_weight.  A minor optimization though.
>
>> +	}
>> +
>> +	return retpages;
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number
>> + / of released system pages
>> + */
>
> Something bad happened to your comments here.
>
>> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages)
>> +{
>> +	int ret;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> +	spin_lock(&(pool->lock));
>> +	ret = clear_tces_nolock(tbl, entry, pages);
>> +	tce_flush(tbl);
>> +	spin_unlock(&(pool->lock));
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
>> +
>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction)
>> +{
>> +	int ret;
>> +	struct page *page = NULL;
>> +	unsigned long kva, offset, oldweight;
>> +
>> +	/* Map new TCE */
>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>
> Maybe the compiler will figure this out, but isn't this the same as tce
> & (IOMMU_PAGE_MASK & PAGE_MASK)?


it is rather (tce & (IOMMU_PAGE_MASK & ~PAGE_MASK)) but I cannot see how it 
is simpler and I doubt that it is faster enough to notice it anyhow :)


>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> +			direction != DMA_TO_DEVICE, &page);
>> +	if (ret < 1) {
>
> Probably (ret != 1) here or else we never get to your >1 case below.
>
>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
>
> Use pr_err
 >
>> +		if (!ret || (ret > 1))
>
> Then (ret >= 0) here.  Or return (ret >= 0) ? -EFAULT : ret
>
>> +			ret = -EFAULT;
>> +		return ret;
>> +	}
>
> You're missing the code from x86 that handles mapping mmap'd ranges.
> This is intended to allow peer-to-peer DMA between devices.  Is that
> intentional?

I am not following you here. What code exactly are talking about? We do not 
track ranges at all and I do not see how it helps with p2p dma.


>> +
>> +	kva = (unsigned long) page_address(page);
>> +	kva += offset;
>> +
>> +	/* tce_build receives a virtual address */
>> +	entry += tbl->it_offset; /* Offset into real TCE table */
>
> Here's what makes me call the entry "relative" rather than zero-based.

This is the bug actually, I overlooked it and I removed it now. Thanks for 
being so picky :)


> The iova is relative to the start of dma32_window_start, ie. if the
> window starts at bus address 512MB and I want to create a translation at
> bus address 512MB, I pass in an iova of 0, right?  The above adds the
> window offset.  So you've removed dma64 window, but we really need to
> define iova better.




>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>> +
>> +	/* tce_build() only returns non-zero for transient errors */
>> +	if (unlikely(ret)) {
>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>
> Use pr_err
>
>> +		put_page(page);
>> +		return -EIO;
>> +	}
>> +
>> +	/* Calculate if new system page has been locked */
>> +	oldweight = syspage_weight(tbl->it_map, entry);
>> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> +	return (oldweight == 0) ? 1 : 0;
>> +}
>> +
>> +/*
>> + * iommu_put_tces builds tces and returned the number of actually
>> + * locked system pages
>> + */
>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long pages)
>> +{
>> +	int i, ret = 0, retpages = 0;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>> +	BUG_ON(direction == DMA_NONE);
>
> This doesn't seem BUG worthy, -EINVAL?  We can't assume tce_iommu_ioctl
> will always be the only caller of this function.


This is what other function does in this file.


>> +
>> +	spin_lock(&(pool->lock));
>> +
>> +	/* Check if any is in use */
>> +	for (i = 0; i < pages; ++i) {
>> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
>> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
>> +				test_bit(entry + i, tbl->it_map)) {
>> +			WARN_ON(test_bit(entry + i, tbl->it_map));
>
> The WARN_ON seems to confirm that these are redundant tests, does that
> imply we don't trust it_map?  It would be a lot faster if we could rely
> on it_map exclusively here.


As for me, pretty minor optimization. I'm testing it now to see if I do not 
miss bits.



>> +			spin_unlock(&(pool->lock));
>> +			return -EBUSY;
>> +		}
>> +	}
>> +
>> +	/* Put tces to the table */
>> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
>> +		ret = put_tce(tbl, entry + i, tce, direction);
>> +		if (ret == 1)
>> +			++retpages;
>> +	}
>> +
>> +	/*
>> +	 * If failed, release locked pages, otherwise return the number
>> +	 * of locked system pages
>> +	 */
>> +	if (ret < 0)
>> +		clear_tces_nolock(tbl, entry, i);
>> +	else
>> +		ret = retpages;
>> +
>> +	tce_flush(tbl);
>> +	spin_unlock(&(pool->lock));
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
>> +#endif /* CONFIG_IOMMU_API */
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index 05205cf..21250ef 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -20,6 +20,7 @@
>>   #include <linux/irq.h>
>>   #include <linux/io.h>
>>   #include <linux/msi.h>
>> +#include <linux/iommu.h>
>>
>>   #include <asm/sections.h>
>>   #include <asm/io.h>
>> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>>   #endif
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * IOMMU groups support required by VFIO
>> + */
>> +static int add_device(struct device *dev)
>> +{
>> +	struct iommu_table *tbl;
>> +	int ret = 0;
>> +
>> +	if (WARN_ON(dev->iommu_group)) {
>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
>> +				dev_name(dev),
>> +				iommu_group_id(dev->iommu_group));
>
> Use pr_warn
>
>> +		return -EBUSY;
>> +	}
>> +
>> +	tbl = get_iommu_table_base(dev);
>> +	if (!tbl) {
>> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
>> +				dev_name(dev));
>> +		return 0;
>> +	}
>> +
>> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
>> +			dev_name(dev), iommu_group_id(tbl->it_group));
>> +
>> +	ret = iommu_group_add_device(tbl->it_group, dev);
>> +	if (ret < 0)
>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>> +				dev_name(dev), ret);
>
> Use pr_err
>
>> +
>> +	return ret;
>> +}
>> +
>> +static void del_device(struct device *dev)
>> +{
>> +	iommu_group_remove_device(dev);
>> +}
>> +
>> +static int iommu_bus_notifier(struct notifier_block *nb,
>> +			      unsigned long action, void *data)
>> +{
>> +	struct device *dev = data;
>> +
>> +	switch (action) {
>> +	case BUS_NOTIFY_ADD_DEVICE:
>> +		return add_device(dev);
>> +	case BUS_NOTIFY_DEL_DEVICE:
>> +		del_device(dev);
>> +		return 0;
>> +	default:
>> +		return 0;
>> +	}
>> +}
>> +
>> +static struct notifier_block tce_iommu_bus_nb = {
>> +	.notifier_call = iommu_bus_notifier,
>> +};
>> +
>> +static void group_release(void *iommu_data)
>> +{
>> +	struct iommu_table *tbl = iommu_data;
>> +	tbl->it_group = NULL;
>> +}
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp;
>> +
>> +	/* Allocate and initialize IOMMU groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +
>> +		/* Skip already initialized */
>> +		if (tbl->it_group)
>> +			continue;
>> +
>> +		grp = iommu_group_alloc();
>> +		if (IS_ERR(grp)) {
>> +			printk(KERN_INFO "tce_vfio: cannot create "
>> +					"new IOMMU group, ret=%ld\n",
>> +					PTR_ERR(grp));
>
> Use pr_info
>
>> +			return PTR_ERR(grp);
>> +		}
>> +		tbl->it_group = grp;
>> +		iommu_group_set_iommudata(grp, tbl, group_release);
>> +	}
>> +
>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> +	/* Add PCI devices to VFIO groups */
>> +	for_each_pci_dev(pdev)
>> +		add_device(&pdev->dev);
>> +
>> +	return 0;
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp = NULL;
>> +
>> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> +	/* Delete PCI devices from VFIO groups */
>> +	for_each_pci_dev(pdev)
>> +		del_device(&pdev->dev);
>> +
>> +	/* Release VFIO groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +		grp = tbl->it_group;
>> +
>> +		/* Skip (already) uninitialized */
>> +		if (!grp)
>> +			continue;
>> +
>> +		/* Do actual release, group_release() is expected to work */
>> +		iommu_group_put(grp);
>> +		BUG_ON(tbl->it_group);
>> +	}
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +#endif /* CONFIG_IOMMU_API */
>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>> index 9f69b56..29d11dc 100644
>> --- a/drivers/iommu/Kconfig
>> +++ b/drivers/iommu/Kconfig
>> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>>
>>   	  Say N unless you need kernel log message for IOMMU debugging
>>
>> +config SPAPR_TCE_IOMMU
>> +	bool "sPAPR TCE IOMMU Support"
>> +	depends on PPC_POWERNV
>> +	select IOMMU_API
>> +	help
>> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
>> +	  still not implemented.
>> +
>>   endif # IOMMU_SUPPORT
>
> Thanks,
>
> Alex
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: enabled on powernv platform
@ 2012-12-04  8:12               ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-04  8:12 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On 04/12/12 04:35, Alex Williamson wrote:
> On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
>> This patch initializes IOMMU groups based on the IOMMU
>> configuration discovered during the PCI scan on POWERNV
>> (POWER non virtualized) platform. The IOMMU groups are
>> to be used later by VFIO driver (PCI pass through).
>>
>> It also implements an API for mapping/unmapping pages for
>> guest PCI drivers and providing DMA window properties.
>> This API is going to be used later by QEMU-VFIO to handle
>> h_put_tce hypercalls from the KVM guest.
>>
>> Although this driver has been tested only on the POWERNV
>> platform, it should work on any platform which supports
>> TCE tables.
>>
>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
>> option and configure VFIO as required.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/iommu.h     |    9 ++
>>   arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>>   arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>>   drivers/iommu/Kconfig                |    8 ++
>>   4 files changed, 338 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index cbfe678..5c7087a 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -76,6 +76,9 @@ struct iommu_table {
>>   	struct iommu_pool large_pool;
>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
>>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> +	struct iommu_group *it_group;
>> +#endif
>>   };
>>
>>   struct scatterlist;
>> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>>   }
>>   #endif
>>
>> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages);
>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long pages);
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index ff5a6ce..2738aa4 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -44,6 +44,7 @@
>>   #include <asm/kdump.h>
>>   #include <asm/fadump.h>
>>   #include <asm/vio.h>
>> +#include <asm/tce.h>
>>
>>   #define DBG(...)
>>
>> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>   		free_pages((unsigned long)vaddr, get_order(size));
>>   	}
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * SPAPR TCE API
>> + */
>> +
>> +/*
>> + * Returns the number of used IOMMU pages (4K) within
>> + * the same system page (4K or 64K).
>> + * bitmap_weight is not used as it does not support bigendian maps.
>> + */
>> +static int syspage_weight(unsigned long *map, unsigned long entry)
>> +{
>> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
>> +
>> +	/* Aligns TCE entry number to system page boundary */
>> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +
>> +	/* Count used 4K pages */
>> +	while (nbits--)
>> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
>> +
>> +	return ret;
>> +}
>> +
>> +static void tce_flush(struct iommu_table *tbl)
>> +{
>> +	/* Flush/invalidate TLB caches if necessary */
>> +	if (ppc_md.tce_flush)
>> +		ppc_md.tce_flush(tbl);
>> +
>> +	/* Make sure updates are seen by hardware */
>> +	mb();
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number of system pages
>> + * which it called put_page() on
>> + */
>> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages)
>> +{
>> +	int i, retpages = 0;
>> +	unsigned long oldtce, oldweight;
>> +	struct page *page;
>> +
>> +	for (i = 0; i < pages; ++i) {
>> +		oldtce = ppc_md.tce_get(tbl, entry + i);
>> +		ppc_md.tce_free(tbl, entry + i, 1);
>> +
>> +		oldweight = syspage_weight(tbl->it_map, entry);
>> +		__clear_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>> +			continue;
>
> Could this happen earlier, above syspage_weight() and __clear_bit()?


Want to clear it anyway if it is not cleared by some reason. Added WARN_ON.


>> +
>> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
>> +
>> +		WARN_ON(!page);
>> +		if (!page)
>> +			continue;
>> +
>> +		if (oldtce & TCE_PCI_WRITE)
>> +			SetPageDirty(page);
>> +
>> +		put_page(page);
>> +
>> +		/* That was the last IOMMU page within the system page */
>> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
>> +			++retpages;
>
> If you used __test_and_clear_bit() above I think you could avoid this
> 2nd call to syspage_weight.  A minor optimization though.
>
>> +	}
>> +
>> +	return retpages;
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number
>> + / of released system pages
>> + */
>
> Something bad happened to your comments here.
>
>> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages)
>> +{
>> +	int ret;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> +	spin_lock(&(pool->lock));
>> +	ret = clear_tces_nolock(tbl, entry, pages);
>> +	tce_flush(tbl);
>> +	spin_unlock(&(pool->lock));
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
>> +
>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction)
>> +{
>> +	int ret;
>> +	struct page *page = NULL;
>> +	unsigned long kva, offset, oldweight;
>> +
>> +	/* Map new TCE */
>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>
> Maybe the compiler will figure this out, but isn't this the same as tce
> & (IOMMU_PAGE_MASK & PAGE_MASK)?


it is rather (tce & (IOMMU_PAGE_MASK & ~PAGE_MASK)) but I cannot see how it 
is simpler and I doubt that it is faster enough to notice it anyhow :)


>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> +			direction != DMA_TO_DEVICE, &page);
>> +	if (ret < 1) {
>
> Probably (ret != 1) here or else we never get to your >1 case below.
>
>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
>
> Use pr_err
 >
>> +		if (!ret || (ret > 1))
>
> Then (ret >= 0) here.  Or return (ret >= 0) ? -EFAULT : ret
>
>> +			ret = -EFAULT;
>> +		return ret;
>> +	}
>
> You're missing the code from x86 that handles mapping mmap'd ranges.
> This is intended to allow peer-to-peer DMA between devices.  Is that
> intentional?

I am not following you here. What code exactly are talking about? We do not 
track ranges at all and I do not see how it helps with p2p dma.


>> +
>> +	kva = (unsigned long) page_address(page);
>> +	kva += offset;
>> +
>> +	/* tce_build receives a virtual address */
>> +	entry += tbl->it_offset; /* Offset into real TCE table */
>
> Here's what makes me call the entry "relative" rather than zero-based.

This is the bug actually, I overlooked it and I removed it now. Thanks for 
being so picky :)


> The iova is relative to the start of dma32_window_start, ie. if the
> window starts at bus address 512MB and I want to create a translation at
> bus address 512MB, I pass in an iova of 0, right?  The above adds the
> window offset.  So you've removed dma64 window, but we really need to
> define iova better.




>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>> +
>> +	/* tce_build() only returns non-zero for transient errors */
>> +	if (unlikely(ret)) {
>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>
> Use pr_err
>
>> +		put_page(page);
>> +		return -EIO;
>> +	}
>> +
>> +	/* Calculate if new system page has been locked */
>> +	oldweight = syspage_weight(tbl->it_map, entry);
>> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> +	return (oldweight == 0) ? 1 : 0;
>> +}
>> +
>> +/*
>> + * iommu_put_tces builds tces and returned the number of actually
>> + * locked system pages
>> + */
>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long pages)
>> +{
>> +	int i, ret = 0, retpages = 0;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>> +	BUG_ON(direction == DMA_NONE);
>
> This doesn't seem BUG worthy, -EINVAL?  We can't assume tce_iommu_ioctl
> will always be the only caller of this function.


This is what other function does in this file.


>> +
>> +	spin_lock(&(pool->lock));
>> +
>> +	/* Check if any is in use */
>> +	for (i = 0; i < pages; ++i) {
>> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
>> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
>> +				test_bit(entry + i, tbl->it_map)) {
>> +			WARN_ON(test_bit(entry + i, tbl->it_map));
>
> The WARN_ON seems to confirm that these are redundant tests, does that
> imply we don't trust it_map?  It would be a lot faster if we could rely
> on it_map exclusively here.


As for me, pretty minor optimization. I'm testing it now to see if I do not 
miss bits.



>> +			spin_unlock(&(pool->lock));
>> +			return -EBUSY;
>> +		}
>> +	}
>> +
>> +	/* Put tces to the table */
>> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
>> +		ret = put_tce(tbl, entry + i, tce, direction);
>> +		if (ret == 1)
>> +			++retpages;
>> +	}
>> +
>> +	/*
>> +	 * If failed, release locked pages, otherwise return the number
>> +	 * of locked system pages
>> +	 */
>> +	if (ret < 0)
>> +		clear_tces_nolock(tbl, entry, i);
>> +	else
>> +		ret = retpages;
>> +
>> +	tce_flush(tbl);
>> +	spin_unlock(&(pool->lock));
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
>> +#endif /* CONFIG_IOMMU_API */
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index 05205cf..21250ef 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -20,6 +20,7 @@
>>   #include <linux/irq.h>
>>   #include <linux/io.h>
>>   #include <linux/msi.h>
>> +#include <linux/iommu.h>
>>
>>   #include <asm/sections.h>
>>   #include <asm/io.h>
>> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>>   #endif
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * IOMMU groups support required by VFIO
>> + */
>> +static int add_device(struct device *dev)
>> +{
>> +	struct iommu_table *tbl;
>> +	int ret = 0;
>> +
>> +	if (WARN_ON(dev->iommu_group)) {
>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
>> +				dev_name(dev),
>> +				iommu_group_id(dev->iommu_group));
>
> Use pr_warn
>
>> +		return -EBUSY;
>> +	}
>> +
>> +	tbl = get_iommu_table_base(dev);
>> +	if (!tbl) {
>> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
>> +				dev_name(dev));
>> +		return 0;
>> +	}
>> +
>> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
>> +			dev_name(dev), iommu_group_id(tbl->it_group));
>> +
>> +	ret = iommu_group_add_device(tbl->it_group, dev);
>> +	if (ret < 0)
>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>> +				dev_name(dev), ret);
>
> Use pr_err
>
>> +
>> +	return ret;
>> +}
>> +
>> +static void del_device(struct device *dev)
>> +{
>> +	iommu_group_remove_device(dev);
>> +}
>> +
>> +static int iommu_bus_notifier(struct notifier_block *nb,
>> +			      unsigned long action, void *data)
>> +{
>> +	struct device *dev = data;
>> +
>> +	switch (action) {
>> +	case BUS_NOTIFY_ADD_DEVICE:
>> +		return add_device(dev);
>> +	case BUS_NOTIFY_DEL_DEVICE:
>> +		del_device(dev);
>> +		return 0;
>> +	default:
>> +		return 0;
>> +	}
>> +}
>> +
>> +static struct notifier_block tce_iommu_bus_nb = {
>> +	.notifier_call = iommu_bus_notifier,
>> +};
>> +
>> +static void group_release(void *iommu_data)
>> +{
>> +	struct iommu_table *tbl = iommu_data;
>> +	tbl->it_group = NULL;
>> +}
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp;
>> +
>> +	/* Allocate and initialize IOMMU groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +
>> +		/* Skip already initialized */
>> +		if (tbl->it_group)
>> +			continue;
>> +
>> +		grp = iommu_group_alloc();
>> +		if (IS_ERR(grp)) {
>> +			printk(KERN_INFO "tce_vfio: cannot create "
>> +					"new IOMMU group, ret=%ld\n",
>> +					PTR_ERR(grp));
>
> Use pr_info
>
>> +			return PTR_ERR(grp);
>> +		}
>> +		tbl->it_group = grp;
>> +		iommu_group_set_iommudata(grp, tbl, group_release);
>> +	}
>> +
>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> +	/* Add PCI devices to VFIO groups */
>> +	for_each_pci_dev(pdev)
>> +		add_device(&pdev->dev);
>> +
>> +	return 0;
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp = NULL;
>> +
>> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> +	/* Delete PCI devices from VFIO groups */
>> +	for_each_pci_dev(pdev)
>> +		del_device(&pdev->dev);
>> +
>> +	/* Release VFIO groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +		grp = tbl->it_group;
>> +
>> +		/* Skip (already) uninitialized */
>> +		if (!grp)
>> +			continue;
>> +
>> +		/* Do actual release, group_release() is expected to work */
>> +		iommu_group_put(grp);
>> +		BUG_ON(tbl->it_group);
>> +	}
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +#endif /* CONFIG_IOMMU_API */
>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>> index 9f69b56..29d11dc 100644
>> --- a/drivers/iommu/Kconfig
>> +++ b/drivers/iommu/Kconfig
>> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>>
>>   	  Say N unless you need kernel log message for IOMMU debugging
>>
>> +config SPAPR_TCE_IOMMU
>> +	bool "sPAPR TCE IOMMU Support"
>> +	depends on PPC_POWERNV
>> +	select IOMMU_API
>> +	help
>> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
>> +	  still not implemented.
>> +
>>   endif # IOMMU_SUPPORT
>
> Thanks,
>
> Alex
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: enabled on powernv platform
  2012-12-04  8:12               ` Alexey Kardashevskiy
@ 2012-12-04 15:51                 ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-04 15:51 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, David Gibson,
	linuxppc-dev, linux-kernel, kvm

On Tue, 2012-12-04 at 19:12 +1100, Alexey Kardashevskiy wrote:
> On 04/12/12 04:35, Alex Williamson wrote:
> > On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
> >> This patch initializes IOMMU groups based on the IOMMU
> >> configuration discovered during the PCI scan on POWERNV
> >> (POWER non virtualized) platform. The IOMMU groups are
> >> to be used later by VFIO driver (PCI pass through).
> >>
> >> It also implements an API for mapping/unmapping pages for
> >> guest PCI drivers and providing DMA window properties.
> >> This API is going to be used later by QEMU-VFIO to handle
> >> h_put_tce hypercalls from the KVM guest.
> >>
> >> Although this driver has been tested only on the POWERNV
> >> platform, it should work on any platform which supports
> >> TCE tables.
> >>
> >> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> >> option and configure VFIO as required.
> >>
> >> Cc: David Gibson <david@gibson.dropbear.id.au>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>   arch/powerpc/include/asm/iommu.h     |    9 ++
> >>   arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
> >>   arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
> >>   drivers/iommu/Kconfig                |    8 ++
> >>   4 files changed, 338 insertions(+)
> >>
> >> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> >> index cbfe678..5c7087a 100644
> >> --- a/arch/powerpc/include/asm/iommu.h
> >> +++ b/arch/powerpc/include/asm/iommu.h
> >> @@ -76,6 +76,9 @@ struct iommu_table {
> >>   	struct iommu_pool large_pool;
> >>   	struct iommu_pool pools[IOMMU_NR_POOLS];
> >>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
> >> +#ifdef CONFIG_IOMMU_API
> >> +	struct iommu_group *it_group;
> >> +#endif
> >>   };
> >>
> >>   struct scatterlist;
> >> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
> >>   }
> >>   #endif
> >>
> >> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long pages);
> >> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		uint64_t tce, enum dma_data_direction direction,
> >> +		unsigned long pages);
> >> +
> >>   #endif /* __KERNEL__ */
> >>   #endif /* _ASM_IOMMU_H */
> >> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >> index ff5a6ce..2738aa4 100644
> >> --- a/arch/powerpc/kernel/iommu.c
> >> +++ b/arch/powerpc/kernel/iommu.c
> >> @@ -44,6 +44,7 @@
> >>   #include <asm/kdump.h>
> >>   #include <asm/fadump.h>
> >>   #include <asm/vio.h>
> >> +#include <asm/tce.h>
> >>
> >>   #define DBG(...)
> >>
> >> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >>   		free_pages((unsigned long)vaddr, get_order(size));
> >>   	}
> >>   }
> >> +
> >> +#ifdef CONFIG_IOMMU_API
> >> +/*
> >> + * SPAPR TCE API
> >> + */
> >> +
> >> +/*
> >> + * Returns the number of used IOMMU pages (4K) within
> >> + * the same system page (4K or 64K).
> >> + * bitmap_weight is not used as it does not support bigendian maps.
> >> + */
> >> +static int syspage_weight(unsigned long *map, unsigned long entry)
> >> +{
> >> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> >> +
> >> +	/* Aligns TCE entry number to system page boundary */
> >> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> >> +
> >> +	/* Count used 4K pages */
> >> +	while (nbits--)
> >> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void tce_flush(struct iommu_table *tbl)
> >> +{
> >> +	/* Flush/invalidate TLB caches if necessary */
> >> +	if (ppc_md.tce_flush)
> >> +		ppc_md.tce_flush(tbl);
> >> +
> >> +	/* Make sure updates are seen by hardware */
> >> +	mb();
> >> +}
> >> +
> >> +/*
> >> + * iommu_clear_tces clears tces and returned the number of system pages
> >> + * which it called put_page() on
> >> + */
> >> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long pages)
> >> +{
> >> +	int i, retpages = 0;
> >> +	unsigned long oldtce, oldweight;
> >> +	struct page *page;
> >> +
> >> +	for (i = 0; i < pages; ++i) {
> >> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> >> +		ppc_md.tce_free(tbl, entry + i, 1);
> >> +
> >> +		oldweight = syspage_weight(tbl->it_map, entry);
> >> +		__clear_bit(entry - tbl->it_offset, tbl->it_map);
> >> +
> >> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >> +			continue;
> >
> > Could this happen earlier, above syspage_weight() and __clear_bit()?
> 
> 
> Want to clear it anyway if it is not cleared by some reason. Added WARN_ON.

The map shouldn't be set unless read/write is enabled, right?  It seems
like we don't have a lot of trust in this bitmap.

> >> +
> >> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> >> +
> >> +		WARN_ON(!page);
> >> +		if (!page)
> >> +			continue;
> >> +
> >> +		if (oldtce & TCE_PCI_WRITE)
> >> +			SetPageDirty(page);
> >> +
> >> +		put_page(page);
> >> +
> >> +		/* That was the last IOMMU page within the system page */
> >> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
> >> +			++retpages;
> >
> > If you used __test_and_clear_bit() above I think you could avoid this
> > 2nd call to syspage_weight.  A minor optimization though.
> >
> >> +	}
> >> +
> >> +	return retpages;
> >> +}
> >> +
> >> +/*
> >> + * iommu_clear_tces clears tces and returned the number
> >> + / of released system pages
> >> + */
> >
> > Something bad happened to your comments here.
> >
> >> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long pages)
> >> +{
> >> +	int ret;
> >> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >> +
> >> +	spin_lock(&(pool->lock));
> >> +	ret = clear_tces_nolock(tbl, entry, pages);
> >> +	tce_flush(tbl);
> >> +	spin_unlock(&(pool->lock));
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> >> +
> >> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> >> +		uint64_t tce, enum dma_data_direction direction)
> >> +{
> >> +	int ret;
> >> +	struct page *page = NULL;
> >> +	unsigned long kva, offset, oldweight;
> >> +
> >> +	/* Map new TCE */
> >> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >
> > Maybe the compiler will figure this out, but isn't this the same as tce
> > & (IOMMU_PAGE_MASK & PAGE_MASK)?
> 
> 
> it is rather (tce & (IOMMU_PAGE_MASK & ~PAGE_MASK)) but I cannot see how it 
> is simpler and I doubt that it is faster enough to notice it anyhow :)

Yes, ~PAGE_MASK.  IMHO, it's more intuitive.

> >> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >> +			direction != DMA_TO_DEVICE, &page);
> >> +	if (ret < 1) {
> >
> > Probably (ret != 1) here or else we never get to your >1 case below.
> >
> >> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> >> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> >
> > Use pr_err
>  >
> >> +		if (!ret || (ret > 1))
> >
> > Then (ret >= 0) here.  Or return (ret >= 0) ? -EFAULT : ret
> >
> >> +			ret = -EFAULT;
> >> +		return ret;
> >> +	}
> >
> > You're missing the code from x86 that handles mapping mmap'd ranges.
> > This is intended to allow peer-to-peer DMA between devices.  Is that
> > intentional?
> 
> I am not following you here. What code exactly are talking about? We do not 
> track ranges at all and I do not see how it helps with p2p dma.

The code in type1 that checks PFNMAP and reserved pages that I only
sometimes remember how it works ;)  The idea there is to allow p2p dma
by inserting iommu translations for non-page backed memory, ie. the
mmap'd BARs of other devices.  It may be that the POWER topology is not
amenable to this since you have a whole PCI bus in your group and
intra-group p2p isn't iommu translated.  I'm not sure how useful it is
even on x86, but KVM device assignment does it, so I added it to type1.

> >> +
> >> +	kva = (unsigned long) page_address(page);
> >> +	kva += offset;
> >> +
> >> +	/* tce_build receives a virtual address */
> >> +	entry += tbl->it_offset; /* Offset into real TCE table */
> >
> > Here's what makes me call the entry "relative" rather than zero-based.
> 
> This is the bug actually, I overlooked it and I removed it now. Thanks for 
> being so picky :)

Ah, ok.  I'll look for it on the next round and make sure I understand
it.

> > The iova is relative to the start of dma32_window_start, ie. if the
> > window starts at bus address 512MB and I want to create a translation at
> > bus address 512MB, I pass in an iova of 0, right?  The above adds the
> > window offset.  So you've removed dma64 window, but we really need to
> > define iova better.
> 
> 
> 
> 
> >> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >> +
> >> +	/* tce_build() only returns non-zero for transient errors */
> >> +	if (unlikely(ret)) {
> >> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> >> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> >
> > Use pr_err
> >
> >> +		put_page(page);
> >> +		return -EIO;
> >> +	}
> >> +
> >> +	/* Calculate if new system page has been locked */
> >> +	oldweight = syspage_weight(tbl->it_map, entry);
> >> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> >> +
> >> +	return (oldweight == 0) ? 1 : 0;
> >> +}
> >> +
> >> +/*
> >> + * iommu_put_tces builds tces and returned the number of actually
> >> + * locked system pages
> >> + */
> >> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		uint64_t tce, enum dma_data_direction direction,
> >> +		unsigned long pages)
> >> +{
> >> +	int i, ret = 0, retpages = 0;
> >> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >> +
> >> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> >> +	BUG_ON(direction == DMA_NONE);
> >
> > This doesn't seem BUG worthy, -EINVAL?  We can't assume tce_iommu_ioctl
> > will always be the only caller of this function.
> 
> 
> This is what other function does in this file.

Blech, ok.

> >> +
> >> +	spin_lock(&(pool->lock));
> >> +
> >> +	/* Check if any is in use */
> >> +	for (i = 0; i < pages; ++i) {
> >> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> >> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
> >> +				test_bit(entry + i, tbl->it_map)) {
> >> +			WARN_ON(test_bit(entry + i, tbl->it_map));
> >
> > The WARN_ON seems to confirm that these are redundant tests, does that
> > imply we don't trust it_map?  It would be a lot faster if we could rely
> > on it_map exclusively here.
> 
> 
> As for me, pretty minor optimization. I'm testing it now to see if I do not 
> miss bits.

It would be a lot more re-assuring if we didn't need it ;)  Thanks,

Alex

> >> +			spin_unlock(&(pool->lock));
> >> +			return -EBUSY;
> >> +		}
> >> +	}
> >> +
> >> +	/* Put tces to the table */
> >> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> >> +		ret = put_tce(tbl, entry + i, tce, direction);
> >> +		if (ret == 1)
> >> +			++retpages;
> >> +	}
> >> +
> >> +	/*
> >> +	 * If failed, release locked pages, otherwise return the number
> >> +	 * of locked system pages
> >> +	 */
> >> +	if (ret < 0)
> >> +		clear_tces_nolock(tbl, entry, i);
> >> +	else
> >> +		ret = retpages;
> >> +
> >> +	tce_flush(tbl);
> >> +	spin_unlock(&(pool->lock));
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> >> +#endif /* CONFIG_IOMMU_API */
> >> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> >> index 05205cf..21250ef 100644
> >> --- a/arch/powerpc/platforms/powernv/pci.c
> >> +++ b/arch/powerpc/platforms/powernv/pci.c
> >> @@ -20,6 +20,7 @@
> >>   #include <linux/irq.h>
> >>   #include <linux/io.h>
> >>   #include <linux/msi.h>
> >> +#include <linux/iommu.h>
> >>
> >>   #include <asm/sections.h>
> >>   #include <asm/io.h>
> >> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
> >>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> >>   #endif
> >>   }
> >> +
> >> +#ifdef CONFIG_IOMMU_API
> >> +/*
> >> + * IOMMU groups support required by VFIO
> >> + */
> >> +static int add_device(struct device *dev)
> >> +{
> >> +	struct iommu_table *tbl;
> >> +	int ret = 0;
> >> +
> >> +	if (WARN_ON(dev->iommu_group)) {
> >> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> >> +				dev_name(dev),
> >> +				iommu_group_id(dev->iommu_group));
> >
> > Use pr_warn
> >
> >> +		return -EBUSY;
> >> +	}
> >> +
> >> +	tbl = get_iommu_table_base(dev);
> >> +	if (!tbl) {
> >> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> >> +				dev_name(dev));
> >> +		return 0;
> >> +	}
> >> +
> >> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> >> +			dev_name(dev), iommu_group_id(tbl->it_group));
> >> +
> >> +	ret = iommu_group_add_device(tbl->it_group, dev);
> >> +	if (ret < 0)
> >> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> >> +				dev_name(dev), ret);
> >
> > Use pr_err
> >
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void del_device(struct device *dev)
> >> +{
> >> +	iommu_group_remove_device(dev);
> >> +}
> >> +
> >> +static int iommu_bus_notifier(struct notifier_block *nb,
> >> +			      unsigned long action, void *data)
> >> +{
> >> +	struct device *dev = data;
> >> +
> >> +	switch (action) {
> >> +	case BUS_NOTIFY_ADD_DEVICE:
> >> +		return add_device(dev);
> >> +	case BUS_NOTIFY_DEL_DEVICE:
> >> +		del_device(dev);
> >> +		return 0;
> >> +	default:
> >> +		return 0;
> >> +	}
> >> +}
> >> +
> >> +static struct notifier_block tce_iommu_bus_nb = {
> >> +	.notifier_call = iommu_bus_notifier,
> >> +};
> >> +
> >> +static void group_release(void *iommu_data)
> >> +{
> >> +	struct iommu_table *tbl = iommu_data;
> >> +	tbl->it_group = NULL;
> >> +}
> >> +
> >> +static int __init tce_iommu_init(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp;
> >> +
> >> +	/* Allocate and initialize IOMMU groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +
> >> +		/* Skip already initialized */
> >> +		if (tbl->it_group)
> >> +			continue;
> >> +
> >> +		grp = iommu_group_alloc();
> >> +		if (IS_ERR(grp)) {
> >> +			printk(KERN_INFO "tce_vfio: cannot create "
> >> +					"new IOMMU group, ret=%ld\n",
> >> +					PTR_ERR(grp));
> >
> > Use pr_info
> >
> >> +			return PTR_ERR(grp);
> >> +		}
> >> +		tbl->it_group = grp;
> >> +		iommu_group_set_iommudata(grp, tbl, group_release);
> >> +	}
> >> +
> >> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >> +
> >> +	/* Add PCI devices to VFIO groups */
> >> +	for_each_pci_dev(pdev)
> >> +		add_device(&pdev->dev);
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static void __exit tce_iommu_cleanup(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp = NULL;
> >> +
> >> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >> +
> >> +	/* Delete PCI devices from VFIO groups */
> >> +	for_each_pci_dev(pdev)
> >> +		del_device(&pdev->dev);
> >> +
> >> +	/* Release VFIO groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +		grp = tbl->it_group;
> >> +
> >> +		/* Skip (already) uninitialized */
> >> +		if (!grp)
> >> +			continue;
> >> +
> >> +		/* Do actual release, group_release() is expected to work */
> >> +		iommu_group_put(grp);
> >> +		BUG_ON(tbl->it_group);
> >> +	}
> >> +}
> >> +
> >> +module_init(tce_iommu_init);
> >> +module_exit(tce_iommu_cleanup);
> >> +#endif /* CONFIG_IOMMU_API */
> >> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> >> index 9f69b56..29d11dc 100644
> >> --- a/drivers/iommu/Kconfig
> >> +++ b/drivers/iommu/Kconfig
> >> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
> >>
> >>   	  Say N unless you need kernel log message for IOMMU debugging
> >>
> >> +config SPAPR_TCE_IOMMU
> >> +	bool "sPAPR TCE IOMMU Support"
> >> +	depends on PPC_POWERNV
> >> +	select IOMMU_API
> >> +	help
> >> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> >> +	  still not implemented.
> >> +
> >>   endif # IOMMU_SUPPORT
> >
> > Thanks,
> >
> > Alex
> >
> 
> 




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH 1/2] vfio powerpc: enabled on powernv platform
@ 2012-12-04 15:51                 ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-04 15:51 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Tue, 2012-12-04 at 19:12 +1100, Alexey Kardashevskiy wrote:
> On 04/12/12 04:35, Alex Williamson wrote:
> > On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
> >> This patch initializes IOMMU groups based on the IOMMU
> >> configuration discovered during the PCI scan on POWERNV
> >> (POWER non virtualized) platform. The IOMMU groups are
> >> to be used later by VFIO driver (PCI pass through).
> >>
> >> It also implements an API for mapping/unmapping pages for
> >> guest PCI drivers and providing DMA window properties.
> >> This API is going to be used later by QEMU-VFIO to handle
> >> h_put_tce hypercalls from the KVM guest.
> >>
> >> Although this driver has been tested only on the POWERNV
> >> platform, it should work on any platform which supports
> >> TCE tables.
> >>
> >> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> >> option and configure VFIO as required.
> >>
> >> Cc: David Gibson <david@gibson.dropbear.id.au>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>   arch/powerpc/include/asm/iommu.h     |    9 ++
> >>   arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
> >>   arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
> >>   drivers/iommu/Kconfig                |    8 ++
> >>   4 files changed, 338 insertions(+)
> >>
> >> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> >> index cbfe678..5c7087a 100644
> >> --- a/arch/powerpc/include/asm/iommu.h
> >> +++ b/arch/powerpc/include/asm/iommu.h
> >> @@ -76,6 +76,9 @@ struct iommu_table {
> >>   	struct iommu_pool large_pool;
> >>   	struct iommu_pool pools[IOMMU_NR_POOLS];
> >>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
> >> +#ifdef CONFIG_IOMMU_API
> >> +	struct iommu_group *it_group;
> >> +#endif
> >>   };
> >>
> >>   struct scatterlist;
> >> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
> >>   }
> >>   #endif
> >>
> >> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long pages);
> >> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		uint64_t tce, enum dma_data_direction direction,
> >> +		unsigned long pages);
> >> +
> >>   #endif /* __KERNEL__ */
> >>   #endif /* _ASM_IOMMU_H */
> >> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >> index ff5a6ce..2738aa4 100644
> >> --- a/arch/powerpc/kernel/iommu.c
> >> +++ b/arch/powerpc/kernel/iommu.c
> >> @@ -44,6 +44,7 @@
> >>   #include <asm/kdump.h>
> >>   #include <asm/fadump.h>
> >>   #include <asm/vio.h>
> >> +#include <asm/tce.h>
> >>
> >>   #define DBG(...)
> >>
> >> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >>   		free_pages((unsigned long)vaddr, get_order(size));
> >>   	}
> >>   }
> >> +
> >> +#ifdef CONFIG_IOMMU_API
> >> +/*
> >> + * SPAPR TCE API
> >> + */
> >> +
> >> +/*
> >> + * Returns the number of used IOMMU pages (4K) within
> >> + * the same system page (4K or 64K).
> >> + * bitmap_weight is not used as it does not support bigendian maps.
> >> + */
> >> +static int syspage_weight(unsigned long *map, unsigned long entry)
> >> +{
> >> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> >> +
> >> +	/* Aligns TCE entry number to system page boundary */
> >> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> >> +
> >> +	/* Count used 4K pages */
> >> +	while (nbits--)
> >> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void tce_flush(struct iommu_table *tbl)
> >> +{
> >> +	/* Flush/invalidate TLB caches if necessary */
> >> +	if (ppc_md.tce_flush)
> >> +		ppc_md.tce_flush(tbl);
> >> +
> >> +	/* Make sure updates are seen by hardware */
> >> +	mb();
> >> +}
> >> +
> >> +/*
> >> + * iommu_clear_tces clears tces and returned the number of system pages
> >> + * which it called put_page() on
> >> + */
> >> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long pages)
> >> +{
> >> +	int i, retpages = 0;
> >> +	unsigned long oldtce, oldweight;
> >> +	struct page *page;
> >> +
> >> +	for (i = 0; i < pages; ++i) {
> >> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> >> +		ppc_md.tce_free(tbl, entry + i, 1);
> >> +
> >> +		oldweight = syspage_weight(tbl->it_map, entry);
> >> +		__clear_bit(entry - tbl->it_offset, tbl->it_map);
> >> +
> >> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >> +			continue;
> >
> > Could this happen earlier, above syspage_weight() and __clear_bit()?
> 
> 
> Want to clear it anyway if it is not cleared by some reason. Added WARN_ON.

The map shouldn't be set unless read/write is enabled, right?  It seems
like we don't have a lot of trust in this bitmap.

> >> +
> >> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> >> +
> >> +		WARN_ON(!page);
> >> +		if (!page)
> >> +			continue;
> >> +
> >> +		if (oldtce & TCE_PCI_WRITE)
> >> +			SetPageDirty(page);
> >> +
> >> +		put_page(page);
> >> +
> >> +		/* That was the last IOMMU page within the system page */
> >> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
> >> +			++retpages;
> >
> > If you used __test_and_clear_bit() above I think you could avoid this
> > 2nd call to syspage_weight.  A minor optimization though.
> >
> >> +	}
> >> +
> >> +	return retpages;
> >> +}
> >> +
> >> +/*
> >> + * iommu_clear_tces clears tces and returned the number
> >> + / of released system pages
> >> + */
> >
> > Something bad happened to your comments here.
> >
> >> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long pages)
> >> +{
> >> +	int ret;
> >> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >> +
> >> +	spin_lock(&(pool->lock));
> >> +	ret = clear_tces_nolock(tbl, entry, pages);
> >> +	tce_flush(tbl);
> >> +	spin_unlock(&(pool->lock));
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> >> +
> >> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> >> +		uint64_t tce, enum dma_data_direction direction)
> >> +{
> >> +	int ret;
> >> +	struct page *page = NULL;
> >> +	unsigned long kva, offset, oldweight;
> >> +
> >> +	/* Map new TCE */
> >> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >
> > Maybe the compiler will figure this out, but isn't this the same as tce
> > & (IOMMU_PAGE_MASK & PAGE_MASK)?
> 
> 
> it is rather (tce & (IOMMU_PAGE_MASK & ~PAGE_MASK)) but I cannot see how it 
> is simpler and I doubt that it is faster enough to notice it anyhow :)

Yes, ~PAGE_MASK.  IMHO, it's more intuitive.

> >> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >> +			direction != DMA_TO_DEVICE, &page);
> >> +	if (ret < 1) {
> >
> > Probably (ret != 1) here or else we never get to your >1 case below.
> >
> >> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> >> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> >
> > Use pr_err
>  >
> >> +		if (!ret || (ret > 1))
> >
> > Then (ret >= 0) here.  Or return (ret >= 0) ? -EFAULT : ret
> >
> >> +			ret = -EFAULT;
> >> +		return ret;
> >> +	}
> >
> > You're missing the code from x86 that handles mapping mmap'd ranges.
> > This is intended to allow peer-to-peer DMA between devices.  Is that
> > intentional?
> 
> I am not following you here. What code exactly are talking about? We do not 
> track ranges at all and I do not see how it helps with p2p dma.

The code in type1 that checks PFNMAP and reserved pages that I only
sometimes remember how it works ;)  The idea there is to allow p2p dma
by inserting iommu translations for non-page backed memory, ie. the
mmap'd BARs of other devices.  It may be that the POWER topology is not
amenable to this since you have a whole PCI bus in your group and
intra-group p2p isn't iommu translated.  I'm not sure how useful it is
even on x86, but KVM device assignment does it, so I added it to type1.

> >> +
> >> +	kva = (unsigned long) page_address(page);
> >> +	kva += offset;
> >> +
> >> +	/* tce_build receives a virtual address */
> >> +	entry += tbl->it_offset; /* Offset into real TCE table */
> >
> > Here's what makes me call the entry "relative" rather than zero-based.
> 
> This is the bug actually, I overlooked it and I removed it now. Thanks for 
> being so picky :)

Ah, ok.  I'll look for it on the next round and make sure I understand
it.

> > The iova is relative to the start of dma32_window_start, ie. if the
> > window starts at bus address 512MB and I want to create a translation at
> > bus address 512MB, I pass in an iova of 0, right?  The above adds the
> > window offset.  So you've removed dma64 window, but we really need to
> > define iova better.
> 
> 
> 
> 
> >> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >> +
> >> +	/* tce_build() only returns non-zero for transient errors */
> >> +	if (unlikely(ret)) {
> >> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> >> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> >
> > Use pr_err
> >
> >> +		put_page(page);
> >> +		return -EIO;
> >> +	}
> >> +
> >> +	/* Calculate if new system page has been locked */
> >> +	oldweight = syspage_weight(tbl->it_map, entry);
> >> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> >> +
> >> +	return (oldweight == 0) ? 1 : 0;
> >> +}
> >> +
> >> +/*
> >> + * iommu_put_tces builds tces and returned the number of actually
> >> + * locked system pages
> >> + */
> >> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		uint64_t tce, enum dma_data_direction direction,
> >> +		unsigned long pages)
> >> +{
> >> +	int i, ret = 0, retpages = 0;
> >> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >> +
> >> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> >> +	BUG_ON(direction == DMA_NONE);
> >
> > This doesn't seem BUG worthy, -EINVAL?  We can't assume tce_iommu_ioctl
> > will always be the only caller of this function.
> 
> 
> This is what other function does in this file.

Blech, ok.

> >> +
> >> +	spin_lock(&(pool->lock));
> >> +
> >> +	/* Check if any is in use */
> >> +	for (i = 0; i < pages; ++i) {
> >> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> >> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
> >> +				test_bit(entry + i, tbl->it_map)) {
> >> +			WARN_ON(test_bit(entry + i, tbl->it_map));
> >
> > The WARN_ON seems to confirm that these are redundant tests, does that
> > imply we don't trust it_map?  It would be a lot faster if we could rely
> > on it_map exclusively here.
> 
> 
> As for me, pretty minor optimization. I'm testing it now to see if I do not 
> miss bits.

It would be a lot more re-assuring if we didn't need it ;)  Thanks,

Alex

> >> +			spin_unlock(&(pool->lock));
> >> +			return -EBUSY;
> >> +		}
> >> +	}
> >> +
> >> +	/* Put tces to the table */
> >> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> >> +		ret = put_tce(tbl, entry + i, tce, direction);
> >> +		if (ret == 1)
> >> +			++retpages;
> >> +	}
> >> +
> >> +	/*
> >> +	 * If failed, release locked pages, otherwise return the number
> >> +	 * of locked system pages
> >> +	 */
> >> +	if (ret < 0)
> >> +		clear_tces_nolock(tbl, entry, i);
> >> +	else
> >> +		ret = retpages;
> >> +
> >> +	tce_flush(tbl);
> >> +	spin_unlock(&(pool->lock));
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> >> +#endif /* CONFIG_IOMMU_API */
> >> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> >> index 05205cf..21250ef 100644
> >> --- a/arch/powerpc/platforms/powernv/pci.c
> >> +++ b/arch/powerpc/platforms/powernv/pci.c
> >> @@ -20,6 +20,7 @@
> >>   #include <linux/irq.h>
> >>   #include <linux/io.h>
> >>   #include <linux/msi.h>
> >> +#include <linux/iommu.h>
> >>
> >>   #include <asm/sections.h>
> >>   #include <asm/io.h>
> >> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
> >>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> >>   #endif
> >>   }
> >> +
> >> +#ifdef CONFIG_IOMMU_API
> >> +/*
> >> + * IOMMU groups support required by VFIO
> >> + */
> >> +static int add_device(struct device *dev)
> >> +{
> >> +	struct iommu_table *tbl;
> >> +	int ret = 0;
> >> +
> >> +	if (WARN_ON(dev->iommu_group)) {
> >> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> >> +				dev_name(dev),
> >> +				iommu_group_id(dev->iommu_group));
> >
> > Use pr_warn
> >
> >> +		return -EBUSY;
> >> +	}
> >> +
> >> +	tbl = get_iommu_table_base(dev);
> >> +	if (!tbl) {
> >> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> >> +				dev_name(dev));
> >> +		return 0;
> >> +	}
> >> +
> >> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> >> +			dev_name(dev), iommu_group_id(tbl->it_group));
> >> +
> >> +	ret = iommu_group_add_device(tbl->it_group, dev);
> >> +	if (ret < 0)
> >> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> >> +				dev_name(dev), ret);
> >
> > Use pr_err
> >
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void del_device(struct device *dev)
> >> +{
> >> +	iommu_group_remove_device(dev);
> >> +}
> >> +
> >> +static int iommu_bus_notifier(struct notifier_block *nb,
> >> +			      unsigned long action, void *data)
> >> +{
> >> +	struct device *dev = data;
> >> +
> >> +	switch (action) {
> >> +	case BUS_NOTIFY_ADD_DEVICE:
> >> +		return add_device(dev);
> >> +	case BUS_NOTIFY_DEL_DEVICE:
> >> +		del_device(dev);
> >> +		return 0;
> >> +	default:
> >> +		return 0;
> >> +	}
> >> +}
> >> +
> >> +static struct notifier_block tce_iommu_bus_nb = {
> >> +	.notifier_call = iommu_bus_notifier,
> >> +};
> >> +
> >> +static void group_release(void *iommu_data)
> >> +{
> >> +	struct iommu_table *tbl = iommu_data;
> >> +	tbl->it_group = NULL;
> >> +}
> >> +
> >> +static int __init tce_iommu_init(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp;
> >> +
> >> +	/* Allocate and initialize IOMMU groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +
> >> +		/* Skip already initialized */
> >> +		if (tbl->it_group)
> >> +			continue;
> >> +
> >> +		grp = iommu_group_alloc();
> >> +		if (IS_ERR(grp)) {
> >> +			printk(KERN_INFO "tce_vfio: cannot create "
> >> +					"new IOMMU group, ret=%ld\n",
> >> +					PTR_ERR(grp));
> >
> > Use pr_info
> >
> >> +			return PTR_ERR(grp);
> >> +		}
> >> +		tbl->it_group = grp;
> >> +		iommu_group_set_iommudata(grp, tbl, group_release);
> >> +	}
> >> +
> >> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >> +
> >> +	/* Add PCI devices to VFIO groups */
> >> +	for_each_pci_dev(pdev)
> >> +		add_device(&pdev->dev);
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static void __exit tce_iommu_cleanup(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp = NULL;
> >> +
> >> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >> +
> >> +	/* Delete PCI devices from VFIO groups */
> >> +	for_each_pci_dev(pdev)
> >> +		del_device(&pdev->dev);
> >> +
> >> +	/* Release VFIO groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +		grp = tbl->it_group;
> >> +
> >> +		/* Skip (already) uninitialized */
> >> +		if (!grp)
> >> +			continue;
> >> +
> >> +		/* Do actual release, group_release() is expected to work */
> >> +		iommu_group_put(grp);
> >> +		BUG_ON(tbl->it_group);
> >> +	}
> >> +}
> >> +
> >> +module_init(tce_iommu_init);
> >> +module_exit(tce_iommu_cleanup);
> >> +#endif /* CONFIG_IOMMU_API */
> >> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> >> index 9f69b56..29d11dc 100644
> >> --- a/drivers/iommu/Kconfig
> >> +++ b/drivers/iommu/Kconfig
> >> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
> >>
> >>   	  Say N unless you need kernel log message for IOMMU debugging
> >>
> >> +config SPAPR_TCE_IOMMU
> >> +	bool "sPAPR TCE IOMMU Support"
> >> +	depends on PPC_POWERNV
> >> +	select IOMMU_API
> >> +	help
> >> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> >> +	  still not implemented.
> >> +
> >>   endif # IOMMU_SUPPORT
> >
> > Thanks,
> >
> > Alex
> >
> 
> 

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
  2012-12-03 17:53             ` Alex Williamson
@ 2012-12-07  7:34               ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-07  7:34 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  348 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   30 +++
 4 files changed, 385 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..b0f81fe
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,348 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm)
+		return; /* process exited */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+
+ 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction;
+		unsigned long locked, lock_limit;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+			direction = DMA_BIDIRECTIONAL;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			direction = DMA_TO_DEVICE;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			direction = DMA_FROM_DEVICE;
+		else
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		if ((param.iova + param.size) >
+				((tbl->it_offset + tbl->it_size)
+				<< IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		/* Account for locked pages */
+		locked = current->mm->locked_vm +
+			(param.size >> PAGE_SHIFT);
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+					rlimit(RLIMIT_MEMLOCK));
+			return -ENOMEM;
+		}
+
+		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.vaddr, direction,
+				param.size >> IOMMU_PAGE_SHIFT);
+
+		if (ret > 0)
+			lock_acct(ret);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		if ((param.iova + param.size) >
+				((tbl->it_offset + tbl->it_size)
+				<< IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.size >> IOMMU_PAGE_SHIFT);
+
+		if (ret > 0)
+			lock_acct(-ret);
+
+		return ret;
+	}
+	default:
+		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	iommu_reset_table(tbl, false);
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		container->tbl = NULL;
+		iommu_reset_table(tbl, true);
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..23cfc79 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,33 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The DMA 32 bit window start is an absolute PCI bus address.
+ * The IOVA address passed via map/unmap ioctls are absolute PCI bus
+ * addresses too so the window works as a filter rather than an offset
+ * for IOVA addresses.
+ *
+ * At the momen the IOMMU page size is 4K.
+ */
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-12-07  7:34               ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-07  7:34 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  348 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   30 +++
 4 files changed, 385 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..b0f81fe
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,348 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm)
+		return; /* process exited */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+
+ 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction;
+		unsigned long locked, lock_limit;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+			direction = DMA_BIDIRECTIONAL;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			direction = DMA_TO_DEVICE;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			direction = DMA_FROM_DEVICE;
+		else
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		if ((param.iova + param.size) >
+				((tbl->it_offset + tbl->it_size)
+				<< IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		/* Account for locked pages */
+		locked = current->mm->locked_vm +
+			(param.size >> PAGE_SHIFT);
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+					rlimit(RLIMIT_MEMLOCK));
+			return -ENOMEM;
+		}
+
+		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.vaddr, direction,
+				param.size >> IOMMU_PAGE_SHIFT);
+
+		if (ret > 0)
+			lock_acct(ret);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.iova & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		if ((param.iova + param.size) >
+				((tbl->it_offset + tbl->it_size)
+				<< IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+			return -EINVAL;
+
+		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
+				param.size >> IOMMU_PAGE_SHIFT);
+
+		if (ret > 0)
+			lock_acct(-ret);
+
+		return ret;
+	}
+	default:
+		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	iommu_reset_table(tbl, false);
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		container->tbl = NULL;
+		iommu_reset_table(tbl, true);
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..23cfc79 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,33 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The DMA 32 bit window start is an absolute PCI bus address.
+ * The IOVA address passed via map/unmap ioctls are absolute PCI bus
+ * addresses too so the window works as a filter rather than an offset
+ * for IOVA addresses.
+ *
+ * At the momen the IOMMU page size is 4K.
+ */
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-04 15:51                 ` Alex Williamson
@ 2012-12-07  7:35                   ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-07  7:35 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |   10 ++
 arch/powerpc/kernel/iommu.c          |  214 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  134 +++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 366 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..be3b11b 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,12 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern void iommu_reset_table(struct iommu_table *tbl, bool release);
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..123431a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,216 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+/*
+ * iommu_reset_table is called when it started/stopped being used
+ */
+void iommu_reset_table(struct iommu_table *tbl, bool release)
+{
+	/*
+	 * Page at 0 is marked as used in iommu_init_table,
+	 * so here we clear it when called with release=false...
+	 */
+	if (!release && (tbl->it_offset == 0))
+		clear_bit(0, tbl->it_map);
+
+	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+
+	memset(tbl->it_map, 0, (tbl->it_size + 7) >> 3);
+
+	/*
+	 * ... or restore when release=true
+	 */
+	if (release && (tbl->it_offset == 0))
+		set_bit(0, tbl->it_map);
+}
+EXPORT_SYMBOL_GPL(iommu_reset_table);
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ * bitmap_weight is not used as it does not support bigendian maps.
+ * "offset" is an IOMMU page number relative to DMA window start.
+ */
+static int syspage_weight(unsigned long *map, unsigned long offset)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits) {
+		if (test_bit(offset, map))
+			++ret;
+		--nbits;
+		++offset;
+	}
+
+	return ret;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0, clr;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		if (!test_bit(entry + i - tbl->it_offset, tbl->it_map))
+			continue;
+
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		oldweight = syspage_weight(tbl->it_map,
+				entry + i - tbl->it_offset);
+		clr = __test_and_clear_bit(entry + i - tbl->it_offset,
+				tbl->it_map);
+
+		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		if (WARN_ON(!page))
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && clr)
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ * of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret != 1) {
+		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		return -EFAULT;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight(tbl->it_map, entry - tbl->it_offset);
+	__set_bit(entry - tbl->it_offset, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0, retpages = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = retpages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..1b970bf 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		pr_err("tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-07  7:35                   ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-07  7:35 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |   10 ++
 arch/powerpc/kernel/iommu.c          |  214 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  134 +++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 366 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..be3b11b 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,12 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern void iommu_reset_table(struct iommu_table *tbl, bool release);
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..123431a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,216 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+/*
+ * iommu_reset_table is called when it started/stopped being used
+ */
+void iommu_reset_table(struct iommu_table *tbl, bool release)
+{
+	/*
+	 * Page at 0 is marked as used in iommu_init_table,
+	 * so here we clear it when called with release=false...
+	 */
+	if (!release && (tbl->it_offset == 0))
+		clear_bit(0, tbl->it_map);
+
+	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+
+	memset(tbl->it_map, 0, (tbl->it_size + 7) >> 3);
+
+	/*
+	 * ... or restore when release=true
+	 */
+	if (release && (tbl->it_offset == 0))
+		set_bit(0, tbl->it_map);
+}
+EXPORT_SYMBOL_GPL(iommu_reset_table);
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ * bitmap_weight is not used as it does not support bigendian maps.
+ * "offset" is an IOMMU page number relative to DMA window start.
+ */
+static int syspage_weight(unsigned long *map, unsigned long offset)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits) {
+		if (test_bit(offset, map))
+			++ret;
+		--nbits;
+		++offset;
+	}
+
+	return ret;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0, clr;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		if (!test_bit(entry + i - tbl->it_offset, tbl->it_map))
+			continue;
+
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		oldweight = syspage_weight(tbl->it_map,
+				entry + i - tbl->it_offset);
+		clr = __test_and_clear_bit(entry + i - tbl->it_offset,
+				tbl->it_map);
+
+		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		if (WARN_ON(!page))
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && clr)
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ * of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret != 1) {
+		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		return -EFAULT;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight(tbl->it_map, entry - tbl->it_offset);
+	__set_bit(entry - tbl->it_offset, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0, retpages = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = retpages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..1b970bf 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		pr_err("tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
  2012-12-07  7:34               ` Alexey Kardashevskiy
@ 2012-12-07 17:01                 ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-07 17:01 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On Fri, 2012-12-07 at 18:34 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  drivers/vfio/Kconfig                |    6 +
>  drivers/vfio/Makefile               |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  348 +++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                |   30 +++
>  4 files changed, 385 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..b0f81fe
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,348 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	long			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);
> +	kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm)
> +		return; /* process exited */
> +
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to update
> +	 * mm->locked_vm later. If locked_vm were atomic, we
> +	 * wouldn't need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		pr_err("tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);
> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +	long ret;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION:
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +
> + 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma32_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map param;
> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction;
> +		unsigned long locked, lock_limit;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> +			direction = DMA_BIDIRECTIONAL;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> +			direction = DMA_TO_DEVICE;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> +			direction = DMA_FROM_DEVICE;
> +		else
> +			return -EINVAL;

flags needs to be sanitized too.  Return EINVAL if any unknown bit is
set or else sloppy users may make it very difficult to make use of those
flag bits later.

> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK) ||
> +				(param.vaddr & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		if ((param.iova + param.size) >
> +				((tbl->it_offset + tbl->it_size)
> +				<< IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		/* Account for locked pages */
> +		locked = current->mm->locked_vm +
> +			(param.size >> PAGE_SHIFT);

If a user were to map only IOMMU_PAGE_SIZE entries on a 64k PAGE_SIZE
system, they'd circumvent this test.

> +		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> +					rlimit(RLIMIT_MEMLOCK));
> +			return -ENOMEM;
> +		}
> +
> +		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.vaddr, direction,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +
> +		if (ret > 0)
> +			lock_acct(ret);
> +
> +		return ret;
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap param;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		if ((param.iova + param.size) >
> +				((tbl->it_offset + tbl->it_size)
> +				<< IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;

Sanitize flags here too.

> +
> +		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +
> +		if (ret > 0)
> +			lock_acct(-ret);
> +
> +		return ret;
> +	}
> +	default:
> +		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);

This is unnecessary additional future maintenance.  You only support
EXTENSION, INFO, MAP, and UNMAP.  It doesn't matter what else gets added
later.

> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;
> +	iommu_reset_table(tbl, false);
> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		container->tbl = NULL;
> +		iommu_reset_table(tbl, true);
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..23cfc79 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,33 @@ struct vfio_iommu_type1_dma_unmap {
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +/*
> + * The SPAPR TCE info struct provides the information about the PCI bus
> + * address ranges available for DMA, these values are programmed into
> + * the hardware so the guest has to know that information.
> + *
> + * The DMA 32 bit window start is an absolute PCI bus address.
> + * The IOVA address passed via map/unmap ioctls are absolute PCI bus
> + * addresses too so the window works as a filter rather than an offset
> + * for IOVA addresses.
> + *
> + * At the momen the IOMMU page size is 4K.

s/momen/moment/

A flag will need to be added if other page sizes are supported, so as
defined here, it's always 4k.  Thanks,

Alex

> + */
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;			/* reserved for future use */
> +	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
> +	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-12-07 17:01                 ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-07 17:01 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Fri, 2012-12-07 at 18:34 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  drivers/vfio/Kconfig                |    6 +
>  drivers/vfio/Makefile               |    1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  348 +++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                |   30 +++
>  4 files changed, 385 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>  	depends on VFIO
>  	default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> +	tristate
> +	depends on VFIO && SPAPR_TCE_IOMMU
> +	default n
> +
>  menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	depends on IOMMU_API
>  	select VFIO_IOMMU_TYPE1 if X86
> +	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 0000000..b0f81fe
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,348 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/err.h>
> +#include <linux/vfio.h>
> +#include <asm/iommu.h>
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "aik@ozlabs.ru"
> +#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	long			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);
> +	kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm)
> +		return; /* process exited */
> +
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to update
> +	 * mm->locked_vm later. If locked_vm were atomic, we
> +	 * wouldn't need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> +	struct mutex lock;
> +	struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> +	struct tce_container *container;
> +
> +	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +		pr_err("tce_vfio: Wrong IOMMU type\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	container = kzalloc(sizeof(*container), GFP_KERNEL);
> +	if (!container)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&container->lock);
> +
> +	return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> +	struct tce_container *container = iommu_data;
> +
> +	WARN_ON(container->tbl && !container->tbl->it_group);
> +	if (container->tbl && container->tbl->it_group)
> +		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> +	mutex_destroy(&container->lock);
> +
> +	kfree(container);
> +}
> +
> +static long tce_iommu_ioctl(void *iommu_data,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct tce_container *container = iommu_data;
> +	unsigned long minsz;
> +	long ret;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION:
> +		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +
> + 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> +		struct vfio_iommu_spapr_tce_info info;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> +				dma32_window_size);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +		info.flags = 0;
> +
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case VFIO_IOMMU_MAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_map param;
> +		struct iommu_table *tbl = container->tbl;
> +		enum dma_data_direction direction;
> +		unsigned long locked, lock_limit;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> +			direction = DMA_BIDIRECTIONAL;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> +			direction = DMA_TO_DEVICE;
> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> +			direction = DMA_FROM_DEVICE;
> +		else
> +			return -EINVAL;

flags needs to be sanitized too.  Return EINVAL if any unknown bit is
set or else sloppy users may make it very difficult to make use of those
flag bits later.

> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK) ||
> +				(param.vaddr & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		if ((param.iova + param.size) >
> +				((tbl->it_offset + tbl->it_size)
> +				<< IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		/* Account for locked pages */
> +		locked = current->mm->locked_vm +
> +			(param.size >> PAGE_SHIFT);

If a user were to map only IOMMU_PAGE_SIZE entries on a 64k PAGE_SIZE
system, they'd circumvent this test.

> +		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +			pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> +					rlimit(RLIMIT_MEMLOCK));
> +			return -ENOMEM;
> +		}
> +
> +		ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.vaddr, direction,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +
> +		if (ret > 0)
> +			lock_acct(ret);
> +
> +		return ret;
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA: {
> +		vfio_iommu_spapr_tce_dma_unmap param;
> +		struct iommu_table *tbl = container->tbl;
> +
> +		if (WARN_ON(!tbl))
> +			return -ENXIO;
> +
> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
> +
> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		if ((param.size & ~IOMMU_PAGE_MASK) ||
> +				(param.iova & ~IOMMU_PAGE_MASK))
> +			return -EINVAL;
> +
> +		if ((param.iova + param.size) >
> +				((tbl->it_offset + tbl->it_size)
> +				<< IOMMU_PAGE_SHIFT))
> +			return -EINVAL;
> +
> +		if (param.iova < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +			return -EINVAL;

Sanitize flags here too.

> +
> +		ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
> +				param.size >> IOMMU_PAGE_SHIFT);
> +
> +		if (ret > 0)
> +			lock_acct(-ret);
> +
> +		return ret;
> +	}
> +	default:
> +		pr_warn("tce_vfio: unexpected cmd %x\n", cmd);

This is unnecessary additional future maintenance.  You only support
EXTENSION, INFO, MAP, and UNMAP.  It doesn't matter what else gets added
later.

> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static int tce_iommu_attach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> +			iommu_group_id(iommu_group), iommu_group);
> +	if (container->tbl) {
> +		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> +				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(iommu_group));
> +		mutex_unlock(&container->lock);
> +		return -EBUSY;
> +	}
> +
> +	container->tbl = tbl;
> +	iommu_reset_table(tbl, false);
> +	mutex_unlock(&container->lock);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct tce_container *container = iommu_data;
> +	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
> +
> +	BUG_ON(!tbl);
> +	mutex_lock(&container->lock);
> +	if (tbl != container->tbl) {
> +		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> +				iommu_group_id(iommu_group),
> +				iommu_group_id(tbl->it_group));
> +	} else {
> +
> +		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> +				iommu_group_id(iommu_group), iommu_group);
> +
> +		container->tbl = NULL;
> +		iommu_reset_table(tbl, true);
> +	}
> +	mutex_unlock(&container->lock);
> +}
> +
> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
> +	.name		= "iommu-vfio-powerpc",
> +	.owner		= THIS_MODULE,
> +	.open		= tce_iommu_open,
> +	.release	= tce_iommu_release,
> +	.ioctl		= tce_iommu_ioctl,
> +	.attach_group	= tce_iommu_attach_group,
> +	.detach_group	= tce_iommu_detach_group,
> +};
> +
> +static int __init tce_iommu_init(void)
> +{
> +	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0a4f180..23cfc79 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>  /* Extensions */
>  
>  #define VFIO_TYPE1_IOMMU		1
> +#define VFIO_SPAPR_TCE_IOMMU		2
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
> @@ -442,4 +443,33 @@ struct vfio_iommu_type1_dma_unmap {
>  
>  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>  
> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> +
> +/*
> + * The SPAPR TCE info struct provides the information about the PCI bus
> + * address ranges available for DMA, these values are programmed into
> + * the hardware so the guest has to know that information.
> + *
> + * The DMA 32 bit window start is an absolute PCI bus address.
> + * The IOVA address passed via map/unmap ioctls are absolute PCI bus
> + * addresses too so the window works as a filter rather than an offset
> + * for IOVA addresses.
> + *
> + * At the momen the IOMMU page size is 4K.

s/momen/moment/

A flag will need to be added if other page sizes are supported, so as
defined here, it's always 4k.  Thanks,

Alex

> + */
> +struct vfio_iommu_spapr_tce_info {
> +	__u32 argsz;
> +	__u32 flags;			/* reserved for future use */
> +	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
> +	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
> +};
> +
> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/* Reuse type1 map/unmap structs as they are the same at the moment */
> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
> +
> +/* ***************************************************************** */
> +
>  #endif /* VFIO_H */

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-07  7:35                   ` Alexey Kardashevskiy
@ 2012-12-07 17:38                     ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-07 17:38 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On Fri, 2012-12-07 at 18:35 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |   10 ++
>  arch/powerpc/kernel/iommu.c          |  214 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  134 +++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 366 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..be3b11b 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern void iommu_reset_table(struct iommu_table *tbl, bool release);
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..123431a 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,216 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * iommu_reset_table is called when it started/stopped being used
> + */
> +void iommu_reset_table(struct iommu_table *tbl, bool release)
> +{
> +	/*
> +	 * Page at 0 is marked as used in iommu_init_table,
> +	 * so here we clear it when called with release=false...
> +	 */
> +	if (!release && (tbl->it_offset == 0))
> +		clear_bit(0, tbl->it_map);

Isn't this redundant to the memset below?

> +
> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +
> +	memset(tbl->it_map, 0, (tbl->it_size + 7) >> 3);
> +
> +	/*
> +	 * ... or restore when release=true
> +	 */
> +	if (release && (tbl->it_offset == 0))
> +		set_bit(0, tbl->it_map);

"release" to me implies something is freed, maybe this should just be
called "restore".

> +}
> +EXPORT_SYMBOL_GPL(iommu_reset_table);
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + * "offset" is an IOMMU page number relative to DMA window start.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long offset)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits) {
> +		if (test_bit(offset, map))
> +			++ret;
> +		--nbits;
> +		++offset;
> +	}
> +
> +	return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0, clr;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {

Any reason not to increment "entry" and avoid the 5 cases of "entry + i"
below?

> +		if (!test_bit(entry + i - tbl->it_offset, tbl->it_map))
> +			continue;
> +
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		oldweight = syspage_weight(tbl->it_map,
> +				entry + i - tbl->it_offset);
> +		clr = __test_and_clear_bit(entry + i - tbl->it_offset,
> +				tbl->it_map);
> +
> +		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		if (WARN_ON(!page))
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && clr)
> +			++retpages;
> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + * of released system pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret != 1) {
> +		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		return -EFAULT;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight(tbl->it_map, entry - tbl->it_offset);
> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;

It seems like there's an optimization for syspage_weight since you only
care about two cases, ie. syspage_weight_one and syspage_weight_zero.
The zero test is easy, just mask and return !! the value.  Testing
weight 1 means you don't have to find more than 2 bits set.  I won't
hold you to that optimization, just fyi.

> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0, retpages = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {

Again, seems like there's an optimization here that avoids individually
testing bits since you only care about zero or non-zero for a sequential
run.

> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = retpages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..1b970bf 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		pr_err("tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);

BTW, groups have a name property that shows up in sysfs that can be set
with iommu_group_set_name().  IIRC, this was a feature David requested
for PEs.  It'd be nice if it was used for PEs...  Thanks,

Alex

> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-07 17:38                     ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-07 17:38 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Fri, 2012-12-07 at 18:35 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |   10 ++
>  arch/powerpc/kernel/iommu.c          |  214 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  134 +++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 366 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..be3b11b 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern void iommu_reset_table(struct iommu_table *tbl, bool release);
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..123431a 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,216 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * iommu_reset_table is called when it started/stopped being used
> + */
> +void iommu_reset_table(struct iommu_table *tbl, bool release)
> +{
> +	/*
> +	 * Page at 0 is marked as used in iommu_init_table,
> +	 * so here we clear it when called with release=false...
> +	 */
> +	if (!release && (tbl->it_offset == 0))
> +		clear_bit(0, tbl->it_map);

Isn't this redundant to the memset below?

> +
> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +
> +	memset(tbl->it_map, 0, (tbl->it_size + 7) >> 3);
> +
> +	/*
> +	 * ... or restore when release=true
> +	 */
> +	if (release && (tbl->it_offset == 0))
> +		set_bit(0, tbl->it_map);

"release" to me implies something is freed, maybe this should just be
called "restore".

> +}
> +EXPORT_SYMBOL_GPL(iommu_reset_table);
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + * "offset" is an IOMMU page number relative to DMA window start.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long offset)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits) {
> +		if (test_bit(offset, map))
> +			++ret;
> +		--nbits;
> +		++offset;
> +	}
> +
> +	return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0, clr;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {

Any reason not to increment "entry" and avoid the 5 cases of "entry + i"
below?

> +		if (!test_bit(entry + i - tbl->it_offset, tbl->it_map))
> +			continue;
> +
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		oldweight = syspage_weight(tbl->it_map,
> +				entry + i - tbl->it_offset);
> +		clr = __test_and_clear_bit(entry + i - tbl->it_offset,
> +				tbl->it_map);
> +
> +		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		if (WARN_ON(!page))
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && clr)
> +			++retpages;
> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + * of released system pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret != 1) {
> +		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		return -EFAULT;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight(tbl->it_map, entry - tbl->it_offset);
> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;

It seems like there's an optimization for syspage_weight since you only
care about two cases, ie. syspage_weight_one and syspage_weight_zero.
The zero test is easy, just mask and return !! the value.  Testing
weight 1 means you don't have to find more than 2 bits set.  I won't
hold you to that optimization, just fyi.

> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0, retpages = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {

Again, seems like there's an optimization here that avoids individually
testing bits since you only care about zero or non-zero for a sequential
run.

> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = retpages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..1b970bf 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		pr_err("tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);

BTW, groups have a name property that shows up in sysfs that can be set
with iommu_group_set_name().  IIRC, this was a feature David requested
for PEs.  It'd be nice if it was used for PEs...  Thanks,

Alex

> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-07 17:38                     ` Alex Williamson
@ 2012-12-12  6:14                       ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12  6:14 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On 08/12/12 04:38, Alex Williamson wrote:
>> +static int __init tce_iommu_init(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp;
>> +
>> +	/* Allocate and initialize IOMMU groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +
>> +		/* Skip already initialized */
>> +		if (tbl->it_group)
>> +			continue;
>> +
>> +		grp = iommu_group_alloc();
>> +		if (IS_ERR(grp)) {
>> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
>> +					PTR_ERR(grp));
>> +			return PTR_ERR(grp);
>> +		}
>> +		tbl->it_group = grp;
>> +		iommu_group_set_iommudata(grp, tbl, group_release);
>
> BTW, groups have a name property that shows up in sysfs that can be set
> with iommu_group_set_name().  IIRC, this was a feature David requested
> for PEs.  It'd be nice if it was used for PEs...  Thanks,



But what would I put there?... IOMMU ID is more than enough at the moment 
and struct iommu_table does not have anything what would have made sense to 
show in the sysfs...


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-12  6:14                       ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12  6:14 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On 08/12/12 04:38, Alex Williamson wrote:
>> +static int __init tce_iommu_init(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp;
>> +
>> +	/* Allocate and initialize IOMMU groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +
>> +		/* Skip already initialized */
>> +		if (tbl->it_group)
>> +			continue;
>> +
>> +		grp = iommu_group_alloc();
>> +		if (IS_ERR(grp)) {
>> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
>> +					PTR_ERR(grp));
>> +			return PTR_ERR(grp);
>> +		}
>> +		tbl->it_group = grp;
>> +		iommu_group_set_iommudata(grp, tbl, group_release);
>
> BTW, groups have a name property that shows up in sysfs that can be set
> with iommu_group_set_name().  IIRC, this was a feature David requested
> for PEs.  It'd be nice if it was used for PEs...  Thanks,



But what would I put there?... IOMMU ID is more than enough at the moment 
and struct iommu_table does not have anything what would have made sense to 
show in the sysfs...


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
  2012-12-07 17:01                 ` Alex Williamson
@ 2012-12-12  6:59                   ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12  6:59 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On 08/12/12 04:01, Alex Williamson wrote:
>> +	case VFIO_IOMMU_MAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_map param;
>> +		struct iommu_table *tbl = container->tbl;
>> +		enum dma_data_direction direction;
>> +		unsigned long locked, lock_limit;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
>> +			direction = DMA_BIDIRECTIONAL;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
>> +			direction = DMA_TO_DEVICE;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> +			direction = DMA_FROM_DEVICE;
>> +		else
>> +			return -EINVAL;
>
> flags needs to be sanitized too.  Return EINVAL if any unknown bit is
> set or else sloppy users may make it very difficult to make use of those
> flag bits later.


It already returns -EINVAL on any bit set except READ/WRITE, no?


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-12-12  6:59                   ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12  6:59 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On 08/12/12 04:01, Alex Williamson wrote:
>> +	case VFIO_IOMMU_MAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_map param;
>> +		struct iommu_table *tbl = container->tbl;
>> +		enum dma_data_direction direction;
>> +		unsigned long locked, lock_limit;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
>> +			direction = DMA_BIDIRECTIONAL;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
>> +			direction = DMA_TO_DEVICE;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> +			direction = DMA_FROM_DEVICE;
>> +		else
>> +			return -EINVAL;
>
> flags needs to be sanitized too.  Return EINVAL if any unknown bit is
> set or else sloppy users may make it very difficult to make use of those
> flag bits later.


It already returns -EINVAL on any bit set except READ/WRITE, no?


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-07 17:38                     ` Alex Williamson
@ 2012-12-12 12:34                       ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12 12:34 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |   10 ++
 arch/powerpc/kernel/iommu.c          |  329 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  134 ++++++++++++++
 drivers/iommu/Kconfig                |    8 +
 4 files changed, 481 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..3c861ae 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,12 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern void iommu_reset_table(struct iommu_table *tbl, bool restore);
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
+		unsigned long size);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long size);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..f3bb2e7 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -36,6 +36,7 @@
 #include <linux/hash.h>
 #include <linux/fault-inject.h>
 #include <linux/pci.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
@@ -44,6 +45,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +858,330 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm)
+		return; /* process exited */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * iommu_reset_table is called when it started/stopped being used.
+ *
+ * restore==true says to bring the iommu_table into the state as it was
+ * before being used by VFIO.
+ */
+void iommu_reset_table(struct iommu_table *tbl, bool restore)
+{
+	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
+	if (!restore && (tbl->it_offset == 0))
+		clear_bit(0, tbl->it_map);
+
+	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+
+	/* ... or restore  */
+	if (restore && (tbl->it_offset == 0))
+		set_bit(0, tbl->it_map);
+}
+EXPORT_SYMBOL_GPL(iommu_reset_table);
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ *
+ * syspage_weight_zero is optimized for expected case == 0
+ * syspage_weight_one is optimized for expected case > 1
+ * Other case are not used in this file.
+ */
+#if PAGE_SIZE == IOMMU_PAGE_SIZE
+
+#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
+#define syspage_weight_one(map, offset)		test_bit((map), (offset))
+
+#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
+
+static int syspage_weight_zero(unsigned long *map, unsigned long offset)
+{
+	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+	return 0xffffUL & (map[BIT_WORD(offset)] >>
+			(offset & (BITS_PER_LONG-1)));
+}
+
+static int syspage_weight_one(unsigned long *map, unsigned long offset)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits && (ret < 2)) {
+		if (test_bit(offset, map))
+			++ret;
+
+		--nbits;
+		++offset;
+	}
+
+	return ret;
+}
+#else
+#error TODO: support other page size
+#endif
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0, clr;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i, ++entry) {
+		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
+			continue;
+
+		oldtce = ppc_md.tce_get(tbl, entry);
+		ppc_md.tce_free(tbl, entry, 1);
+
+		oldweight = syspage_weight_one(tbl->it_map,
+				entry - tbl->it_offset);
+		clr = __test_and_clear_bit(entry - tbl->it_offset,
+				tbl->it_map);
+
+		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		if (WARN_ON(!page))
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && clr)
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ * of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
+		unsigned long size)
+{
+	int ret;
+	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
+	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
+		return -EINVAL;
+
+	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
+			<< IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, npages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	if (ret > 0) {
+		lock_acct(-ret);
+		return 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret != 1) {
+		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		return -EFAULT;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
+	__set_bit(entry - tbl->it_offset, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long size)
+{
+	int i, ret = 0, retpages = 0;
+	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
+	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+	unsigned long locked, lock_limit;
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	if ((size & ~IOMMU_PAGE_MASK) ||
+			(ioba & ~IOMMU_PAGE_MASK) ||
+			(tce & ~IOMMU_PAGE_MASK))
+		return -EINVAL;
+
+	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
+			 << IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	/* Account for locked pages */
+	locked = current->mm->locked_vm +
+		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+				rlimit(RLIMIT_MEMLOCK));
+		return -ENOMEM;
+	}
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < npages; ++i) {
+		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0) {
+		clear_tces_nolock(tbl, entry, i);
+	} else {
+		if (retpages)
+			lock_acct(retpages);
+		ret = 0;
+	}
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..1b970bf 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		pr_err("tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-12 12:34                       ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12 12:34 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |   10 ++
 arch/powerpc/kernel/iommu.c          |  329 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  134 ++++++++++++++
 drivers/iommu/Kconfig                |    8 +
 4 files changed, 481 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..3c861ae 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,12 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern void iommu_reset_table(struct iommu_table *tbl, bool restore);
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
+		unsigned long size);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long size);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..f3bb2e7 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -36,6 +36,7 @@
 #include <linux/hash.h>
 #include <linux/fault-inject.h>
 #include <linux/pci.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
@@ -44,6 +45,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +858,330 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm)
+		return; /* process exited */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * iommu_reset_table is called when it started/stopped being used.
+ *
+ * restore==true says to bring the iommu_table into the state as it was
+ * before being used by VFIO.
+ */
+void iommu_reset_table(struct iommu_table *tbl, bool restore)
+{
+	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
+	if (!restore && (tbl->it_offset == 0))
+		clear_bit(0, tbl->it_map);
+
+	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+
+	/* ... or restore  */
+	if (restore && (tbl->it_offset == 0))
+		set_bit(0, tbl->it_map);
+}
+EXPORT_SYMBOL_GPL(iommu_reset_table);
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ *
+ * syspage_weight_zero is optimized for expected case == 0
+ * syspage_weight_one is optimized for expected case > 1
+ * Other case are not used in this file.
+ */
+#if PAGE_SIZE == IOMMU_PAGE_SIZE
+
+#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
+#define syspage_weight_one(map, offset)		test_bit((map), (offset))
+
+#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
+
+static int syspage_weight_zero(unsigned long *map, unsigned long offset)
+{
+	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+	return 0xffffUL & (map[BIT_WORD(offset)] >>
+			(offset & (BITS_PER_LONG-1)));
+}
+
+static int syspage_weight_one(unsigned long *map, unsigned long offset)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits && (ret < 2)) {
+		if (test_bit(offset, map))
+			++ret;
+
+		--nbits;
+		++offset;
+	}
+
+	return ret;
+}
+#else
+#error TODO: support other page size
+#endif
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0, clr;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i, ++entry) {
+		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
+			continue;
+
+		oldtce = ppc_md.tce_get(tbl, entry);
+		ppc_md.tce_free(tbl, entry, 1);
+
+		oldweight = syspage_weight_one(tbl->it_map,
+				entry - tbl->it_offset);
+		clr = __test_and_clear_bit(entry - tbl->it_offset,
+				tbl->it_map);
+
+		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		if (WARN_ON(!page))
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && clr)
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ * of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
+		unsigned long size)
+{
+	int ret;
+	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
+	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
+		return -EINVAL;
+
+	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
+			<< IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, npages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	if (ret > 0) {
+		lock_acct(-ret);
+		return 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret != 1) {
+		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		return -EFAULT;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
+	__set_bit(entry - tbl->it_offset, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long size)
+{
+	int i, ret = 0, retpages = 0;
+	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
+	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+	unsigned long locked, lock_limit;
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	if ((size & ~IOMMU_PAGE_MASK) ||
+			(ioba & ~IOMMU_PAGE_MASK) ||
+			(tce & ~IOMMU_PAGE_MASK))
+		return -EINVAL;
+
+	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
+			 << IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	/* Account for locked pages */
+	locked = current->mm->locked_vm +
+		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+				rlimit(RLIMIT_MEMLOCK));
+		return -ENOMEM;
+	}
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < npages; ++i) {
+		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0) {
+		clear_tces_nolock(tbl, entry, i);
+	} else {
+		if (retpages)
+			lock_acct(retpages);
+		ret = 0;
+	}
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..1b970bf 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		pr_err("tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
  2012-12-07 17:01                 ` Alex Williamson
@ 2012-12-12 12:35                   ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12 12:35 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  249 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   31 +++++
 4 files changed, 287 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..714bf57
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,249 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+
+ 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+			direction = DMA_BIDIRECTIONAL;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			direction = DMA_TO_DEVICE;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			direction = DMA_FROM_DEVICE;
+		else
+			return -EINVAL;
+
+		ret = iommu_put_tces(tbl, param.iova, param.vaddr, direction,
+				param.size);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		ret = iommu_clear_tces(tbl, param.iova, param.size);
+
+		return ret;
+	}
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	iommu_reset_table(tbl, false);
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		container->tbl = NULL;
+		iommu_reset_table(tbl, true);
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..b97697d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,34 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The DMA 32 bit window start is an absolute PCI bus address.
+ * The IOVA address passed via map/unmap ioctls are absolute PCI bus
+ * addresses too so the window works as a filter rather than an offset
+ * for IOVA addresses.
+ *
+ * A flag will need to be added if other page sizes are supported,
+ * so as defined here, it is always 4k.
+ */
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-12-12 12:35                   ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12 12:35 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  249 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   31 +++++
 4 files changed, 287 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..714bf57
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,249 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+
+ 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+			direction = DMA_BIDIRECTIONAL;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			direction = DMA_TO_DEVICE;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			direction = DMA_FROM_DEVICE;
+		else
+			return -EINVAL;
+
+		ret = iommu_put_tces(tbl, param.iova, param.vaddr, direction,
+				param.size);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		ret = iommu_clear_tces(tbl, param.iova, param.size);
+
+		return ret;
+	}
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	iommu_reset_table(tbl, false);
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		container->tbl = NULL;
+		iommu_reset_table(tbl, true);
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..b97697d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,34 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The DMA 32 bit window start is an absolute PCI bus address.
+ * The IOVA address passed via map/unmap ioctls are absolute PCI bus
+ * addresses too so the window works as a filter rather than an offset
+ * for IOVA addresses.
+ *
+ * A flag will need to be added if other page sizes are supported,
+ * so as defined here, it is always 4k.
+ */
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-12 12:34                       ` Alexey Kardashevskiy
@ 2012-12-12 12:38                         ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12 12:38 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev, linux-kernel, kvm, David Gibson

Hi Alex,

I posted other pair of patches. While debugging and testing my stuff I 
implemented some rough hack to support IOMMU mappings without passing those 
hypercalls to the QEMU, this is why I moved pieces of code around - want to 
support both QEMU-VFIO and kernel optimized H_PUT_TCE handler.



On 12/12/12 23:34, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
>
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
>
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
>
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-12 12:38                         ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-12 12:38 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

Hi Alex,

I posted other pair of patches. While debugging and testing my stuff I 
implemented some rough hack to support IOMMU mappings without passing those 
hypercalls to the QEMU, this is why I moved pieces of code around - want to 
support both QEMU-VFIO and kernel optimized H_PUT_TCE handler.



On 12/12/12 23:34, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
>
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
>
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
>
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-12  6:14                       ` Alexey Kardashevskiy
@ 2012-12-12 14:34                         ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-12 14:34 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On Wed, 2012-12-12 at 17:14 +1100, Alexey Kardashevskiy wrote:
> On 08/12/12 04:38, Alex Williamson wrote:
> >> +static int __init tce_iommu_init(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp;
> >> +
> >> +	/* Allocate and initialize IOMMU groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +
> >> +		/* Skip already initialized */
> >> +		if (tbl->it_group)
> >> +			continue;
> >> +
> >> +		grp = iommu_group_alloc();
> >> +		if (IS_ERR(grp)) {
> >> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> >> +					PTR_ERR(grp));
> >> +			return PTR_ERR(grp);
> >> +		}
> >> +		tbl->it_group = grp;
> >> +		iommu_group_set_iommudata(grp, tbl, group_release);
> >
> > BTW, groups have a name property that shows up in sysfs that can be set
> > with iommu_group_set_name().  IIRC, this was a feature David requested
> > for PEs.  It'd be nice if it was used for PEs...  Thanks,
> 
> 
> 
> But what would I put there?... IOMMU ID is more than enough at the moment 
> and struct iommu_table does not have anything what would have made sense to 
> show in the sysfs...

I believe David mentioned that PEs had user visible names.  Perhaps they
match an enclosure location or something.  Group numbers are rather
arbitrary and really have no guarantee of persistence.  Thanks,

Alex



^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-12 14:34                         ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-12 14:34 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Wed, 2012-12-12 at 17:14 +1100, Alexey Kardashevskiy wrote:
> On 08/12/12 04:38, Alex Williamson wrote:
> >> +static int __init tce_iommu_init(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp;
> >> +
> >> +	/* Allocate and initialize IOMMU groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +
> >> +		/* Skip already initialized */
> >> +		if (tbl->it_group)
> >> +			continue;
> >> +
> >> +		grp = iommu_group_alloc();
> >> +		if (IS_ERR(grp)) {
> >> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> >> +					PTR_ERR(grp));
> >> +			return PTR_ERR(grp);
> >> +		}
> >> +		tbl->it_group = grp;
> >> +		iommu_group_set_iommudata(grp, tbl, group_release);
> >
> > BTW, groups have a name property that shows up in sysfs that can be set
> > with iommu_group_set_name().  IIRC, this was a feature David requested
> > for PEs.  It'd be nice if it was used for PEs...  Thanks,
> 
> 
> 
> But what would I put there?... IOMMU ID is more than enough at the moment 
> and struct iommu_table does not have anything what would have made sense to 
> show in the sysfs...

I believe David mentioned that PEs had user visible names.  Perhaps they
match an enclosure location or something.  Group numbers are rather
arbitrary and really have no guarantee of persistence.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
  2012-12-12  6:59                   ` Alexey Kardashevskiy
@ 2012-12-12 14:36                     ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-12 14:36 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On Wed, 2012-12-12 at 17:59 +1100, Alexey Kardashevskiy wrote:
> On 08/12/12 04:01, Alex Williamson wrote:
> >> +	case VFIO_IOMMU_MAP_DMA: {
> >> +		vfio_iommu_spapr_tce_dma_map param;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +		enum dma_data_direction direction;
> >> +		unsigned long locked, lock_limit;
> >> +
> >> +		if (WARN_ON(!tbl))
> >> +			return -ENXIO;
> >> +
> >> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >> +
> >> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (param.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> >> +			direction = DMA_BIDIRECTIONAL;
> >> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> >> +			direction = DMA_TO_DEVICE;
> >> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> >> +			direction = DMA_FROM_DEVICE;
> >> +		else
> >> +			return -EINVAL;
> >
> > flags needs to be sanitized too.  Return EINVAL if any unknown bit is
> > set or else sloppy users may make it very difficult to make use of those
> > flag bits later.
> 
> 
> It already returns -EINVAL on any bit set except READ/WRITE, no?

No.  I could pass flags ~0 through there to get a read/write mapping and
cause you problems if you later want to define another bit.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
@ 2012-12-12 14:36                     ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-12 14:36 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Wed, 2012-12-12 at 17:59 +1100, Alexey Kardashevskiy wrote:
> On 08/12/12 04:01, Alex Williamson wrote:
> >> +	case VFIO_IOMMU_MAP_DMA: {
> >> +		vfio_iommu_spapr_tce_dma_map param;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +		enum dma_data_direction direction;
> >> +		unsigned long locked, lock_limit;
> >> +
> >> +		if (WARN_ON(!tbl))
> >> +			return -ENXIO;
> >> +
> >> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >> +
> >> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (param.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> >> +			direction = DMA_BIDIRECTIONAL;
> >> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> >> +			direction = DMA_TO_DEVICE;
> >> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> >> +			direction = DMA_FROM_DEVICE;
> >> +		else
> >> +			return -EINVAL;
> >
> > flags needs to be sanitized too.  Return EINVAL if any unknown bit is
> > set or else sloppy users may make it very difficult to make use of those
> > flag bits later.
> 
> 
> It already returns -EINVAL on any bit set except READ/WRITE, no?

No.  I could pass flags ~0 through there to get a read/write mapping and
cause you problems if you later want to define another bit.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-12 12:34                       ` Alexey Kardashevskiy
@ 2012-12-12 23:30                         ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-12 23:30 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On Wed, 2012-12-12 at 23:34 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |   10 ++
>  arch/powerpc/kernel/iommu.c          |  329 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  134 ++++++++++++++
>  drivers/iommu/Kconfig                |    8 +
>  4 files changed, 481 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..3c861ae 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern void iommu_reset_table(struct iommu_table *tbl, bool restore);
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
> +		unsigned long size);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long size);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..f3bb2e7 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -36,6 +36,7 @@
>  #include <linux/hash.h>
>  #include <linux/fault-inject.h>
>  #include <linux/pci.h>
> +#include <linux/uaccess.h>
>  #include <asm/io.h>
>  #include <asm/prom.h>
>  #include <asm/iommu.h>
> @@ -44,6 +45,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +858,330 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	long			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);
> +	kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm)
> +		return; /* process exited */
> +
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to update
> +	 * mm->locked_vm later. If locked_vm were atomic, we
> +	 * wouldn't need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}

Locked page accounting in this version is very, very broken.  How do
powerpc folks feel about seemingly generic kernel iommu interfaces
messing with the current task mm?  Besides that, more problems below...

> +
> +/*
> + * iommu_reset_table is called when it started/stopped being used.
> + *
> + * restore==true says to bring the iommu_table into the state as it was
> + * before being used by VFIO.
> + */
> +void iommu_reset_table(struct iommu_table *tbl, bool restore)
> +{
> +	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
> +	if (!restore && (tbl->it_offset == 0))
> +		clear_bit(0, tbl->it_map);
> +
> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);

This does locked page accounting and unpins pages, even on startup when
the pages aren't necessarily pinned or accounted against the current
process.

> +
> +	/* ... or restore  */
> +	if (restore && (tbl->it_offset == 0))
> +		set_bit(0, tbl->it_map);
> +}
> +EXPORT_SYMBOL_GPL(iommu_reset_table);
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + *
> + * syspage_weight_zero is optimized for expected case == 0
> + * syspage_weight_one is optimized for expected case > 1
> + * Other case are not used in this file.
> + */
> +#if PAGE_SIZE == IOMMU_PAGE_SIZE
> +
> +#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
> +#define syspage_weight_one(map, offset)		test_bit((map), (offset))
> +
> +#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
> +
> +static int syspage_weight_zero(unsigned long *map, unsigned long offset)
> +{
> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +	return 0xffffUL & (map[BIT_WORD(offset)] >>
> +			(offset & (BITS_PER_LONG-1)));
> +}

I would have expected these to be bools and return true if the weight
matches the value.

If you replaced 0xffff above w/ this, would you need the #error below?

(1UL << (PAGE_SIZE/IOMMU_PAGE_SIZE)) - 1)

> +
> +static int syspage_weight_one(unsigned long *map, unsigned long offset)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits && (ret < 2)) {

Don't you have a ffs()?  Could also be used for _zero.  Surely there are
some bitops helpers that could help here even on big endian.  hweight
really doesn't work?

> +		if (test_bit(offset, map))
> +			++ret;
> +
> +		--nbits;
> +		++offset;
> +	}
> +
> +	return ret;
> +}
> +#else
> +#error TODO: support other page size
> +#endif
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0, clr;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i, ++entry) {
> +		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
> +			continue;
> +
> +		oldtce = ppc_md.tce_get(tbl, entry);
> +		ppc_md.tce_free(tbl, entry, 1);
> +
> +		oldweight = syspage_weight_one(tbl->it_map,
> +				entry - tbl->it_offset);
> +		clr = __test_and_clear_bit(entry - tbl->it_offset,
> +				tbl->it_map);
> +
> +		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		if (WARN_ON(!page))
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && clr)
> +			++retpages;
> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + * of released system pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
> +		unsigned long size)
> +{
> +	int ret;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
> +		return -EINVAL;
> +
> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> +			<< IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, npages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	if (ret > 0) {
> +		lock_acct(-ret);
> +		return 0;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret != 1) {
> +		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		return -EFAULT;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long size)
> +{
> +	int i, ret = 0, retpages = 0;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +	unsigned long locked, lock_limit;
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	if ((size & ~IOMMU_PAGE_MASK) ||
> +			(ioba & ~IOMMU_PAGE_MASK) ||
> +			(tce & ~IOMMU_PAGE_MASK))
> +		return -EINVAL;
> +
> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> +			 << IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	/* Account for locked pages */
> +	locked = current->mm->locked_vm +
> +		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);

Looks like we just over penalize upfront and correct when mapped, that's
better, but not great.

> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> +				rlimit(RLIMIT_MEMLOCK));
> +		return -ENOMEM;
> +	}
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < npages; ++i) {
> +		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0) {
> +		clear_tces_nolock(tbl, entry, i);
> +	} else {
> +		if (retpages)
> +			lock_acct(retpages);
> +		ret = 0;
> +	}

Bug, if it fails we clear, which decrements our locked pages, but we
haven't incremented them yet.  Thanks,

Alex

> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..1b970bf 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		pr_err("tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-12 23:30                         ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-12 23:30 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On Wed, 2012-12-12 at 23:34 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |   10 ++
>  arch/powerpc/kernel/iommu.c          |  329 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  134 ++++++++++++++
>  drivers/iommu/Kconfig                |    8 +
>  4 files changed, 481 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..3c861ae 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern void iommu_reset_table(struct iommu_table *tbl, bool restore);
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
> +		unsigned long size);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long size);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..f3bb2e7 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -36,6 +36,7 @@
>  #include <linux/hash.h>
>  #include <linux/fault-inject.h>
>  #include <linux/pci.h>
> +#include <linux/uaccess.h>
>  #include <asm/io.h>
>  #include <asm/prom.h>
>  #include <asm/iommu.h>
> @@ -44,6 +45,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +858,330 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	long			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);
> +	kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm)
> +		return; /* process exited */
> +
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to update
> +	 * mm->locked_vm later. If locked_vm were atomic, we
> +	 * wouldn't need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}

Locked page accounting in this version is very, very broken.  How do
powerpc folks feel about seemingly generic kernel iommu interfaces
messing with the current task mm?  Besides that, more problems below...

> +
> +/*
> + * iommu_reset_table is called when it started/stopped being used.
> + *
> + * restore==true says to bring the iommu_table into the state as it was
> + * before being used by VFIO.
> + */
> +void iommu_reset_table(struct iommu_table *tbl, bool restore)
> +{
> +	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
> +	if (!restore && (tbl->it_offset == 0))
> +		clear_bit(0, tbl->it_map);
> +
> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);

This does locked page accounting and unpins pages, even on startup when
the pages aren't necessarily pinned or accounted against the current
process.

> +
> +	/* ... or restore  */
> +	if (restore && (tbl->it_offset == 0))
> +		set_bit(0, tbl->it_map);
> +}
> +EXPORT_SYMBOL_GPL(iommu_reset_table);
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + *
> + * syspage_weight_zero is optimized for expected case == 0
> + * syspage_weight_one is optimized for expected case > 1
> + * Other case are not used in this file.
> + */
> +#if PAGE_SIZE == IOMMU_PAGE_SIZE
> +
> +#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
> +#define syspage_weight_one(map, offset)		test_bit((map), (offset))
> +
> +#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
> +
> +static int syspage_weight_zero(unsigned long *map, unsigned long offset)
> +{
> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +	return 0xffffUL & (map[BIT_WORD(offset)] >>
> +			(offset & (BITS_PER_LONG-1)));
> +}

I would have expected these to be bools and return true if the weight
matches the value.

If you replaced 0xffff above w/ this, would you need the #error below?

(1UL << (PAGE_SIZE/IOMMU_PAGE_SIZE)) - 1)

> +
> +static int syspage_weight_one(unsigned long *map, unsigned long offset)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits && (ret < 2)) {

Don't you have a ffs()?  Could also be used for _zero.  Surely there are
some bitops helpers that could help here even on big endian.  hweight
really doesn't work?

> +		if (test_bit(offset, map))
> +			++ret;
> +
> +		--nbits;
> +		++offset;
> +	}
> +
> +	return ret;
> +}
> +#else
> +#error TODO: support other page size
> +#endif
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0, clr;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i, ++entry) {
> +		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
> +			continue;
> +
> +		oldtce = ppc_md.tce_get(tbl, entry);
> +		ppc_md.tce_free(tbl, entry, 1);
> +
> +		oldweight = syspage_weight_one(tbl->it_map,
> +				entry - tbl->it_offset);
> +		clr = __test_and_clear_bit(entry - tbl->it_offset,
> +				tbl->it_map);
> +
> +		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		if (WARN_ON(!page))
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && clr)
> +			++retpages;
> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + * of released system pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
> +		unsigned long size)
> +{
> +	int ret;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
> +		return -EINVAL;
> +
> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> +			<< IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, npages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	if (ret > 0) {
> +		lock_acct(-ret);
> +		return 0;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret != 1) {
> +		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		return -EFAULT;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long size)
> +{
> +	int i, ret = 0, retpages = 0;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +	unsigned long locked, lock_limit;
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	if ((size & ~IOMMU_PAGE_MASK) ||
> +			(ioba & ~IOMMU_PAGE_MASK) ||
> +			(tce & ~IOMMU_PAGE_MASK))
> +		return -EINVAL;
> +
> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> +			 << IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	/* Account for locked pages */
> +	locked = current->mm->locked_vm +
> +		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);

Looks like we just over penalize upfront and correct when mapped, that's
better, but not great.

> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> +				rlimit(RLIMIT_MEMLOCK));
> +		return -ENOMEM;
> +	}
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < npages; ++i) {
> +		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0) {
> +		clear_tces_nolock(tbl, entry, i);
> +	} else {
> +		if (retpages)
> +			lock_acct(retpages);
> +		ret = 0;
> +	}

Bug, if it fails we clear, which decrements our locked pages, but we
haven't incremented them yet.  Thanks,

Alex

> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..1b970bf 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		pr_err("tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-12 23:30                         ` Alex Williamson
@ 2012-12-13  2:24                           ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-13  2:24 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	linux-kernel, kvm, David Gibson

On 13/12/12 10:30, Alex Williamson wrote:
> On Wed, 2012-12-12 at 23:34 +1100, Alexey Kardashevskiy wrote:
>> This patch initializes IOMMU groups based on the IOMMU
>> configuration discovered during the PCI scan on POWERNV
>> (POWER non virtualized) platform. The IOMMU groups are
>> to be used later by VFIO driver (PCI pass through).
>>
>> It also implements an API for mapping/unmapping pages for
>> guest PCI drivers and providing DMA window properties.
>> This API is going to be used later by QEMU-VFIO to handle
>> h_put_tce hypercalls from the KVM guest.
>>
>> Although this driver has been tested only on the POWERNV
>> platform, it should work on any platform which supports
>> TCE tables.
>>
>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
>> option and configure VFIO as required.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/iommu.h     |   10 ++
>>   arch/powerpc/kernel/iommu.c          |  329 ++++++++++++++++++++++++++++++++++
>>   arch/powerpc/platforms/powernv/pci.c |  134 ++++++++++++++
>>   drivers/iommu/Kconfig                |    8 +
>>   4 files changed, 481 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index cbfe678..3c861ae 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -76,6 +76,9 @@ struct iommu_table {
>>   	struct iommu_pool large_pool;
>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
>>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> +	struct iommu_group *it_group;
>> +#endif
>>   };
>>
>>   struct scatterlist;
>> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>>   }
>>   #endif
>>
>> +extern void iommu_reset_table(struct iommu_table *tbl, bool restore);
>> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
>> +		unsigned long size);
>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long size);
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index ff5a6ce..f3bb2e7 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -36,6 +36,7 @@
>>   #include <linux/hash.h>
>>   #include <linux/fault-inject.h>
>>   #include <linux/pci.h>
>> +#include <linux/uaccess.h>
>>   #include <asm/io.h>
>>   #include <asm/prom.h>
>>   #include <asm/iommu.h>
>> @@ -44,6 +45,7 @@
>>   #include <asm/kdump.h>
>>   #include <asm/fadump.h>
>>   #include <asm/vio.h>
>> +#include <asm/tce.h>
>>
>>   #define DBG(...)
>>
>> @@ -856,3 +858,330 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>   		free_pages((unsigned long)vaddr, get_order(size));
>>   	}
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * SPAPR TCE API
>> + */
>> +
>> +struct vwork {
>> +	struct mm_struct	*mm;
>> +	long			npage;
>> +	struct work_struct	work;
>> +};
>> +
>> +/* delayed decrement/increment for locked_vm */
>> +static void lock_acct_bg(struct work_struct *work)
>> +{
>> +	struct vwork *vwork = container_of(work, struct vwork, work);
>> +	struct mm_struct *mm;
>> +
>> +	mm = vwork->mm;
>> +	down_write(&mm->mmap_sem);
>> +	mm->locked_vm += vwork->npage;
>> +	up_write(&mm->mmap_sem);
>> +	mmput(mm);
>> +	kfree(vwork);
>> +}
>> +
>> +static void lock_acct(long npage)
>> +{
>> +	struct vwork *vwork;
>> +	struct mm_struct *mm;
>> +
>> +	if (!current->mm)
>> +		return; /* process exited */
>> +
>> +	if (down_write_trylock(&current->mm->mmap_sem)) {
>> +		current->mm->locked_vm += npage;
>> +		up_write(&current->mm->mmap_sem);
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * Couldn't get mmap_sem lock, so must setup to update
>> +	 * mm->locked_vm later. If locked_vm were atomic, we
>> +	 * wouldn't need this silliness
>> +	 */
>> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>> +	if (!vwork)
>> +		return;
>> +	mm = get_task_mm(current);
>> +	if (!mm) {
>> +		kfree(vwork);
>> +		return;
>> +	}
>> +	INIT_WORK(&vwork->work, lock_acct_bg);
>> +	vwork->mm = mm;
>> +	vwork->npage = npage;
>> +	schedule_work(&vwork->work);
>> +}
>
> Locked page accounting in this version is very, very broken.  How do
> powerpc folks feel about seemingly generic kernel iommu interfaces
> messing with the current task mm?  Besides that, more problems below...
>
>> +
>> +/*
>> + * iommu_reset_table is called when it started/stopped being used.
>> + *
>> + * restore==true says to bring the iommu_table into the state as it was
>> + * before being used by VFIO.
>> + */
>> +void iommu_reset_table(struct iommu_table *tbl, bool restore)
>> +{
>> +	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
>> +	if (!restore && (tbl->it_offset == 0))
>> +		clear_bit(0, tbl->it_map);
>> +
>> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>
> This does locked page accounting and unpins pages, even on startup when
> the pages aren't necessarily pinned or accounted against the current
> process.
 >
>> +
>> +	/* ... or restore  */
>> +	if (restore && (tbl->it_offset == 0))
>> +		set_bit(0, tbl->it_map);
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_reset_table);
>> +
>> +/*
>> + * Returns the number of used IOMMU pages (4K) within
>> + * the same system page (4K or 64K).
>> + *
>> + * syspage_weight_zero is optimized for expected case == 0
>> + * syspage_weight_one is optimized for expected case > 1
>> + * Other case are not used in this file.
>> + */
>> +#if PAGE_SIZE == IOMMU_PAGE_SIZE
>> +
>> +#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
>> +#define syspage_weight_one(map, offset)		test_bit((map), (offset))
>> +
>> +#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
>> +
>> +static int syspage_weight_zero(unsigned long *map, unsigned long offset)
>> +{
>> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +	return 0xffffUL & (map[BIT_WORD(offset)] >>
>> +			(offset & (BITS_PER_LONG-1)));
>> +}
>
> I would have expected these to be bools and return true if the weight
> matches the value.

My expectation was different but ok, I'll fix :)


> If you replaced 0xffff above w/ this, would you need the #error below?
> (1UL << (PAGE_SIZE/IOMMU_PAGE_SIZE)) - 1)


We have 3 pages size on POWER - 4K, 64K and 16MB. We already handle 4K and 
64K and the 16MB case will require much different approach and I am not 
sure how/when we will add this so I'd keep it as an error.


>> +
>> +static int syspage_weight_one(unsigned long *map, unsigned long offset)
>> +{
>> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
>> +
>> +	/* Aligns TCE entry number to system page boundary */
>> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +
>> +	/* Count used 4K pages */
>> +	while (nbits && (ret < 2)) {
>
> Don't you have a ffs()?  Could also be used for _zero.  Surely there are
> some bitops helpers that could help here even on big endian.  hweight
> really doesn't work?
>
>> +		if (test_bit(offset, map))
>> +			++ret;
>> +
>> +		--nbits;
>> +		++offset;
>> +	}
>> +
>> +	return ret;
>> +}
>> +#else
>> +#error TODO: support other page size
>> +#endif
>> +
>> +static void tce_flush(struct iommu_table *tbl)
>> +{
>> +	/* Flush/invalidate TLB caches if necessary */
>> +	if (ppc_md.tce_flush)
>> +		ppc_md.tce_flush(tbl);
>> +
>> +	/* Make sure updates are seen by hardware */
>> +	mb();
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number of system pages
>> + * which it called put_page() on
>> + */
>> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages)
>> +{
>> +	int i, retpages = 0, clr;
>> +	unsigned long oldtce, oldweight;
>> +	struct page *page;
>> +
>> +	for (i = 0; i < pages; ++i, ++entry) {
>> +		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
>> +			continue;
>> +
>> +		oldtce = ppc_md.tce_get(tbl, entry);
>> +		ppc_md.tce_free(tbl, entry, 1);
>> +
>> +		oldweight = syspage_weight_one(tbl->it_map,
>> +				entry - tbl->it_offset);
>> +		clr = __test_and_clear_bit(entry - tbl->it_offset,
>> +				tbl->it_map);
>> +
>> +		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
>> +			continue;
>> +
>> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
>> +
>> +		if (WARN_ON(!page))
>> +			continue;
>> +
>> +		if (oldtce & TCE_PCI_WRITE)
>> +			SetPageDirty(page);
>> +
>> +		put_page(page);
>> +
>> +		/* That was the last IOMMU page within the system page */
>> +		if ((oldweight == 1) && clr)
>> +			++retpages;
>> +	}
>> +
>> +	return retpages;
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number
>> + * of released system pages
>> + */
>> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
>> +		unsigned long size)
>> +{
>> +	int ret;
>> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
>> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> +	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
>> +		return -EINVAL;
>> +
>> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
>> +			<< IOMMU_PAGE_SHIFT))
>> +		return -EINVAL;
>> +
>> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
>> +		return -EINVAL;
>> +
>> +	spin_lock(&(pool->lock));
>> +	ret = clear_tces_nolock(tbl, entry, npages);
>> +	tce_flush(tbl);
>> +	spin_unlock(&(pool->lock));
>> +
>> +	if (ret > 0) {
>> +		lock_acct(-ret);
>> +		return 0;
>> +	}
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
>> +
>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction)
>> +{
>> +	int ret;
>> +	struct page *page = NULL;
>> +	unsigned long kva, offset, oldweight;
>> +
>> +	/* Map new TCE */
>> +	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> +			direction != DMA_TO_DEVICE, &page);
>> +	if (ret != 1) {
>> +		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
>> +		return -EFAULT;
>> +	}
>> +
>> +	kva = (unsigned long) page_address(page);
>> +	kva += offset;
>> +
>> +	/* tce_build receives a virtual address */
>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>> +
>> +	/* tce_build() only returns non-zero for transient errors */
>> +	if (unlikely(ret)) {
>> +		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>> +		put_page(page);
>> +		return -EIO;
>> +	}
>> +
>> +	/* Calculate if new system page has been locked */
>> +	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
>> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> +	return (oldweight == 0) ? 1 : 0;
>> +}
>> +
>> +/*
>> + * iommu_put_tces builds tces and returned the number of actually
>> + * locked system pages
>> + */
>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long size)
>> +{
>> +	int i, ret = 0, retpages = 0;
>> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
>> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +	unsigned long locked, lock_limit;
>> +
>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>> +	BUG_ON(direction == DMA_NONE);
>> +
>> +	if ((size & ~IOMMU_PAGE_MASK) ||
>> +			(ioba & ~IOMMU_PAGE_MASK) ||
>> +			(tce & ~IOMMU_PAGE_MASK))
>> +		return -EINVAL;
>> +
>> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
>> +			 << IOMMU_PAGE_SHIFT))
>> +		return -EINVAL;
>> +
>> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
>> +		return -EINVAL;
>> +
>> +	/* Account for locked pages */
>> +	locked = current->mm->locked_vm +
>> +		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);
>
> Looks like we just over penalize upfront and correct when mapped, that's
> better, but not great.

What would be great? :)


>> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>> +		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
>> +				rlimit(RLIMIT_MEMLOCK));
>> +		return -ENOMEM;
>> +	}
>> +
>> +	spin_lock(&(pool->lock));
>> +
>> +	/* Check if any is in use */
>> +	for (i = 0; i < npages; ++i) {
>> +		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
>> +			spin_unlock(&(pool->lock));
>> +			return -EBUSY;
>> +		}
>> +	}
>> +
>> +	/* Put tces to the table */
>> +	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
>> +		ret = put_tce(tbl, entry + i, tce, direction);
>> +		if (ret == 1)
>> +			++retpages;
>> +	}
>> +
>> +	/*
>> +	 * If failed, release locked pages, otherwise return the number
>> +	 * of locked system pages
>> +	 */
>> +	if (ret < 0) {
>> +		clear_tces_nolock(tbl, entry, i);
>> +	} else {
>> +		if (retpages)
>> +			lock_acct(retpages);
>> +		ret = 0;
>> +	}
>
> Bug, if it fails we clear, which decrements our locked pages, but we
> haven't incremented them yet.  Thanks,


static clear_tces_nolock does not touch the counter, extern 
iommu_clear_tces does or I missed your point.


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-13  2:24                           ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-13  2:24 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson

On 13/12/12 10:30, Alex Williamson wrote:
> On Wed, 2012-12-12 at 23:34 +1100, Alexey Kardashevskiy wrote:
>> This patch initializes IOMMU groups based on the IOMMU
>> configuration discovered during the PCI scan on POWERNV
>> (POWER non virtualized) platform. The IOMMU groups are
>> to be used later by VFIO driver (PCI pass through).
>>
>> It also implements an API for mapping/unmapping pages for
>> guest PCI drivers and providing DMA window properties.
>> This API is going to be used later by QEMU-VFIO to handle
>> h_put_tce hypercalls from the KVM guest.
>>
>> Although this driver has been tested only on the POWERNV
>> platform, it should work on any platform which supports
>> TCE tables.
>>
>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
>> option and configure VFIO as required.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/iommu.h     |   10 ++
>>   arch/powerpc/kernel/iommu.c          |  329 ++++++++++++++++++++++++++++++++++
>>   arch/powerpc/platforms/powernv/pci.c |  134 ++++++++++++++
>>   drivers/iommu/Kconfig                |    8 +
>>   4 files changed, 481 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index cbfe678..3c861ae 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -76,6 +76,9 @@ struct iommu_table {
>>   	struct iommu_pool large_pool;
>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
>>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> +	struct iommu_group *it_group;
>> +#endif
>>   };
>>
>>   struct scatterlist;
>> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>>   }
>>   #endif
>>
>> +extern void iommu_reset_table(struct iommu_table *tbl, bool restore);
>> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
>> +		unsigned long size);
>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long size);
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index ff5a6ce..f3bb2e7 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -36,6 +36,7 @@
>>   #include <linux/hash.h>
>>   #include <linux/fault-inject.h>
>>   #include <linux/pci.h>
>> +#include <linux/uaccess.h>
>>   #include <asm/io.h>
>>   #include <asm/prom.h>
>>   #include <asm/iommu.h>
>> @@ -44,6 +45,7 @@
>>   #include <asm/kdump.h>
>>   #include <asm/fadump.h>
>>   #include <asm/vio.h>
>> +#include <asm/tce.h>
>>
>>   #define DBG(...)
>>
>> @@ -856,3 +858,330 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>   		free_pages((unsigned long)vaddr, get_order(size));
>>   	}
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * SPAPR TCE API
>> + */
>> +
>> +struct vwork {
>> +	struct mm_struct	*mm;
>> +	long			npage;
>> +	struct work_struct	work;
>> +};
>> +
>> +/* delayed decrement/increment for locked_vm */
>> +static void lock_acct_bg(struct work_struct *work)
>> +{
>> +	struct vwork *vwork = container_of(work, struct vwork, work);
>> +	struct mm_struct *mm;
>> +
>> +	mm = vwork->mm;
>> +	down_write(&mm->mmap_sem);
>> +	mm->locked_vm += vwork->npage;
>> +	up_write(&mm->mmap_sem);
>> +	mmput(mm);
>> +	kfree(vwork);
>> +}
>> +
>> +static void lock_acct(long npage)
>> +{
>> +	struct vwork *vwork;
>> +	struct mm_struct *mm;
>> +
>> +	if (!current->mm)
>> +		return; /* process exited */
>> +
>> +	if (down_write_trylock(&current->mm->mmap_sem)) {
>> +		current->mm->locked_vm += npage;
>> +		up_write(&current->mm->mmap_sem);
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * Couldn't get mmap_sem lock, so must setup to update
>> +	 * mm->locked_vm later. If locked_vm were atomic, we
>> +	 * wouldn't need this silliness
>> +	 */
>> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>> +	if (!vwork)
>> +		return;
>> +	mm = get_task_mm(current);
>> +	if (!mm) {
>> +		kfree(vwork);
>> +		return;
>> +	}
>> +	INIT_WORK(&vwork->work, lock_acct_bg);
>> +	vwork->mm = mm;
>> +	vwork->npage = npage;
>> +	schedule_work(&vwork->work);
>> +}
>
> Locked page accounting in this version is very, very broken.  How do
> powerpc folks feel about seemingly generic kernel iommu interfaces
> messing with the current task mm?  Besides that, more problems below...
>
>> +
>> +/*
>> + * iommu_reset_table is called when it started/stopped being used.
>> + *
>> + * restore==true says to bring the iommu_table into the state as it was
>> + * before being used by VFIO.
>> + */
>> +void iommu_reset_table(struct iommu_table *tbl, bool restore)
>> +{
>> +	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
>> +	if (!restore && (tbl->it_offset == 0))
>> +		clear_bit(0, tbl->it_map);
>> +
>> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>
> This does locked page accounting and unpins pages, even on startup when
> the pages aren't necessarily pinned or accounted against the current
> process.
 >
>> +
>> +	/* ... or restore  */
>> +	if (restore && (tbl->it_offset == 0))
>> +		set_bit(0, tbl->it_map);
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_reset_table);
>> +
>> +/*
>> + * Returns the number of used IOMMU pages (4K) within
>> + * the same system page (4K or 64K).
>> + *
>> + * syspage_weight_zero is optimized for expected case == 0
>> + * syspage_weight_one is optimized for expected case > 1
>> + * Other case are not used in this file.
>> + */
>> +#if PAGE_SIZE == IOMMU_PAGE_SIZE
>> +
>> +#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
>> +#define syspage_weight_one(map, offset)		test_bit((map), (offset))
>> +
>> +#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
>> +
>> +static int syspage_weight_zero(unsigned long *map, unsigned long offset)
>> +{
>> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +	return 0xffffUL & (map[BIT_WORD(offset)] >>
>> +			(offset & (BITS_PER_LONG-1)));
>> +}
>
> I would have expected these to be bools and return true if the weight
> matches the value.

My expectation was different but ok, I'll fix :)


> If you replaced 0xffff above w/ this, would you need the #error below?
> (1UL << (PAGE_SIZE/IOMMU_PAGE_SIZE)) - 1)


We have 3 pages size on POWER - 4K, 64K and 16MB. We already handle 4K and 
64K and the 16MB case will require much different approach and I am not 
sure how/when we will add this so I'd keep it as an error.


>> +
>> +static int syspage_weight_one(unsigned long *map, unsigned long offset)
>> +{
>> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
>> +
>> +	/* Aligns TCE entry number to system page boundary */
>> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +
>> +	/* Count used 4K pages */
>> +	while (nbits && (ret < 2)) {
>
> Don't you have a ffs()?  Could also be used for _zero.  Surely there are
> some bitops helpers that could help here even on big endian.  hweight
> really doesn't work?
>
>> +		if (test_bit(offset, map))
>> +			++ret;
>> +
>> +		--nbits;
>> +		++offset;
>> +	}
>> +
>> +	return ret;
>> +}
>> +#else
>> +#error TODO: support other page size
>> +#endif
>> +
>> +static void tce_flush(struct iommu_table *tbl)
>> +{
>> +	/* Flush/invalidate TLB caches if necessary */
>> +	if (ppc_md.tce_flush)
>> +		ppc_md.tce_flush(tbl);
>> +
>> +	/* Make sure updates are seen by hardware */
>> +	mb();
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number of system pages
>> + * which it called put_page() on
>> + */
>> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages)
>> +{
>> +	int i, retpages = 0, clr;
>> +	unsigned long oldtce, oldweight;
>> +	struct page *page;
>> +
>> +	for (i = 0; i < pages; ++i, ++entry) {
>> +		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
>> +			continue;
>> +
>> +		oldtce = ppc_md.tce_get(tbl, entry);
>> +		ppc_md.tce_free(tbl, entry, 1);
>> +
>> +		oldweight = syspage_weight_one(tbl->it_map,
>> +				entry - tbl->it_offset);
>> +		clr = __test_and_clear_bit(entry - tbl->it_offset,
>> +				tbl->it_map);
>> +
>> +		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
>> +			continue;
>> +
>> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
>> +
>> +		if (WARN_ON(!page))
>> +			continue;
>> +
>> +		if (oldtce & TCE_PCI_WRITE)
>> +			SetPageDirty(page);
>> +
>> +		put_page(page);
>> +
>> +		/* That was the last IOMMU page within the system page */
>> +		if ((oldweight == 1) && clr)
>> +			++retpages;
>> +	}
>> +
>> +	return retpages;
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number
>> + * of released system pages
>> + */
>> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
>> +		unsigned long size)
>> +{
>> +	int ret;
>> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
>> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> +	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
>> +		return -EINVAL;
>> +
>> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
>> +			<< IOMMU_PAGE_SHIFT))
>> +		return -EINVAL;
>> +
>> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
>> +		return -EINVAL;
>> +
>> +	spin_lock(&(pool->lock));
>> +	ret = clear_tces_nolock(tbl, entry, npages);
>> +	tce_flush(tbl);
>> +	spin_unlock(&(pool->lock));
>> +
>> +	if (ret > 0) {
>> +		lock_acct(-ret);
>> +		return 0;
>> +	}
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
>> +
>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction)
>> +{
>> +	int ret;
>> +	struct page *page = NULL;
>> +	unsigned long kva, offset, oldweight;
>> +
>> +	/* Map new TCE */
>> +	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> +			direction != DMA_TO_DEVICE, &page);
>> +	if (ret != 1) {
>> +		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
>> +		return -EFAULT;
>> +	}
>> +
>> +	kva = (unsigned long) page_address(page);
>> +	kva += offset;
>> +
>> +	/* tce_build receives a virtual address */
>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>> +
>> +	/* tce_build() only returns non-zero for transient errors */
>> +	if (unlikely(ret)) {
>> +		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>> +		put_page(page);
>> +		return -EIO;
>> +	}
>> +
>> +	/* Calculate if new system page has been locked */
>> +	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
>> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> +	return (oldweight == 0) ? 1 : 0;
>> +}
>> +
>> +/*
>> + * iommu_put_tces builds tces and returned the number of actually
>> + * locked system pages
>> + */
>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long size)
>> +{
>> +	int i, ret = 0, retpages = 0;
>> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
>> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +	unsigned long locked, lock_limit;
>> +
>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>> +	BUG_ON(direction == DMA_NONE);
>> +
>> +	if ((size & ~IOMMU_PAGE_MASK) ||
>> +			(ioba & ~IOMMU_PAGE_MASK) ||
>> +			(tce & ~IOMMU_PAGE_MASK))
>> +		return -EINVAL;
>> +
>> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
>> +			 << IOMMU_PAGE_SHIFT))
>> +		return -EINVAL;
>> +
>> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
>> +		return -EINVAL;
>> +
>> +	/* Account for locked pages */
>> +	locked = current->mm->locked_vm +
>> +		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);
>
> Looks like we just over penalize upfront and correct when mapped, that's
> better, but not great.

What would be great? :)


>> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>> +		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
>> +				rlimit(RLIMIT_MEMLOCK));
>> +		return -ENOMEM;
>> +	}
>> +
>> +	spin_lock(&(pool->lock));
>> +
>> +	/* Check if any is in use */
>> +	for (i = 0; i < npages; ++i) {
>> +		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
>> +			spin_unlock(&(pool->lock));
>> +			return -EBUSY;
>> +		}
>> +	}
>> +
>> +	/* Put tces to the table */
>> +	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
>> +		ret = put_tce(tbl, entry + i, tce, direction);
>> +		if (ret == 1)
>> +			++retpages;
>> +	}
>> +
>> +	/*
>> +	 * If failed, release locked pages, otherwise return the number
>> +	 * of locked system pages
>> +	 */
>> +	if (ret < 0) {
>> +		clear_tces_nolock(tbl, entry, i);
>> +	} else {
>> +		if (retpages)
>> +			lock_acct(retpages);
>> +		ret = 0;
>> +	}
>
> Bug, if it fails we clear, which decrements our locked pages, but we
> haven't incremented them yet.  Thanks,


static clear_tces_nolock does not touch the counter, extern 
iommu_clear_tces does or I missed your point.


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-12 14:34                         ` Alex Williamson
@ 2012-12-13  2:29                           ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 122+ messages in thread
From: Benjamin Herrenschmidt @ 2012-12-13  2:29 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Paul Mackerras, linuxppc-dev, linux-kernel,
	kvm, David Gibson

On Wed, 2012-12-12 at 07:34 -0700, Alex Williamson wrote:
> > But what would I put there?... IOMMU ID is more than enough at the moment 
> > and struct iommu_table does not have anything what would have made sense to 
> > show in the sysfs...
> 
> I believe David mentioned that PEs had user visible names.  Perhaps they
> match an enclosure location or something.  Group numbers are rather
> arbitrary and really have no guarantee of persistence.  Thanks, 

I agree. Make up something, for example domain[PE] or something like
that.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-13  2:29                           ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 122+ messages in thread
From: Benjamin Herrenschmidt @ 2012-12-13  2:29 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

On Wed, 2012-12-12 at 07:34 -0700, Alex Williamson wrote:
> > But what would I put there?... IOMMU ID is more than enough at the moment 
> > and struct iommu_table does not have anything what would have made sense to 
> > show in the sysfs...
> 
> I believe David mentioned that PEs had user visible names.  Perhaps they
> match an enclosure location or something.  Group numbers are rather
> arbitrary and really have no guarantee of persistence.  Thanks, 

I agree. Make up something, for example domain[PE] or something like
that.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-12 23:30                         ` Alex Williamson
@ 2012-12-13  2:39                           ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 122+ messages in thread
From: Benjamin Herrenschmidt @ 2012-12-13  2:39 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Paul Mackerras, linuxppc-dev, linux-kernel,
	kvm, David Gibson

On Wed, 2012-12-12 at 16:30 -0700, Alex Williamson wrote:

> Locked page accounting in this version is very, very broken.  How do
> powerpc folks feel about seemingly generic kernel iommu interfaces
> messing with the current task mm?  Besides that, more problems below...

Not good at all :-)

I don't understand tho ... H_PUT_TCE calls should be in the qemu context
(or the guest) as current at the point of the call, so everything should
be accounted fine on the *current* task when those calls occur, what's
the point of the work queue Alexey ?

This code looks horribly complicated ... where does it come from ?

> > +/*
> > + * iommu_reset_table is called when it started/stopped being used.
> > + *
> > + * restore==true says to bring the iommu_table into the state as it was
> > + * before being used by VFIO.
> > + */
> > +void iommu_reset_table(struct iommu_table *tbl, bool restore)
> > +{
> > +	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
> > +	if (!restore && (tbl->it_offset == 0))
> > +		clear_bit(0, tbl->it_map);
> > +
> > +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> 
> This does locked page accounting and unpins pages, even on startup when
> the pages aren't necessarily pinned or accounted against the current
> process.

Not sure what you mean Alex, and not sure either what Alexey
implementation actually does but indeed, pages inside an iommu table
that was used by the host don't have their refcount elevated by the fact
that they are there.

So when taking ownership of an iommu for vfio, you probably need to FAIL
if any page is already mapped. Only once you know the iommu is clear for
use, then you can start populating it and account for anything you put
in it (and de-account anything you remove from it when cleaning things
up).

> > +
> > +	/* ... or restore  */
> > +	if (restore && (tbl->it_offset == 0))
> > +		set_bit(0, tbl->it_map);
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_reset_table);
> > +
> > +/*
> > + * Returns the number of used IOMMU pages (4K) within
> > + * the same system page (4K or 64K).
> > + *
> > + * syspage_weight_zero is optimized for expected case == 0
> > + * syspage_weight_one is optimized for expected case > 1
> > + * Other case are not used in this file.
> > + */
> > +#if PAGE_SIZE == IOMMU_PAGE_SIZE
> > +
> > +#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
> > +#define syspage_weight_one(map, offset)		test_bit((map), (offset))
> > +
> > +#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
> > +
> > +static int syspage_weight_zero(unsigned long *map, unsigned long offset)
> > +{
> > +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> > +	return 0xffffUL & (map[BIT_WORD(offset)] >>
> > +			(offset & (BITS_PER_LONG-1)));
> > +}
> 
> I would have expected these to be bools and return true if the weight
> matches the value.

What is that business anyway ? It's very obscure.

> If you replaced 0xffff above w/ this, would you need the #error below?
> 
> (1UL << (PAGE_SIZE/IOMMU_PAGE_SIZE)) - 1)
> 
> > +
> > +static int syspage_weight_one(unsigned long *map, unsigned long offset)
> > +{
> > +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> > +
> > +	/* Aligns TCE entry number to system page boundary */
> > +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> > +
> > +	/* Count used 4K pages */
> > +	while (nbits && (ret < 2)) {
> 
> Don't you have a ffs()?  Could also be used for _zero.  Surely there are
> some bitops helpers that could help here even on big endian.  hweight
> really doesn't work?
> 
> > +		if (test_bit(offset, map))
> > +			++ret;
> > +
> > +		--nbits;
> > +		++offset;
> > +	}
> > +
> > +	return ret;
> > +}
> > +#else
> > +#error TODO: support other page size
> > +#endif

What combinations do you support ?

> > +static void tce_flush(struct iommu_table *tbl)
> > +{
> > +	/* Flush/invalidate TLB caches if necessary */
> > +	if (ppc_md.tce_flush)
> > +		ppc_md.tce_flush(tbl);
> > +
> > +	/* Make sure updates are seen by hardware */
> > +	mb();
> > +}
>> +
> > +/*
> > + * iommu_clear_tces clears tces and returned the number of system pages
> > + * which it called put_page() on
> > + */
> > +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> > +		unsigned long pages)
> > +{
> > +	int i, retpages = 0, clr;
> > +	unsigned long oldtce, oldweight;
> > +	struct page *page;
> > +
> > +	for (i = 0; i < pages; ++i, ++entry) {
> > +		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
> > +			continue;
> > +
> > +		oldtce = ppc_md.tce_get(tbl, entry);
> > +		ppc_md.tce_free(tbl, entry, 1);
> > +
> > +		oldweight = syspage_weight_one(tbl->it_map,
> > +				entry - tbl->it_offset);
> > +		clr = __test_and_clear_bit(entry - tbl->it_offset,
> > +				tbl->it_map);
> > +
> > +		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
> > +			continue;
> > +
> > +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > +
> > +		if (WARN_ON(!page))
> > +			continue;
> > +
> > +		if (oldtce & TCE_PCI_WRITE)
> > +			SetPageDirty(page);
> > +
> > +		put_page(page);
> > +
> > +		/* That was the last IOMMU page within the system page */
> > +		if ((oldweight == 1) && clr)
> > +			++retpages;
> > +	}
> > +
> > +	return retpages;
> > +}
> > +
> > +/*
> > + * iommu_clear_tces clears tces and returned the number
> > + * of released system pages
> > + */
> > +long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
> > +		unsigned long size)
> > +{
> > +	int ret;
> > +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> > +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> > +	struct iommu_pool *pool = get_pool(tbl, entry);
> > +
> > +	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
> > +		return -EINVAL;
> > +
> > +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> > +			<< IOMMU_PAGE_SHIFT))
> > +		return -EINVAL;
> > +
> > +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> > +		return -EINVAL;
> > +
> > +	spin_lock(&(pool->lock));
> > +	ret = clear_tces_nolock(tbl, entry, npages);
> > +	tce_flush(tbl);
> > +	spin_unlock(&(pool->lock));

Why are you messing with the pools and their locks ? These are only
relevant for the in-kernel use of the table. The table should be locked
out of kernel use when given to vfio (we could add a flag to make any
kernel dma mapping attempt to fail).

> > +	if (ret > 0) {
> > +		lock_acct(-ret);
> > +		return 0;
> > +	}
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> > +
> > +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > +		uint64_t tce, enum dma_data_direction direction)
> > +{
> > +	int ret;
> > +	struct page *page = NULL;
> > +	unsigned long kva, offset, oldweight;
> > +
> > +	/* Map new TCE */
> > +	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
> > +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > +			direction != DMA_TO_DEVICE, &page);
> > +	if (ret != 1) {
> > +		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> > +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> > +		return -EFAULT;
> > +	}
> > +
> > +	kva = (unsigned long) page_address(page);
> > +	kva += offset;
> > +
> > +	/* tce_build receives a virtual address */
> > +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > +
> > +	/* tce_build() only returns non-zero for transient errors */
> > +	if (unlikely(ret)) {
> > +		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> > +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > +		put_page(page);
> > +		return -EIO;
> > +	}
> > +
> > +	/* Calculate if new system page has been locked */
> > +	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
> > +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> > +
> > +	return (oldweight == 0) ? 1 : 0;
> > +}
> > +
> > +/*
> > + * iommu_put_tces builds tces and returned the number of actually
> > + * locked system pages
> > + */
> > +long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
> > +		uint64_t tce, enum dma_data_direction direction,
> > +		unsigned long size)
> > +{
> > +	int i, ret = 0, retpages = 0;
> > +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> > +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> > +	struct iommu_pool *pool = get_pool(tbl, entry);
> > +	unsigned long locked, lock_limit;
> > +
> > +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > +	BUG_ON(direction == DMA_NONE);
> > +
> > +	if ((size & ~IOMMU_PAGE_MASK) ||
> > +			(ioba & ~IOMMU_PAGE_MASK) ||
> > +			(tce & ~IOMMU_PAGE_MASK))
> > +		return -EINVAL;
> > +
> > +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> > +			 << IOMMU_PAGE_SHIFT))
> > +		return -EINVAL;
> > +
> > +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> > +		return -EINVAL;
> > +
> > +	/* Account for locked pages */
> > +	locked = current->mm->locked_vm +
> > +		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);
> 
> Looks like we just over penalize upfront and correct when mapped, that's
> better, but not great.
> 
> > +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> > +		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> > +				rlimit(RLIMIT_MEMLOCK));
> > +		return -ENOMEM;
> > +	}
> > +
> > +	spin_lock(&(pool->lock));
> > +
> > +	/* Check if any is in use */
> > +	for (i = 0; i < npages; ++i) {
> > +		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
> > +			spin_unlock(&(pool->lock));
> > +			return -EBUSY;
> > +		}
> > +	}
> > +
> > +	/* Put tces to the table */
> > +	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> > +		ret = put_tce(tbl, entry + i, tce, direction);
> > +		if (ret == 1)
> > +			++retpages;
> > +	}
> > +
> > +	/*
> > +	 * If failed, release locked pages, otherwise return the number
> > +	 * of locked system pages
> > +	 */
> > +	if (ret < 0) {
> > +		clear_tces_nolock(tbl, entry, i);
> > +	} else {
> > +		if (retpages)
> > +			lock_acct(retpages);
> > +		ret = 0;
> > +	}
> 
> Bug, if it fails we clear, which decrements our locked pages, but we
> haven't incremented them yet.  Thanks,
> 
> Alex
> 
> > +
> > +	tce_flush(tbl);
> > +	spin_unlock(&(pool->lock));
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_put_tces);
> > +
> > +#endif /* CONFIG_IOMMU_API */
> > diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> > index 05205cf..1b970bf 100644
> > --- a/arch/powerpc/platforms/powernv/pci.c
> > +++ b/arch/powerpc/platforms/powernv/pci.c
> > @@ -20,6 +20,7 @@
> >  #include <linux/irq.h>
> >  #include <linux/io.h>
> >  #include <linux/msi.h>
> > +#include <linux/iommu.h>
> >  
> >  #include <asm/sections.h>
> >  #include <asm/io.h>
> > @@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
> >  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> >  #endif
> >  }
> > +
> > +#ifdef CONFIG_IOMMU_API
> > +/*
> > + * IOMMU groups support required by VFIO
> > + */
> > +static int add_device(struct device *dev)
> > +{
> > +	struct iommu_table *tbl;
> > +	int ret = 0;
> > +
> > +	if (WARN_ON(dev->iommu_group)) {
> > +		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
> > +				dev_name(dev),
> > +				iommu_group_id(dev->iommu_group));
> > +		return -EBUSY;
> > +	}
> > +
> > +	tbl = get_iommu_table_base(dev);
> > +	if (!tbl) {
> > +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> > +				dev_name(dev));
> > +		return 0;
> > +	}
> > +
> > +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> > +			dev_name(dev), iommu_group_id(tbl->it_group));
> > +
> > +	ret = iommu_group_add_device(tbl->it_group, dev);
> > +	if (ret < 0)
> > +		pr_err("tce_vfio: %s has not been added, ret=%d\n",
> > +				dev_name(dev), ret);
> > +
> > +	return ret;
> > +}
> > +
> > +static void del_device(struct device *dev)
> > +{
> > +	iommu_group_remove_device(dev);
> > +}
> > +
> > +static int iommu_bus_notifier(struct notifier_block *nb,
> > +			      unsigned long action, void *data)
> > +{
> > +	struct device *dev = data;
> > +
> > +	switch (action) {
> > +	case BUS_NOTIFY_ADD_DEVICE:
> > +		return add_device(dev);
> > +	case BUS_NOTIFY_DEL_DEVICE:
> > +		del_device(dev);
> > +		return 0;
> > +	default:
> > +		return 0;
> > +	}
> > +}
> > +
> > +static struct notifier_block tce_iommu_bus_nb = {
> > +	.notifier_call = iommu_bus_notifier,
> > +};
> > +
> > +static void group_release(void *iommu_data)
> > +{
> > +	struct iommu_table *tbl = iommu_data;
> > +	tbl->it_group = NULL;
> > +}
> > +
> > +static int __init tce_iommu_init(void)
> > +{
> > +	struct pci_dev *pdev = NULL;
> > +	struct iommu_table *tbl;
> > +	struct iommu_group *grp;
> > +
> > +	/* Allocate and initialize IOMMU groups */
> > +	for_each_pci_dev(pdev) {
> > +		tbl = get_iommu_table_base(&pdev->dev);
> > +		if (!tbl)
> > +			continue;
> > +
> > +		/* Skip already initialized */
> > +		if (tbl->it_group)
> > +			continue;
> > +
> > +		grp = iommu_group_alloc();
> > +		if (IS_ERR(grp)) {
> > +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> > +					PTR_ERR(grp));
> > +			return PTR_ERR(grp);
> > +		}
> > +		tbl->it_group = grp;
> > +		iommu_group_set_iommudata(grp, tbl, group_release);
> > +	}
> > +
> > +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> > +
> > +	/* Add PCI devices to VFIO groups */
> > +	for_each_pci_dev(pdev)
> > +		add_device(&pdev->dev);
> > +
> > +	return 0;
> > +}
> > +
> > +static void __exit tce_iommu_cleanup(void)
> > +{
> > +	struct pci_dev *pdev = NULL;
> > +	struct iommu_table *tbl;
> > +	struct iommu_group *grp = NULL;
> > +
> > +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> > +
> > +	/* Delete PCI devices from VFIO groups */
> > +	for_each_pci_dev(pdev)
> > +		del_device(&pdev->dev);
> > +
> > +	/* Release VFIO groups */
> > +	for_each_pci_dev(pdev) {
> > +		tbl = get_iommu_table_base(&pdev->dev);
> > +		if (!tbl)
> > +			continue;
> > +		grp = tbl->it_group;
> > +
> > +		/* Skip (already) uninitialized */
> > +		if (!grp)
> > +			continue;
> > +
> > +		/* Do actual release, group_release() is expected to work */
> > +		iommu_group_put(grp);
> > +		BUG_ON(tbl->it_group);
> > +	}
> > +}
> > +
> > +module_init(tce_iommu_init);
> > +module_exit(tce_iommu_cleanup);
> > +#endif /* CONFIG_IOMMU_API */
> > diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> > index 9f69b56..29d11dc 100644
> > --- a/drivers/iommu/Kconfig
> > +++ b/drivers/iommu/Kconfig
> > @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
> >  
> >  	  Say N unless you need kernel log message for IOMMU debugging
> >  
> > +config SPAPR_TCE_IOMMU
> > +	bool "sPAPR TCE IOMMU Support"
> > +	depends on PPC_POWERNV
> > +	select IOMMU_API
> > +	help
> > +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> > +	  still not implemented.
> > +
> >  endif # IOMMU_SUPPORT
> 
> 



^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-13  2:39                           ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 122+ messages in thread
From: Benjamin Herrenschmidt @ 2012-12-13  2:39 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

On Wed, 2012-12-12 at 16:30 -0700, Alex Williamson wrote:

> Locked page accounting in this version is very, very broken.  How do
> powerpc folks feel about seemingly generic kernel iommu interfaces
> messing with the current task mm?  Besides that, more problems below...

Not good at all :-)

I don't understand tho ... H_PUT_TCE calls should be in the qemu context
(or the guest) as current at the point of the call, so everything should
be accounted fine on the *current* task when those calls occur, what's
the point of the work queue Alexey ?

This code looks horribly complicated ... where does it come from ?

> > +/*
> > + * iommu_reset_table is called when it started/stopped being used.
> > + *
> > + * restore==true says to bring the iommu_table into the state as it was
> > + * before being used by VFIO.
> > + */
> > +void iommu_reset_table(struct iommu_table *tbl, bool restore)
> > +{
> > +	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
> > +	if (!restore && (tbl->it_offset == 0))
> > +		clear_bit(0, tbl->it_map);
> > +
> > +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> 
> This does locked page accounting and unpins pages, even on startup when
> the pages aren't necessarily pinned or accounted against the current
> process.

Not sure what you mean Alex, and not sure either what Alexey
implementation actually does but indeed, pages inside an iommu table
that was used by the host don't have their refcount elevated by the fact
that they are there.

So when taking ownership of an iommu for vfio, you probably need to FAIL
if any page is already mapped. Only once you know the iommu is clear for
use, then you can start populating it and account for anything you put
in it (and de-account anything you remove from it when cleaning things
up).

> > +
> > +	/* ... or restore  */
> > +	if (restore && (tbl->it_offset == 0))
> > +		set_bit(0, tbl->it_map);
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_reset_table);
> > +
> > +/*
> > + * Returns the number of used IOMMU pages (4K) within
> > + * the same system page (4K or 64K).
> > + *
> > + * syspage_weight_zero is optimized for expected case == 0
> > + * syspage_weight_one is optimized for expected case > 1
> > + * Other case are not used in this file.
> > + */
> > +#if PAGE_SIZE == IOMMU_PAGE_SIZE
> > +
> > +#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
> > +#define syspage_weight_one(map, offset)		test_bit((map), (offset))
> > +
> > +#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
> > +
> > +static int syspage_weight_zero(unsigned long *map, unsigned long offset)
> > +{
> > +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> > +	return 0xffffUL & (map[BIT_WORD(offset)] >>
> > +			(offset & (BITS_PER_LONG-1)));
> > +}
> 
> I would have expected these to be bools and return true if the weight
> matches the value.

What is that business anyway ? It's very obscure.

> If you replaced 0xffff above w/ this, would you need the #error below?
> 
> (1UL << (PAGE_SIZE/IOMMU_PAGE_SIZE)) - 1)
> 
> > +
> > +static int syspage_weight_one(unsigned long *map, unsigned long offset)
> > +{
> > +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> > +
> > +	/* Aligns TCE entry number to system page boundary */
> > +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> > +
> > +	/* Count used 4K pages */
> > +	while (nbits && (ret < 2)) {
> 
> Don't you have a ffs()?  Could also be used for _zero.  Surely there are
> some bitops helpers that could help here even on big endian.  hweight
> really doesn't work?
> 
> > +		if (test_bit(offset, map))
> > +			++ret;
> > +
> > +		--nbits;
> > +		++offset;
> > +	}
> > +
> > +	return ret;
> > +}
> > +#else
> > +#error TODO: support other page size
> > +#endif

What combinations do you support ?

> > +static void tce_flush(struct iommu_table *tbl)
> > +{
> > +	/* Flush/invalidate TLB caches if necessary */
> > +	if (ppc_md.tce_flush)
> > +		ppc_md.tce_flush(tbl);
> > +
> > +	/* Make sure updates are seen by hardware */
> > +	mb();
> > +}
>> +
> > +/*
> > + * iommu_clear_tces clears tces and returned the number of system pages
> > + * which it called put_page() on
> > + */
> > +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> > +		unsigned long pages)
> > +{
> > +	int i, retpages = 0, clr;
> > +	unsigned long oldtce, oldweight;
> > +	struct page *page;
> > +
> > +	for (i = 0; i < pages; ++i, ++entry) {
> > +		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
> > +			continue;
> > +
> > +		oldtce = ppc_md.tce_get(tbl, entry);
> > +		ppc_md.tce_free(tbl, entry, 1);
> > +
> > +		oldweight = syspage_weight_one(tbl->it_map,
> > +				entry - tbl->it_offset);
> > +		clr = __test_and_clear_bit(entry - tbl->it_offset,
> > +				tbl->it_map);
> > +
> > +		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
> > +			continue;
> > +
> > +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> > +
> > +		if (WARN_ON(!page))
> > +			continue;
> > +
> > +		if (oldtce & TCE_PCI_WRITE)
> > +			SetPageDirty(page);
> > +
> > +		put_page(page);
> > +
> > +		/* That was the last IOMMU page within the system page */
> > +		if ((oldweight == 1) && clr)
> > +			++retpages;
> > +	}
> > +
> > +	return retpages;
> > +}
> > +
> > +/*
> > + * iommu_clear_tces clears tces and returned the number
> > + * of released system pages
> > + */
> > +long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
> > +		unsigned long size)
> > +{
> > +	int ret;
> > +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> > +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> > +	struct iommu_pool *pool = get_pool(tbl, entry);
> > +
> > +	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
> > +		return -EINVAL;
> > +
> > +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> > +			<< IOMMU_PAGE_SHIFT))
> > +		return -EINVAL;
> > +
> > +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> > +		return -EINVAL;
> > +
> > +	spin_lock(&(pool->lock));
> > +	ret = clear_tces_nolock(tbl, entry, npages);
> > +	tce_flush(tbl);
> > +	spin_unlock(&(pool->lock));

Why are you messing with the pools and their locks ? These are only
relevant for the in-kernel use of the table. The table should be locked
out of kernel use when given to vfio (we could add a flag to make any
kernel dma mapping attempt to fail).

> > +	if (ret > 0) {
> > +		lock_acct(-ret);
> > +		return 0;
> > +	}
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> > +
> > +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> > +		uint64_t tce, enum dma_data_direction direction)
> > +{
> > +	int ret;
> > +	struct page *page = NULL;
> > +	unsigned long kva, offset, oldweight;
> > +
> > +	/* Map new TCE */
> > +	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
> > +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> > +			direction != DMA_TO_DEVICE, &page);
> > +	if (ret != 1) {
> > +		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> > +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> > +		return -EFAULT;
> > +	}
> > +
> > +	kva = (unsigned long) page_address(page);
> > +	kva += offset;
> > +
> > +	/* tce_build receives a virtual address */
> > +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> > +
> > +	/* tce_build() only returns non-zero for transient errors */
> > +	if (unlikely(ret)) {
> > +		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> > +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> > +		put_page(page);
> > +		return -EIO;
> > +	}
> > +
> > +	/* Calculate if new system page has been locked */
> > +	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
> > +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> > +
> > +	return (oldweight == 0) ? 1 : 0;
> > +}
> > +
> > +/*
> > + * iommu_put_tces builds tces and returned the number of actually
> > + * locked system pages
> > + */
> > +long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
> > +		uint64_t tce, enum dma_data_direction direction,
> > +		unsigned long size)
> > +{
> > +	int i, ret = 0, retpages = 0;
> > +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> > +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> > +	struct iommu_pool *pool = get_pool(tbl, entry);
> > +	unsigned long locked, lock_limit;
> > +
> > +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> > +	BUG_ON(direction == DMA_NONE);
> > +
> > +	if ((size & ~IOMMU_PAGE_MASK) ||
> > +			(ioba & ~IOMMU_PAGE_MASK) ||
> > +			(tce & ~IOMMU_PAGE_MASK))
> > +		return -EINVAL;
> > +
> > +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> > +			 << IOMMU_PAGE_SHIFT))
> > +		return -EINVAL;
> > +
> > +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> > +		return -EINVAL;
> > +
> > +	/* Account for locked pages */
> > +	locked = current->mm->locked_vm +
> > +		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);
> 
> Looks like we just over penalize upfront and correct when mapped, that's
> better, but not great.
> 
> > +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> > +		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> > +				rlimit(RLIMIT_MEMLOCK));
> > +		return -ENOMEM;
> > +	}
> > +
> > +	spin_lock(&(pool->lock));
> > +
> > +	/* Check if any is in use */
> > +	for (i = 0; i < npages; ++i) {
> > +		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
> > +			spin_unlock(&(pool->lock));
> > +			return -EBUSY;
> > +		}
> > +	}
> > +
> > +	/* Put tces to the table */
> > +	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> > +		ret = put_tce(tbl, entry + i, tce, direction);
> > +		if (ret == 1)
> > +			++retpages;
> > +	}
> > +
> > +	/*
> > +	 * If failed, release locked pages, otherwise return the number
> > +	 * of locked system pages
> > +	 */
> > +	if (ret < 0) {
> > +		clear_tces_nolock(tbl, entry, i);
> > +	} else {
> > +		if (retpages)
> > +			lock_acct(retpages);
> > +		ret = 0;
> > +	}
> 
> Bug, if it fails we clear, which decrements our locked pages, but we
> haven't incremented them yet.  Thanks,
> 
> Alex
> 
> > +
> > +	tce_flush(tbl);
> > +	spin_unlock(&(pool->lock));
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_put_tces);
> > +
> > +#endif /* CONFIG_IOMMU_API */
> > diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> > index 05205cf..1b970bf 100644
> > --- a/arch/powerpc/platforms/powernv/pci.c
> > +++ b/arch/powerpc/platforms/powernv/pci.c
> > @@ -20,6 +20,7 @@
> >  #include <linux/irq.h>
> >  #include <linux/io.h>
> >  #include <linux/msi.h>
> > +#include <linux/iommu.h>
> >  
> >  #include <asm/sections.h>
> >  #include <asm/io.h>
> > @@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
> >  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> >  #endif
> >  }
> > +
> > +#ifdef CONFIG_IOMMU_API
> > +/*
> > + * IOMMU groups support required by VFIO
> > + */
> > +static int add_device(struct device *dev)
> > +{
> > +	struct iommu_table *tbl;
> > +	int ret = 0;
> > +
> > +	if (WARN_ON(dev->iommu_group)) {
> > +		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
> > +				dev_name(dev),
> > +				iommu_group_id(dev->iommu_group));
> > +		return -EBUSY;
> > +	}
> > +
> > +	tbl = get_iommu_table_base(dev);
> > +	if (!tbl) {
> > +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> > +				dev_name(dev));
> > +		return 0;
> > +	}
> > +
> > +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> > +			dev_name(dev), iommu_group_id(tbl->it_group));
> > +
> > +	ret = iommu_group_add_device(tbl->it_group, dev);
> > +	if (ret < 0)
> > +		pr_err("tce_vfio: %s has not been added, ret=%d\n",
> > +				dev_name(dev), ret);
> > +
> > +	return ret;
> > +}
> > +
> > +static void del_device(struct device *dev)
> > +{
> > +	iommu_group_remove_device(dev);
> > +}
> > +
> > +static int iommu_bus_notifier(struct notifier_block *nb,
> > +			      unsigned long action, void *data)
> > +{
> > +	struct device *dev = data;
> > +
> > +	switch (action) {
> > +	case BUS_NOTIFY_ADD_DEVICE:
> > +		return add_device(dev);
> > +	case BUS_NOTIFY_DEL_DEVICE:
> > +		del_device(dev);
> > +		return 0;
> > +	default:
> > +		return 0;
> > +	}
> > +}
> > +
> > +static struct notifier_block tce_iommu_bus_nb = {
> > +	.notifier_call = iommu_bus_notifier,
> > +};
> > +
> > +static void group_release(void *iommu_data)
> > +{
> > +	struct iommu_table *tbl = iommu_data;
> > +	tbl->it_group = NULL;
> > +}
> > +
> > +static int __init tce_iommu_init(void)
> > +{
> > +	struct pci_dev *pdev = NULL;
> > +	struct iommu_table *tbl;
> > +	struct iommu_group *grp;
> > +
> > +	/* Allocate and initialize IOMMU groups */
> > +	for_each_pci_dev(pdev) {
> > +		tbl = get_iommu_table_base(&pdev->dev);
> > +		if (!tbl)
> > +			continue;
> > +
> > +		/* Skip already initialized */
> > +		if (tbl->it_group)
> > +			continue;
> > +
> > +		grp = iommu_group_alloc();
> > +		if (IS_ERR(grp)) {
> > +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> > +					PTR_ERR(grp));
> > +			return PTR_ERR(grp);
> > +		}
> > +		tbl->it_group = grp;
> > +		iommu_group_set_iommudata(grp, tbl, group_release);
> > +	}
> > +
> > +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> > +
> > +	/* Add PCI devices to VFIO groups */
> > +	for_each_pci_dev(pdev)
> > +		add_device(&pdev->dev);
> > +
> > +	return 0;
> > +}
> > +
> > +static void __exit tce_iommu_cleanup(void)
> > +{
> > +	struct pci_dev *pdev = NULL;
> > +	struct iommu_table *tbl;
> > +	struct iommu_group *grp = NULL;
> > +
> > +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> > +
> > +	/* Delete PCI devices from VFIO groups */
> > +	for_each_pci_dev(pdev)
> > +		del_device(&pdev->dev);
> > +
> > +	/* Release VFIO groups */
> > +	for_each_pci_dev(pdev) {
> > +		tbl = get_iommu_table_base(&pdev->dev);
> > +		if (!tbl)
> > +			continue;
> > +		grp = tbl->it_group;
> > +
> > +		/* Skip (already) uninitialized */
> > +		if (!grp)
> > +			continue;
> > +
> > +		/* Do actual release, group_release() is expected to work */
> > +		iommu_group_put(grp);
> > +		BUG_ON(tbl->it_group);
> > +	}
> > +}
> > +
> > +module_init(tce_iommu_init);
> > +module_exit(tce_iommu_cleanup);
> > +#endif /* CONFIG_IOMMU_API */
> > diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> > index 9f69b56..29d11dc 100644
> > --- a/drivers/iommu/Kconfig
> > +++ b/drivers/iommu/Kconfig
> > @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
> >  
> >  	  Say N unless you need kernel log message for IOMMU debugging
> >  
> > +config SPAPR_TCE_IOMMU
> > +	bool "sPAPR TCE IOMMU Support"
> > +	depends on PPC_POWERNV
> > +	select IOMMU_API
> > +	help
> > +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> > +	  still not implemented.
> > +
> >  endif # IOMMU_SUPPORT
> 
> 

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-12 23:30                         ` Alex Williamson
@ 2012-12-13  2:57                           ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 122+ messages in thread
From: Benjamin Herrenschmidt @ 2012-12-13  2:57 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Paul Mackerras, linuxppc-dev, linux-kernel,
	kvm, David Gibson

On Wed, 2012-12-12 at 16:30 -0700, Alex Williamson wrote:
> Locked page accounting in this version is very, very broken.  How do
> powerpc folks feel about seemingly generic kernel iommu interfaces
> messing with the current task mm?  Besides that, more problems
> below...

After a second look & thought...

This whole accounting business is fucked. First, we simply can't just
randomly return errors from H_PUT_TCE because the process reached some
rlimit. This is not a proper failure mode. That means that the guest
will probably panic() ... possibly right in the middle of some disk
writeback or god knows what. Not good.

Also the overhead of doing all that crap on every TCE map/unmap is
ridiculous.

Finally, it's just not going to work for real mode which we really want,
since we can't take the mmap-sem in real mode anyway, so unless we
convert that counter to an atomic, we can't do it.

I'd suggest just not bothering, or if you want to bother, check once
when creating a TCE table that the rlimit is enough to bolt as many
pages as can be populated in that table and fail to create *that*. The
failure mode is much better, ie, qemu failing to create a PCI bus due to
insufficient rlimits.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-13  2:57                           ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 122+ messages in thread
From: Benjamin Herrenschmidt @ 2012-12-13  2:57 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

On Wed, 2012-12-12 at 16:30 -0700, Alex Williamson wrote:
> Locked page accounting in this version is very, very broken.  How do
> powerpc folks feel about seemingly generic kernel iommu interfaces
> messing with the current task mm?  Besides that, more problems
> below...

After a second look & thought...

This whole accounting business is fucked. First, we simply can't just
randomly return errors from H_PUT_TCE because the process reached some
rlimit. This is not a proper failure mode. That means that the guest
will probably panic() ... possibly right in the middle of some disk
writeback or god knows what. Not good.

Also the overhead of doing all that crap on every TCE map/unmap is
ridiculous.

Finally, it's just not going to work for real mode which we really want,
since we can't take the mmap-sem in real mode anyway, so unless we
convert that counter to an atomic, we can't do it.

I'd suggest just not bothering, or if you want to bother, check once
when creating a TCE table that the rlimit is enough to bolt as many
pages as can be populated in that table and fail to create *that*. The
failure mode is much better, ie, qemu failing to create a PCI bus due to
insufficient rlimits.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-13  2:57                           ` Benjamin Herrenschmidt
@ 2012-12-13  3:22                             ` Alex Williamson
  -1 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-13  3:22 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Alexey Kardashevskiy, Paul Mackerras, linuxppc-dev, linux-kernel,
	kvm, David Gibson

On Thu, 2012-12-13 at 13:57 +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2012-12-12 at 16:30 -0700, Alex Williamson wrote:
> > Locked page accounting in this version is very, very broken.  How do
> > powerpc folks feel about seemingly generic kernel iommu interfaces
> > messing with the current task mm?  Besides that, more problems
> > below...
> 
> After a second look & thought...
> 
> This whole accounting business is fucked. First, we simply can't just
> randomly return errors from H_PUT_TCE because the process reached some
> rlimit. This is not a proper failure mode. That means that the guest
> will probably panic() ... possibly right in the middle of some disk
> writeback or god knows what. Not good.
> 
> Also the overhead of doing all that crap on every TCE map/unmap is
> ridiculous.
> 
> Finally, it's just not going to work for real mode which we really want,
> since we can't take the mmap-sem in real mode anyway, so unless we
> convert that counter to an atomic, we can't do it.
> 
> I'd suggest just not bothering, or if you want to bother, check once
> when creating a TCE table that the rlimit is enough to bolt as many
> pages as can be populated in that table and fail to create *that*. The
> failure mode is much better, ie, qemu failing to create a PCI bus due to
> insufficient rlimits.

I agree, we don't seem to be headed in the right direction.  x86 needs
to track rlimits or else a user can exploit the interface to pin all the
memory in the system.  On power, only the iova window can be pinned, so
it's a fixed amount.  I could see it as granting access to a group
implicitly grants access to pinning the iova window.  We can still make
it more explicit by handling the rlimit accounting upfront.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-13  3:22                             ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-13  3:22 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson

On Thu, 2012-12-13 at 13:57 +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2012-12-12 at 16:30 -0700, Alex Williamson wrote:
> > Locked page accounting in this version is very, very broken.  How do
> > powerpc folks feel about seemingly generic kernel iommu interfaces
> > messing with the current task mm?  Besides that, more problems
> > below...
> 
> After a second look & thought...
> 
> This whole accounting business is fucked. First, we simply can't just
> randomly return errors from H_PUT_TCE because the process reached some
> rlimit. This is not a proper failure mode. That means that the guest
> will probably panic() ... possibly right in the middle of some disk
> writeback or god knows what. Not good.
> 
> Also the overhead of doing all that crap on every TCE map/unmap is
> ridiculous.
> 
> Finally, it's just not going to work for real mode which we really want,
> since we can't take the mmap-sem in real mode anyway, so unless we
> convert that counter to an atomic, we can't do it.
> 
> I'd suggest just not bothering, or if you want to bother, check once
> when creating a TCE table that the rlimit is enough to bolt as many
> pages as can be populated in that table and fail to create *that*. The
> failure mode is much better, ie, qemu failing to create a PCI bus due to
> insufficient rlimits.

I agree, we don't seem to be headed in the right direction.  x86 needs
to track rlimits or else a user can exploit the interface to pin all the
memory in the system.  On power, only the iova window can be pinned, so
it's a fixed amount.  I could see it as granting access to a group
implicitly grants access to pinning the iova window.  We can still make
it more explicit by handling the rlimit accounting upfront.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
  2012-12-13  2:29                           ` Benjamin Herrenschmidt
@ 2012-12-13  6:27                             ` Alexey Kardashevskiy
  -1 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-13  6:27 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Alex Williamson, Paul Mackerras, linuxppc-dev, linux-kernel, kvm,
	David Gibson

On 13/12/12 13:29, Benjamin Herrenschmidt wrote:
> On Wed, 2012-12-12 at 07:34 -0700, Alex Williamson wrote:
>>> But what would I put there?... IOMMU ID is more than enough at the moment
>>> and struct iommu_table does not have anything what would have made sense to
>>> show in the sysfs...
>>
>> I believe David mentioned that PEs had user visible names.  Perhaps they
>> match an enclosure location or something.  Group numbers are rather
>> arbitrary and really have no guarantee of persistence.  Thanks,
>
> I agree. Make up something, for example domain[PE] or something like
> that.

To be able to add a PE number, I need to call iommu_group_alloc() in the 
correct place where I know this number OR I have to carry it in iommu_table 
till the moment the iommu_group_alloc() is called (acceptable but not cool).

I will post a patch which would help as a response to this mail.


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] vfio powerpc: enabled on powernv platform
@ 2012-12-13  6:27                             ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-13  6:27 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: kvm, linux-kernel, Alex Williamson, Paul Mackerras, linuxppc-dev,
	David Gibson

On 13/12/12 13:29, Benjamin Herrenschmidt wrote:
> On Wed, 2012-12-12 at 07:34 -0700, Alex Williamson wrote:
>>> But what would I put there?... IOMMU ID is more than enough at the moment
>>> and struct iommu_table does not have anything what would have made sense to
>>> show in the sysfs...
>>
>> I believe David mentioned that PEs had user visible names.  Perhaps they
>> match an enclosure location or something.  Group numbers are rather
>> arbitrary and really have no guarantee of persistence.  Thanks,
>
> I agree. Make up something, for example domain[PE] or something like
> that.

To be able to add a PE number, I need to call iommu_group_alloc() in the 
correct place where I know this number OR I have to carry it in iommu_table 
till the moment the iommu_group_alloc() is called (acceptable but not cool).

I will post a patch which would help as a response to this mail.


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* [PATCH] iommu: moving initialization earlier
  2012-12-13  2:29                           ` Benjamin Herrenschmidt
  (?)
  (?)
@ 2012-12-13  6:28                           ` Alexey Kardashevskiy
  2012-12-13 15:48                               ` Alex Williamson
  -1 siblings, 1 reply; 122+ messages in thread
From: Alexey Kardashevskiy @ 2012-12-13  6:28 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, linux-kernel

The iommu_init() call initializes IOMMU internal structures and data
required for the API to function such as iommu_group_alloc().
It is registered as a subsys_initcall.

One of the IOMMU users is a PCI subsystem on POWER which discovers new
IOMMU tables during the PCI scan so the most logical place to call
iommu_group_alloc() is when a new group is just discovered. However
PCI scan is done from subsys_initcall hook as well what makes
using of the IOMMU API impossible.

The patch moves IOMMU subsystem initialization one step earlier.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/iommu/iommu.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index de857bb..b0afd3d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -865,7 +865,7 @@ printk("%s %u\n", __func__, __LINE__);
 
 	return 0;
 }
-subsys_initcall(iommu_init);
+arch_initcall(iommu_init);
 
 int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 122+ messages in thread

* Re: [PATCH] iommu: moving initialization earlier
@ 2012-12-13 15:48                               ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-13 15:48 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Benjamin Herrenschmidt, linux-kernel, iommu, joro

Probably a good idea to CC the iommu list and maintainer...

On Thu, 2012-12-13 at 17:28 +1100, Alexey Kardashevskiy wrote:
> The iommu_init() call initializes IOMMU internal structures and data
> required for the API to function such as iommu_group_alloc().
> It is registered as a subsys_initcall.
> 
> One of the IOMMU users is a PCI subsystem on POWER which discovers new
> IOMMU tables during the PCI scan so the most logical place to call
> iommu_group_alloc() is when a new group is just discovered. However
> PCI scan is done from subsys_initcall hook as well what makes
> using of the IOMMU API impossible.
> 
> The patch moves IOMMU subsystem initialization one step earlier.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  drivers/iommu/iommu.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index de857bb..b0afd3d 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -865,7 +865,7 @@ printk("%s %u\n", __func__, __LINE__);
>  
>  	return 0;
>  }
> -subsys_initcall(iommu_init);
> +arch_initcall(iommu_init);
>  
>  int iommu_domain_get_attr(struct iommu_domain *domain,
>  			  enum iommu_attr attr, void *data)




^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] iommu: moving initialization earlier
@ 2012-12-13 15:48                               ` Alex Williamson
  0 siblings, 0 replies; 122+ messages in thread
From: Alex Williamson @ 2012-12-13 15:48 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Benjamin Herrenschmidt, iommu, linux-kernel-u79uwXL29TY76Z2rM5mHXA

Probably a good idea to CC the iommu list and maintainer...

On Thu, 2012-12-13 at 17:28 +1100, Alexey Kardashevskiy wrote:
> The iommu_init() call initializes IOMMU internal structures and data
> required for the API to function such as iommu_group_alloc().
> It is registered as a subsys_initcall.
> 
> One of the IOMMU users is a PCI subsystem on POWER which discovers new
> IOMMU tables during the PCI scan so the most logical place to call
> iommu_group_alloc() is when a new group is just discovered. However
> PCI scan is done from subsys_initcall hook as well what makes
> using of the IOMMU API impossible.
> 
> The patch moves IOMMU subsystem initialization one step earlier.
> 
> Signed-off-by: Alexey Kardashevskiy <aik-sLpHqDYs0B2HXe+LvDLADg@public.gmane.org>
> ---
>  drivers/iommu/iommu.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index de857bb..b0afd3d 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -865,7 +865,7 @@ printk("%s %u\n", __func__, __LINE__);
>  
>  	return 0;
>  }
> -subsys_initcall(iommu_init);
> +arch_initcall(iommu_init);
>  
>  int iommu_domain_get_attr(struct iommu_domain *domain,
>  			  enum iommu_attr attr, void *data)

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] iommu: moving initialization earlier
@ 2012-12-16 11:20                                 ` Joerg Roedel
  0 siblings, 0 replies; 122+ messages in thread
From: Joerg Roedel @ 2012-12-16 11:20 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, linux-kernel, iommu

Alexey,

On Thu, Dec 13, 2012 at 08:48:55AM -0700, Alex Williamson wrote:
> Probably a good idea to CC the iommu list and maintainer...
> 
> On Thu, 2012-12-13 at 17:28 +1100, Alexey Kardashevskiy wrote:
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Please resend the patch when the merge-window is closed.

Thanks,

	Joerg



^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] iommu: moving initialization earlier
@ 2012-12-16 11:20                                 ` Joerg Roedel
  0 siblings, 0 replies; 122+ messages in thread
From: Joerg Roedel @ 2012-12-16 11:20 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, iommu,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Alexey,

On Thu, Dec 13, 2012 at 08:48:55AM -0700, Alex Williamson wrote:
> Probably a good idea to CC the iommu list and maintainer...
> 
> On Thu, 2012-12-13 at 17:28 +1100, Alexey Kardashevskiy wrote:
> > Signed-off-by: Alexey Kardashevskiy <aik-sLpHqDYs0B2HXe+LvDLADg@public.gmane.org>

Please resend the patch when the merge-window is closed.

Thanks,

	Joerg

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] iommu: moving initialization earlier
@ 2013-01-04  8:21                                   ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2013-01-04  8:21 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: linux-kernel, iommu

On 16/12/12 22:20, Joerg Roedel wrote:
> Alexey,
>
> On Thu, Dec 13, 2012 at 08:48:55AM -0700, Alex Williamson wrote:
>> Probably a good idea to CC the iommu list and maintainer...
>>
>> On Thu, 2012-12-13 at 17:28 +1100, Alexey Kardashevskiy wrote:
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> Please resend the patch when the merge-window is closed.

is it closed now? not sure I entirely understand what window you kept in 
mind :)


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] iommu: moving initialization earlier
@ 2013-01-04  8:21                                   ` Alexey Kardashevskiy
  0 siblings, 0 replies; 122+ messages in thread
From: Alexey Kardashevskiy @ 2013-01-04  8:21 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: iommu, linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 16/12/12 22:20, Joerg Roedel wrote:
> Alexey,
>
> On Thu, Dec 13, 2012 at 08:48:55AM -0700, Alex Williamson wrote:
>> Probably a good idea to CC the iommu list and maintainer...
>>
>> On Thu, 2012-12-13 at 17:28 +1100, Alexey Kardashevskiy wrote:
>>> Signed-off-by: Alexey Kardashevskiy <aik-sLpHqDYs0B2HXe+LvDLADg@public.gmane.org>
>
> Please resend the patch when the merge-window is closed.

is it closed now? not sure I entirely understand what window you kept in 
mind :)


-- 
Alexey

^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] iommu: moving initialization earlier
@ 2013-01-06  9:49                                     ` Joerg Roedel
  0 siblings, 0 replies; 122+ messages in thread
From: Joerg Roedel @ 2013-01-06  9:49 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linux-kernel, iommu

On Fri, Jan 04, 2013 at 07:21:34PM +1100, Alexey Kardashevskiy wrote:
> On 16/12/12 22:20, Joerg Roedel wrote:

> >Please resend the patch when the merge-window is closed.
> 
> is it closed now? not sure I entirely understand what window you
> kept in mind :)

Yes, it is closed now :-) The merge-window is the time between a Linux
kernel release (like v3.7) and the next release candiate (like
v3.8-rc1).


	Joerg



^ permalink raw reply	[flat|nested] 122+ messages in thread

* Re: [PATCH] iommu: moving initialization earlier
@ 2013-01-06  9:49                                     ` Joerg Roedel
  0 siblings, 0 replies; 122+ messages in thread
From: Joerg Roedel @ 2013-01-06  9:49 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: iommu, linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Fri, Jan 04, 2013 at 07:21:34PM +1100, Alexey Kardashevskiy wrote:
> On 16/12/12 22:20, Joerg Roedel wrote:

> >Please resend the patch when the merge-window is closed.
> 
> is it closed now? not sure I entirely understand what window you
> kept in mind :)

Yes, it is closed now :-) The merge-window is the time between a Linux
kernel release (like v3.7) and the next release candiate (like
v3.8-rc1).


	Joerg

^ permalink raw reply	[flat|nested] 122+ messages in thread

end of thread, other threads:[~2013-01-06  9:49 UTC | newest]

Thread overview: 122+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20121113033832.GW4696@truffula.fritz.box>
2012-11-20  0:48 ` [PATCH] vfio powerpc: enabled and supported on powernv platform Alexey Kardashevskiy
2012-11-20  0:48   ` Alexey Kardashevskiy
2012-11-20 18:19   ` Alex Williamson
2012-11-20 18:19     ` Alex Williamson
2012-11-22 11:56     ` Sethi Varun-B16395
2012-11-22 11:56       ` Sethi Varun-B16395
2012-11-22 11:56       ` Sethi Varun-B16395
2012-11-23  2:02       ` Alexey Kardashevskiy
2012-11-23  2:02         ` Alexey Kardashevskiy
2012-11-26 15:18         ` Alex Williamson
2012-11-26 15:18           ` Alex Williamson
2012-11-26 18:04           ` Alex Williamson
2012-11-26 18:04             ` Alex Williamson
2012-11-27  0:21             ` Benjamin Herrenschmidt
2012-11-27  0:21               ` Benjamin Herrenschmidt
2012-11-27  3:28             ` Alexey Kardashevskiy
2012-11-27  3:28               ` Alexey Kardashevskiy
2012-11-27  4:23               ` Alex Williamson
2012-11-27  4:23                 ` Alex Williamson
2012-11-26 15:08       ` Alex Williamson
2012-11-26 15:08         ` Alex Williamson
2012-11-23  9:03     ` [PATCH 0/2] vfio powerpc: implemented and enabled Alexey Kardashevskiy
2012-11-23  9:03       ` Alexey Kardashevskiy
2012-11-23  9:03       ` [PATCH 1/2] vfio powerpc: implemented IOMMU driver for VFIO Alexey Kardashevskiy
2012-11-23  9:03         ` Alexey Kardashevskiy
2012-11-26 18:20         ` Alex Williamson
2012-11-26 18:20           ` Alex Williamson
2012-11-27  4:06           ` Alexey Kardashevskiy
2012-11-27  4:06             ` Alexey Kardashevskiy
2012-11-27  4:29             ` Alex Williamson
2012-11-27  4:29               ` Alex Williamson
2012-11-27  4:58               ` Alexey Kardashevskiy
2012-11-27  4:58                 ` Alexey Kardashevskiy
2012-11-27  5:06                 ` David Gibson
2012-11-27  5:06                   ` David Gibson
2012-11-27  5:07                 ` Alex Williamson
2012-11-27  5:07                   ` Alex Williamson
2012-11-28  7:21                   ` [PATCH] " Alexey Kardashevskiy
2012-11-28  7:21                     ` Alexey Kardashevskiy
2012-11-28 21:01                     ` Alex Williamson
2012-11-28 21:01                       ` Alex Williamson
2012-11-29  3:51                       ` Alexey Kardashevskiy
2012-11-29  3:51                         ` Alexey Kardashevskiy
2012-11-23  9:03       ` [PATCH 2/2] vfio powerpc: enabled on powernv platform Alexey Kardashevskiy
2012-11-23  9:03         ` Alexey Kardashevskiy
2012-11-27  4:41         ` Alex Williamson
2012-11-27  4:41           ` Alex Williamson
2012-11-28  7:18           ` [PATCH] " Alexey Kardashevskiy
2012-11-28  7:18             ` Alexey Kardashevskiy
2012-11-28 21:30             ` Alex Williamson
2012-11-28 21:30               ` Alex Williamson
2012-11-29  3:53               ` Alexey Kardashevskiy
2012-11-29  3:53                 ` Alexey Kardashevskiy
2012-11-29  4:20                 ` Alex Williamson
2012-11-29  4:20                   ` Alex Williamson
2012-11-30  6:14                   ` Alexey Kardashevskiy
2012-11-30  6:14                     ` Alexey Kardashevskiy
2012-11-30 16:48                     ` Alex Williamson
2012-11-30 16:48                       ` Alex Williamson
2012-12-01  0:14                       ` Alexey Kardashevskiy
2012-12-01  0:14                         ` Alexey Kardashevskiy
2012-11-30  6:16                   ` Alexey Kardashevskiy
2012-11-30  6:16                     ` Alexey Kardashevskiy
2012-12-03  2:52       ` [PATCH 0/2] vfio on power: yet another try Alexey Kardashevskiy
2012-12-03  2:52         ` Alexey Kardashevskiy
2012-12-03  2:52         ` [PATCH 1/2] vfio powerpc: enabled on powernv platform Alexey Kardashevskiy
2012-12-03  2:52           ` Alexey Kardashevskiy
2012-12-03 17:35           ` Alex Williamson
2012-12-03 17:35             ` Alex Williamson
2012-12-04  8:12             ` Alexey Kardashevskiy
2012-12-04  8:12               ` Alexey Kardashevskiy
2012-12-04 15:51               ` Alex Williamson
2012-12-04 15:51                 ` Alex Williamson
2012-12-07  7:35                 ` [PATCH] " Alexey Kardashevskiy
2012-12-07  7:35                   ` Alexey Kardashevskiy
2012-12-07 17:38                   ` Alex Williamson
2012-12-07 17:38                     ` Alex Williamson
2012-12-12  6:14                     ` Alexey Kardashevskiy
2012-12-12  6:14                       ` Alexey Kardashevskiy
2012-12-12 14:34                       ` Alex Williamson
2012-12-12 14:34                         ` Alex Williamson
2012-12-13  2:29                         ` Benjamin Herrenschmidt
2012-12-13  2:29                           ` Benjamin Herrenschmidt
2012-12-13  6:27                           ` Alexey Kardashevskiy
2012-12-13  6:27                             ` Alexey Kardashevskiy
2012-12-13  6:28                           ` [PATCH] iommu: moving initialization earlier Alexey Kardashevskiy
2012-12-13 15:48                             ` Alex Williamson
2012-12-13 15:48                               ` Alex Williamson
2012-12-16 11:20                               ` Joerg Roedel
2012-12-16 11:20                                 ` Joerg Roedel
2013-01-04  8:21                                 ` Alexey Kardashevskiy
2013-01-04  8:21                                   ` Alexey Kardashevskiy
2013-01-06  9:49                                   ` Joerg Roedel
2013-01-06  9:49                                     ` Joerg Roedel
2012-12-12 12:34                     ` [PATCH] vfio powerpc: enabled on powernv platform Alexey Kardashevskiy
2012-12-12 12:34                       ` Alexey Kardashevskiy
2012-12-12 12:38                       ` Alexey Kardashevskiy
2012-12-12 12:38                         ` Alexey Kardashevskiy
2012-12-12 23:30                       ` Alex Williamson
2012-12-12 23:30                         ` Alex Williamson
2012-12-13  2:24                         ` Alexey Kardashevskiy
2012-12-13  2:24                           ` Alexey Kardashevskiy
2012-12-13  2:39                         ` Benjamin Herrenschmidt
2012-12-13  2:39                           ` Benjamin Herrenschmidt
2012-12-13  2:57                         ` Benjamin Herrenschmidt
2012-12-13  2:57                           ` Benjamin Herrenschmidt
2012-12-13  3:22                           ` Alex Williamson
2012-12-13  3:22                             ` Alex Williamson
2012-12-03  2:52         ` [PATCH 2/2] vfio powerpc: implemented IOMMU driver for VFIO Alexey Kardashevskiy
2012-12-03  2:52           ` Alexey Kardashevskiy
2012-12-03 17:53           ` Alex Williamson
2012-12-03 17:53             ` Alex Williamson
2012-12-07  7:34             ` [PATCH] " Alexey Kardashevskiy
2012-12-07  7:34               ` Alexey Kardashevskiy
2012-12-07 17:01               ` Alex Williamson
2012-12-07 17:01                 ` Alex Williamson
2012-12-12  6:59                 ` Alexey Kardashevskiy
2012-12-12  6:59                   ` Alexey Kardashevskiy
2012-12-12 14:36                   ` Alex Williamson
2012-12-12 14:36                     ` Alex Williamson
2012-12-12 12:35                 ` Alexey Kardashevskiy
2012-12-12 12:35                   ` Alexey Kardashevskiy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.