All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kirti Wankhede <kwankhede@nvidia.com>
To: <alex.williamson@redhat.com>, <pbonzini@redhat.com>,
	<kraxel@redhat.com>, <cjia@nvidia.com>
Cc: <qemu-devel@nongnu.org>, <kvm@vger.kernel.org>,
	<kevin.tian@intel.com>, <shuai.ruan@intel.com>,
	<jike.song@intel.com>, <zhiyuan.lv@intel.com>,
	<bjsdjshi@linux.vnet.ibm.com>,
	Kirti Wankhede <kwankhede@nvidia.com>
Subject: [RFC PATCH v4 3/3] VFIO Type1 IOMMU: Add support for mediated devices
Date: Wed, 25 May 2016 01:28:17 +0530	[thread overview]
Message-ID: <1464119897-10844-4-git-send-email-kwankhede@nvidia.com> (raw)
In-Reply-To: <1464119897-10844-1-git-send-email-kwankhede@nvidia.com>

VFIO Type1 IOMMU driver is designed for the devices which are IOMMU
capable. Mediated device only uses IOMMU TYPE1 API, the underlying
hardware can be managed by an IOMMU domain.

This change exports functions to pin and unpin pages for mediated devices.
It maintains data of pinned pages for mediated domain. This data is used to
verify unpinning request and to unpin remaining pages from detach_group()
if there are any.

Aim of this change is:
- To use most of the code of IOMMU driver for mediated devices
- To support direct assigned device and mediated device by single module

Updated the change to keep mediated domain structure out of domain_list.

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- two GPU pass through

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I9c262abc9c68fd6abf52d91a636bf0cc631593a0
---
 drivers/vfio/vfio_iommu_type1.c | 433 +++++++++++++++++++++++++++++++++++++---
 include/linux/vfio.h            |   6 +
 2 files changed, 407 insertions(+), 32 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 75b24e93cedb..5cc7dc0288a3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,7 @@
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
+#include <linux/mdev.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -55,6 +56,7 @@ MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct vfio_domain	*mediated_domain;
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	bool			v2;
@@ -67,6 +69,13 @@ struct vfio_domain {
 	struct list_head	group_list;
 	int			prot;		/* IOMMU_CACHE */
 	bool			fgsp;		/* Fine-grained super pages */
+
+	/* Domain for mediated device which is without physical IOMMU */
+	bool			mediated_device;
+
+	struct mm_struct	*mm;
+	struct rb_root		pfn_list;	/* pinned Host pfn list */
+	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
 };
 
 struct vfio_dma {
@@ -79,10 +88,23 @@ struct vfio_dma {
 
 struct vfio_group {
 	struct iommu_group	*iommu_group;
+	struct mdev_device	*mdevice;
 	struct list_head	next;
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	unsigned long		vaddr;		/* virtual addr */
+	dma_addr_t		iova;		/* IOVA */
+	unsigned long		npage;		/* number of pages */
+	unsigned long		pfn;		/* Host pfn */
+	size_t			prot;
+	atomic_t		ref_count;
+};
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -130,6 +152,64 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host pfn list
+ */
+
+static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
+				      unsigned long pfn)
+{
+	struct rb_node *node;
+	struct vfio_pfn *vpfn, *ret = NULL;
+
+	mutex_lock(&domain->pfn_list_lock);
+	node = domain->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (pfn < vpfn->pfn)
+			node = node->rb_left;
+		else if (pfn > vpfn->pfn)
+			node = node->rb_right;
+		else {
+			ret = vpfn;
+			break;
+		}
+	}
+
+	mutex_unlock(&domain->pfn_list_lock);
+	return ret;
+}
+
+static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	mutex_lock(&domain->pfn_list_lock);
+	link = &domain->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->pfn < vpfn->pfn)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &domain->pfn_list);
+	mutex_unlock(&domain->pfn_list_lock);
+}
+
+/* call by holding domain->pfn_list_lock */
+static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &domain->pfn_list);
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -228,20 +308,29 @@ static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+			 int prot, unsigned long *pfn)
 {
 	struct page *page[1];
 	struct vm_area_struct *vma;
+	struct mm_struct *local_mm = mm;
 	int ret = -EFAULT;
 
-	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+	if (!local_mm && !current->mm)
+		return -ENODEV;
+
+	if (!local_mm)
+		local_mm = current->mm;
+
+	down_read(&local_mm->mmap_sem);
+	if (get_user_pages_remote(NULL, local_mm, vaddr, 1,
+				!!(prot & IOMMU_WRITE), 0, page, NULL) == 1) {
 		*pfn = page_to_pfn(page[0]);
-		return 0;
+		ret = 0;
+		goto done_pfn;
 	}
 
-	down_read(&current->mm->mmap_sem);
-
-	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -249,7 +338,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 			ret = 0;
 	}
 
-	up_read(&current->mm->mmap_sem);
+done_pfn:
+	up_read(&local_mm->mmap_sem);
 
 	return ret;
 }
@@ -259,18 +349,19 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long vfio_pin_pages(unsigned long vaddr, long npage,
-			   int prot, unsigned long *pfn_base)
+static long vfio_pin_pages_internal(struct vfio_domain *domain,
+				    unsigned long vaddr, long npage,
+				    int prot, unsigned long *pfn_base)
 {
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = capable(CAP_IPC_LOCK);
 	long ret, i;
 	bool rsvd;
 
-	if (!current->mm)
+	if (!domain)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(domain->mm, vaddr, prot, pfn_base);
 	if (ret)
 		return ret;
 
@@ -293,7 +384,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(domain->mm, vaddr, prot, &pfn);
 		if (ret)
 			break;
 
@@ -318,20 +409,165 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 	return i;
 }
 
-static long vfio_unpin_pages(unsigned long pfn, long npage,
-			     int prot, bool do_accounting)
+static long vfio_unpin_pages_internal(struct vfio_domain *domain,
+				      unsigned long pfn, long npage, int prot,
+				      bool do_accounting)
 {
 	unsigned long unlocked = 0;
 	long i;
 
+	if (!domain)
+		return -ENODEV;
+
 	for (i = 0; i < npage; i++)
 		unlocked += put_pfn(pfn++, prot);
 
 	if (do_accounting)
 		vfio_lock_acct(-unlocked);
+	return unlocked;
+}
+
+/*
+ * Pin a set of guest PFNs and return their associated host PFNs for API
+ * supported domain only.
+ * @vaddr [in]: array of guest PFNs
+ * @npage [in]: count of array elements
+ * @prot [in] : protection flags
+ * @pfn_base[out] : array of host PFNs
+ */
+long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
+		   int prot, dma_addr_t *pfn_base)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL;
+	int i = 0, ret = 0;
+	long retpage;
+	unsigned long remote_vaddr = 0;
+	dma_addr_t *pfn = pfn_base;
+	struct vfio_dma *dma;
+
+	if (!iommu || !vaddr || !pfn_base)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->mediated_domain) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	domain = iommu->mediated_domain;
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p, *lpfn;
+		unsigned long tpfn;
+		dma_addr_t iova;
+		long pg_cnt = 1;
+
+		iova = vaddr[i] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0 /*  size */);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_done;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		retpage = vfio_pin_pages_internal(domain, remote_vaddr,
+						  pg_cnt, prot, &tpfn);
+		if (retpage <= 0) {
+			WARN_ON(!retpage);
+			ret = (int)retpage;
+			goto pin_done;
+		}
+
+		pfn[i] = tpfn;
+
+		/* search if pfn exist */
+		p = vfio_find_pfn(domain, tpfn);
+		if (p) {
+			atomic_inc(&p->ref_count);
+			continue;
+		}
+
+		/* add to pfn_list */
+		lpfn = kzalloc(sizeof(*lpfn), GFP_KERNEL);
+		if (!lpfn) {
+			ret = -ENOMEM;
+			goto pin_done;
+		}
+		lpfn->vaddr = remote_vaddr;
+		lpfn->iova = iova;
+		lpfn->pfn = pfn[i];
+		lpfn->npage = 1;
+		lpfn->prot = prot;
+		atomic_inc(&lpfn->ref_count);
+		vfio_link_pfn(domain, lpfn);
+	}
+
+	ret = i;
+
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+static int vfio_unpin_pfn(struct vfio_domain *domain,
+			  struct vfio_pfn *vpfn, bool do_accounting)
+{
+	int ret;
+
+	ret = vfio_unpin_pages_internal(domain, vpfn->pfn, vpfn->npage,
+					vpfn->prot, do_accounting);
+
+	if (ret > 0 && atomic_dec_and_test(&vpfn->ref_count)) {
+		vfio_unlink_pfn(domain, vpfn);
+		kfree(vpfn);
+	}
+
+	return ret;
+}
+
+/*
+ * Unpin set of host PFNs for API supported domain only.
+ * @pfn	[in] : array of host PFNs to be unpinned.
+ * @npage [in] :count of elements in array, that is number of pages.
+ * @prot [in] : protection flags
+ */
+long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
+		     int prot)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL;
+	long unlocked = 0;
+	int i;
+
+	if (!iommu || !pfn)
+		return -EINVAL;
+
+	if (!iommu->mediated_domain)
+		return -EINVAL;
+
+	domain = iommu->mediated_domain;
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+
+		/* verify if pfn exist in pfn_list */
+		p = vfio_find_pfn(domain, *(pfn + i));
+		if (!p)
+			continue;
+
+		mutex_lock(&domain->pfn_list_lock);
+		unlocked += vfio_unpin_pfn(domain, p, true);
+		mutex_unlock(&domain->pfn_list_lock);
+	}
 
 	return unlocked;
 }
+EXPORT_SYMBOL(vfio_unpin_pages);
 
 static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
@@ -341,6 +577,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 
 	if (!dma->size)
 		return;
+
+	if (list_empty(&iommu->domain_list))
+		return;
+
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -382,9 +622,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
-					     unmapped >> PAGE_SHIFT,
-					     dma->prot, false);
+		unlocked += vfio_unpin_pages_internal(domain,
+						phys >> PAGE_SHIFT,
+						unmapped >> PAGE_SHIFT,
+						dma->prot, false);
 		iova += unmapped;
 
 		cond_resched();
@@ -517,6 +758,9 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
 	long i;
 	int ret;
 
+	if (domain->mediated_device)
+		return -EINVAL;
+
 	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 		ret = iommu_map(domain->domain, iova,
 				(phys_addr_t)pfn << PAGE_SHIFT,
@@ -537,6 +781,9 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 	struct vfio_domain *d;
 	int ret;
 
+	if (list_empty(&iommu->domain_list))
+		return 0;
+
 	list_for_each_entry(d, &iommu->domain_list, next) {
 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
 				npage << PAGE_SHIFT, prot | d->prot);
@@ -569,6 +816,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	uint64_t mask;
 	struct vfio_dma *dma;
 	unsigned long pfn;
+	struct vfio_domain *domain = NULL;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -611,10 +859,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
+	/*
+	 * Skip pin and map if and domain list is empty
+	 */
+	if (list_empty(&iommu->domain_list)) {
+		dma->size = size;
+		goto map_done;
+	}
+
+	domain = list_first_entry(&iommu->domain_list,
+				  struct vfio_domain, next);
+
 	while (size) {
 		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages(vaddr + dma->size,
-				       size >> PAGE_SHIFT, prot, &pfn);
+		npage = vfio_pin_pages_internal(domain, vaddr + dma->size,
+						size >> PAGE_SHIFT, prot, &pfn);
 		if (npage <= 0) {
 			WARN_ON(!npage);
 			ret = (int)npage;
@@ -624,7 +883,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		/* Map it! */
 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
 		if (ret) {
-			vfio_unpin_pages(pfn, npage, prot, true);
+			vfio_unpin_pages_internal(domain, pfn, npage,
+						  prot, true);
 			break;
 		}
 
@@ -635,6 +895,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (ret)
 		vfio_remove_dma(iommu, dma);
 
+map_done:
 	mutex_unlock(&iommu->lock);
 	return ret;
 }
@@ -658,6 +919,9 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	struct rb_node *n;
 	int ret;
 
+	if (domain->mediated_device)
+		return 0;
+
 	/* Arbitrarily pick the first domain in the list for lookups */
 	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
 	n = rb_first(&iommu->dma_list);
@@ -716,6 +980,9 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	struct page *pages;
 	int ret, order = get_order(PAGE_SIZE * 2);
 
+	if (domain->mediated_device)
+		return;
+
 	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 	if (!pages)
 		return;
@@ -734,11 +1001,25 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	__free_pages(pages, order);
 }
 
+static struct vfio_group *is_iommu_group_present(struct vfio_domain *domain,
+				   struct iommu_group *iommu_group)
+{
+	struct vfio_group *g;
+
+	list_for_each_entry(g, &domain->group_list, next) {
+		if (g->iommu_group != iommu_group)
+			continue;
+		return g;
+	}
+
+	return NULL;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group, *g;
+	struct vfio_group *group;
 	struct vfio_domain *domain, *d;
 	struct bus_type *bus = NULL;
 	int ret;
@@ -746,10 +1027,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	mutex_lock(&iommu->lock);
 
 	list_for_each_entry(d, &iommu->domain_list, next) {
-		list_for_each_entry(g, &d->group_list, next) {
-			if (g->iommu_group != iommu_group)
-				continue;
+		if (is_iommu_group_present(d, iommu_group)) {
+			mutex_unlock(&iommu->lock);
+			return -EINVAL;
+		}
+	}
 
+	if (iommu->mediated_domain) {
+		if (is_iommu_group_present(iommu->mediated_domain,
+					   iommu_group)) {
 			mutex_unlock(&iommu->lock);
 			return -EINVAL;
 		}
@@ -769,6 +1055,32 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+	if (!iommu_present(bus) && (bus == &mdev_bus_type)) {
+		struct mdev_device *mdevice = NULL;
+
+		mdevice = mdev_get_device_by_group(iommu_group);
+		if (!mdevice)
+			goto out_free;
+
+		mdevice->iommu_data = iommu;
+		group->mdevice = mdevice;
+
+		if (iommu->mediated_domain) {
+			list_add(&group->next,
+				 &iommu->mediated_domain->group_list);
+			kfree(domain);
+			goto out_success;
+		}
+		domain->mediated_device = true;
+		domain->mm = current->mm;
+		INIT_LIST_HEAD(&domain->group_list);
+		list_add(&group->next, &domain->group_list);
+		domain->pfn_list = RB_ROOT;
+		mutex_init(&domain->pfn_list_lock);
+		iommu->mediated_domain = domain;
+		goto out_success;
+	}
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -836,6 +1148,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 
 	list_add(&domain->next, &iommu->domain_list);
 
+out_success:
 	mutex_unlock(&iommu->lock);
 
 	return 0;
@@ -859,6 +1172,18 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+static void vfio_iommu_unpin_api_domain(struct vfio_domain *domain)
+{
+	struct rb_node *node;
+
+	mutex_lock(&domain->pfn_list_lock);
+	while ((node = rb_first(&domain->pfn_list))) {
+		vfio_unpin_pfn(domain,
+				rb_entry(node, struct vfio_pfn, node), false);
+	}
+	mutex_unlock(&domain->pfn_list_lock);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -868,31 +1193,54 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry(domain, &iommu->domain_list, next) {
-		list_for_each_entry(group, &domain->group_list, next) {
-			if (group->iommu_group != iommu_group)
-				continue;
+	if (iommu->mediated_domain) {
+		domain = iommu->mediated_domain;
+		group = is_iommu_group_present(domain, iommu_group);
+		if (group) {
+			if (group->mdevice) {
+				group->mdevice->iommu_data = NULL;
+				mdev_put_device(group->mdevice);
+			}
+
+			list_del(&group->next);
+			kfree(group);
 
+			if (list_empty(&domain->group_list)) {
+				vfio_iommu_unpin_api_domain(domain);
+
+				if (list_empty(&iommu->domain_list))
+					vfio_iommu_unmap_unpin_all(iommu);
+
+				kfree(domain);
+				iommu->mediated_domain = NULL;
+			}
+		}
+	}
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = is_iommu_group_present(domain, iommu_group);
+		if (group) {
 			iommu_detach_group(domain->domain, iommu_group);
 			list_del(&group->next);
 			kfree(group);
 			/*
 			 * Group ownership provides privilege, if the group
 			 * list is empty, the domain goes away.  If it's the
-			 * last domain, then all the mappings go away too.
+			 * last domain with iommu and API-only domain doesn't
+			 * exist, the all the mappings go away too.
 			 */
 			if (list_empty(&domain->group_list)) {
-				if (list_is_singular(&iommu->domain_list))
+				if (list_is_singular(&iommu->domain_list) &&
+				    !iommu->mediated_domain)
 					vfio_iommu_unmap_unpin_all(iommu);
 				iommu_domain_free(domain->domain);
 				list_del(&domain->next);
 				kfree(domain);
 			}
-			goto done;
+			break;
 		}
 	}
 
-done:
 	mutex_unlock(&iommu->lock);
 }
 
@@ -930,8 +1278,28 @@ static void vfio_iommu_type1_release(void *iommu_data)
 	struct vfio_domain *domain, *domain_tmp;
 	struct vfio_group *group, *group_tmp;
 
+	if (iommu->mediated_domain) {
+		domain = iommu->mediated_domain;
+		list_for_each_entry_safe(group, group_tmp,
+					 &domain->group_list, next) {
+			if (group->mdevice) {
+				group->mdevice->iommu_data = NULL;
+				mdev_put_device(group->mdevice);
+			}
+
+			list_del(&group->next);
+			kfree(group);
+		}
+		vfio_iommu_unpin_api_domain(domain);
+		kfree(domain);
+		iommu->mediated_domain = NULL;
+	}
+
 	vfio_iommu_unmap_unpin_all(iommu);
 
+	if (list_empty(&iommu->domain_list))
+		goto release_exit;
+
 	list_for_each_entry_safe(domain, domain_tmp,
 				 &iommu->domain_list, next) {
 		list_for_each_entry_safe(group, group_tmp,
@@ -945,6 +1313,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
 		kfree(domain);
 	}
 
+release_exit:
 	kfree(iommu);
 }
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 431b824b0d3e..0a907bb33426 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -134,6 +134,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 }
 #endif /* CONFIG_EEH */
 
+extern long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
+			   int prot, dma_addr_t *pfn_base);
+
+extern long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
+			     int prot);
+
 /*
  * IRQfd - generic
  */
-- 
2.7.0


WARNING: multiple messages have this Message-ID (diff)
From: Kirti Wankhede <kwankhede@nvidia.com>
To: alex.williamson@redhat.com, pbonzini@redhat.com,
	kraxel@redhat.com, cjia@nvidia.com
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org, kevin.tian@intel.com,
	shuai.ruan@intel.com, jike.song@intel.com, zhiyuan.lv@intel.com,
	bjsdjshi@linux.vnet.ibm.com,
	Kirti Wankhede <kwankhede@nvidia.com>
Subject: [Qemu-devel] [RFC PATCH v4 3/3] VFIO Type1 IOMMU: Add support for mediated devices
Date: Wed, 25 May 2016 01:28:17 +0530	[thread overview]
Message-ID: <1464119897-10844-4-git-send-email-kwankhede@nvidia.com> (raw)
In-Reply-To: <1464119897-10844-1-git-send-email-kwankhede@nvidia.com>

VFIO Type1 IOMMU driver is designed for the devices which are IOMMU
capable. Mediated device only uses IOMMU TYPE1 API, the underlying
hardware can be managed by an IOMMU domain.

This change exports functions to pin and unpin pages for mediated devices.
It maintains data of pinned pages for mediated domain. This data is used to
verify unpinning request and to unpin remaining pages from detach_group()
if there are any.

Aim of this change is:
- To use most of the code of IOMMU driver for mediated devices
- To support direct assigned device and mediated device by single module

Updated the change to keep mediated domain structure out of domain_list.

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- two GPU pass through

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I9c262abc9c68fd6abf52d91a636bf0cc631593a0
---
 drivers/vfio/vfio_iommu_type1.c | 433 +++++++++++++++++++++++++++++++++++++---
 include/linux/vfio.h            |   6 +
 2 files changed, 407 insertions(+), 32 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 75b24e93cedb..5cc7dc0288a3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,7 @@
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
+#include <linux/mdev.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -55,6 +56,7 @@ MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct vfio_domain	*mediated_domain;
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	bool			v2;
@@ -67,6 +69,13 @@ struct vfio_domain {
 	struct list_head	group_list;
 	int			prot;		/* IOMMU_CACHE */
 	bool			fgsp;		/* Fine-grained super pages */
+
+	/* Domain for mediated device which is without physical IOMMU */
+	bool			mediated_device;
+
+	struct mm_struct	*mm;
+	struct rb_root		pfn_list;	/* pinned Host pfn list */
+	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
 };
 
 struct vfio_dma {
@@ -79,10 +88,23 @@ struct vfio_dma {
 
 struct vfio_group {
 	struct iommu_group	*iommu_group;
+	struct mdev_device	*mdevice;
 	struct list_head	next;
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	unsigned long		vaddr;		/* virtual addr */
+	dma_addr_t		iova;		/* IOVA */
+	unsigned long		npage;		/* number of pages */
+	unsigned long		pfn;		/* Host pfn */
+	size_t			prot;
+	atomic_t		ref_count;
+};
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -130,6 +152,64 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host pfn list
+ */
+
+static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
+				      unsigned long pfn)
+{
+	struct rb_node *node;
+	struct vfio_pfn *vpfn, *ret = NULL;
+
+	mutex_lock(&domain->pfn_list_lock);
+	node = domain->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (pfn < vpfn->pfn)
+			node = node->rb_left;
+		else if (pfn > vpfn->pfn)
+			node = node->rb_right;
+		else {
+			ret = vpfn;
+			break;
+		}
+	}
+
+	mutex_unlock(&domain->pfn_list_lock);
+	return ret;
+}
+
+static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	mutex_lock(&domain->pfn_list_lock);
+	link = &domain->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->pfn < vpfn->pfn)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &domain->pfn_list);
+	mutex_unlock(&domain->pfn_list_lock);
+}
+
+/* call by holding domain->pfn_list_lock */
+static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &domain->pfn_list);
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -228,20 +308,29 @@ static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+			 int prot, unsigned long *pfn)
 {
 	struct page *page[1];
 	struct vm_area_struct *vma;
+	struct mm_struct *local_mm = mm;
 	int ret = -EFAULT;
 
-	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+	if (!local_mm && !current->mm)
+		return -ENODEV;
+
+	if (!local_mm)
+		local_mm = current->mm;
+
+	down_read(&local_mm->mmap_sem);
+	if (get_user_pages_remote(NULL, local_mm, vaddr, 1,
+				!!(prot & IOMMU_WRITE), 0, page, NULL) == 1) {
 		*pfn = page_to_pfn(page[0]);
-		return 0;
+		ret = 0;
+		goto done_pfn;
 	}
 
-	down_read(&current->mm->mmap_sem);
-
-	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -249,7 +338,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 			ret = 0;
 	}
 
-	up_read(&current->mm->mmap_sem);
+done_pfn:
+	up_read(&local_mm->mmap_sem);
 
 	return ret;
 }
@@ -259,18 +349,19 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long vfio_pin_pages(unsigned long vaddr, long npage,
-			   int prot, unsigned long *pfn_base)
+static long vfio_pin_pages_internal(struct vfio_domain *domain,
+				    unsigned long vaddr, long npage,
+				    int prot, unsigned long *pfn_base)
 {
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = capable(CAP_IPC_LOCK);
 	long ret, i;
 	bool rsvd;
 
-	if (!current->mm)
+	if (!domain)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(domain->mm, vaddr, prot, pfn_base);
 	if (ret)
 		return ret;
 
@@ -293,7 +384,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(domain->mm, vaddr, prot, &pfn);
 		if (ret)
 			break;
 
@@ -318,20 +409,165 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 	return i;
 }
 
-static long vfio_unpin_pages(unsigned long pfn, long npage,
-			     int prot, bool do_accounting)
+static long vfio_unpin_pages_internal(struct vfio_domain *domain,
+				      unsigned long pfn, long npage, int prot,
+				      bool do_accounting)
 {
 	unsigned long unlocked = 0;
 	long i;
 
+	if (!domain)
+		return -ENODEV;
+
 	for (i = 0; i < npage; i++)
 		unlocked += put_pfn(pfn++, prot);
 
 	if (do_accounting)
 		vfio_lock_acct(-unlocked);
+	return unlocked;
+}
+
+/*
+ * Pin a set of guest PFNs and return their associated host PFNs for API
+ * supported domain only.
+ * @vaddr [in]: array of guest PFNs
+ * @npage [in]: count of array elements
+ * @prot [in] : protection flags
+ * @pfn_base[out] : array of host PFNs
+ */
+long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
+		   int prot, dma_addr_t *pfn_base)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL;
+	int i = 0, ret = 0;
+	long retpage;
+	unsigned long remote_vaddr = 0;
+	dma_addr_t *pfn = pfn_base;
+	struct vfio_dma *dma;
+
+	if (!iommu || !vaddr || !pfn_base)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->mediated_domain) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	domain = iommu->mediated_domain;
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p, *lpfn;
+		unsigned long tpfn;
+		dma_addr_t iova;
+		long pg_cnt = 1;
+
+		iova = vaddr[i] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0 /*  size */);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_done;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		retpage = vfio_pin_pages_internal(domain, remote_vaddr,
+						  pg_cnt, prot, &tpfn);
+		if (retpage <= 0) {
+			WARN_ON(!retpage);
+			ret = (int)retpage;
+			goto pin_done;
+		}
+
+		pfn[i] = tpfn;
+
+		/* search if pfn exist */
+		p = vfio_find_pfn(domain, tpfn);
+		if (p) {
+			atomic_inc(&p->ref_count);
+			continue;
+		}
+
+		/* add to pfn_list */
+		lpfn = kzalloc(sizeof(*lpfn), GFP_KERNEL);
+		if (!lpfn) {
+			ret = -ENOMEM;
+			goto pin_done;
+		}
+		lpfn->vaddr = remote_vaddr;
+		lpfn->iova = iova;
+		lpfn->pfn = pfn[i];
+		lpfn->npage = 1;
+		lpfn->prot = prot;
+		atomic_inc(&lpfn->ref_count);
+		vfio_link_pfn(domain, lpfn);
+	}
+
+	ret = i;
+
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+static int vfio_unpin_pfn(struct vfio_domain *domain,
+			  struct vfio_pfn *vpfn, bool do_accounting)
+{
+	int ret;
+
+	ret = vfio_unpin_pages_internal(domain, vpfn->pfn, vpfn->npage,
+					vpfn->prot, do_accounting);
+
+	if (ret > 0 && atomic_dec_and_test(&vpfn->ref_count)) {
+		vfio_unlink_pfn(domain, vpfn);
+		kfree(vpfn);
+	}
+
+	return ret;
+}
+
+/*
+ * Unpin set of host PFNs for API supported domain only.
+ * @pfn	[in] : array of host PFNs to be unpinned.
+ * @npage [in] :count of elements in array, that is number of pages.
+ * @prot [in] : protection flags
+ */
+long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
+		     int prot)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL;
+	long unlocked = 0;
+	int i;
+
+	if (!iommu || !pfn)
+		return -EINVAL;
+
+	if (!iommu->mediated_domain)
+		return -EINVAL;
+
+	domain = iommu->mediated_domain;
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+
+		/* verify if pfn exist in pfn_list */
+		p = vfio_find_pfn(domain, *(pfn + i));
+		if (!p)
+			continue;
+
+		mutex_lock(&domain->pfn_list_lock);
+		unlocked += vfio_unpin_pfn(domain, p, true);
+		mutex_unlock(&domain->pfn_list_lock);
+	}
 
 	return unlocked;
 }
+EXPORT_SYMBOL(vfio_unpin_pages);
 
 static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
@@ -341,6 +577,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 
 	if (!dma->size)
 		return;
+
+	if (list_empty(&iommu->domain_list))
+		return;
+
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -382,9 +622,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
-					     unmapped >> PAGE_SHIFT,
-					     dma->prot, false);
+		unlocked += vfio_unpin_pages_internal(domain,
+						phys >> PAGE_SHIFT,
+						unmapped >> PAGE_SHIFT,
+						dma->prot, false);
 		iova += unmapped;
 
 		cond_resched();
@@ -517,6 +758,9 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
 	long i;
 	int ret;
 
+	if (domain->mediated_device)
+		return -EINVAL;
+
 	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 		ret = iommu_map(domain->domain, iova,
 				(phys_addr_t)pfn << PAGE_SHIFT,
@@ -537,6 +781,9 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 	struct vfio_domain *d;
 	int ret;
 
+	if (list_empty(&iommu->domain_list))
+		return 0;
+
 	list_for_each_entry(d, &iommu->domain_list, next) {
 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
 				npage << PAGE_SHIFT, prot | d->prot);
@@ -569,6 +816,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	uint64_t mask;
 	struct vfio_dma *dma;
 	unsigned long pfn;
+	struct vfio_domain *domain = NULL;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -611,10 +859,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
+	/*
+	 * Skip pin and map if and domain list is empty
+	 */
+	if (list_empty(&iommu->domain_list)) {
+		dma->size = size;
+		goto map_done;
+	}
+
+	domain = list_first_entry(&iommu->domain_list,
+				  struct vfio_domain, next);
+
 	while (size) {
 		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages(vaddr + dma->size,
-				       size >> PAGE_SHIFT, prot, &pfn);
+		npage = vfio_pin_pages_internal(domain, vaddr + dma->size,
+						size >> PAGE_SHIFT, prot, &pfn);
 		if (npage <= 0) {
 			WARN_ON(!npage);
 			ret = (int)npage;
@@ -624,7 +883,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		/* Map it! */
 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
 		if (ret) {
-			vfio_unpin_pages(pfn, npage, prot, true);
+			vfio_unpin_pages_internal(domain, pfn, npage,
+						  prot, true);
 			break;
 		}
 
@@ -635,6 +895,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (ret)
 		vfio_remove_dma(iommu, dma);
 
+map_done:
 	mutex_unlock(&iommu->lock);
 	return ret;
 }
@@ -658,6 +919,9 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	struct rb_node *n;
 	int ret;
 
+	if (domain->mediated_device)
+		return 0;
+
 	/* Arbitrarily pick the first domain in the list for lookups */
 	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
 	n = rb_first(&iommu->dma_list);
@@ -716,6 +980,9 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	struct page *pages;
 	int ret, order = get_order(PAGE_SIZE * 2);
 
+	if (domain->mediated_device)
+		return;
+
 	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 	if (!pages)
 		return;
@@ -734,11 +1001,25 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	__free_pages(pages, order);
 }
 
+static struct vfio_group *is_iommu_group_present(struct vfio_domain *domain,
+				   struct iommu_group *iommu_group)
+{
+	struct vfio_group *g;
+
+	list_for_each_entry(g, &domain->group_list, next) {
+		if (g->iommu_group != iommu_group)
+			continue;
+		return g;
+	}
+
+	return NULL;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group, *g;
+	struct vfio_group *group;
 	struct vfio_domain *domain, *d;
 	struct bus_type *bus = NULL;
 	int ret;
@@ -746,10 +1027,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	mutex_lock(&iommu->lock);
 
 	list_for_each_entry(d, &iommu->domain_list, next) {
-		list_for_each_entry(g, &d->group_list, next) {
-			if (g->iommu_group != iommu_group)
-				continue;
+		if (is_iommu_group_present(d, iommu_group)) {
+			mutex_unlock(&iommu->lock);
+			return -EINVAL;
+		}
+	}
 
+	if (iommu->mediated_domain) {
+		if (is_iommu_group_present(iommu->mediated_domain,
+					   iommu_group)) {
 			mutex_unlock(&iommu->lock);
 			return -EINVAL;
 		}
@@ -769,6 +1055,32 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+	if (!iommu_present(bus) && (bus == &mdev_bus_type)) {
+		struct mdev_device *mdevice = NULL;
+
+		mdevice = mdev_get_device_by_group(iommu_group);
+		if (!mdevice)
+			goto out_free;
+
+		mdevice->iommu_data = iommu;
+		group->mdevice = mdevice;
+
+		if (iommu->mediated_domain) {
+			list_add(&group->next,
+				 &iommu->mediated_domain->group_list);
+			kfree(domain);
+			goto out_success;
+		}
+		domain->mediated_device = true;
+		domain->mm = current->mm;
+		INIT_LIST_HEAD(&domain->group_list);
+		list_add(&group->next, &domain->group_list);
+		domain->pfn_list = RB_ROOT;
+		mutex_init(&domain->pfn_list_lock);
+		iommu->mediated_domain = domain;
+		goto out_success;
+	}
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -836,6 +1148,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 
 	list_add(&domain->next, &iommu->domain_list);
 
+out_success:
 	mutex_unlock(&iommu->lock);
 
 	return 0;
@@ -859,6 +1172,18 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+static void vfio_iommu_unpin_api_domain(struct vfio_domain *domain)
+{
+	struct rb_node *node;
+
+	mutex_lock(&domain->pfn_list_lock);
+	while ((node = rb_first(&domain->pfn_list))) {
+		vfio_unpin_pfn(domain,
+				rb_entry(node, struct vfio_pfn, node), false);
+	}
+	mutex_unlock(&domain->pfn_list_lock);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -868,31 +1193,54 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry(domain, &iommu->domain_list, next) {
-		list_for_each_entry(group, &domain->group_list, next) {
-			if (group->iommu_group != iommu_group)
-				continue;
+	if (iommu->mediated_domain) {
+		domain = iommu->mediated_domain;
+		group = is_iommu_group_present(domain, iommu_group);
+		if (group) {
+			if (group->mdevice) {
+				group->mdevice->iommu_data = NULL;
+				mdev_put_device(group->mdevice);
+			}
+
+			list_del(&group->next);
+			kfree(group);
 
+			if (list_empty(&domain->group_list)) {
+				vfio_iommu_unpin_api_domain(domain);
+
+				if (list_empty(&iommu->domain_list))
+					vfio_iommu_unmap_unpin_all(iommu);
+
+				kfree(domain);
+				iommu->mediated_domain = NULL;
+			}
+		}
+	}
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = is_iommu_group_present(domain, iommu_group);
+		if (group) {
 			iommu_detach_group(domain->domain, iommu_group);
 			list_del(&group->next);
 			kfree(group);
 			/*
 			 * Group ownership provides privilege, if the group
 			 * list is empty, the domain goes away.  If it's the
-			 * last domain, then all the mappings go away too.
+			 * last domain with iommu and API-only domain doesn't
+			 * exist, the all the mappings go away too.
 			 */
 			if (list_empty(&domain->group_list)) {
-				if (list_is_singular(&iommu->domain_list))
+				if (list_is_singular(&iommu->domain_list) &&
+				    !iommu->mediated_domain)
 					vfio_iommu_unmap_unpin_all(iommu);
 				iommu_domain_free(domain->domain);
 				list_del(&domain->next);
 				kfree(domain);
 			}
-			goto done;
+			break;
 		}
 	}
 
-done:
 	mutex_unlock(&iommu->lock);
 }
 
@@ -930,8 +1278,28 @@ static void vfio_iommu_type1_release(void *iommu_data)
 	struct vfio_domain *domain, *domain_tmp;
 	struct vfio_group *group, *group_tmp;
 
+	if (iommu->mediated_domain) {
+		domain = iommu->mediated_domain;
+		list_for_each_entry_safe(group, group_tmp,
+					 &domain->group_list, next) {
+			if (group->mdevice) {
+				group->mdevice->iommu_data = NULL;
+				mdev_put_device(group->mdevice);
+			}
+
+			list_del(&group->next);
+			kfree(group);
+		}
+		vfio_iommu_unpin_api_domain(domain);
+		kfree(domain);
+		iommu->mediated_domain = NULL;
+	}
+
 	vfio_iommu_unmap_unpin_all(iommu);
 
+	if (list_empty(&iommu->domain_list))
+		goto release_exit;
+
 	list_for_each_entry_safe(domain, domain_tmp,
 				 &iommu->domain_list, next) {
 		list_for_each_entry_safe(group, group_tmp,
@@ -945,6 +1313,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
 		kfree(domain);
 	}
 
+release_exit:
 	kfree(iommu);
 }
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 431b824b0d3e..0a907bb33426 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -134,6 +134,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 }
 #endif /* CONFIG_EEH */
 
+extern long vfio_pin_pages(void *iommu_data, dma_addr_t *vaddr, long npage,
+			   int prot, dma_addr_t *pfn_base);
+
+extern long vfio_unpin_pages(void *iommu_data, dma_addr_t *pfn, long npage,
+			     int prot);
+
 /*
  * IRQfd - generic
  */
-- 
2.7.0

  parent reply	other threads:[~2016-05-24 19:57 UTC|newest]

Thread overview: 92+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-05-24 19:58 [RFC PATCH v4 0/3] Add Mediated device support[was: Add vGPU support] Kirti Wankhede
2016-05-24 19:58 ` [Qemu-devel] " Kirti Wankhede
2016-05-24 19:58 ` [RFC PATCH v4 1/3] Mediated device Core driver Kirti Wankhede
2016-05-24 19:58   ` [Qemu-devel] " Kirti Wankhede
2016-05-25  7:55   ` Tian, Kevin
2016-05-25  7:55     ` [Qemu-devel] " Tian, Kevin
2016-05-25 14:47     ` Kirti Wankhede
2016-05-25 14:47       ` [Qemu-devel] " Kirti Wankhede
2016-05-27  9:00       ` Tian, Kevin
2016-05-27  9:00         ` [Qemu-devel] " Tian, Kevin
2016-05-25 22:39   ` Alex Williamson
2016-05-25 22:39     ` [Qemu-devel] " Alex Williamson
2016-05-26  9:03     ` Kirti Wankhede
2016-05-26  9:03       ` [Qemu-devel] " Kirti Wankhede
2016-05-26 14:06       ` Alex Williamson
2016-05-26 14:06         ` [Qemu-devel] " Alex Williamson
2016-06-03  8:57   ` Dong Jia
2016-06-03  8:57     ` [Qemu-devel] " Dong Jia
2016-06-03  9:40     ` Tian, Kevin
2016-06-03  9:40       ` [Qemu-devel] " Tian, Kevin
2016-06-06  2:24       ` Dong Jia
2016-06-06  2:24         ` [Qemu-devel] " Dong Jia
2016-06-06  5:27     ` Kirti Wankhede
2016-06-06  5:27       ` [Qemu-devel] " Kirti Wankhede
2016-06-06  6:01       ` Dong Jia
2016-06-06  6:01         ` [Qemu-devel] " Dong Jia
2016-06-06  6:27         ` Neo Jia
2016-06-06  6:27           ` [Qemu-devel] " Neo Jia
2016-06-06  8:29           ` Dong Jia
2016-06-06  8:29             ` [Qemu-devel] " Dong Jia
2016-06-06 17:44             ` Neo Jia
2016-06-06 17:44               ` [Qemu-devel] " Neo Jia
2016-06-06 19:31               ` Alex Williamson
2016-06-06 19:31                 ` [Qemu-devel] " Alex Williamson
2016-06-07  3:03                 ` Tian, Kevin
2016-06-07  3:03                   ` [Qemu-devel] " Tian, Kevin
2016-06-07 22:42                   ` Alex Williamson
2016-06-07 22:42                     ` [Qemu-devel] " Alex Williamson
2016-06-08  1:18                     ` Tian, Kevin
2016-06-08  1:18                       ` [Qemu-devel] " Tian, Kevin
2016-06-08  1:39                       ` Alex Williamson
2016-06-08  1:39                         ` [Qemu-devel] " Alex Williamson
2016-06-08  3:18                         ` Dong Jia
2016-06-08  3:18                           ` [Qemu-devel] " Dong Jia
2016-06-08  3:48                           ` Neo Jia
2016-06-08  3:48                             ` [Qemu-devel] " Neo Jia
2016-06-08  6:13                             ` Dong Jia
2016-06-08  6:13                               ` [Qemu-devel] " Dong Jia
2016-06-08  6:22                               ` Neo Jia
2016-06-08  6:22                                 ` [Qemu-devel] " Neo Jia
2016-06-08  4:29                           ` Alex Williamson
2016-06-08  4:29                             ` [Qemu-devel] " Alex Williamson
2016-06-15  6:37                             ` Dong Jia
2016-06-15  6:37                               ` [Qemu-devel] " Dong Jia
2016-05-24 19:58 ` [RFC PATCH v4 2/3] VFIO driver for mediated PCI device Kirti Wankhede
2016-05-24 19:58   ` [Qemu-devel] " Kirti Wankhede
2016-05-25  8:15   ` Tian, Kevin
2016-05-25  8:15     ` [Qemu-devel] " Tian, Kevin
2016-05-25 13:04     ` Kirti Wankhede
2016-05-25 13:04       ` [Qemu-devel] " Kirti Wankhede
2016-05-27 10:03       ` Tian, Kevin
2016-05-27 10:03         ` [Qemu-devel] " Tian, Kevin
2016-05-27 15:13         ` Alex Williamson
2016-05-27 15:13           ` [Qemu-devel] " Alex Williamson
2016-05-24 19:58 ` Kirti Wankhede [this message]
2016-05-24 19:58   ` [Qemu-devel] [RFC PATCH v4 3/3] VFIO Type1 IOMMU: Add support for mediated devices Kirti Wankhede
2016-06-01  8:40   ` Dong Jia
2016-06-01  8:40     ` [Qemu-devel] " Dong Jia
2016-06-02  7:56     ` Neo Jia
2016-06-02  7:56       ` [Qemu-devel] " Neo Jia
2016-06-03  8:32       ` Dong Jia
2016-06-03  8:32         ` [Qemu-devel] " Dong Jia
2016-06-03  8:37         ` Tian, Kevin
2016-06-03  8:37           ` [Qemu-devel] " Tian, Kevin
2016-05-25  7:13 ` [RFC PATCH v4 0/3] Add Mediated device support[was: Add vGPU support] Tian, Kevin
2016-05-25  7:13   ` [Qemu-devel] " Tian, Kevin
2016-05-25 13:43   ` Alex Williamson
2016-05-25 13:43     ` [Qemu-devel] " Alex Williamson
2016-05-27 11:02     ` Tian, Kevin
2016-05-27 11:02       ` [Qemu-devel] " Tian, Kevin
2016-05-27 14:54       ` Alex Williamson
2016-05-27 14:54         ` [Qemu-devel] " Alex Williamson
2016-05-27 22:43         ` Tian, Kevin
2016-05-27 22:43           ` [Qemu-devel] " Tian, Kevin
2016-05-28 14:56           ` Alex Williamson
2016-05-28 14:56             ` [Qemu-devel] " Alex Williamson
2016-05-31  2:29             ` Jike Song
2016-05-31  2:29               ` [Qemu-devel] " Jike Song
2016-05-31 14:29               ` Alex Williamson
2016-05-31 14:29                 ` [Qemu-devel] " Alex Williamson
2016-06-02  2:11                 ` Jike Song
2016-06-02  2:11                   ` [Qemu-devel] " Jike Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1464119897-10844-4-git-send-email-kwankhede@nvidia.com \
    --to=kwankhede@nvidia.com \
    --cc=alex.williamson@redhat.com \
    --cc=bjsdjshi@linux.vnet.ibm.com \
    --cc=cjia@nvidia.com \
    --cc=jike.song@intel.com \
    --cc=kevin.tian@intel.com \
    --cc=kraxel@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=shuai.ruan@intel.com \
    --cc=zhiyuan.lv@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.