From mboxrd@z Thu Jan  1 00:00:00 1970
From: Jike Song <jike.song@intel.com>
Subject: Re: [PATCH v7 3/4] vfio iommu: Add support for mediated devices
Date: Thu, 29 Sep 2016 10:17:23 +0800
Message-ID: <57EC79B3.2060305@intel.com>
References: <1472097235-6332-1-git-send-email-kwankhede@nvidia.com> <1472097235-6332-4-git-send-email-kwankhede@nvidia.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 7bit
Cc: alex.williamson@redhat.com, pbonzini@redhat.com, kraxel@redhat.com,
        cjia@nvidia.com, qemu-devel@nongnu.org, kvm@vger.kernel.org,
        kevin.tian@intel.com, bjsdjshi@linux.vnet.ibm.com,
        "Xiao, Guangrong" <guangrong.xiao@intel.com>
To: Kirti Wankhede <kwankhede@nvidia.com>
Return-path: <kvm-owner@vger.kernel.org>
Received: from mga06.intel.com ([134.134.136.31]:1921 "EHLO mga06.intel.com"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S1755122AbcI2CTt (ORCPT <rfc822;kvm@vger.kernel.org>);
        Wed, 28 Sep 2016 22:19:49 -0400
In-Reply-To: <1472097235-6332-4-git-send-email-kwankhede@nvidia.com>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

+Guangrong

On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
> IOMMU module that supports pining and unpinning pages for mdev devices
> should provide these functions.
> Added APIs for pining and unpining pages to VFIO module. These calls back
> into backend iommu module to actually pin and unpin pages.
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - two GPU pass through
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> Reviewed-on: http://git-master/r/1175707
> Reviewed-by: Automatic_Commit_Validation_User
> ---
>  drivers/vfio/vfio.c             | 117 ++++++++++
>  drivers/vfio/vfio_iommu_type1.c | 498 ++++++++++++++++++++++++++++++++++++----
>  include/linux/vfio.h            |  13 +-
>  3 files changed, 580 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 6fd6fa5469de..e3e342861e04 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1782,6 +1782,123 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
>  }
>  EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
>  
> +static struct vfio_group *vfio_group_from_dev(struct device *dev)
> +{
> +	struct vfio_device *device;
> +	struct vfio_group *group;
> +	int ret;
> +
> +	device = vfio_device_get_from_dev(dev);
> +	if (!device)
> +		return ERR_PTR(-EINVAL);
> +
> +	group = device->group;
> +	if (!atomic_inc_not_zero(&group->container_users)) {
> +		ret = -EINVAL;
> +		goto err_ret;
> +	}
> +
> +	if (group->noiommu) {
> +		atomic_dec(&group->container_users);
> +		ret = -EPERM;
> +		goto err_ret;
> +	}
> +
> +	if (!group->container->iommu_driver ||
> +	    !vfio_group_viable(group)) {
> +		atomic_dec(&group->container_users);
> +		ret = -EINVAL;
> +		goto err_ret;
> +	}
> +
> +	vfio_device_put(device);
> +	return group;
> +
> +err_ret:
> +	vfio_device_put(device);
> +	return ERR_PTR(ret);
> +}
> +
> +/*
> + * Pin a set of guest PFNs and return their associated host PFNs for local
> + * domain only.
> + * @dev [in] : device
> + * @user_pfn [in]: array of user/guest PFNs
> + * @npage [in]: count of array elements
> + * @prot [in] : protection flags
> + * @phys_pfn[out] : array of host PFNs
> + */
> +long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +		    long npage, int prot, unsigned long *phys_pfn)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;
> +
> +	if (!dev || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	container = group->container;
> +	if (IS_ERR(container))
> +		return PTR_ERR(container);
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->pin_pages))
> +		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> +					     npage, prot, phys_pfn);
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +	return ret;
> +
> +}
> +EXPORT_SYMBOL(vfio_pin_pages);
> +
> +/*
> + * Unpin set of host PFNs for local domain only.
> + * @dev [in] : device
> + * @pfn [in] : array of host PFNs to be unpinned.
> + * @npage [in] :count of elements in array, that is number of pages.
> + */
> +long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;
> +
> +	if (!dev || !pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	container = group->container;
> +	if (IS_ERR(container))
> +		return PTR_ERR(container);
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->unpin_pages))
> +		ret = driver->ops->unpin_pages(container->iommu_data, pfn,
> +					       npage);
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_unpin_pages);
> +
>  /**
>   * Module/class support
>   */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..d52d75fd0f04 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*local_domain;
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
>  	bool			nesting;
>  };
>  
> +struct local_addr_space {
> +	struct task_struct	*task;
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> +};
> +
>  struct vfio_domain {
>  	struct iommu_domain	*domain;
>  	struct list_head	next;
>  	struct list_head	group_list;
>  	int			prot;		/* IOMMU_CACHE */
>  	bool			fgsp;		/* Fine-grained super pages */
> +	struct local_addr_space	*local_addr_space;
>  };
>  
>  struct vfio_dma {
> @@ -83,6 +91,22 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		vaddr;		/* virtual addr */
> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		pfn;		/* Host pfn */
> +	size_t			prot;
> +	atomic_t		ref_count;
> +};
> +
> +
> +#define IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)	\
> +			 (list_empty(&iommu->domain_list) ? false : true)
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +154,84 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +				      unsigned long pfn)
> +{
> +	struct rb_node *node;
> +	struct vfio_pfn *vpfn, *ret = NULL;
> +
> +	node = domain->local_addr_space->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else {
> +			ret = vpfn;
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &domain->local_addr_space->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> +				dma_addr_t iova, unsigned long pfn, size_t prot)
> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->vaddr = vaddr;
> +	vpfn->iova = iova;
> +	vpfn->pfn = pfn;
> +	vpfn->prot = prot;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(domain, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(domain, vpfn);
> +	kfree(vpfn);
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -150,17 +252,17 @@ static void vfio_lock_acct_bg(struct work_struct *work)
>  	kfree(vwork);
>  }
>  
> -static void vfio_lock_acct(long npage)
> +static void vfio_lock_acct(struct task_struct *task, long npage)
>  {
>  	struct vwork *vwork;
>  	struct mm_struct *mm;
>  
> -	if (!current->mm || !npage)
> +	if (!task->mm || !npage)
>  		return; /* process exited or nothing to do */
>  
> -	if (down_write_trylock(&current->mm->mmap_sem)) {
> -		current->mm->locked_vm += npage;
> -		up_write(&current->mm->mmap_sem);
> +	if (down_write_trylock(&task->mm->mmap_sem)) {
> +		task->mm->locked_vm += npage;
> +		up_write(&task->mm->mmap_sem);
>  		return;
>  	}
>  
> @@ -172,7 +274,7 @@ static void vfio_lock_acct(long npage)
>  	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>  	if (!vwork)
>  		return;
> -	mm = get_task_mm(current);
> +	mm = get_task_mm(task);
>  	if (!mm) {
>  		kfree(vwork);
>  		return;
> @@ -228,20 +330,31 @@ static int put_pfn(unsigned long pfn, int prot)
>  	return 0;
>  }
>  
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +			 int prot, unsigned long *pfn)
>  {
>  	struct page *page[1];
>  	struct vm_area_struct *vma;
> +	struct mm_struct *local_mm = (mm ? mm : current->mm);
>  	int ret = -EFAULT;
>  
> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> +	if (mm) {
> +		down_read(&local_mm->mmap_sem);
> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
> +		up_read(&local_mm->mmap_sem);
> +	} else
> +		ret = get_user_pages_fast(vaddr, 1,
> +					  !!(prot & IOMMU_WRITE), page);
> +
> +	if (ret == 1) {
>  		*pfn = page_to_pfn(page[0]);
>  		return 0;
>  	}
>  
> -	down_read(&current->mm->mmap_sem);
> +	down_read(&local_mm->mmap_sem);
>  
> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>  
>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> @@ -249,7 +362,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>  			ret = 0;
>  	}
>  
> -	up_read(&current->mm->mmap_sem);
> +	up_read(&local_mm->mmap_sem);
>  
>  	return ret;
>  }
> @@ -259,8 +372,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
> +				    int prot, unsigned long *pfn_base)
>  {
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
> @@ -270,7 +383,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	if (!current->mm)
>  		return -ENODEV;
>  
> -	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> +	ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
>  	if (ret)
>  		return ret;
>  
> @@ -285,7 +398,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  
>  	if (unlikely(disable_hugepages)) {
>  		if (!rsvd)
> -			vfio_lock_acct(1);
> +			vfio_lock_acct(current, 1);
>  		return 1;
>  	}
>  
> @@ -293,7 +406,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		unsigned long pfn = 0;
>  
> -		ret = vaddr_get_pfn(vaddr, prot, &pfn);
> +		ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
>  		if (ret)
>  			break;
>  
> @@ -313,13 +426,13 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	}
>  
>  	if (!rsvd)
> -		vfio_lock_acct(i);
> +		vfio_lock_acct(current, i);
>  
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, int prot,
> +				      bool do_accounting)
>  {
>  	unsigned long unlocked = 0;
>  	long i;
> @@ -328,7 +441,188 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
>  		unlocked += put_pfn(pfn++, prot);
>  
>  	if (do_accounting)
> -		vfio_lock_acct(-unlocked);
> +		vfio_lock_acct(current, -unlocked);
> +	return unlocked;
> +}
> +
> +static long __vfio_pin_pages_local(struct vfio_domain *domain,
> +				   unsigned long vaddr, int prot,
> +				   unsigned long *pfn_base,
> +				   bool do_accounting)
> +{
> +	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = capable(CAP_IPC_LOCK);
> +	long ret;
> +	bool rsvd;
> +	struct task_struct *task = domain->local_addr_space->task;
> +
> +	if (!task->mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
> +	if (ret)
> +		return ret;
> +
> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> +
> +	if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
> +		put_pfn(*pfn_base, prot);
> +		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> +			limit << PAGE_SHIFT);
> +		return -ENOMEM;
> +	}
> +
> +	if (!rsvd && do_accounting)
> +		vfio_lock_acct(task, 1);
> +
> +	return 1;
> +}
> +
> +static void __vfio_unpin_pages_local(struct vfio_domain *domain,
> +				     unsigned long pfn, int prot,
> +				     bool do_accounting)
> +{
> +	put_pfn(pfn, prot);
> +
> +	if (do_accounting)
> +		vfio_lock_acct(domain->local_addr_space->task, -1);
> +}
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> +			  struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +	__vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
> +				 do_accounting);
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count))
> +		vfio_remove_from_pfn_list(domain, vpfn);
> +
> +	return 1;
> +}
> +
> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
> +				       unsigned long *user_pfn,
> +				       long npage, int prot,
> +				       unsigned long *phys_pfn)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain;
> +	int i, j, ret;
> +	long retpage;
> +	unsigned long remote_vaddr;
> +	unsigned long *pfn = phys_pfn;
> +	struct vfio_dma *dma;
> +	bool do_accounting = false;
> +
> +	if (!iommu || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->local_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	domain = iommu->local_domain;
> +
> +	/*
> +	 * If iommu capable domain exist in the container then all pages are
> +	 * already pinned and accounted. Accouting should be done if there is no
> +	 * iommu capable domain in the container.
> +	 */
> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_unwind;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
> +						 &pfn[i], do_accounting);

Hi Kirti,

Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
whether the vaddr already pinned or not. That probably means, if the caller 
calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.

GUP always increases the page refcnt.

FWIW, I would like to have the pfn_list_lock implemented with key == iova,
so you can always try to find the PFN for a given iova, and pin it only if
not found.

--
Thanks,
Jike


> +		if (retpage <= 0) {
> +			WARN_ON(!retpage);
> +			ret = (int)retpage;
> +			goto pin_unwind;
> +		}
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* search if pfn exist */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p) {
> +			atomic_inc(&p->ref_count);
> +			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +			continue;
> +		}
> +
> +		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
> +					   pfn[i], prot);
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +		if (ret) {
> +			__vfio_unpin_pages_local(domain, pfn[i], prot,
> +						 do_accounting);
> +			goto pin_unwind;
> +		}
> +	}
> +
> +	ret = i;
> +	goto pin_done;
> +
> +pin_unwind:
> +	pfn[i] = 0;
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	for (j = 0; j < i; j++) {
> +		struct vfio_pfn *p;
> +
> +		p = vfio_find_pfn(domain, pfn[j]);
> +		if (p)
> +			vfio_unpin_pfn(domain, p, do_accounting);
> +
> +		pfn[j] = 0;
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
> +					 long npage)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL;
> +	long unlocked = 0;
> +	int i;
> +
> +	if (!iommu || !pfn)
> +		return -EINVAL;
> +
> +	domain = iommu->local_domain;
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* verify if pfn exist in pfn_list */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p)
> +			unlocked += vfio_unpin_pfn(domain, p, true);
> +
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +	}
>  
>  	return unlocked;
>  }
> @@ -341,6 +635,9 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  
>  	if (!dma->size)
>  		return;
> +
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		return;
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -382,15 +679,15 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> -					     unmapped >> PAGE_SHIFT,
> -					     dma->prot, false);
> +		unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
> +						      unmapped >> PAGE_SHIFT,
> +						      dma->prot, false);
>  		iova += unmapped;
>  
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(-unlocked);
> +	vfio_lock_acct(current, -unlocked);
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> @@ -611,10 +908,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> +	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) {
> +		dma->size = size;
> +		goto map_done;
> +	}
> +
>  	while (size) {
>  		/* Pin a contiguous chunk of memory */
> -		npage = vfio_pin_pages(vaddr + dma->size,
> -				       size >> PAGE_SHIFT, prot, &pfn);
> +		npage = __vfio_pin_pages_remote(vaddr + dma->size,
> +						size >> PAGE_SHIFT, prot, &pfn);
>  		if (npage <= 0) {
>  			WARN_ON(!npage);
>  			ret = (int)npage;
> @@ -624,7 +927,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  		/* Map it! */
>  		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
>  		if (ret) {
> -			vfio_unpin_pages(pfn, npage, prot, true);
> +			__vfio_unpin_pages_remote(pfn, npage, prot, true);
>  			break;
>  		}
>  
> @@ -635,6 +938,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	if (ret)
>  		vfio_remove_dma(iommu, dma);
>  
> +map_done:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
>  }
> @@ -734,11 +1038,24 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
>  	__free_pages(pages, order);
>  }
>  
> +static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
> +				   struct iommu_group *iommu_group)
> +{
> +	struct vfio_group *g;
> +
> +	list_for_each_entry(g, &domain->group_list, next) {
> +		if (g->iommu_group == iommu_group)
> +			return g;
> +	}
> +
> +	return NULL;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
> -	struct vfio_group *group, *g;
> +	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
>  	struct bus_type *bus = NULL;
>  	int ret;
> @@ -746,10 +1063,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	mutex_lock(&iommu->lock);
>  
>  	list_for_each_entry(d, &iommu->domain_list, next) {
> -		list_for_each_entry(g, &d->group_list, next) {
> -			if (g->iommu_group != iommu_group)
> -				continue;
> +		if (find_iommu_group(d, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}
>  
> +	if (iommu->local_domain) {
> +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
>  			mutex_unlock(&iommu->lock);
>  			return -EINVAL;
>  		}
> @@ -769,6 +1090,33 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	if (IS_ENABLED(CONFIF_VFIO_MDEV) && !iommu_present(bus) &&
> +	    (bus == &mdev_bus_type)) {
> +		if (iommu->local_domain) {
> +			list_add(&group->next,
> +				 &iommu->local_domain->group_list);
> +			kfree(domain);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		}
> +
> +		domain->local_addr_space = kzalloc(sizeof(*domain->local_addr_space),
> +						   GFP_KERNEL);
> +		if (!domain->local_addr_space) {
> +			ret = -ENOMEM;
> +			goto out_free;
> +		}
> +
> +		domain->local_addr_space->task = current;
> +		INIT_LIST_HEAD(&domain->group_list);
> +		list_add(&group->next, &domain->group_list);
> +		domain->local_addr_space->pfn_list = RB_ROOT;
> +		mutex_init(&domain->local_addr_space->pfn_list_lock);
> +		iommu->local_domain = domain;
> +		mutex_unlock(&iommu->lock);
> +		return 0;
> +	}
> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -859,6 +1207,18 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static void vfio_local_unpin_all(struct vfio_domain *domain)
> +{
> +	struct rb_node *node;
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	while ((node = rb_first(&domain->local_addr_space->pfn_list))) {
> +		vfio_unpin_pfn(domain,
> +				rb_entry(node, struct vfio_pfn, node), false);
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -868,31 +1228,52 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> -	list_for_each_entry(domain, &iommu->domain_list, next) {
> -		list_for_each_entry(group, &domain->group_list, next) {
> -			if (group->iommu_group != iommu_group)
> -				continue;
> +	if (iommu->local_domain) {
> +		domain = iommu->local_domain;
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
> +			list_del(&group->next);
> +			kfree(group);
>  
> +			if (list_empty(&domain->group_list)) {
> +				vfio_local_unpin_all(domain);
> +				if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				kfree(domain);
> +				iommu->local_domain = NULL;
> +			}
> +			goto detach_group_done;
> +		}
> +	}
> +
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		goto detach_group_done;
> +
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
>  			iommu_detach_group(domain->domain, iommu_group);
>  			list_del(&group->next);
>  			kfree(group);
>  			/*
>  			 * Group ownership provides privilege, if the group
>  			 * list is empty, the domain goes away.  If it's the
> -			 * last domain, then all the mappings go away too.
> +			 * last domain with iommu and local domain doesn't
> +			 * exist, the all the mappings go away too.
>  			 */
>  			if (list_empty(&domain->group_list)) {
> -				if (list_is_singular(&iommu->domain_list))
> +				if (list_is_singular(&iommu->domain_list) &&
> +				   (!iommu->local_domain))
>  					vfio_iommu_unmap_unpin_all(iommu);
>  				iommu_domain_free(domain->domain);
>  				list_del(&domain->next);
>  				kfree(domain);
>  			}
> -			goto done;
> +			break;
>  		}
>  	}
>  
> -done:
> +detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }
>  
> @@ -924,27 +1305,48 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	return iommu;
>  }
>  
> +static void vfio_release_domain(struct vfio_domain *domain)
> +{
> +	struct vfio_group *group, *group_tmp;
> +
> +	list_for_each_entry_safe(group, group_tmp,
> +				 &domain->group_list, next) {
> +		if (!domain->local_addr_space)
> +			iommu_detach_group(domain->domain, group->iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +	}
> +
> +	if (domain->local_addr_space)
> +		vfio_local_unpin_all(domain);
> +	else
> +		iommu_domain_free(domain->domain);
> +}
> +
>  static void vfio_iommu_type1_release(void *iommu_data)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_domain *domain, *domain_tmp;
> -	struct vfio_group *group, *group_tmp;
> +
> +	if (iommu->local_domain) {
> +		vfio_release_domain(iommu->local_domain);
> +		kfree(iommu->local_domain);
> +		iommu->local_domain = NULL;
> +	}
>  
>  	vfio_iommu_unmap_unpin_all(iommu);
>  
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		goto release_exit;
> +
>  	list_for_each_entry_safe(domain, domain_tmp,
>  				 &iommu->domain_list, next) {
> -		list_for_each_entry_safe(group, group_tmp,
> -					 &domain->group_list, next) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> -			list_del(&group->next);
> -			kfree(group);
> -		}
> -		iommu_domain_free(domain->domain);
> +		vfio_release_domain(domain);
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}
>  
> +release_exit:
>  	kfree(iommu);
>  }
>  
> @@ -1048,6 +1450,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.ioctl		= vfio_iommu_type1_ioctl,
>  	.attach_group	= vfio_iommu_type1_attach_group,
>  	.detach_group	= vfio_iommu_type1_detach_group,
> +	.pin_pages	= vfio_iommu_type1_pin_pages,
> +	.unpin_pages	= vfio_iommu_type1_unpin_pages,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0ecae0b1cd34..0bd25ba6223d 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -17,6 +17,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/poll.h>
>  #include <uapi/linux/vfio.h>
> +#include <linux/mdev.h>
>  
>  /**
>   * struct vfio_device_ops - VFIO bus driver device callbacks
> @@ -75,7 +76,11 @@ struct vfio_iommu_driver_ops {
>  					struct iommu_group *group);
>  	void		(*detach_group)(void *iommu_data,
>  					struct iommu_group *group);
> -
> +	long		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
> +				     long npage, int prot,
> +				     unsigned long *phys_pfn);
> +	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
> +				       long npage);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -127,6 +132,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>  }
>  #endif /* CONFIG_EEH */
>  
> +extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +			   long npage, int prot, unsigned long *phys_pfn);
> +
> +extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
> +			     long npage);
> +
>  /*
>   * IRQfd - generic
>   */
> 


From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from eggs.gnu.org ([2001:4830:134:3::10]:48856)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <jike.song@intel.com>) id 1bpQxI-0004hK-IK
	for qemu-devel@nongnu.org; Wed, 28 Sep 2016 22:20:16 -0400
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <jike.song@intel.com>) id 1bpQxE-0008LF-6L
	for qemu-devel@nongnu.org; Wed, 28 Sep 2016 22:20:11 -0400
Received: from mga04.intel.com ([192.55.52.120]:45109)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <jike.song@intel.com>) id 1bpQxD-0008Hd-MU
	for qemu-devel@nongnu.org; Wed, 28 Sep 2016 22:20:08 -0400
Message-ID: <57EC79B3.2060305@intel.com>
Date: Thu, 29 Sep 2016 10:17:23 +0800
From: Jike Song <jike.song@intel.com>
MIME-Version: 1.0
References: <1472097235-6332-1-git-send-email-kwankhede@nvidia.com>
	<1472097235-6332-4-git-send-email-kwankhede@nvidia.com>
In-Reply-To: <1472097235-6332-4-git-send-email-kwankhede@nvidia.com>
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 7bit
Subject: Re: [Qemu-devel] [PATCH v7 3/4] vfio iommu: Add support for
 mediated devices
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel/>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: Kirti Wankhede <kwankhede@nvidia.com>
Cc: alex.williamson@redhat.com, pbonzini@redhat.com, kraxel@redhat.com, cjia@nvidia.com, qemu-devel@nongnu.org, kvm@vger.kernel.org, kevin.tian@intel.com, bjsdjshi@linux.vnet.ibm.com, "Xiao, Guangrong" <guangrong.xiao@intel.com>

+Guangrong

On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
> IOMMU module that supports pining and unpinning pages for mdev devices
> should provide these functions.
> Added APIs for pining and unpining pages to VFIO module. These calls back
> into backend iommu module to actually pin and unpin pages.
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - two GPU pass through
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> Reviewed-on: http://git-master/r/1175707
> Reviewed-by: Automatic_Commit_Validation_User
> ---
>  drivers/vfio/vfio.c             | 117 ++++++++++
>  drivers/vfio/vfio_iommu_type1.c | 498 ++++++++++++++++++++++++++++++++++++----
>  include/linux/vfio.h            |  13 +-
>  3 files changed, 580 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 6fd6fa5469de..e3e342861e04 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1782,6 +1782,123 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
>  }
>  EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
>  
> +static struct vfio_group *vfio_group_from_dev(struct device *dev)
> +{
> +	struct vfio_device *device;
> +	struct vfio_group *group;
> +	int ret;
> +
> +	device = vfio_device_get_from_dev(dev);
> +	if (!device)
> +		return ERR_PTR(-EINVAL);
> +
> +	group = device->group;
> +	if (!atomic_inc_not_zero(&group->container_users)) {
> +		ret = -EINVAL;
> +		goto err_ret;
> +	}
> +
> +	if (group->noiommu) {
> +		atomic_dec(&group->container_users);
> +		ret = -EPERM;
> +		goto err_ret;
> +	}
> +
> +	if (!group->container->iommu_driver ||
> +	    !vfio_group_viable(group)) {
> +		atomic_dec(&group->container_users);
> +		ret = -EINVAL;
> +		goto err_ret;
> +	}
> +
> +	vfio_device_put(device);
> +	return group;
> +
> +err_ret:
> +	vfio_device_put(device);
> +	return ERR_PTR(ret);
> +}
> +
> +/*
> + * Pin a set of guest PFNs and return their associated host PFNs for local
> + * domain only.
> + * @dev [in] : device
> + * @user_pfn [in]: array of user/guest PFNs
> + * @npage [in]: count of array elements
> + * @prot [in] : protection flags
> + * @phys_pfn[out] : array of host PFNs
> + */
> +long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +		    long npage, int prot, unsigned long *phys_pfn)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;
> +
> +	if (!dev || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	container = group->container;
> +	if (IS_ERR(container))
> +		return PTR_ERR(container);
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->pin_pages))
> +		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> +					     npage, prot, phys_pfn);
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +	return ret;
> +
> +}
> +EXPORT_SYMBOL(vfio_pin_pages);
> +
> +/*
> + * Unpin set of host PFNs for local domain only.
> + * @dev [in] : device
> + * @pfn [in] : array of host PFNs to be unpinned.
> + * @npage [in] :count of elements in array, that is number of pages.
> + */
> +long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;
> +
> +	if (!dev || !pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	container = group->container;
> +	if (IS_ERR(container))
> +		return PTR_ERR(container);
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->unpin_pages))
> +		ret = driver->ops->unpin_pages(container->iommu_data, pfn,
> +					       npage);
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_unpin_pages);
> +
>  /**
>   * Module/class support
>   */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..d52d75fd0f04 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*local_domain;
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
>  	bool			nesting;
>  };
>  
> +struct local_addr_space {
> +	struct task_struct	*task;
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> +};
> +
>  struct vfio_domain {
>  	struct iommu_domain	*domain;
>  	struct list_head	next;
>  	struct list_head	group_list;
>  	int			prot;		/* IOMMU_CACHE */
>  	bool			fgsp;		/* Fine-grained super pages */
> +	struct local_addr_space	*local_addr_space;
>  };
>  
>  struct vfio_dma {
> @@ -83,6 +91,22 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		vaddr;		/* virtual addr */
> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		pfn;		/* Host pfn */
> +	size_t			prot;
> +	atomic_t		ref_count;
> +};
> +
> +
> +#define IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)	\
> +			 (list_empty(&iommu->domain_list) ? false : true)
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +154,84 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +				      unsigned long pfn)
> +{
> +	struct rb_node *node;
> +	struct vfio_pfn *vpfn, *ret = NULL;
> +
> +	node = domain->local_addr_space->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else {
> +			ret = vpfn;
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &domain->local_addr_space->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> +				dma_addr_t iova, unsigned long pfn, size_t prot)
> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->vaddr = vaddr;
> +	vpfn->iova = iova;
> +	vpfn->pfn = pfn;
> +	vpfn->prot = prot;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(domain, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(domain, vpfn);
> +	kfree(vpfn);
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -150,17 +252,17 @@ static void vfio_lock_acct_bg(struct work_struct *work)
>  	kfree(vwork);
>  }
>  
> -static void vfio_lock_acct(long npage)
> +static void vfio_lock_acct(struct task_struct *task, long npage)
>  {
>  	struct vwork *vwork;
>  	struct mm_struct *mm;
>  
> -	if (!current->mm || !npage)
> +	if (!task->mm || !npage)
>  		return; /* process exited or nothing to do */
>  
> -	if (down_write_trylock(&current->mm->mmap_sem)) {
> -		current->mm->locked_vm += npage;
> -		up_write(&current->mm->mmap_sem);
> +	if (down_write_trylock(&task->mm->mmap_sem)) {
> +		task->mm->locked_vm += npage;
> +		up_write(&task->mm->mmap_sem);
>  		return;
>  	}
>  
> @@ -172,7 +274,7 @@ static void vfio_lock_acct(long npage)
>  	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>  	if (!vwork)
>  		return;
> -	mm = get_task_mm(current);
> +	mm = get_task_mm(task);
>  	if (!mm) {
>  		kfree(vwork);
>  		return;
> @@ -228,20 +330,31 @@ static int put_pfn(unsigned long pfn, int prot)
>  	return 0;
>  }
>  
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +			 int prot, unsigned long *pfn)
>  {
>  	struct page *page[1];
>  	struct vm_area_struct *vma;
> +	struct mm_struct *local_mm = (mm ? mm : current->mm);
>  	int ret = -EFAULT;
>  
> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> +	if (mm) {
> +		down_read(&local_mm->mmap_sem);
> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
> +		up_read(&local_mm->mmap_sem);
> +	} else
> +		ret = get_user_pages_fast(vaddr, 1,
> +					  !!(prot & IOMMU_WRITE), page);
> +
> +	if (ret == 1) {
>  		*pfn = page_to_pfn(page[0]);
>  		return 0;
>  	}
>  
> -	down_read(&current->mm->mmap_sem);
> +	down_read(&local_mm->mmap_sem);
>  
> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>  
>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> @@ -249,7 +362,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>  			ret = 0;
>  	}
>  
> -	up_read(&current->mm->mmap_sem);
> +	up_read(&local_mm->mmap_sem);
>  
>  	return ret;
>  }
> @@ -259,8 +372,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
> +				    int prot, unsigned long *pfn_base)
>  {
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
> @@ -270,7 +383,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	if (!current->mm)
>  		return -ENODEV;
>  
> -	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> +	ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
>  	if (ret)
>  		return ret;
>  
> @@ -285,7 +398,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  
>  	if (unlikely(disable_hugepages)) {
>  		if (!rsvd)
> -			vfio_lock_acct(1);
> +			vfio_lock_acct(current, 1);
>  		return 1;
>  	}
>  
> @@ -293,7 +406,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		unsigned long pfn = 0;
>  
> -		ret = vaddr_get_pfn(vaddr, prot, &pfn);
> +		ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
>  		if (ret)
>  			break;
>  
> @@ -313,13 +426,13 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	}
>  
>  	if (!rsvd)
> -		vfio_lock_acct(i);
> +		vfio_lock_acct(current, i);
>  
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, int prot,
> +				      bool do_accounting)
>  {
>  	unsigned long unlocked = 0;
>  	long i;
> @@ -328,7 +441,188 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
>  		unlocked += put_pfn(pfn++, prot);
>  
>  	if (do_accounting)
> -		vfio_lock_acct(-unlocked);
> +		vfio_lock_acct(current, -unlocked);
> +	return unlocked;
> +}
> +
> +static long __vfio_pin_pages_local(struct vfio_domain *domain,
> +				   unsigned long vaddr, int prot,
> +				   unsigned long *pfn_base,
> +				   bool do_accounting)
> +{
> +	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = capable(CAP_IPC_LOCK);
> +	long ret;
> +	bool rsvd;
> +	struct task_struct *task = domain->local_addr_space->task;
> +
> +	if (!task->mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
> +	if (ret)
> +		return ret;
> +
> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> +
> +	if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
> +		put_pfn(*pfn_base, prot);
> +		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> +			limit << PAGE_SHIFT);
> +		return -ENOMEM;
> +	}
> +
> +	if (!rsvd && do_accounting)
> +		vfio_lock_acct(task, 1);
> +
> +	return 1;
> +}
> +
> +static void __vfio_unpin_pages_local(struct vfio_domain *domain,
> +				     unsigned long pfn, int prot,
> +				     bool do_accounting)
> +{
> +	put_pfn(pfn, prot);
> +
> +	if (do_accounting)
> +		vfio_lock_acct(domain->local_addr_space->task, -1);
> +}
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> +			  struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +	__vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
> +				 do_accounting);
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count))
> +		vfio_remove_from_pfn_list(domain, vpfn);
> +
> +	return 1;
> +}
> +
> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
> +				       unsigned long *user_pfn,
> +				       long npage, int prot,
> +				       unsigned long *phys_pfn)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain;
> +	int i, j, ret;
> +	long retpage;
> +	unsigned long remote_vaddr;
> +	unsigned long *pfn = phys_pfn;
> +	struct vfio_dma *dma;
> +	bool do_accounting = false;
> +
> +	if (!iommu || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->local_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	domain = iommu->local_domain;
> +
> +	/*
> +	 * If iommu capable domain exist in the container then all pages are
> +	 * already pinned and accounted. Accouting should be done if there is no
> +	 * iommu capable domain in the container.
> +	 */
> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_unwind;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
> +						 &pfn[i], do_accounting);

Hi Kirti,

Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
whether the vaddr already pinned or not. That probably means, if the caller 
calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.

GUP always increases the page refcnt.

FWIW, I would like to have the pfn_list_lock implemented with key == iova,
so you can always try to find the PFN for a given iova, and pin it only if
not found.

--
Thanks,
Jike


> +		if (retpage <= 0) {
> +			WARN_ON(!retpage);
> +			ret = (int)retpage;
> +			goto pin_unwind;
> +		}
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* search if pfn exist */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p) {
> +			atomic_inc(&p->ref_count);
> +			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +			continue;
> +		}
> +
> +		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
> +					   pfn[i], prot);
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +		if (ret) {
> +			__vfio_unpin_pages_local(domain, pfn[i], prot,
> +						 do_accounting);
> +			goto pin_unwind;
> +		}
> +	}
> +
> +	ret = i;
> +	goto pin_done;
> +
> +pin_unwind:
> +	pfn[i] = 0;
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	for (j = 0; j < i; j++) {
> +		struct vfio_pfn *p;
> +
> +		p = vfio_find_pfn(domain, pfn[j]);
> +		if (p)
> +			vfio_unpin_pfn(domain, p, do_accounting);
> +
> +		pfn[j] = 0;
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
> +					 long npage)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL;
> +	long unlocked = 0;
> +	int i;
> +
> +	if (!iommu || !pfn)
> +		return -EINVAL;
> +
> +	domain = iommu->local_domain;
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* verify if pfn exist in pfn_list */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p)
> +			unlocked += vfio_unpin_pfn(domain, p, true);
> +
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +	}
>  
>  	return unlocked;
>  }
> @@ -341,6 +635,9 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  
>  	if (!dma->size)
>  		return;
> +
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		return;
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -382,15 +679,15 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> -					     unmapped >> PAGE_SHIFT,
> -					     dma->prot, false);
> +		unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
> +						      unmapped >> PAGE_SHIFT,
> +						      dma->prot, false);
>  		iova += unmapped;
>  
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(-unlocked);
> +	vfio_lock_acct(current, -unlocked);
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> @@ -611,10 +908,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> +	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) {
> +		dma->size = size;
> +		goto map_done;
> +	}
> +
>  	while (size) {
>  		/* Pin a contiguous chunk of memory */
> -		npage = vfio_pin_pages(vaddr + dma->size,
> -				       size >> PAGE_SHIFT, prot, &pfn);
> +		npage = __vfio_pin_pages_remote(vaddr + dma->size,
> +						size >> PAGE_SHIFT, prot, &pfn);
>  		if (npage <= 0) {
>  			WARN_ON(!npage);
>  			ret = (int)npage;
> @@ -624,7 +927,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  		/* Map it! */
>  		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
>  		if (ret) {
> -			vfio_unpin_pages(pfn, npage, prot, true);
> +			__vfio_unpin_pages_remote(pfn, npage, prot, true);
>  			break;
>  		}
>  
> @@ -635,6 +938,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	if (ret)
>  		vfio_remove_dma(iommu, dma);
>  
> +map_done:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
>  }
> @@ -734,11 +1038,24 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
>  	__free_pages(pages, order);
>  }
>  
> +static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
> +				   struct iommu_group *iommu_group)
> +{
> +	struct vfio_group *g;
> +
> +	list_for_each_entry(g, &domain->group_list, next) {
> +		if (g->iommu_group == iommu_group)
> +			return g;
> +	}
> +
> +	return NULL;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
> -	struct vfio_group *group, *g;
> +	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
>  	struct bus_type *bus = NULL;
>  	int ret;
> @@ -746,10 +1063,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	mutex_lock(&iommu->lock);
>  
>  	list_for_each_entry(d, &iommu->domain_list, next) {
> -		list_for_each_entry(g, &d->group_list, next) {
> -			if (g->iommu_group != iommu_group)
> -				continue;
> +		if (find_iommu_group(d, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}
>  
> +	if (iommu->local_domain) {
> +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
>  			mutex_unlock(&iommu->lock);
>  			return -EINVAL;
>  		}
> @@ -769,6 +1090,33 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	if (IS_ENABLED(CONFIF_VFIO_MDEV) && !iommu_present(bus) &&
> +	    (bus == &mdev_bus_type)) {
> +		if (iommu->local_domain) {
> +			list_add(&group->next,
> +				 &iommu->local_domain->group_list);
> +			kfree(domain);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		}
> +
> +		domain->local_addr_space = kzalloc(sizeof(*domain->local_addr_space),
> +						   GFP_KERNEL);
> +		if (!domain->local_addr_space) {
> +			ret = -ENOMEM;
> +			goto out_free;
> +		}
> +
> +		domain->local_addr_space->task = current;
> +		INIT_LIST_HEAD(&domain->group_list);
> +		list_add(&group->next, &domain->group_list);
> +		domain->local_addr_space->pfn_list = RB_ROOT;
> +		mutex_init(&domain->local_addr_space->pfn_list_lock);
> +		iommu->local_domain = domain;
> +		mutex_unlock(&iommu->lock);
> +		return 0;
> +	}
> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -859,6 +1207,18 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static void vfio_local_unpin_all(struct vfio_domain *domain)
> +{
> +	struct rb_node *node;
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	while ((node = rb_first(&domain->local_addr_space->pfn_list))) {
> +		vfio_unpin_pfn(domain,
> +				rb_entry(node, struct vfio_pfn, node), false);
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -868,31 +1228,52 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> -	list_for_each_entry(domain, &iommu->domain_list, next) {
> -		list_for_each_entry(group, &domain->group_list, next) {
> -			if (group->iommu_group != iommu_group)
> -				continue;
> +	if (iommu->local_domain) {
> +		domain = iommu->local_domain;
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
> +			list_del(&group->next);
> +			kfree(group);
>  
> +			if (list_empty(&domain->group_list)) {
> +				vfio_local_unpin_all(domain);
> +				if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				kfree(domain);
> +				iommu->local_domain = NULL;
> +			}
> +			goto detach_group_done;
> +		}
> +	}
> +
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		goto detach_group_done;
> +
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
>  			iommu_detach_group(domain->domain, iommu_group);
>  			list_del(&group->next);
>  			kfree(group);
>  			/*
>  			 * Group ownership provides privilege, if the group
>  			 * list is empty, the domain goes away.  If it's the
> -			 * last domain, then all the mappings go away too.
> +			 * last domain with iommu and local domain doesn't
> +			 * exist, the all the mappings go away too.
>  			 */
>  			if (list_empty(&domain->group_list)) {
> -				if (list_is_singular(&iommu->domain_list))
> +				if (list_is_singular(&iommu->domain_list) &&
> +				   (!iommu->local_domain))
>  					vfio_iommu_unmap_unpin_all(iommu);
>  				iommu_domain_free(domain->domain);
>  				list_del(&domain->next);
>  				kfree(domain);
>  			}
> -			goto done;
> +			break;
>  		}
>  	}
>  
> -done:
> +detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }
>  
> @@ -924,27 +1305,48 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	return iommu;
>  }
>  
> +static void vfio_release_domain(struct vfio_domain *domain)
> +{
> +	struct vfio_group *group, *group_tmp;
> +
> +	list_for_each_entry_safe(group, group_tmp,
> +				 &domain->group_list, next) {
> +		if (!domain->local_addr_space)
> +			iommu_detach_group(domain->domain, group->iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +	}
> +
> +	if (domain->local_addr_space)
> +		vfio_local_unpin_all(domain);
> +	else
> +		iommu_domain_free(domain->domain);
> +}
> +
>  static void vfio_iommu_type1_release(void *iommu_data)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_domain *domain, *domain_tmp;
> -	struct vfio_group *group, *group_tmp;
> +
> +	if (iommu->local_domain) {
> +		vfio_release_domain(iommu->local_domain);
> +		kfree(iommu->local_domain);
> +		iommu->local_domain = NULL;
> +	}
>  
>  	vfio_iommu_unmap_unpin_all(iommu);
>  
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		goto release_exit;
> +
>  	list_for_each_entry_safe(domain, domain_tmp,
>  				 &iommu->domain_list, next) {
> -		list_for_each_entry_safe(group, group_tmp,
> -					 &domain->group_list, next) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> -			list_del(&group->next);
> -			kfree(group);
> -		}
> -		iommu_domain_free(domain->domain);
> +		vfio_release_domain(domain);
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}
>  
> +release_exit:
>  	kfree(iommu);
>  }
>  
> @@ -1048,6 +1450,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.ioctl		= vfio_iommu_type1_ioctl,
>  	.attach_group	= vfio_iommu_type1_attach_group,
>  	.detach_group	= vfio_iommu_type1_detach_group,
> +	.pin_pages	= vfio_iommu_type1_pin_pages,
> +	.unpin_pages	= vfio_iommu_type1_unpin_pages,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0ecae0b1cd34..0bd25ba6223d 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -17,6 +17,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/poll.h>
>  #include <uapi/linux/vfio.h>
> +#include <linux/mdev.h>
>  
>  /**
>   * struct vfio_device_ops - VFIO bus driver device callbacks
> @@ -75,7 +76,11 @@ struct vfio_iommu_driver_ops {
>  					struct iommu_group *group);
>  	void		(*detach_group)(void *iommu_data,
>  					struct iommu_group *group);
> -
> +	long		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
> +				     long npage, int prot,
> +				     unsigned long *phys_pfn);
> +	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
> +				       long npage);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -127,6 +132,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>  }
>  #endif /* CONFIG_EEH */
>  
> +extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +			   long npage, int prot, unsigned long *phys_pfn);
> +
> +extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
> +			     long npage);
> +
>  /*
>   * IRQfd - generic
>   */
>