linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jike Song <jike.song@intel.com>
To: Alex Williamson <alex.williamson@redhat.com>
Cc: Kirti Wankhede <kwankhede@nvidia.com>,
	pbonzini@redhat.com, kraxel@redhat.com, cjia@nvidia.com,
	qemu-devel@nongnu.org, kvm@vger.kernel.org, kevin.tian@intel.com,
	bjsdjshi@linux.vnet.ibm.com, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v11 10/22] vfio iommu type1: Add support for mediated devices
Date: Tue, 08 Nov 2016 10:20:14 +0800	[thread overview]
Message-ID: <5821365E.6020304@intel.com> (raw)
In-Reply-To: <20161107161619.66e03d8f@t450s.home>

On 11/08/2016 07:16 AM, Alex Williamson wrote:
> On Sat, 5 Nov 2016 02:40:44 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>> managed by an IOMMU domain.
>>
>> Aim of this change is:
>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>> - To support direct assigned device and mediated device in single module
>>
>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>> backend module. More details:
>> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
>>   that is, of the process who mapped that iova range.
>> - Added pfn_list tracking logic to address space structure. All pages
>>   pinned through this interface are trached in its address space.
>                                           ^ k
> ------------------------------------------|
> 
>> - Pinned pages list is used to verify unpinning request and to unpin
>>   remaining pages while detaching the group for that device.
>> - Page accounting is updated to account in its address space where the
>>   pages are pinned/unpinned.
>> -  Accouting for mdev device is only done if there is no iommu capable
>>   domain in the container. When there is a direct device assigned to the
>>   container and that domain is iommu capable, all pages are already pinned
>>   during DMA_MAP.
>> - Page accouting is updated on hot plug and unplug mdev device and pass
>>   through device.
>>
>> Tested by assigning below combinations of devices to a single VM:
>> - GPU pass through only
>> - vGPU device only
>> - One GPU pass through and one vGPU device
>> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>>   exist
>> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>>   exist
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Signed-off-by: Neo Jia <cjia@nvidia.com>
>> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 538 +++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 500 insertions(+), 38 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 8d64528dcc22..e511073446a0 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -36,6 +36,7 @@
>>  #include <linux/uaccess.h>
>>  #include <linux/vfio.h>
>>  #include <linux/workqueue.h>
>> +#include <linux/mdev.h>
>>  
>>  #define DRIVER_VERSION  "0.2"
>>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
>> @@ -56,6 +57,7 @@ MODULE_PARM_DESC(disable_hugepages,
>>  struct vfio_iommu {
>>  	struct list_head	domain_list;
>>  	struct list_head	addr_space_list;
>> +	struct vfio_domain	*external_domain; /* domain for external user */
>>  	struct mutex		lock;
>>  	struct rb_root		dma_list;
>>  	bool			v2;
>> @@ -67,6 +69,9 @@ struct vfio_addr_space {
>>  	struct mm_struct	*mm;
>>  	struct list_head	next;
>>  	atomic_t		ref_count;
>> +	/* external user pinned pfns */
>> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
>> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
>>  };
>>  
>>  struct vfio_domain {
>> @@ -83,6 +88,7 @@ struct vfio_dma {
>>  	unsigned long		vaddr;		/* Process virtual addr */
>>  	size_t			size;		/* Map size (bytes) */
>>  	int			prot;		/* IOMMU_READ/WRITE */
>> +	bool			iommu_mapped;
>>  	struct vfio_addr_space	*addr_space;
>>  	struct task_struct	*task;
>>  	bool			mlock_cap;
>> @@ -94,6 +100,19 @@ struct vfio_group {
>>  };
>>  
>>  /*
>> + * Guest RAM pinning working set or DMA target
>> + */
>> +struct vfio_pfn {
>> +	struct rb_node		node;
>> +	unsigned long		pfn;		/* Host pfn */
>> +	int			prot;
>> +	atomic_t		ref_count;
>> +};
>> +
>> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>> +					(!list_empty(&iommu->domain_list))
>> +
>> +/*
>>   * This code handles mapping and unmapping of user data buffers
>>   * into DMA'ble space using the IOMMU
>>   */
>> @@ -153,6 +172,93 @@ static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
>>  	return NULL;
>>  }
>>  
>> +/*
>> + * Helper Functions for host pfn list
>> + */
>> +static struct vfio_pfn *vfio_find_pfn(struct vfio_addr_space *addr_space,
>> +				      unsigned long pfn)
>> +{
>> +	struct vfio_pfn *vpfn;
>> +	struct rb_node *node = addr_space->pfn_list.rb_node;
>> +
>> +	while (node) {
>> +		vpfn = rb_entry(node, struct vfio_pfn, node);
>> +
>> +		if (pfn < vpfn->pfn)
>> +			node = node->rb_left;
>> +		else if (pfn > vpfn->pfn)
>> +			node = node->rb_right;
>> +		else
>> +			return vpfn;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +static void vfio_link_pfn(struct vfio_addr_space *addr_space,
>> +			  struct vfio_pfn *new)
>> +{
>> +	struct rb_node **link, *parent = NULL;
>> +	struct vfio_pfn *vpfn;
>> +
>> +	link = &addr_space->pfn_list.rb_node;
>> +	while (*link) {
>> +		parent = *link;
>> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
>> +
>> +		if (new->pfn < vpfn->pfn)
>> +			link = &(*link)->rb_left;
>> +		else
>> +			link = &(*link)->rb_right;
>> +	}
>> +
>> +	rb_link_node(&new->node, parent, link);
>> +	rb_insert_color(&new->node, &addr_space->pfn_list);
>> +}
>> +
>> +static void vfio_unlink_pfn(struct vfio_addr_space *addr_space,
>> +			    struct vfio_pfn *old)
>> +{
>> +	rb_erase(&old->node, &addr_space->pfn_list);
>> +}
>> +
>> +static int vfio_add_to_pfn_list(struct vfio_addr_space *addr_space,
>> +				unsigned long pfn, int prot)
>> +{
>> +	struct vfio_pfn *vpfn;
>> +
>> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
>> +	if (!vpfn)
>> +		return -ENOMEM;
>> +
>> +	vpfn->pfn = pfn;
>> +	vpfn->prot = prot;
>> +	atomic_set(&vpfn->ref_count, 1);
>> +	vfio_link_pfn(addr_space, vpfn);
>> +	return 0;
>> +}
>> +
>> +static void vfio_remove_from_pfn_list(struct vfio_addr_space *addr_space,
>> +				      struct vfio_pfn *vpfn)
>> +{
>> +	vfio_unlink_pfn(addr_space, vpfn);
>> +	kfree(vpfn);
>> +}
>> +
>> +static int vfio_pfn_account(struct vfio_addr_space *addr_space,
>> +			    unsigned long pfn)
>> +{
>> +	struct vfio_pfn *p;
>> +	int ret = 1;
>> +
>> +	mutex_lock(&addr_space->pfn_list_lock);
>> +	p = vfio_find_pfn(addr_space, pfn);
>> +	if (p)
>> +		ret = 0;
>> +	mutex_unlock(&addr_space->pfn_list_lock);
>> +	return ret;
>> +}
>> +
>>  struct vwork {
>>  	struct mm_struct	*mm;
>>  	long			npage;
>> @@ -304,16 +410,18 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>>  	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>  	bool lock_cap = dma->mlock_cap;
>>  	struct mm_struct *mm = dma->addr_space->mm;
>> -	long ret, i;
>> +	long ret, i, lock_acct;
>>  	bool rsvd;
>>  
>>  	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>>  	if (ret)
>>  		return ret;
>>  
>> +	lock_acct = vfio_pfn_account(dma->addr_space, *pfn_base);
>> +
>>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
>>  
>> -	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
>> +	if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) {
>>  		put_pfn(*pfn_base, prot);
>>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>>  			limit << PAGE_SHIFT);
>> @@ -340,8 +448,10 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>>  			break;
>>  		}
>>  
>> +		lock_acct += vfio_pfn_account(dma->addr_space, pfn);
>> +
>>  		if (!rsvd && !lock_cap &&
>> -		    mm->locked_vm + i + 1 > limit) {
>> +		    mm->locked_vm + lock_acct + 1 > limit) {
>>  			put_pfn(pfn, prot);
>>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>>  				__func__, limit << PAGE_SHIFT);
>> @@ -350,7 +460,7 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>>  	}
>>  
>>  	if (!rsvd)
>> -		vfio_lock_acct(mm, i);
>> +		vfio_lock_acct(mm, lock_acct);
>>  
>>  	return i;
>>  }
>> @@ -370,14 +480,214 @@ static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
>>  	return unlocked;
>>  }
>>  
>> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>> +static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>> +				    int prot, unsigned long *pfn_base,
>> +				    bool do_accounting)
>> +{
>> +	struct task_struct *task = dma->task;
>> +	unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> +	bool lock_cap = dma->mlock_cap;
>> +	struct mm_struct *mm = dma->addr_space->mm;
>> +	int ret;
>> +	bool rsvd;
>> +
>> +	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>> +	if (ret)
>> +		return ret;
>> +
>> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
>> +
>> +	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
>> +		put_pfn(*pfn_base, prot);
>> +		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
>> +			__func__, task->comm, task_pid_nr(task),
>> +			limit << PAGE_SHIFT);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	if (!rsvd && do_accounting)
>> +		vfio_lock_acct(mm, 1);
>> +
>> +	return 1;
>> +}
>> +
>> +static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space,
>> +				       unsigned long pfn, int prot,
>> +				       bool do_accounting)
>> +{
>> +	put_pfn(pfn, prot);
>> +
>> +	if (do_accounting)
>> +		vfio_lock_acct(addr_space->mm, -1);
> 
> Can't we batch this like we do elsewhere?  Intel folks, AIUI you intend
> to pin all VM memory through this side channel, have you tested the
> scalability and performance of this with larger VMs?  Our vfio_pfn
> data structure alone is 40 bytes per pinned page, which means for
> each 1GB of VM memory, we have 10MBs worth of struct vfio_pfn!
> Additionally, unmapping each 1GB of VM memory will result in 256k
> separate vfio_lock_acct() callbacks.  I'm concerned that we're not
> being efficient enough in either space or time.

Hi Alex,

Sorry for being confusing, Intel vGPU actually doesn't necessarily need
to pin all guest memory. A vGPU has its page table (GTT), whose access
is trapped. Whenever guest driver wants to specify a page for DMA, it
writes the GTT entry - thereby we could know the event and pin that
page only.

Performance data will be shared once available. Thanks :)

--
Thanks,
Jike

  reply	other threads:[~2016-11-08  2:23 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-11-04 21:10 [PATCH v11 00/22] Add Mediated device support Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 01/22] vfio: Mediated device Core driver Kirti Wankhede
2016-11-07  6:40   ` Tian, Kevin
     [not found]   ` <20161108092552.GA2090@bjsdjshi@linux.vnet.ibm.com>
2016-11-08 21:06     ` Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 02/22] vfio: VFIO based driver for Mediated devices Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 03/22] vfio: Rearrange functions to get vfio_group from dev Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 04/22] vfio: Common function to increment container_users Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 05/22] vfio iommu: Added pin and unpin callback functions to vfio_iommu_driver_ops Kirti Wankhede
2016-11-07 19:36   ` Alex Williamson
2016-11-08 13:55     ` Kirti Wankhede
2016-11-08 16:39       ` Alex Williamson
2016-11-08 18:47         ` Kirti Wankhede
2016-11-08 19:14           ` Alex Williamson
2016-11-04 21:10 ` [PATCH v11 06/22] vfio iommu type1: Update arguments of vfio_lock_acct Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 07/22] vfio iommu type1: Update argument of vaddr_get_pfn() Kirti Wankhede
2016-11-07  8:42   ` Alexey Kardashevskiy
2016-11-04 21:10 ` [PATCH v11 08/22] vfio iommu type1: Add find_iommu_group() function Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 09/22] vfio iommu type1: Add task structure to vfio_dma Kirti Wankhede
2016-11-07 21:03   ` Alex Williamson
2016-11-08 14:13     ` Kirti Wankhede
2016-11-08 16:43       ` Alex Williamson
2016-11-04 21:10 ` [PATCH v11 10/22] vfio iommu type1: Add support for mediated devices Kirti Wankhede
2016-11-07 23:16   ` Alex Williamson
2016-11-08  2:20     ` Jike Song [this message]
2016-11-08 16:18       ` Alex Williamson
2016-11-08 15:06     ` Kirti Wankhede
2016-11-08 17:05       ` Alex Williamson
2016-11-08  6:52   ` Alexey Kardashevskiy
2016-11-15  5:17     ` Alexey Kardashevskiy
2016-11-15  6:33       ` Kirti Wankhede
2016-11-15  7:27         ` Alexey Kardashevskiy
2016-11-15  7:56           ` Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 11/22] vfio iommu: Add blocking notifier to notify DMA_UNMAP Kirti Wankhede
2016-11-07 23:45   ` Alex Williamson
2016-11-08 16:26     ` Kirti Wankhede
2016-11-08 17:46       ` Alex Williamson
2016-11-08 19:59         ` Kirti Wankhede
2016-11-08 21:28           ` Alex Williamson
2016-11-14  7:52             ` Kirti Wankhede
2016-11-14 15:37               ` Alex Williamson
2016-11-04 21:10 ` [PATCH v11 12/22] vfio: Add notifier callback to parent's ops structure of mdev Kirti Wankhede
2016-11-07 23:51   ` Alex Williamson
2016-11-04 21:10 ` [PATCH v11 13/22] vfio: Introduce common function to add capabilities Kirti Wankhede
2016-11-08  7:29   ` Alexey Kardashevskiy
2016-11-08 20:46     ` Kirti Wankhede
2016-11-08 21:42       ` Alex Williamson
2016-11-09  2:23         ` Alexey Kardashevskiy
2016-11-04 21:10 ` [PATCH v11 14/22] vfio_pci: Update vfio_pci to use vfio_info_add_capability() Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 15/22] vfio: Introduce vfio_set_irqs_validate_and_prepare() Kirti Wankhede
2016-11-08  8:46   ` Alexey Kardashevskiy
2016-11-08 20:22     ` Kirti Wankhede
2016-11-09  3:07       ` Alexey Kardashevskiy
2016-11-09  3:35         ` Alex Williamson
2016-11-04 21:10 ` [PATCH v11 16/22] vfio_pci: Updated to use vfio_set_irqs_validate_and_prepare() Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 17/22] vfio_platform: " Kirti Wankhede
2016-11-08  8:52   ` Alexey Kardashevskiy
2016-11-08 20:41     ` Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 18/22] vfio: Define device_api strings Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 19/22] docs: Add Documentation for Mediated devices Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 20/22] docs: Sysfs ABI for mediated device framework Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 21/22] docs: Sample driver to demonstrate how to use Mediated " Kirti Wankhede
2016-11-04 21:10 ` [PATCH v11 22/22] MAINTAINERS: Add entry VFIO based Mediated device drivers Kirti Wankhede
2016-11-07  3:30 ` [PATCH v11 00/22] Add Mediated device support Alexey Kardashevskiy
2016-11-07  3:59   ` Kirti Wankhede
2016-11-07  5:06     ` Kirti Wankhede
2016-11-07  6:15     ` Alexey Kardashevskiy
2016-11-07  6:36       ` Kirti Wankhede
2016-11-07  6:46         ` Alexey Kardashevskiy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5821365E.6020304@intel.com \
    --to=jike.song@intel.com \
    --cc=alex.williamson@redhat.com \
    --cc=bjsdjshi@linux.vnet.ibm.com \
    --cc=cjia@nvidia.com \
    --cc=kevin.tian@intel.com \
    --cc=kraxel@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=kwankhede@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).