[v11,05/10] iommu/vt-d: Add bind guest PASID support
diff mbox series

Message ID 1585939334-21396-6-git-send-email-jacob.jun.pan@linux.intel.com
State New
Headers show
Series
  • Nested Shared Virtual Address (SVA) VT-d support
Related show

Commit Message

Jacob Pan April 3, 2020, 6:42 p.m. UTC
When supporting guest SVA with emulated IOMMU, the guest PASID
table is shadowed in VMM. Updates to guest vIOMMU PASID table
will result in PASID cache flush which will be passed down to
the host as bind guest PASID calls.

For the SL page tables, it will be harvested from device's
default domain (request w/o PASID), or aux domain in case of
mediated device.

    .-------------.  .---------------------------.
    |   vIOMMU    |  | Guest process CR3, FL only|
    |             |  '---------------------------'
    .----------------/
    | PASID Entry |--- PASID cache flush -
    '-------------'                       |
    |             |                       V
    |             |                CR3 in GPA
    '-------------'
Guest
------| Shadow |--------------------------|--------
      v        v                          v
Host
    .-------------.  .----------------------.
    |   pIOMMU    |  | Bind FL for GVA-GPA  |
    |             |  '----------------------'
    .----------------/  |
    | PASID Entry |     V (Nested xlate)
    '----------------\.------------------------------.
    |             |   |SL for GPA-HPA, default domain|
    |             |   '------------------------------'
    '-------------'
Where:
 - FL = First level/stage one page tables
 - SL = Second level/stage two page tables

---
v11: Fixed locking, avoid duplicated paging mode check, added helper to
free svm if device list is empty. Use rate limited error message since
the bind gpasid call comes from user space.
---

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
---
 drivers/iommu/intel-iommu.c |   4 +
 drivers/iommu/intel-svm.c   | 206 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/intel-iommu.h |   8 +-
 include/linux/intel-svm.h   |  17 ++++
 4 files changed, 234 insertions(+), 1 deletion(-)

Comments

Auger Eric April 9, 2020, 7:41 a.m. UTC | #1
Hi Jacob,

On 4/3/20 8:42 PM, Jacob Pan wrote:
> When supporting guest SVA with emulated IOMMU, the guest PASID
> table is shadowed in VMM. Updates to guest vIOMMU PASID table
> will result in PASID cache flush which will be passed down to
> the host as bind guest PASID calls.
> 
> For the SL page tables, it will be harvested from device's
> default domain (request w/o PASID), or aux domain in case of
> mediated device.
> 
>     .-------------.  .---------------------------.
>     |   vIOMMU    |  | Guest process CR3, FL only|
>     |             |  '---------------------------'
>     .----------------/
>     | PASID Entry |--- PASID cache flush -
>     '-------------'                       |
>     |             |                       V
>     |             |                CR3 in GPA
>     '-------------'
> Guest
> ------| Shadow |--------------------------|--------
>       v        v                          v
> Host
>     .-------------.  .----------------------.
>     |   pIOMMU    |  | Bind FL for GVA-GPA  |
>     |             |  '----------------------'
>     .----------------/  |
>     | PASID Entry |     V (Nested xlate)
>     '----------------\.------------------------------.
>     |             |   |SL for GPA-HPA, default domain|
>     |             |   '------------------------------'
>     '-------------'
> Where:
>  - FL = First level/stage one page tables
>  - SL = Second level/stage two page tables
> 
> ---
> v11: Fixed locking, avoid duplicated paging mode check, added helper to
> free svm if device list is empty. Use rate limited error message since
> the bind gpasid call comes from user space.
> ---
> 
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> ---
>  drivers/iommu/intel-iommu.c |   4 +
>  drivers/iommu/intel-svm.c   | 206 ++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/intel-iommu.h |   8 +-
>  include/linux/intel-svm.h   |  17 ++++
>  4 files changed, 234 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index c0dadec5a6b3..94c7993dac6a 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -6178,6 +6178,10 @@ const struct iommu_ops intel_iommu_ops = {
>  	.dev_disable_feat	= intel_iommu_dev_disable_feat,
>  	.is_attach_deferred	= intel_iommu_is_attach_deferred,
>  	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
> +#ifdef CONFIG_INTEL_IOMMU_SVM
> +	.sva_bind_gpasid	= intel_svm_bind_gpasid,
> +	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
> +#endif
>  };
>  
>  static void quirk_iommu_igfx(struct pci_dev *dev)
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> index d7f2a5358900..7cf711318b87 100644
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -226,6 +226,212 @@ static LIST_HEAD(global_svm_list);
>  	list_for_each_entry((sdev), &(svm)->devs, list)	\
>  		if ((d) != (sdev)->dev) {} else
>  
> +
> +static inline void intel_svm_free_if_empty(struct intel_svm *svm, u64 pasid)
> +{
> +	if (list_empty(&svm->devs)) {
> +		ioasid_set_data(pasid, NULL);
> +		kfree(svm);
> +	}
> +}
> +
> +int intel_svm_bind_gpasid(struct iommu_domain *domain,
> +			struct device *dev,
> +			struct iommu_gpasid_bind_data *data)
> +{
> +	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
> +	struct dmar_domain *dmar_domain;
> +	struct intel_svm_dev *sdev;
> +	struct intel_svm *svm;
> +	int ret = 0;
> +
> +	if (WARN_ON(!iommu) || !data)
> +		return -EINVAL;
> +
> +	if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
> +	    data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
> +		return -EINVAL;
> +
> +	if (dev_is_pci(dev)) {
> +		/* VT-d supports devices with full 20 bit PASIDs only */
> +		if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
> +			return -EINVAL;
> +	} else {
> +		return -ENOTSUPP;
> +	}
> +
> +	/*
> +	 * We only check host PASID range, we have no knowledge to check
> +	 * guest PASID range.
> +	 */
> +	if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
> +		return -EINVAL;
> +
> +	dmar_domain = to_dmar_domain(domain);
> +
> +	mutex_lock(&pasid_mutex);
> +	svm = ioasid_find(NULL, data->hpasid, NULL);
> +	if (IS_ERR(svm)) {
> +		ret = PTR_ERR(svm);
> +		goto out;
> +	}
> +
> +	if (svm) {
> +		/*
> +		 * If we found svm for the PASID, there must be at
> +		 * least one device bond, otherwise svm should be freed.
> +		 */
> +		if (WARN_ON(list_empty(&svm->devs))) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
> +		for_each_svm_dev(sdev, svm, dev) {
> +			/* In case of multiple sub-devices of the same pdev
> +			 * assigned, we should allow multiple bind calls with
> +			 * the same PASID and pdev.
> +			 */
> +			sdev->users++;
What if this is not an mdev device. Is it also allowed?
> +			goto out;
> +		}
> +	} else {
> +		/* We come here when PASID has never been bond to a device. */
> +		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
> +		if (!svm) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +		/* REVISIT: upper layer/VFIO can track host process that bind the PASID.
> +		 * ioasid_set = mm might be sufficient for vfio to check pasid VMM
> +		 * ownership. We can drop the following line once VFIO and IOASID set
> +		 * check is in place.
> +		 */
> +		svm->mm = get_task_mm(current);
> +		svm->pasid = data->hpasid;
> +		if (data->flags & IOMMU_SVA_GPASID_VAL) {
> +			svm->gpasid = data->gpasid;
> +			svm->flags |= SVM_FLAG_GUEST_PASID;
> +		}
> +		ioasid_set_data(data->hpasid, svm);
> +		INIT_LIST_HEAD_RCU(&svm->devs);
> +		mmput(svm->mm);
> +	}
> +	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
> +	if (!sdev) {
> +		/*
> +		 * If this is a new PASID that never bond to a device, then
> +		 * the device list must be empty which indicates struct svm
> +		 * was allocated in this function.
> +		 */
> +		intel_svm_free_if_empty(svm, data->hpasid);
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	sdev->dev = dev;
> +	sdev->users = 1;
> +
> +	/* Set up device context entry for PASID if not enabled already */
> +	ret = intel_iommu_enable_pasid(iommu, sdev->dev);
> +	if (ret) {
> +		dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
> +		kfree(sdev);
> +		intel_svm_free_if_empty(svm, data->hpasid);
> +		goto out;
> +	}
> +
> +	/*
> +	 * PASID table is per device for better security. Therefore, for
> +	 * each bind of a new device even with an existing PASID, we need to
> +	 * call the nested mode setup function here.
> +	 */
> +	spin_lock(&iommu->lock);
> +	ret = intel_pasid_setup_nested(iommu,
> +				       dev,
> +				       (pgd_t *)data->gpgd,
> +				       data->hpasid,
> +				       &data->vtd,
> +				       dmar_domain,
> +				       data->addr_width);
> +	if (ret) {
> +		dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
> +			data->hpasid, ret);
> +		/*
> +		 * PASID entry should be in cleared state if nested mode
> +		 * set up failed. So we only need to clear IOASID tracking
> +		 * data such that free call will succeed.
> +		 */
> +		kfree(sdev);
> +		intel_svm_free_if_empty(svm, data->hpasid);
> +		spin_unlock(&iommu->lock);
> +		goto out;
> +	}
> +	spin_unlock(&iommu->lock);
> +	svm->flags |= SVM_FLAG_GUEST_MODE;
> +
> +	init_rcu_head(&sdev->rcu);
> +	list_add_rcu(&sdev->list, &svm->devs);
> + out:
> +	mutex_unlock(&pasid_mutex);
> +	return ret;
> +}
> +
> +int intel_svm_unbind_gpasid(struct device *dev, int pasid)
> +{
This function's code looks very very similar to intel_svm_unbind_mm()
besides to the mmu_notifier_unregister and the ioasid_free(). Don't we
have means to factorize the code by checking the SVM_FLAG_GUEST_MODE flag?

Please could you explain again what does happen if the guest process
dies/VM dies. How do we make sure the svm get freed. I fail to
understand the whole picture at the moment as you cannot fully rely on
the userspace to call unbind_gpasid.
> +	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
> +	struct intel_svm_dev *sdev;
> +	struct intel_svm *svm;
> +	int ret = -EINVAL;
> +
> +	if (WARN_ON(!iommu))
> +		return -EINVAL;
> +
> +	mutex_lock(&pasid_mutex);
> +	svm = ioasid_find(NULL, pasid, NULL);
> +	if (!svm) {
> +		ret = -EINVAL;
As per the discussion on the VFIO series, shall we return an error in
that case? Same below?
> +		goto out;
> +	}
> +
> +	if (IS_ERR(svm)) {
> +		ret = PTR_ERR(svm);
> +		goto out;
> +	}
> +
> +	for_each_svm_dev(sdev, svm, dev) {
> +		ret = 0;
> +		sdev->users--;
> +		if (!sdev->users) {
> +			list_del_rcu(&sdev->list);
> +			intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
intel_svm_unbind_mm() calls intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
Don't we need to flush the (DEV-)IOTLBs as well?
> +			/* TODO: Drain in flight PRQ for the PASID since it
> +			 * may get reused soon, we don't want to
> +			 * confuse with its previous life.
> +			 * intel_svm_drain_prq(dev, pasid);
> +			 */
> +			kfree_rcu(sdev, rcu);
> +
> +			if (list_empty(&svm->devs)) {
> +				/*
> +				 * We do not free the IOASID here in that
> +				 * IOMMU driver did not allocate it.
s/in/as?
> +				 * Unlike native SVM, IOASID for guest use was
> +				 * allocated prior to the bind call.> +				 * In any case, if the free call comes before
> +				 * the unbind, IOMMU driver will get notified
> +				 * and perform cleanup.
> +				 */
> +				ioasid_set_data(pasid, NULL);
> +				kfree(svm);
> +			}
nit: you may use intel_svm_free_if_empty()
> +		}
> +		break;
> +	}
> +out:
> +	mutex_unlock(&pasid_mutex);
> +
nit : spare new line
> +	return ret;
> +}
> +
>  int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
>  {
>  	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 6da03f627ba3..a5bd53cf190c 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -706,7 +706,9 @@ struct dmar_domain *find_domain(struct device *dev);
>  extern void intel_svm_check(struct intel_iommu *iommu);
>  extern int intel_svm_enable_prq(struct intel_iommu *iommu);
>  extern int intel_svm_finish_prq(struct intel_iommu *iommu);
> -
> +extern int intel_svm_bind_gpasid(struct iommu_domain *domain,
> +		struct device *dev, struct iommu_gpasid_bind_data *data);
> +extern int intel_svm_unbind_gpasid(struct device *dev, int pasid);
>  struct svm_dev_ops;
>  
>  struct intel_svm_dev {
> @@ -723,9 +725,13 @@ struct intel_svm_dev {
>  struct intel_svm {
>  	struct mmu_notifier notifier;
>  	struct mm_struct *mm;
> +
>  	struct intel_iommu *iommu;
>  	int flags;
>  	int pasid;
> +	int gpasid; /* Guest PASID in case of vSVA bind with non-identity host
> +		     * to guest PASID mapping.
> +		     */
>  	struct list_head devs;
>  	struct list_head list;
>  };
> diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
> index d7c403d0dd27..c19690937540 100644
> --- a/include/linux/intel-svm.h
> +++ b/include/linux/intel-svm.h
> @@ -44,6 +44,23 @@ struct svm_dev_ops {
>   * do such IOTLB flushes automatically.
>   */
>  #define SVM_FLAG_SUPERVISOR_MODE	(1<<1)
> +/*
> + * The SVM_FLAG_GUEST_MODE flag is used when a guest process bind to a device.
> + * In this case the mm_struct is in the guest kernel or userspace, its life
> + * cycle is managed by VMM and VFIO layer. For IOMMU driver, this API provides
> + * means to bind/unbind guest CR3 with PASIDs allocated for a device.
> + */
> +#define SVM_FLAG_GUEST_MODE	(1<<2)
> +/*
> + * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space,
> + * which requires guest and host PASID translation at both directions. We keep
> + * track of guest PASID in order to provide lookup service to device drivers.
> + * One such example is a physical function (PF) driver that supports mediated
> + * device (mdev) assignment. Guest programming of mdev configuration space can
> + * only be done with guest PASID, therefore PF driver needs to find the matching
> + * host PASID to program the real hardware.
> + */
> +#define SVM_FLAG_GUEST_PASID	(1<<3)
>  
>  #ifdef CONFIG_INTEL_IOMMU_SVM
>  
> 
Thanks

Eric
Jacob Pan April 10, 2020, 7:45 p.m. UTC | #2
Hi Eric,

On Thu, 9 Apr 2020 09:41:32 +0200
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Jacob,
> 
> On 4/3/20 8:42 PM, Jacob Pan wrote:
> > When supporting guest SVA with emulated IOMMU, the guest PASID
> > table is shadowed in VMM. Updates to guest vIOMMU PASID table
> > will result in PASID cache flush which will be passed down to
> > the host as bind guest PASID calls.
> > 
> > For the SL page tables, it will be harvested from device's
> > default domain (request w/o PASID), or aux domain in case of
> > mediated device.
> > 
> >     .-------------.  .---------------------------.
> >     |   vIOMMU    |  | Guest process CR3, FL only|
> >     |             |  '---------------------------'
> >     .----------------/
> >     | PASID Entry |--- PASID cache flush -
> >     '-------------'                       |
> >     |             |                       V
> >     |             |                CR3 in GPA
> >     '-------------'
> > Guest
> > ------| Shadow |--------------------------|--------
> >       v        v                          v
> > Host
> >     .-------------.  .----------------------.
> >     |   pIOMMU    |  | Bind FL for GVA-GPA  |
> >     |             |  '----------------------'
> >     .----------------/  |
> >     | PASID Entry |     V (Nested xlate)
> >     '----------------\.------------------------------.
> >     |             |   |SL for GPA-HPA, default domain|
> >     |             |   '------------------------------'
> >     '-------------'
> > Where:
> >  - FL = First level/stage one page tables
> >  - SL = Second level/stage two page tables
> > 
> > ---
> > v11: Fixed locking, avoid duplicated paging mode check, added
> > helper to free svm if device list is empty. Use rate limited error
> > message since the bind gpasid call comes from user space.
> > ---
> > 
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > ---
> >  drivers/iommu/intel-iommu.c |   4 +
> >  drivers/iommu/intel-svm.c   | 206
> > ++++++++++++++++++++++++++++++++++++++++++++
> > include/linux/intel-iommu.h |   8 +- include/linux/intel-svm.h   |
> > 17 ++++ 4 files changed, 234 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/iommu/intel-iommu.c
> > b/drivers/iommu/intel-iommu.c index c0dadec5a6b3..94c7993dac6a
> > 100644 --- a/drivers/iommu/intel-iommu.c
> > +++ b/drivers/iommu/intel-iommu.c
> > @@ -6178,6 +6178,10 @@ const struct iommu_ops intel_iommu_ops = {
> >  	.dev_disable_feat	= intel_iommu_dev_disable_feat,
> >  	.is_attach_deferred	=
> > intel_iommu_is_attach_deferred, .pgsize_bitmap		=
> > INTEL_IOMMU_PGSIZES, +#ifdef CONFIG_INTEL_IOMMU_SVM
> > +	.sva_bind_gpasid	= intel_svm_bind_gpasid,
> > +	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
> > +#endif
> >  };
> >  
> >  static void quirk_iommu_igfx(struct pci_dev *dev)
> > diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> > index d7f2a5358900..7cf711318b87 100644
> > --- a/drivers/iommu/intel-svm.c
> > +++ b/drivers/iommu/intel-svm.c
> > @@ -226,6 +226,212 @@ static LIST_HEAD(global_svm_list);
> >  	list_for_each_entry((sdev), &(svm)->devs, list)	\
> >  		if ((d) != (sdev)->dev) {} else
> >  
> > +
> > +static inline void intel_svm_free_if_empty(struct intel_svm *svm,
> > u64 pasid) +{
> > +	if (list_empty(&svm->devs)) {
> > +		ioasid_set_data(pasid, NULL);
> > +		kfree(svm);
> > +	}
> > +}
> > +
> > +int intel_svm_bind_gpasid(struct iommu_domain *domain,
> > +			struct device *dev,
> > +			struct iommu_gpasid_bind_data *data)
> > +{
> > +	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
> > +	struct dmar_domain *dmar_domain;
> > +	struct intel_svm_dev *sdev;
> > +	struct intel_svm *svm;
> > +	int ret = 0;
> > +
> > +	if (WARN_ON(!iommu) || !data)
> > +		return -EINVAL;
> > +
> > +	if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
> > +	    data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
> > +		return -EINVAL;
> > +
> > +	if (dev_is_pci(dev)) {
> > +		/* VT-d supports devices with full 20 bit PASIDs
> > only */
> > +		if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
> > +			return -EINVAL;
> > +	} else {
> > +		return -ENOTSUPP;
> > +	}
> > +
> > +	/*
> > +	 * We only check host PASID range, we have no knowledge to
> > check
> > +	 * guest PASID range.
> > +	 */
> > +	if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
> > +		return -EINVAL;
> > +
> > +	dmar_domain = to_dmar_domain(domain);
> > +
> > +	mutex_lock(&pasid_mutex);
> > +	svm = ioasid_find(NULL, data->hpasid, NULL);
> > +	if (IS_ERR(svm)) {
> > +		ret = PTR_ERR(svm);
> > +		goto out;
> > +	}
> > +
> > +	if (svm) {
> > +		/*
> > +		 * If we found svm for the PASID, there must be at
> > +		 * least one device bond, otherwise svm should be
> > freed.
> > +		 */
> > +		if (WARN_ON(list_empty(&svm->devs))) {
> > +			ret = -EINVAL;
> > +			goto out;
> > +		}
> > +
> > +		for_each_svm_dev(sdev, svm, dev) {
> > +			/* In case of multiple sub-devices of the
> > same pdev
> > +			 * assigned, we should allow multiple bind
> > calls with
> > +			 * the same PASID and pdev.
> > +			 */
> > +			sdev->users++;  
> What if this is not an mdev device. Is it also allowed?
Yes. IOMMU and VT-d driver is not mdev aware. Here mdev is just an
example of normal use case. You can bind the same PCI device (PF or
SRIOV VF) more than once to the same PASID. Just need to unbind also.

Do you see any issues?

> > +			goto out;
> > +		}
> > +	} else {
> > +		/* We come here when PASID has never been bond to
> > a device. */
> > +		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
> > +		if (!svm) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +		/* REVISIT: upper layer/VFIO can track host
> > process that bind the PASID.
> > +		 * ioasid_set = mm might be sufficient for vfio to
> > check pasid VMM
> > +		 * ownership. We can drop the following line once
> > VFIO and IOASID set
> > +		 * check is in place.
> > +		 */
> > +		svm->mm = get_task_mm(current);
> > +		svm->pasid = data->hpasid;
> > +		if (data->flags & IOMMU_SVA_GPASID_VAL) {
> > +			svm->gpasid = data->gpasid;
> > +			svm->flags |= SVM_FLAG_GUEST_PASID;
> > +		}
> > +		ioasid_set_data(data->hpasid, svm);
> > +		INIT_LIST_HEAD_RCU(&svm->devs);
> > +		mmput(svm->mm);
> > +	}
> > +	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
> > +	if (!sdev) {
> > +		/*
> > +		 * If this is a new PASID that never bond to a
> > device, then
> > +		 * the device list must be empty which indicates
> > struct svm
> > +		 * was allocated in this function.
> > +		 */
> > +		intel_svm_free_if_empty(svm, data->hpasid);
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +	sdev->dev = dev;
> > +	sdev->users = 1;
> > +
> > +	/* Set up device context entry for PASID if not enabled
> > already */
> > +	ret = intel_iommu_enable_pasid(iommu, sdev->dev);
> > +	if (ret) {
> > +		dev_err_ratelimited(dev, "Failed to enable PASID
> > capability\n");
> > +		kfree(sdev);
> > +		intel_svm_free_if_empty(svm, data->hpasid);
> > +		goto out;
> > +	}
> > +
> > +	/*
> > +	 * PASID table is per device for better security.
> > Therefore, for
> > +	 * each bind of a new device even with an existing PASID,
> > we need to
> > +	 * call the nested mode setup function here.
> > +	 */
> > +	spin_lock(&iommu->lock);
> > +	ret = intel_pasid_setup_nested(iommu,
> > +				       dev,
> > +				       (pgd_t *)data->gpgd,
> > +				       data->hpasid,
> > +				       &data->vtd,
> > +				       dmar_domain,
> > +				       data->addr_width);
> > +	if (ret) {
> > +		dev_err_ratelimited(dev, "Failed to set up PASID
> > %llu in nested mode, Err %d\n",
> > +			data->hpasid, ret);
> > +		/*
> > +		 * PASID entry should be in cleared state if
> > nested mode
> > +		 * set up failed. So we only need to clear IOASID
> > tracking
> > +		 * data such that free call will succeed.
> > +		 */
> > +		kfree(sdev);
> > +		intel_svm_free_if_empty(svm, data->hpasid);
> > +		spin_unlock(&iommu->lock);
> > +		goto out;
> > +	}
> > +	spin_unlock(&iommu->lock);
> > +	svm->flags |= SVM_FLAG_GUEST_MODE;
> > +
> > +	init_rcu_head(&sdev->rcu);
> > +	list_add_rcu(&sdev->list, &svm->devs);
> > + out:
> > +	mutex_unlock(&pasid_mutex);
> > +	return ret;
> > +}
> > +
> > +int intel_svm_unbind_gpasid(struct device *dev, int pasid)
> > +{  
> This function's code looks very very similar to intel_svm_unbind_mm()
> besides to the mmu_notifier_unregister and the ioasid_free(). Don't we
> have means to factorize the code by checking the SVM_FLAG_GUEST_MODE
> flag?
> 
Yes, I agree there is room to consolidate. But the same time there are
two potential changes coming:
1. I also try to consolidate with Jean's generic sva_bind_device() for
native bind.
https://lkml.org/lkml/2020/2/24/1351
2. Support enqcmd and lazy PASID free.
https://lkml.org/lkml/2020/3/30/910

Perhaps we can revisit after those get resolved?

> Please could you explain again what does happen if the guest process
> dies/VM dies. How do we make sure the svm get freed. I fail to
> understand the whole picture at the moment as you cannot fully rely on
> the userspace to call unbind_gpasid.

The fault handling needs to be better documented. Here is the
current plan as I proposed in the IOASID extension set with IOASID
notifier.
https://lkml.org/lkml/2020/3/25/874
I will do a document in the next version. In short On VT-d, IOASID has
the following subscribers:
1. IOMMU driver, when free comes before unbind, since there is no way
to hook up with guest mmu_notifier directly.
2. KVM, needs to update VMCS PASID translation table. This is needed
when we support non-identity G-H PASID mapping.
3. VDCM (Virtual device composition module), similar to SRIOV PF
driver, also needs G-H PASID translation.

So when a guest goes away, based on FD close VFIO needs to free all
the PASIDs belong to the guest. The proposal is to call
ioasid_free_set() where the ioasid_set contains all the IOASIDs
for the VM.
A blocking notifier will be called, IOMMU gets notified and performs
all the clean up on IOMMU side as follows:
1.clear PASID table entry, flush PASID and IOTLB, devTLB. Drain page
requests for the PASID. The same thing you would do during unbind.
2.mark PASID defunct such that we can drop further DMA faults and PRQ

For individual guest PASID, the normal order is 
1. alloc (via virtual cmd)
2. bind_gpasid
3. unbind_gpasid
4. free (via virtual cmd)

If 4 comes before 3, free will call ioasid notifier. IOMMU gets
notified and perform cleanup as above. When unbind comes, there is no
PASID entry then exit.

One challenge is the cleanup can be lengthy and requires thread
context. SVA code needed to call ioasid_free in atomic context
(mmu_notifier release, SRCU callback). This might be changing with
Jean's patch to defer ioasid_free, this restriction may go away.
https://www.spinics.net/lists/iommu/msg43065.html

I still feel the clean up work in notifier handler is too heavy.
Two options I can think of:
1. let VFIO enforce the unbind->free order
2. introduce some reclaim phase for IOASID instead of putting the
IOASID back to the pool immediately.

Any suggestions?

> > +	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
> > +	struct intel_svm_dev *sdev;
> > +	struct intel_svm *svm;
> > +	int ret = -EINVAL;
> > +
> > +	if (WARN_ON(!iommu))
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&pasid_mutex);
> > +	svm = ioasid_find(NULL, pasid, NULL);
> > +	if (!svm) {
> > +		ret = -EINVAL;  
> As per the discussion on the VFIO series, shall we return an error in
> that case? Same below?
You mean make unbind a void function? I agree, there is really no way
to recover. Same for invalidate, which cannot fail.
Perhaps another series?

> > +		goto out;
> > +	}
> > +
> > +	if (IS_ERR(svm)) {
> > +		ret = PTR_ERR(svm);
> > +		goto out;
> > +	}
> > +
> > +	for_each_svm_dev(sdev, svm, dev) {
> > +		ret = 0;
> > +		sdev->users--;
> > +		if (!sdev->users) {
> > +			list_del_rcu(&sdev->list);
> > +			intel_pasid_tear_down_entry(iommu, dev,
> > svm->pasid);  
> intel_svm_unbind_mm() calls intel_flush_svm_range_dev(svm, sdev, 0,
> -1, 0); Don't we need to flush the (DEV-)IOTLBs as well?
> > +			/* TODO: Drain in flight PRQ for the PASID
> > since it
> > +			 * may get reused soon, we don't want to
> > +			 * confuse with its previous life.
> > +			 * intel_svm_drain_prq(dev, pasid);
> > +			 */
> > +			kfree_rcu(sdev, rcu);
> > +
> > +			if (list_empty(&svm->devs)) {
> > +				/*
> > +				 * We do not free the IOASID here
> > in that
> > +				 * IOMMU driver did not allocate
> > it.  
> s/in/as?
> > +				 * Unlike native SVM, IOASID for
> > guest use was
> > +				 * allocated prior to the bind
> > call.> +				 * In any case, if the free
> > call comes before
> > +				 * the unbind, IOMMU driver will
> > get notified
> > +				 * and perform cleanup.
> > +				 */
> > +				ioasid_set_data(pasid, NULL);
> > +				kfree(svm);
> > +			}  
> nit: you may use intel_svm_free_if_empty()
> > +		}
> > +		break;
> > +	}
> > +out:
> > +	mutex_unlock(&pasid_mutex);
> > +  
> nit : spare new line
Sounds good.

> > +	return ret;
> > +}
> > +
> >  int intel_svm_bind_mm(struct device *dev, int *pasid, int flags,
> > struct svm_dev_ops *ops) {
> >  	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
> > diff --git a/include/linux/intel-iommu.h
> > b/include/linux/intel-iommu.h index 6da03f627ba3..a5bd53cf190c
> > 100644 --- a/include/linux/intel-iommu.h
> > +++ b/include/linux/intel-iommu.h
> > @@ -706,7 +706,9 @@ struct dmar_domain *find_domain(struct device
> > *dev); extern void intel_svm_check(struct intel_iommu *iommu);
> >  extern int intel_svm_enable_prq(struct intel_iommu *iommu);
> >  extern int intel_svm_finish_prq(struct intel_iommu *iommu);
> > -
> > +extern int intel_svm_bind_gpasid(struct iommu_domain *domain,
> > +		struct device *dev, struct iommu_gpasid_bind_data
> > *data); +extern int intel_svm_unbind_gpasid(struct device *dev, int
> > pasid); struct svm_dev_ops;
> >  
> >  struct intel_svm_dev {
> > @@ -723,9 +725,13 @@ struct intel_svm_dev {
> >  struct intel_svm {
> >  	struct mmu_notifier notifier;
> >  	struct mm_struct *mm;
> > +
> >  	struct intel_iommu *iommu;
> >  	int flags;
> >  	int pasid;
> > +	int gpasid; /* Guest PASID in case of vSVA bind with
> > non-identity host
> > +		     * to guest PASID mapping.
> > +		     */
> >  	struct list_head devs;
> >  	struct list_head list;
> >  };
> > diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
> > index d7c403d0dd27..c19690937540 100644
> > --- a/include/linux/intel-svm.h
> > +++ b/include/linux/intel-svm.h
> > @@ -44,6 +44,23 @@ struct svm_dev_ops {
> >   * do such IOTLB flushes automatically.
> >   */
> >  #define SVM_FLAG_SUPERVISOR_MODE	(1<<1)
> > +/*
> > + * The SVM_FLAG_GUEST_MODE flag is used when a guest process bind
> > to a device.
> > + * In this case the mm_struct is in the guest kernel or userspace,
> > its life
> > + * cycle is managed by VMM and VFIO layer. For IOMMU driver, this
> > API provides
> > + * means to bind/unbind guest CR3 with PASIDs allocated for a
> > device.
> > + */
> > +#define SVM_FLAG_GUEST_MODE	(1<<2)
> > +/*
> > + * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own
> > PASID space,
> > + * which requires guest and host PASID translation at both
> > directions. We keep
> > + * track of guest PASID in order to provide lookup service to
> > device drivers.
> > + * One such example is a physical function (PF) driver that
> > supports mediated
> > + * device (mdev) assignment. Guest programming of mdev
> > configuration space can
> > + * only be done with guest PASID, therefore PF driver needs to
> > find the matching
> > + * host PASID to program the real hardware.
> > + */
> > +#define SVM_FLAG_GUEST_PASID	(1<<3)
> >  
> >  #ifdef CONFIG_INTEL_IOMMU_SVM
> >  
> >   
> Thanks
> 
> Eric
> 

[Jacob Pan]
Jacob Pan April 10, 2020, 9:06 p.m. UTC | #3
Hi Eric,

Missed a few things in the last reply.

On Thu, 9 Apr 2020 09:41:32 +0200
Auger Eric <eric.auger@redhat.com> wrote:

> > +			intel_pasid_tear_down_entry(iommu, dev,
> > svm->pasid);  
> intel_svm_unbind_mm() calls intel_flush_svm_range_dev(svm, sdev, 0,
> -1, 0); Don't we need to flush the (DEV-)IOTLBs as well?
Right, pasid tear down should always include (DEV-)IOTLB flush, I
initially thought it is taken care of by intel_pasid_tear_down_entry().

> > +			/* TODO: Drain in flight PRQ for the PASID
> > since it
> > +			 * may get reused soon, we don't want to
> > +			 * confuse with its previous life.
> > +			 * intel_svm_drain_prq(dev, pasid);
> > +			 */
> > +			kfree_rcu(sdev, rcu);
> > +
> > +			if (list_empty(&svm->devs)) {
> > +				/*
> > +				 * We do not free the IOASID here
> > in that
> > +				 * IOMMU driver did not allocate
> > it.  
> s/in/as?
I meant to say "in that" as "for the reason that"

> > +				 * Unlike native SVM, IOASID for
> > guest use was
> > +				 * allocated prior to the bind
> > call.> +				 * In any case, if the free
> > call comes before
> > +				 * the unbind, IOMMU driver will
> > get notified
> > +				 * and perform cleanup.
> > +				 */
> > +				ioasid_set_data(pasid, NULL);
> > +				kfree(svm);
> > +			}  
> nit: you may use intel_svm_free_if_empty()
True, but I meant to insert ioasid_notifier under the list_empty
condition in the following ioasid patch.


Thanks,

Jacob
Auger Eric April 16, 2020, 10:43 a.m. UTC | #4
Hi Jacob,

On 4/10/20 9:45 PM, Jacob Pan wrote:
> Hi Eric,
> 
> On Thu, 9 Apr 2020 09:41:32 +0200
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>> Hi Jacob,
>>
>> On 4/3/20 8:42 PM, Jacob Pan wrote:
>>> When supporting guest SVA with emulated IOMMU, the guest PASID
>>> table is shadowed in VMM. Updates to guest vIOMMU PASID table
>>> will result in PASID cache flush which will be passed down to
>>> the host as bind guest PASID calls.
>>>
>>> For the SL page tables, it will be harvested from device's
>>> default domain (request w/o PASID), or aux domain in case of
>>> mediated device.
>>>
>>>     .-------------.  .---------------------------.
>>>     |   vIOMMU    |  | Guest process CR3, FL only|
>>>     |             |  '---------------------------'
>>>     .----------------/
>>>     | PASID Entry |--- PASID cache flush -
>>>     '-------------'                       |
>>>     |             |                       V
>>>     |             |                CR3 in GPA
>>>     '-------------'
>>> Guest
>>> ------| Shadow |--------------------------|--------
>>>       v        v                          v
>>> Host
>>>     .-------------.  .----------------------.
>>>     |   pIOMMU    |  | Bind FL for GVA-GPA  |
>>>     |             |  '----------------------'
>>>     .----------------/  |
>>>     | PASID Entry |     V (Nested xlate)
>>>     '----------------\.------------------------------.
>>>     |             |   |SL for GPA-HPA, default domain|
>>>     |             |   '------------------------------'
>>>     '-------------'
>>> Where:
>>>  - FL = First level/stage one page tables
>>>  - SL = Second level/stage two page tables
>>>
>>> ---
>>> v11: Fixed locking, avoid duplicated paging mode check, added
>>> helper to free svm if device list is empty. Use rate limited error
>>> message since the bind gpasid call comes from user space.
>>> ---
>>>
>>> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
>>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
>>> ---
>>>  drivers/iommu/intel-iommu.c |   4 +
>>>  drivers/iommu/intel-svm.c   | 206
>>> ++++++++++++++++++++++++++++++++++++++++++++
>>> include/linux/intel-iommu.h |   8 +- include/linux/intel-svm.h   |
>>> 17 ++++ 4 files changed, 234 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/iommu/intel-iommu.c
>>> b/drivers/iommu/intel-iommu.c index c0dadec5a6b3..94c7993dac6a
>>> 100644 --- a/drivers/iommu/intel-iommu.c
>>> +++ b/drivers/iommu/intel-iommu.c
>>> @@ -6178,6 +6178,10 @@ const struct iommu_ops intel_iommu_ops = {
>>>  	.dev_disable_feat	= intel_iommu_dev_disable_feat,
>>>  	.is_attach_deferred	=
>>> intel_iommu_is_attach_deferred, .pgsize_bitmap		=
>>> INTEL_IOMMU_PGSIZES, +#ifdef CONFIG_INTEL_IOMMU_SVM
>>> +	.sva_bind_gpasid	= intel_svm_bind_gpasid,
>>> +	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
>>> +#endif
>>>  };
>>>  
>>>  static void quirk_iommu_igfx(struct pci_dev *dev)
>>> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
>>> index d7f2a5358900..7cf711318b87 100644
>>> --- a/drivers/iommu/intel-svm.c
>>> +++ b/drivers/iommu/intel-svm.c
>>> @@ -226,6 +226,212 @@ static LIST_HEAD(global_svm_list);
>>>  	list_for_each_entry((sdev), &(svm)->devs, list)	\
>>>  		if ((d) != (sdev)->dev) {} else
>>>  
>>> +
>>> +static inline void intel_svm_free_if_empty(struct intel_svm *svm,
>>> u64 pasid) +{
>>> +	if (list_empty(&svm->devs)) {
>>> +		ioasid_set_data(pasid, NULL);
>>> +		kfree(svm);
>>> +	}
>>> +}
>>> +
>>> +int intel_svm_bind_gpasid(struct iommu_domain *domain,
>>> +			struct device *dev,
>>> +			struct iommu_gpasid_bind_data *data)
>>> +{
>>> +	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
>>> +	struct dmar_domain *dmar_domain;
>>> +	struct intel_svm_dev *sdev;
>>> +	struct intel_svm *svm;
>>> +	int ret = 0;
>>> +
>>> +	if (WARN_ON(!iommu) || !data)
>>> +		return -EINVAL;
>>> +
>>> +	if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
>>> +	    data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
>>> +		return -EINVAL;
>>> +
>>> +	if (dev_is_pci(dev)) {
>>> +		/* VT-d supports devices with full 20 bit PASIDs
>>> only */
>>> +		if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
>>> +			return -EINVAL;
>>> +	} else {
>>> +		return -ENOTSUPP;
>>> +	}
>>> +
>>> +	/*
>>> +	 * We only check host PASID range, we have no knowledge to
>>> check
>>> +	 * guest PASID range.
>>> +	 */
>>> +	if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
>>> +		return -EINVAL;
>>> +
>>> +	dmar_domain = to_dmar_domain(domain);
>>> +
>>> +	mutex_lock(&pasid_mutex);
>>> +	svm = ioasid_find(NULL, data->hpasid, NULL);
>>> +	if (IS_ERR(svm)) {
>>> +		ret = PTR_ERR(svm);
>>> +		goto out;
>>> +	}
>>> +
>>> +	if (svm) {
>>> +		/*
>>> +		 * If we found svm for the PASID, there must be at
>>> +		 * least one device bond, otherwise svm should be
>>> freed.
>>> +		 */
>>> +		if (WARN_ON(list_empty(&svm->devs))) {
>>> +			ret = -EINVAL;
>>> +			goto out;
>>> +		}
>>> +
>>> +		for_each_svm_dev(sdev, svm, dev) {
>>> +			/* In case of multiple sub-devices of the
>>> same pdev
>>> +			 * assigned, we should allow multiple bind
>>> calls with
>>> +			 * the same PASID and pdev.
>>> +			 */
>>> +			sdev->users++;  
>> What if this is not an mdev device. Is it also allowed?
> Yes. IOMMU and VT-d driver is not mdev aware. Here mdev is just an
> example of normal use case. You can bind the same PCI device (PF or
> SRIOV VF) more than once to the same PASID. Just need to unbind also.

I don't get the point of binding a non mdev device several times with
the same PASID. Do you intend to allow that at userspace level or
prevent this from happening in VFIO?

Besides, the comment is a bit misleading as it gives the impression it
is only true for mdev and there is no associated check.
> 
> Do you see any issues?
no I don't
> 
>>> +			goto out;
>>> +		}
>>> +	} else {
>>> +		/* We come here when PASID has never been bond to
>>> a device. */
>>> +		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
>>> +		if (!svm) {
>>> +			ret = -ENOMEM;
>>> +			goto out;
>>> +		}
>>> +		/* REVISIT: upper layer/VFIO can track host
>>> process that bind the PASID.
>>> +		 * ioasid_set = mm might be sufficient for vfio to
>>> check pasid VMM
>>> +		 * ownership. We can drop the following line once
>>> VFIO and IOASID set
>>> +		 * check is in place.
>>> +		 */
>>> +		svm->mm = get_task_mm(current);
>>> +		svm->pasid = data->hpasid;
>>> +		if (data->flags & IOMMU_SVA_GPASID_VAL) {
>>> +			svm->gpasid = data->gpasid;
>>> +			svm->flags |= SVM_FLAG_GUEST_PASID;
>>> +		}
>>> +		ioasid_set_data(data->hpasid, svm);
>>> +		INIT_LIST_HEAD_RCU(&svm->devs);
>>> +		mmput(svm->mm);
>>> +	}
>>> +	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
>>> +	if (!sdev) {
>>> +		/*
>>> +		 * If this is a new PASID that never bond to a
>>> device, then
>>> +		 * the device list must be empty which indicates
>>> struct svm
>>> +		 * was allocated in this function.
>>> +		 */
>>> +		intel_svm_free_if_empty(svm, data->hpasid);
>>> +		ret = -ENOMEM;
>>> +		goto out;
>>> +	}
>>> +	sdev->dev = dev;
>>> +	sdev->users = 1;
>>> +
>>> +	/* Set up device context entry for PASID if not enabled
>>> already */
>>> +	ret = intel_iommu_enable_pasid(iommu, sdev->dev);
>>> +	if (ret) {
>>> +		dev_err_ratelimited(dev, "Failed to enable PASID
>>> capability\n");
>>> +		kfree(sdev);
>>> +		intel_svm_free_if_empty(svm, data->hpasid);
>>> +		goto out;
>>> +	}
>>> +
>>> +	/*
>>> +	 * PASID table is per device for better security.
>>> Therefore, for
>>> +	 * each bind of a new device even with an existing PASID,
>>> we need to
>>> +	 * call the nested mode setup function here.
>>> +	 */
>>> +	spin_lock(&iommu->lock);
>>> +	ret = intel_pasid_setup_nested(iommu,
>>> +				       dev,
>>> +				       (pgd_t *)data->gpgd,
>>> +				       data->hpasid,
>>> +				       &data->vtd,
>>> +				       dmar_domain,
>>> +				       data->addr_width);
>>> +	if (ret) {
>>> +		dev_err_ratelimited(dev, "Failed to set up PASID
>>> %llu in nested mode, Err %d\n",
>>> +			data->hpasid, ret);
>>> +		/*
>>> +		 * PASID entry should be in cleared state if
>>> nested mode
>>> +		 * set up failed. So we only need to clear IOASID
>>> tracking
>>> +		 * data such that free call will succeed.
>>> +		 */
>>> +		kfree(sdev);
>>> +		intel_svm_free_if_empty(svm, data->hpasid);
>>> +		spin_unlock(&iommu->lock);
>>> +		goto out;
>>> +	}
>>> +	spin_unlock(&iommu->lock);
>>> +	svm->flags |= SVM_FLAG_GUEST_MODE;
>>> +
>>> +	init_rcu_head(&sdev->rcu);
>>> +	list_add_rcu(&sdev->list, &svm->devs);
>>> + out:
>>> +	mutex_unlock(&pasid_mutex);
>>> +	return ret;
>>> +}
>>> +
>>> +int intel_svm_unbind_gpasid(struct device *dev, int pasid)
>>> +{  
>> This function's code looks very very similar to intel_svm_unbind_mm()
>> besides to the mmu_notifier_unregister and the ioasid_free(). Don't we
>> have means to factorize the code by checking the SVM_FLAG_GUEST_MODE
>> flag?
>>
> Yes, I agree there is room to consolidate. But the same time there are
> two potential changes coming:
> 1. I also try to consolidate with Jean's generic sva_bind_device() for
> native bind.
> https://lkml.org/lkml/2020/2/24/1351
> 2. Support enqcmd and lazy PASID free.
> https://lkml.org/lkml/2020/3/30/910
> 
> Perhaps we can revisit after those get resolved?
yep
> 
>> Please could you explain again what does happen if the guest process
>> dies/VM dies. How do we make sure the svm get freed. I fail to
>> understand the whole picture at the moment as you cannot fully rely on
>> the userspace to call unbind_gpasid.
> 
> The fault handling needs to be better documented. Here is the
> current plan as I proposed in the IOASID extension set with IOASID
> notifier.
> https://lkml.org/lkml/2020/3/25/874
> I will do a document in the next version. In short On VT-d, IOASID has
> the following subscribers:
> 1. IOMMU driver, when free comes before unbind, since there is no way
> to hook up with guest mmu_notifier directly.
> 2. KVM, needs to update VMCS PASID translation table. This is needed
> when we support non-identity G-H PASID mapping.
> 3. VDCM (Virtual device composition module), similar to SRIOV PF
> driver, also needs G-H PASID translation.
> 
> So when a guest goes away, based on FD close VFIO needs to free all
> the PASIDs belong to the guest. The proposal is to call
> ioasid_free_set() where the ioasid_set contains all the IOASIDs
> for the VM.
> A blocking notifier will be called, IOMMU gets notified and performs
> all the clean up on IOMMU side as follows:
> 1.clear PASID table entry, flush PASID and IOTLB, devTLB. Drain page
> requests for the PASID. The same thing you would do during unbind.
> 2.mark PASID defunct such that we can drop further DMA faults and PRQ
> 
> For individual guest PASID, the normal order is 
> 1. alloc (via virtual cmd)
> 2. bind_gpasid
> 3. unbind_gpasid
> 4. free (via virtual cmd)
> 
> If 4 comes before 3, free will call ioasid notifier. IOMMU gets
> notified and perform cleanup as above. When unbind comes, there is no
> PASID entry then exit.
[*] could you detail when the free comes before unbind and the opposite?
> 
> One challenge is the cleanup can be lengthy and requires thread
> context. SVA code needed to call ioasid_free in atomic context
> (mmu_notifier release, SRCU callback). This might be changing with
> Jean's patch to defer ioasid_free, this restriction may go away.
> https://www.spinics.net/lists/iommu/msg43065.html
> 
> I still feel the clean up work in notifier handler is too heavy.
> Two options I can think of:
> 1. let VFIO enforce the unbind->free order
depends on [*]
> 2. introduce some reclaim phase for IOASID instead of putting the
> IOASID back to the pool immediately.
what do you mean by reclain phase?

Thanks

Eric
> 
> Any suggestions?

> 
>>> +	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
>>> +	struct intel_svm_dev *sdev;
>>> +	struct intel_svm *svm;
>>> +	int ret = -EINVAL;
>>> +
>>> +	if (WARN_ON(!iommu))
>>> +		return -EINVAL;
>>> +
>>> +	mutex_lock(&pasid_mutex);
>>> +	svm = ioasid_find(NULL, pasid, NULL);
>>> +	if (!svm) {
>>> +		ret = -EINVAL;  
>> As per the discussion on the VFIO series, shall we return an error in
>> that case? Same below?
> You mean make unbind a void function? I agree, there is really no way
> to recover. Same for invalidate, which cannot fail.
> Perhaps another series?
> 
>>> +		goto out;
>>> +	}
>>> +
>>> +	if (IS_ERR(svm)) {
>>> +		ret = PTR_ERR(svm);
>>> +		goto out;
>>> +	}
>>> +
>>> +	for_each_svm_dev(sdev, svm, dev) {
>>> +		ret = 0;
>>> +		sdev->users--;
>>> +		if (!sdev->users) {
>>> +			list_del_rcu(&sdev->list);
>>> +			intel_pasid_tear_down_entry(iommu, dev,
>>> svm->pasid);  
>> intel_svm_unbind_mm() calls intel_flush_svm_range_dev(svm, sdev, 0,
>> -1, 0); Don't we need to flush the (DEV-)IOTLBs as well?
>>> +			/* TODO: Drain in flight PRQ for the PASID
>>> since it
>>> +			 * may get reused soon, we don't want to
>>> +			 * confuse with its previous life.
>>> +			 * intel_svm_drain_prq(dev, pasid);
>>> +			 */
>>> +			kfree_rcu(sdev, rcu);
>>> +
>>> +			if (list_empty(&svm->devs)) {
>>> +				/*
>>> +				 * We do not free the IOASID here
>>> in that
>>> +				 * IOMMU driver did not allocate
>>> it.  
>> s/in/as?
>>> +				 * Unlike native SVM, IOASID for
>>> guest use was
>>> +				 * allocated prior to the bind
>>> call.> +				 * In any case, if the free
>>> call comes before
>>> +				 * the unbind, IOMMU driver will
>>> get notified
>>> +				 * and perform cleanup.
>>> +				 */
>>> +				ioasid_set_data(pasid, NULL);
>>> +				kfree(svm);
>>> +			}  
>> nit: you may use intel_svm_free_if_empty()
>>> +		}
>>> +		break;
>>> +	}
>>> +out:
>>> +	mutex_unlock(&pasid_mutex);
>>> +  
>> nit : spare new line
> Sounds good.
> 
>>> +	return ret;
>>> +}
>>> +
>>>  int intel_svm_bind_mm(struct device *dev, int *pasid, int flags,
>>> struct svm_dev_ops *ops) {
>>>  	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
>>> diff --git a/include/linux/intel-iommu.h
>>> b/include/linux/intel-iommu.h index 6da03f627ba3..a5bd53cf190c
>>> 100644 --- a/include/linux/intel-iommu.h
>>> +++ b/include/linux/intel-iommu.h
>>> @@ -706,7 +706,9 @@ struct dmar_domain *find_domain(struct device
>>> *dev); extern void intel_svm_check(struct intel_iommu *iommu);
>>>  extern int intel_svm_enable_prq(struct intel_iommu *iommu);
>>>  extern int intel_svm_finish_prq(struct intel_iommu *iommu);
>>> -
>>> +extern int intel_svm_bind_gpasid(struct iommu_domain *domain,
>>> +		struct device *dev, struct iommu_gpasid_bind_data
>>> *data); +extern int intel_svm_unbind_gpasid(struct device *dev, int
>>> pasid); struct svm_dev_ops;
>>>  
>>>  struct intel_svm_dev {
>>> @@ -723,9 +725,13 @@ struct intel_svm_dev {
>>>  struct intel_svm {
>>>  	struct mmu_notifier notifier;
>>>  	struct mm_struct *mm;
>>> +
>>>  	struct intel_iommu *iommu;
>>>  	int flags;
>>>  	int pasid;
>>> +	int gpasid; /* Guest PASID in case of vSVA bind with
>>> non-identity host
>>> +		     * to guest PASID mapping.
>>> +		     */
>>>  	struct list_head devs;
>>>  	struct list_head list;
>>>  };
>>> diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
>>> index d7c403d0dd27..c19690937540 100644
>>> --- a/include/linux/intel-svm.h
>>> +++ b/include/linux/intel-svm.h
>>> @@ -44,6 +44,23 @@ struct svm_dev_ops {
>>>   * do such IOTLB flushes automatically.
>>>   */
>>>  #define SVM_FLAG_SUPERVISOR_MODE	(1<<1)
>>> +/*
>>> + * The SVM_FLAG_GUEST_MODE flag is used when a guest process bind
>>> to a device.
>>> + * In this case the mm_struct is in the guest kernel or userspace,
>>> its life
>>> + * cycle is managed by VMM and VFIO layer. For IOMMU driver, this
>>> API provides
>>> + * means to bind/unbind guest CR3 with PASIDs allocated for a
>>> device.
>>> + */
>>> +#define SVM_FLAG_GUEST_MODE	(1<<2)
>>> +/*
>>> + * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own
>>> PASID space,
>>> + * which requires guest and host PASID translation at both
>>> directions. We keep
>>> + * track of guest PASID in order to provide lookup service to
>>> device drivers.
>>> + * One such example is a physical function (PF) driver that
>>> supports mediated
>>> + * device (mdev) assignment. Guest programming of mdev
>>> configuration space can
>>> + * only be done with guest PASID, therefore PF driver needs to
>>> find the matching
>>> + * host PASID to program the real hardware.
>>> + */
>>> +#define SVM_FLAG_GUEST_PASID	(1<<3)
>>>  
>>>  #ifdef CONFIG_INTEL_IOMMU_SVM
>>>  
>>>   
>> Thanks
>>
>> Eric
>>
> 
> [Jacob Pan]
>
Auger Eric April 16, 2020, 10:46 a.m. UTC | #5
Hi Jacob,

On 4/10/20 11:06 PM, Jacob Pan wrote:
> Hi Eric,
> 
> Missed a few things in the last reply.
> 
> On Thu, 9 Apr 2020 09:41:32 +0200
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>>> +			intel_pasid_tear_down_entry(iommu, dev,
>>> svm->pasid);  
>> intel_svm_unbind_mm() calls intel_flush_svm_range_dev(svm, sdev, 0,
>> -1, 0); Don't we need to flush the (DEV-)IOTLBs as well?
> Right, pasid tear down should always include (DEV-)IOTLB flush, I
> initially thought it is taken care of by intel_pasid_tear_down_entry().
> 
>>> +			/* TODO: Drain in flight PRQ for the PASID
>>> since it
>>> +			 * may get reused soon, we don't want to
>>> +			 * confuse with its previous life.
>>> +			 * intel_svm_drain_prq(dev, pasid);
>>> +			 */
>>> +			kfree_rcu(sdev, rcu);
>>> +
>>> +			if (list_empty(&svm->devs)) {
>>> +				/*
>>> +				 * We do not free the IOASID here
>>> in that
>>> +				 * IOMMU driver did not allocate
>>> it.  
>> s/in/as?
> I meant to say "in that" as "for the reason that"
ok sorry
> 
>>> +				 * Unlike native SVM, IOASID for
>>> guest use was
>>> +				 * allocated prior to the bind
>>> call.> +				 * In any case, if the free
>>> call comes before
>>> +				 * the unbind, IOMMU driver will
>>> get notified
>>> +				 * and perform cleanup.
>>> +				 */
>>> +				ioasid_set_data(pasid, NULL);
>>> +				kfree(svm);
>>> +			}  
>> nit: you may use intel_svm_free_if_empty()
> True, but I meant to insert ioasid_notifier under the list_empty
> condition in the following ioasid patch.
ok

Thanks

Eric
> 
> 
> Thanks,
> 
> Jacob
>
Tian, Kevin April 17, 2020, 2:45 a.m. UTC | #6
> From: Auger Eric
> Sent: Thursday, April 16, 2020 6:43 PM
> 
[...]
> >>> +	if (svm) {
> >>> +		/*
> >>> +		 * If we found svm for the PASID, there must be at
> >>> +		 * least one device bond, otherwise svm should be
> >>> freed.
> >>> +		 */
> >>> +		if (WARN_ON(list_empty(&svm->devs))) {
> >>> +			ret = -EINVAL;
> >>> +			goto out;
> >>> +		}
> >>> +
> >>> +		for_each_svm_dev(sdev, svm, dev) {
> >>> +			/* In case of multiple sub-devices of the
> >>> same pdev
> >>> +			 * assigned, we should allow multiple bind
> >>> calls with
> >>> +			 * the same PASID and pdev.
> >>> +			 */
> >>> +			sdev->users++;
> >> What if this is not an mdev device. Is it also allowed?
> > Yes. IOMMU and VT-d driver is not mdev aware. Here mdev is just an
> > example of normal use case. You can bind the same PCI device (PF or
> > SRIOV VF) more than once to the same PASID. Just need to unbind also.
> 
> I don't get the point of binding a non mdev device several times with
> the same PASID. Do you intend to allow that at userspace level or
> prevent this from happening in VFIO?

I feel it's better to prevent this from happening, otherwise VFIO also
needs to track the bind count and do multiple unbinds at mm_exit.
But it's not necessary to prevent it in VFIO. We can check here
upon whether aux_domain is valid, and if not return -EBUSY.

> 
> Besides, the comment is a bit misleading as it gives the impression it
> is only true for mdev and there is no associated check.

Thanks
Kevin
Auger Eric April 17, 2020, 7:46 a.m. UTC | #7
Hi Kevin,
On 4/17/20 4:45 AM, Tian, Kevin wrote:
>> From: Auger Eric
>> Sent: Thursday, April 16, 2020 6:43 PM
>>
> [...]
>>>>> +	if (svm) {
>>>>> +		/*
>>>>> +		 * If we found svm for the PASID, there must be at
>>>>> +		 * least one device bond, otherwise svm should be
>>>>> freed.
>>>>> +		 */
>>>>> +		if (WARN_ON(list_empty(&svm->devs))) {
>>>>> +			ret = -EINVAL;
>>>>> +			goto out;
>>>>> +		}
>>>>> +
>>>>> +		for_each_svm_dev(sdev, svm, dev) {
>>>>> +			/* In case of multiple sub-devices of the
>>>>> same pdev
>>>>> +			 * assigned, we should allow multiple bind
>>>>> calls with
>>>>> +			 * the same PASID and pdev.
>>>>> +			 */
>>>>> +			sdev->users++;
>>>> What if this is not an mdev device. Is it also allowed?
>>> Yes. IOMMU and VT-d driver is not mdev aware. Here mdev is just an
>>> example of normal use case. You can bind the same PCI device (PF or
>>> SRIOV VF) more than once to the same PASID. Just need to unbind also.
>>
>> I don't get the point of binding a non mdev device several times with
>> the same PASID. Do you intend to allow that at userspace level or
>> prevent this from happening in VFIO?
> 
> I feel it's better to prevent this from happening, otherwise VFIO also
> needs to track the bind count and do multiple unbinds at mm_exit.
> But it's not necessary to prevent it in VFIO. We can check here
> upon whether aux_domain is valid, and if not return -EBUSY.
Ah OK. So if we can detect the case here it is even better

Thanks

Eric
> 
>>
>> Besides, the comment is a bit misleading as it gives the impression it
>> is only true for mdev and there is no associated check.
> 
> Thanks
> Kevin
>
Jacob Pan April 17, 2020, 3:28 p.m. UTC | #8
On Fri, 17 Apr 2020 09:46:55 +0200
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Kevin,
> On 4/17/20 4:45 AM, Tian, Kevin wrote:
> >> From: Auger Eric
> >> Sent: Thursday, April 16, 2020 6:43 PM
> >>  
> > [...]  
> >>>>> +	if (svm) {
> >>>>> +		/*
> >>>>> +		 * If we found svm for the PASID, there must
> >>>>> be at
> >>>>> +		 * least one device bond, otherwise svm should
> >>>>> be freed.
> >>>>> +		 */
> >>>>> +		if (WARN_ON(list_empty(&svm->devs))) {
> >>>>> +			ret = -EINVAL;
> >>>>> +			goto out;
> >>>>> +		}
> >>>>> +
> >>>>> +		for_each_svm_dev(sdev, svm, dev) {
> >>>>> +			/* In case of multiple sub-devices of
> >>>>> the same pdev
> >>>>> +			 * assigned, we should allow multiple
> >>>>> bind calls with
> >>>>> +			 * the same PASID and pdev.
> >>>>> +			 */
> >>>>> +			sdev->users++;  
> >>>> What if this is not an mdev device. Is it also allowed?  
> >>> Yes. IOMMU and VT-d driver is not mdev aware. Here mdev is just an
> >>> example of normal use case. You can bind the same PCI device (PF
> >>> or SRIOV VF) more than once to the same PASID. Just need to
> >>> unbind also.  
> >>
> >> I don't get the point of binding a non mdev device several times
> >> with the same PASID. Do you intend to allow that at userspace
> >> level or prevent this from happening in VFIO?  
> > 
> > I feel it's better to prevent this from happening, otherwise VFIO
> > also needs to track the bind count and do multiple unbinds at
> > mm_exit. But it's not necessary to prevent it in VFIO. We can check
> > here upon whether aux_domain is valid, and if not return -EBUSY.  
> Ah OK. So if we can detect the case here it is even better
> 
I don't understand why VFIO cannot track, since it is mdev aware. if we
don;t refcount the users, one mdev unbind will result unbind for all
mdev under the same pdev. That may not be the right thing to do.

> Thanks
> 
> Eric
> >   
> >>
> >> Besides, the comment is a bit misleading as it gives the
> >> impression it is only true for mdev and there is no associated
> >> check.  
> > 
> > Thanks
> > Kevin
> >   
> 

[Jacob Pan]
Tian, Kevin April 17, 2020, 11:46 p.m. UTC | #9
> From: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Sent: Friday, April 17, 2020 11:29 PM
> 
> On Fri, 17 Apr 2020 09:46:55 +0200
> Auger Eric <eric.auger@redhat.com> wrote:
> 
> > Hi Kevin,
> > On 4/17/20 4:45 AM, Tian, Kevin wrote:
> > >> From: Auger Eric
> > >> Sent: Thursday, April 16, 2020 6:43 PM
> > >>
> > > [...]
> > >>>>> +	if (svm) {
> > >>>>> +		/*
> > >>>>> +		 * If we found svm for the PASID, there must
> > >>>>> be at
> > >>>>> +		 * least one device bond, otherwise svm should
> > >>>>> be freed.
> > >>>>> +		 */
> > >>>>> +		if (WARN_ON(list_empty(&svm->devs))) {
> > >>>>> +			ret = -EINVAL;
> > >>>>> +			goto out;
> > >>>>> +		}
> > >>>>> +
> > >>>>> +		for_each_svm_dev(sdev, svm, dev) {
> > >>>>> +			/* In case of multiple sub-devices of
> > >>>>> the same pdev
> > >>>>> +			 * assigned, we should allow multiple
> > >>>>> bind calls with
> > >>>>> +			 * the same PASID and pdev.
> > >>>>> +			 */
> > >>>>> +			sdev->users++;
> > >>>> What if this is not an mdev device. Is it also allowed?
> > >>> Yes. IOMMU and VT-d driver is not mdev aware. Here mdev is just an
> > >>> example of normal use case. You can bind the same PCI device (PF
> > >>> or SRIOV VF) more than once to the same PASID. Just need to
> > >>> unbind also.
> > >>
> > >> I don't get the point of binding a non mdev device several times
> > >> with the same PASID. Do you intend to allow that at userspace
> > >> level or prevent this from happening in VFIO?
> > >
> > > I feel it's better to prevent this from happening, otherwise VFIO
> > > also needs to track the bind count and do multiple unbinds at
> > > mm_exit. But it's not necessary to prevent it in VFIO. We can check
> > > here upon whether aux_domain is valid, and if not return -EBUSY.
> > Ah OK. So if we can detect the case here it is even better
> >
> I don't understand why VFIO cannot track, since it is mdev aware. if we
> don;t refcount the users, one mdev unbind will result unbind for all
> mdev under the same pdev. That may not be the right thing to do.
> 

The open here is not for mdev, which refcount is still required. Eric's
point is for non-mdev endpoints. It's meaningless and not intuitive 
to allow binding a PASID multiple-times to the same device. 

Thanks
Kevin
Jacob Pan April 27, 2020, 5:28 p.m. UTC | #10
On Fri, 17 Apr 2020 23:46:13 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Sent: Friday, April 17, 2020 11:29 PM
> > 
> > On Fri, 17 Apr 2020 09:46:55 +0200
> > Auger Eric <eric.auger@redhat.com> wrote:
> >   
> > > Hi Kevin,
> > > On 4/17/20 4:45 AM, Tian, Kevin wrote:  
> > > >> From: Auger Eric
> > > >> Sent: Thursday, April 16, 2020 6:43 PM
> > > >>  
> > > > [...]  
> > > >>>>> +	if (svm) {
> > > >>>>> +		/*
> > > >>>>> +		 * If we found svm for the PASID, there
> > > >>>>> must be at
> > > >>>>> +		 * least one device bond, otherwise svm
> > > >>>>> should be freed.
> > > >>>>> +		 */
> > > >>>>> +		if (WARN_ON(list_empty(&svm->devs))) {
> > > >>>>> +			ret = -EINVAL;
> > > >>>>> +			goto out;
> > > >>>>> +		}
> > > >>>>> +
> > > >>>>> +		for_each_svm_dev(sdev, svm, dev) {
> > > >>>>> +			/* In case of multiple sub-devices
> > > >>>>> of the same pdev
> > > >>>>> +			 * assigned, we should allow
> > > >>>>> multiple bind calls with
> > > >>>>> +			 * the same PASID and pdev.
> > > >>>>> +			 */
> > > >>>>> +			sdev->users++;  
> > > >>>> What if this is not an mdev device. Is it also allowed?  
> > > >>> Yes. IOMMU and VT-d driver is not mdev aware. Here mdev is
> > > >>> just an example of normal use case. You can bind the same PCI
> > > >>> device (PF or SRIOV VF) more than once to the same PASID.
> > > >>> Just need to unbind also.  
> > > >>
> > > >> I don't get the point of binding a non mdev device several
> > > >> times with the same PASID. Do you intend to allow that at
> > > >> userspace level or prevent this from happening in VFIO?  
> > > >
> > > > I feel it's better to prevent this from happening, otherwise
> > > > VFIO also needs to track the bind count and do multiple unbinds
> > > > at mm_exit. But it's not necessary to prevent it in VFIO. We
> > > > can check here upon whether aux_domain is valid, and if not
> > > > return -EBUSY.  
> > > Ah OK. So if we can detect the case here it is even better
> > >  
> > I don't understand why VFIO cannot track, since it is mdev aware.
> > if we don;t refcount the users, one mdev unbind will result unbind
> > for all mdev under the same pdev. That may not be the right thing
> > to do. 
> 
> The open here is not for mdev, which refcount is still required.
> Eric's point is for non-mdev endpoints. It's meaningless and not
> intuitive to allow binding a PASID multiple-times to the same device. 
> 
That seems contradictory. The refcount here is intended/required for sub
devices such as mdev. Since IOMMU driver is not mdev aware, we cannot
treat devices differently.

Perhaps Yi can clarify if this case is handled within VFIO, then I can
drop the refcount here.

> Thanks
> Kevin

[Jacob Pan]

Patch
diff mbox series

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c0dadec5a6b3..94c7993dac6a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -6178,6 +6178,10 @@  const struct iommu_ops intel_iommu_ops = {
 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	.sva_bind_gpasid	= intel_svm_bind_gpasid,
+	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
+#endif
 };
 
 static void quirk_iommu_igfx(struct pci_dev *dev)
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index d7f2a5358900..7cf711318b87 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -226,6 +226,212 @@  static LIST_HEAD(global_svm_list);
 	list_for_each_entry((sdev), &(svm)->devs, list)	\
 		if ((d) != (sdev)->dev) {} else
 
+
+static inline void intel_svm_free_if_empty(struct intel_svm *svm, u64 pasid)
+{
+	if (list_empty(&svm->devs)) {
+		ioasid_set_data(pasid, NULL);
+		kfree(svm);
+	}
+}
+
+int intel_svm_bind_gpasid(struct iommu_domain *domain,
+			struct device *dev,
+			struct iommu_gpasid_bind_data *data)
+{
+	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+	struct dmar_domain *dmar_domain;
+	struct intel_svm_dev *sdev;
+	struct intel_svm *svm;
+	int ret = 0;
+
+	if (WARN_ON(!iommu) || !data)
+		return -EINVAL;
+
+	if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
+	    data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
+		return -EINVAL;
+
+	if (dev_is_pci(dev)) {
+		/* VT-d supports devices with full 20 bit PASIDs only */
+		if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
+			return -EINVAL;
+	} else {
+		return -ENOTSUPP;
+	}
+
+	/*
+	 * We only check host PASID range, we have no knowledge to check
+	 * guest PASID range.
+	 */
+	if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
+		return -EINVAL;
+
+	dmar_domain = to_dmar_domain(domain);
+
+	mutex_lock(&pasid_mutex);
+	svm = ioasid_find(NULL, data->hpasid, NULL);
+	if (IS_ERR(svm)) {
+		ret = PTR_ERR(svm);
+		goto out;
+	}
+
+	if (svm) {
+		/*
+		 * If we found svm for the PASID, there must be at
+		 * least one device bond, otherwise svm should be freed.
+		 */
+		if (WARN_ON(list_empty(&svm->devs))) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		for_each_svm_dev(sdev, svm, dev) {
+			/* In case of multiple sub-devices of the same pdev
+			 * assigned, we should allow multiple bind calls with
+			 * the same PASID and pdev.
+			 */
+			sdev->users++;
+			goto out;
+		}
+	} else {
+		/* We come here when PASID has never been bond to a device. */
+		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+		if (!svm) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		/* REVISIT: upper layer/VFIO can track host process that bind the PASID.
+		 * ioasid_set = mm might be sufficient for vfio to check pasid VMM
+		 * ownership. We can drop the following line once VFIO and IOASID set
+		 * check is in place.
+		 */
+		svm->mm = get_task_mm(current);
+		svm->pasid = data->hpasid;
+		if (data->flags & IOMMU_SVA_GPASID_VAL) {
+			svm->gpasid = data->gpasid;
+			svm->flags |= SVM_FLAG_GUEST_PASID;
+		}
+		ioasid_set_data(data->hpasid, svm);
+		INIT_LIST_HEAD_RCU(&svm->devs);
+		mmput(svm->mm);
+	}
+	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+	if (!sdev) {
+		/*
+		 * If this is a new PASID that never bond to a device, then
+		 * the device list must be empty which indicates struct svm
+		 * was allocated in this function.
+		 */
+		intel_svm_free_if_empty(svm, data->hpasid);
+		ret = -ENOMEM;
+		goto out;
+	}
+	sdev->dev = dev;
+	sdev->users = 1;
+
+	/* Set up device context entry for PASID if not enabled already */
+	ret = intel_iommu_enable_pasid(iommu, sdev->dev);
+	if (ret) {
+		dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
+		kfree(sdev);
+		intel_svm_free_if_empty(svm, data->hpasid);
+		goto out;
+	}
+
+	/*
+	 * PASID table is per device for better security. Therefore, for
+	 * each bind of a new device even with an existing PASID, we need to
+	 * call the nested mode setup function here.
+	 */
+	spin_lock(&iommu->lock);
+	ret = intel_pasid_setup_nested(iommu,
+				       dev,
+				       (pgd_t *)data->gpgd,
+				       data->hpasid,
+				       &data->vtd,
+				       dmar_domain,
+				       data->addr_width);
+	if (ret) {
+		dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
+			data->hpasid, ret);
+		/*
+		 * PASID entry should be in cleared state if nested mode
+		 * set up failed. So we only need to clear IOASID tracking
+		 * data such that free call will succeed.
+		 */
+		kfree(sdev);
+		intel_svm_free_if_empty(svm, data->hpasid);
+		spin_unlock(&iommu->lock);
+		goto out;
+	}
+	spin_unlock(&iommu->lock);
+	svm->flags |= SVM_FLAG_GUEST_MODE;
+
+	init_rcu_head(&sdev->rcu);
+	list_add_rcu(&sdev->list, &svm->devs);
+ out:
+	mutex_unlock(&pasid_mutex);
+	return ret;
+}
+
+int intel_svm_unbind_gpasid(struct device *dev, int pasid)
+{
+	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+	struct intel_svm_dev *sdev;
+	struct intel_svm *svm;
+	int ret = -EINVAL;
+
+	if (WARN_ON(!iommu))
+		return -EINVAL;
+
+	mutex_lock(&pasid_mutex);
+	svm = ioasid_find(NULL, pasid, NULL);
+	if (!svm) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (IS_ERR(svm)) {
+		ret = PTR_ERR(svm);
+		goto out;
+	}
+
+	for_each_svm_dev(sdev, svm, dev) {
+		ret = 0;
+		sdev->users--;
+		if (!sdev->users) {
+			list_del_rcu(&sdev->list);
+			intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
+			/* TODO: Drain in flight PRQ for the PASID since it
+			 * may get reused soon, we don't want to
+			 * confuse with its previous life.
+			 * intel_svm_drain_prq(dev, pasid);
+			 */
+			kfree_rcu(sdev, rcu);
+
+			if (list_empty(&svm->devs)) {
+				/*
+				 * We do not free the IOASID here in that
+				 * IOMMU driver did not allocate it.
+				 * Unlike native SVM, IOASID for guest use was
+				 * allocated prior to the bind call.
+				 * In any case, if the free call comes before
+				 * the unbind, IOMMU driver will get notified
+				 * and perform cleanup.
+				 */
+				ioasid_set_data(pasid, NULL);
+				kfree(svm);
+			}
+		}
+		break;
+	}
+out:
+	mutex_unlock(&pasid_mutex);
+
+	return ret;
+}
+
 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
 {
 	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 6da03f627ba3..a5bd53cf190c 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -706,7 +706,9 @@  struct dmar_domain *find_domain(struct device *dev);
 extern void intel_svm_check(struct intel_iommu *iommu);
 extern int intel_svm_enable_prq(struct intel_iommu *iommu);
 extern int intel_svm_finish_prq(struct intel_iommu *iommu);
-
+extern int intel_svm_bind_gpasid(struct iommu_domain *domain,
+		struct device *dev, struct iommu_gpasid_bind_data *data);
+extern int intel_svm_unbind_gpasid(struct device *dev, int pasid);
 struct svm_dev_ops;
 
 struct intel_svm_dev {
@@ -723,9 +725,13 @@  struct intel_svm_dev {
 struct intel_svm {
 	struct mmu_notifier notifier;
 	struct mm_struct *mm;
+
 	struct intel_iommu *iommu;
 	int flags;
 	int pasid;
+	int gpasid; /* Guest PASID in case of vSVA bind with non-identity host
+		     * to guest PASID mapping.
+		     */
 	struct list_head devs;
 	struct list_head list;
 };
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index d7c403d0dd27..c19690937540 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -44,6 +44,23 @@  struct svm_dev_ops {
  * do such IOTLB flushes automatically.
  */
 #define SVM_FLAG_SUPERVISOR_MODE	(1<<1)
+/*
+ * The SVM_FLAG_GUEST_MODE flag is used when a guest process bind to a device.
+ * In this case the mm_struct is in the guest kernel or userspace, its life
+ * cycle is managed by VMM and VFIO layer. For IOMMU driver, this API provides
+ * means to bind/unbind guest CR3 with PASIDs allocated for a device.
+ */
+#define SVM_FLAG_GUEST_MODE	(1<<2)
+/*
+ * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space,
+ * which requires guest and host PASID translation at both directions. We keep
+ * track of guest PASID in order to provide lookup service to device drivers.
+ * One such example is a physical function (PF) driver that supports mediated
+ * device (mdev) assignment. Guest programming of mdev configuration space can
+ * only be done with guest PASID, therefore PF driver needs to find the matching
+ * host PASID to program the real hardware.
+ */
+#define SVM_FLAG_GUEST_PASID	(1<<3)
 
 #ifdef CONFIG_INTEL_IOMMU_SVM