RE: [PATCH v3 07/10] iommu/vt-d: Add iotlb flush for nested domain

From: "Tian, Kevin" <kevin.tian@intel.com>
To: "Liu, Yi L" <yi.l.liu@intel.com>,
	"joro@8bytes.org" <joro@8bytes.org>,
	"alex.williamson@redhat.com" <alex.williamson@redhat.com>,
	"jgg@nvidia.com" <jgg@nvidia.com>,
	"robin.murphy@arm.com" <robin.murphy@arm.com>,
	"baolu.lu@linux.intel.com" <baolu.lu@linux.intel.com>
Cc: "cohuck@redhat.com" <cohuck@redhat.com>,
	"eric.auger@redhat.com" <eric.auger@redhat.com>,
	"nicolinc@nvidia.com" <nicolinc@nvidia.com>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
	"mjrosato@linux.ibm.com" <mjrosato@linux.ibm.com>,
	"chao.p.peng@linux.intel.com" <chao.p.peng@linux.intel.com>,
	"yi.y.sun@linux.intel.com" <yi.y.sun@linux.intel.com>,
	"peterx@redhat.com" <peterx@redhat.com>,
	"jasowang@redhat.com" <jasowang@redhat.com>,
	"shameerali.kolothum.thodi@huawei.com"
	<shameerali.kolothum.thodi@huawei.com>,
	"lulu@redhat.com" <lulu@redhat.com>,
	"suravee.suthikulpanit@amd.com" <suravee.suthikulpanit@amd.com>,
	"iommu@lists.linux.dev" <iommu@lists.linux.dev>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-kselftest@vger.kernel.org"
	<linux-kselftest@vger.kernel.org>,
	"Duan, Zhenzhong" <zhenzhong.duan@intel.com>
Subject: RE: [PATCH v3 07/10] iommu/vt-d: Add iotlb flush for nested domain
Date: Wed, 24 May 2023 07:33:42 +0000	[thread overview]
Message-ID: <BN9PR11MB52768C95F6E9B943066F39218C419@BN9PR11MB5276.namprd11.prod.outlook.com> (raw)
In-Reply-To: <20230511145110.27707-8-yi.l.liu@intel.com>

> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Thursday, May 11, 2023 10:51 PM
> 
> This is needed as the stage-1 page table of the nested domain is
> maintained outside the iommu subsystem, hence, needs to support iotlb
> flush requests.
> 
> This adds the data structure for flushing iotlb for the nested domain
> allocated with IOMMU_HWPT_TYPE_VTD_S1 type and the related callback
> to accept iotlb flush request from IOMMUFD.
> 
> This only exposes the interface for invalidating IOTLB, but no for
> device-TLB as device-TLB invalidation will be covered automatically
> in IOTLB invalidation if the affected device is ATS-capable.
> 
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>

Following how you split patches in former part of the series this should
be split into three patches: one to introduce the uAPI changes, the 2nd
to export symbols and the last to actually add iotlb flush.

> +static int intel_nested_cache_invalidate_user(struct iommu_domain
> *domain,
> +					      void *user_data)
> +{
> +	struct iommu_hwpt_invalidate_request_intel_vtd *req = user_data;
> +	struct iommu_hwpt_invalidate_intel_vtd *inv_info = user_data;
> +	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> +	unsigned int entry_size = inv_info->entry_size;
> +	u64 uptr = inv_info->inv_data_uptr;
> +	u64 nr_uptr = inv_info->entry_nr_uptr;
> +	struct device_domain_info *info;
> +	u32 entry_nr, index;
> +	unsigned long flags;
> +	int ret = 0;
> +
> +	if (WARN_ON(!user_data))
> +		return 0;

WARN_ON should lead to error returned.

> +
> +	if (get_user(entry_nr, (uint32_t __user *)u64_to_user_ptr(nr_uptr)))
> +		return -EFAULT;
> +
> +	if (!entry_nr)
> +		return -EINVAL;

Having zero number of entries is instead not an error. Just means no work
to do.

> +
> +	for (index = 0; index < entry_nr; index++) {
> +		ret = copy_struct_from_user(req, sizeof(*req),
> +					    u64_to_user_ptr(uptr + index *
> entry_size),
> +					    entry_size);
> +		if (ret) {
> +			pr_err_ratelimited("Failed to fetch invalidation
> request\n");
> +			break;
> +		}
> +
> +		if (req->__reserved || (req->flags &
> ~IOMMU_VTD_QI_FLAGS_LEAF) ||
> +		    !IS_ALIGNED(req->addr, VTD_PAGE_SIZE)) {
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		spin_lock_irqsave(&dmar_domain->lock, flags);
> +		list_for_each_entry(info, &dmar_domain->devices, link)
> +			intel_nested_invalidate(info->dev, dmar_domain,
> +						req->addr, req->npages);
> +		spin_unlock_irqrestore(&dmar_domain->lock, flags);
> +	}
> +
> +	if (ret && put_user(index, (uint32_t __user
> *)u64_to_user_ptr(nr_uptr)))
> +		return -EFAULT;

You want to always update the nr no matter success or failure

> diff --git a/drivers/iommu/iommufd/main.c
> b/drivers/iommu/iommufd/main.c
> index 39922f83ce34..b338b082950b 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -282,6 +282,12 @@ union ucmd_buffer {
>  #ifdef CONFIG_IOMMUFD_TEST
>  	struct iommu_test_cmd test;
>  #endif
> +	/*
> +	 * hwpt_type specific structure used in the cache invalidation
> +	 * path.
> +	 */
> +	struct iommu_hwpt_invalidate_intel_vtd vtd;
> +	struct iommu_hwpt_invalidate_request_intel_vtd req_vtd;
>  };

Can you add some explanation in commit msg why such vendor
specific structures must be put in the generic ucmd_buffer?

> 
> +/**
> + * enum iommu_hwpt_intel_vtd_invalidate_flags - Flags for Intel VT-d

enum iommu_hwpt_vtd_s1_invalidate_flags

> + *                                              stage-1 page table cache
> + *                                              invalidation
> + * @IOMMU_VTD_QI_FLAGS_LEAF: The LEAF flag indicates whether only the
> + *                           leaf PTE caching needs to be invalidated
> + *                           and other paging structure caches can be
> + *                           preserved.
> + */

what about "Drain Reads" and "Drain Writes"? Is the user allowed/required
to provide those hints?
> +
> +/**
> + * struct iommu_hwpt_invalidate_request_intel_vtd - Intel VT-d cache
> invalidation request

here you put "intel_vtd" in the end of the name. let's follow the same order
as earlier definitions.

struct iommu_hwpt_vtd_s1_invalidate_desc

> + * @addr: The start address of the addresses to be invalidated.
> + * @npages: Number of contiguous 4K pages to be invalidated.
> + * @flags: Combination of enum iommu_hwpt_intel_vtd_invalidate_flags
> + * @__reserved: Must be 0
> + *
> + * The Intel VT-d specific invalidation data for user-managed stage-1 cache
> + * invalidation under nested translation. Userspace uses this structure to
> + * tell host about the impacted caches after modifying the stage-1 page
> table.
> + *
> + * Invalidating all the caches related to the hw_pagetable by setting
> + * @addr==0 and @npages==__u64(-1).
> + */
> +struct iommu_hwpt_invalidate_request_intel_vtd {
> +	__u64 addr;
> +	__u64 npages;
> +	__u32 flags;
> +	__u32 __reserved;
> +};
> +
> +/**
> + * struct iommu_hwpt_invalidate_intel_vtd - Intel VT-d cache invalidation
> info

iommu_hwpt_vtd_s1_invalidate

> + * @flags: Must be 0
> + * @entry_size: Size in bytes of each cache invalidation request
> + * @entry_nr_uptr: User pointer to the number of invalidation requests.
> + *                 Kernel reads it to get the number of requests and
> + *                 updates the buffer with the number of requests that
> + *                 have been processed successfully. This pointer must
> + *                 point to a __u32 type of memory location.
> + * @inv_data_uptr: Pointer to the cache invalidation requests
> + *
> + * The Intel VT-d specific invalidation data for a set of cache invalidation
> + * requests. Kernel loops the requests one-by-one and stops when failure
> + * is encountered. The number of handled requests is reported to user by
> + * writing the buffer pointed by @entry_nr_uptr.
> + */
> +struct iommu_hwpt_invalidate_intel_vtd {
> +	__u32 flags;
> +	__u32 entry_size;
> +	__u64 entry_nr_uptr;
> +	__u64 inv_data_uptr;
> +};
> +
>  /**
>   * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
>   * @size: sizeof(struct iommu_hwpt_invalidate)
> @@ -520,6 +577,8 @@ struct iommu_hw_info {
>   *
> +==============================+================================
> ========+
>   * | @hwpt_type                   |     Data structure in @data_uptr       |
>   * +------------------------------+----------------------------------------+
> + * | IOMMU_HWPT_TYPE_VTD_S1       | struct
> iommu_hwpt_invalidate_intel_vtd |
> + * +------------------------------+----------------------------------------+
>   */
>  struct iommu_hwpt_invalidate {
>  	__u32 size;
> --
> 2.34.1