linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] vfio-pci support pasid attach/detach
@ 2023-11-27  6:39 Yi Liu
  2023-11-27  6:39 ` [PATCH 1/3] vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices Yi Liu
                   ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Yi Liu @ 2023-11-27  6:39 UTC (permalink / raw)
  To: joro, alex.williamson, jgg, kevin.tian, robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.l.liu, yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi,
	lulu, suravee.suthikulpanit, iommu, linux-kernel,
	linux-kselftest, zhenzhong.duan, joao.m.martins, xin.zeng,
	yan.y.zhao

This adds the pasid attach/detach uAPIs for userspace to attach/detach
a PASID of a device to/from a given ioas/hwpt. Only vfio-pci driver is
enabled in this series. After this series, PASID-capable devices bound
with vfio-pci can report PASID capability to userspace and VM to enable
PASID usages like Shared Virtual Addressing (SVA).

This series first adds the helpers for pasid attach in vfio core and then
add the device cdev ioctls for pasid attach/detach, finally exposes the
device PASID capability to user. It depends on iommufd pasid attach/detach
series [1].

Complete code can be found at [2], tested with a draft Qemu branch[3]

[1] https://lore.kernel.org/linux-iommu/20231127063428.127436-1-yi.l.liu@intel.com/
[2] https://github.com/yiliu1765/iommufd/tree/iommufd_pasid
[3] https://github.com/yiliu1765/qemu/tree/zhenzhong/wip/iommufd_nesting_rfcv1%2Bpasid

Change log:

v1:
 - Report PASID capability via VFIO_DEVICE_FEATURE (Alex)

rfc: https://lore.kernel.org/linux-iommu/20230926093121.18676-1-yi.l.liu@intel.com/

Regards,
	Yi Liu

Kevin Tian (1):
  vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices

Yi Liu (2):
  vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT
  vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl

 drivers/vfio/device_cdev.c       | 45 +++++++++++++++++++++
 drivers/vfio/iommufd.c           | 48 ++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci.c      |  2 +
 drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++
 drivers/vfio/vfio.h              |  4 ++
 drivers/vfio/vfio_main.c         |  8 ++++
 include/linux/vfio.h             | 11 ++++++
 include/uapi/linux/vfio.h        | 68 ++++++++++++++++++++++++++++++++
 8 files changed, 233 insertions(+)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 1/3] vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices
  2023-11-27  6:39 [PATCH 0/3] vfio-pci support pasid attach/detach Yi Liu
@ 2023-11-27  6:39 ` Yi Liu
  2024-01-15 17:18   ` Jason Gunthorpe
  2023-11-27  6:39 ` [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT Yi Liu
  2023-11-27  6:39 ` [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl Yi Liu
  2 siblings, 1 reply; 31+ messages in thread
From: Yi Liu @ 2023-11-27  6:39 UTC (permalink / raw)
  To: joro, alex.williamson, jgg, kevin.tian, robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.l.liu, yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi,
	lulu, suravee.suthikulpanit, iommu, linux-kernel,
	linux-kselftest, zhenzhong.duan, joao.m.martins, xin.zeng,
	yan.y.zhao

From: Kevin Tian <kevin.tian@intel.com>

This adds pasid_at|de]tach_ioas ops for attaching hwpt to pasid of a
device and the helpers for it. For now, only vfio-pci supports pasid
attach/detach.

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/iommufd.c      | 48 +++++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci.c |  2 ++
 include/linux/vfio.h        | 11 +++++++++
 3 files changed, 61 insertions(+)

diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 82eba6966fa5..43a702b9b4d3 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -119,6 +119,7 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev,
 	if (IS_ERR(idev))
 		return PTR_ERR(idev);
 	vdev->iommufd_device = idev;
+	xa_init(&vdev->pasid_pts);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind);
@@ -127,6 +128,17 @@ void vfio_iommufd_physical_unbind(struct vfio_device *vdev)
 {
 	lockdep_assert_held(&vdev->dev_set->lock);
 
+	if (!xa_empty(&vdev->pasid_pts)) {
+		void *entry;
+		unsigned long index;
+
+		xa_for_each(&vdev->pasid_pts, index, entry) {
+			xa_erase(&vdev->pasid_pts, index);
+			iommufd_device_pasid_detach(vdev->iommufd_device, index);
+		}
+		xa_destroy(&vdev->pasid_pts);
+	}
+
 	if (vdev->iommufd_attached) {
 		iommufd_device_detach(vdev->iommufd_device);
 		vdev->iommufd_attached = false;
@@ -168,6 +180,42 @@ void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev)
 }
 EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas);
 
+int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev,
+					    u32 pasid, u32 *pt_id)
+{
+	void *entry;
+	int rc;
+
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (WARN_ON(!vdev->iommufd_device))
+		return -EINVAL;
+
+	entry = xa_load(&vdev->pasid_pts, pasid);
+	if (xa_is_value(entry))
+		rc = iommufd_device_pasid_replace(vdev->iommufd_device, pasid, pt_id);
+	else
+		rc = iommufd_device_pasid_attach(vdev->iommufd_device, pasid, pt_id);
+	if (rc)
+		return rc;
+	xa_store(&vdev->pasid_pts, pasid, xa_mk_value(*pt_id), GFP_KERNEL);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_attach_ioas);
+
+void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev, u32 pasid)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (WARN_ON(!vdev->iommufd_device) ||
+	    !xa_is_value(xa_load(&vdev->pasid_pts, pasid)))
+		return;
+
+	iommufd_device_pasid_detach(vdev->iommufd_device, pasid);
+	xa_erase(&vdev->pasid_pts, pasid);
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_detach_ioas);
+
 /*
  * The emulated standard ops mean that vfio_device is going to use the
  * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index cb5b7f865d58..e0198851ffd2 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -142,6 +142,8 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
+	.pasid_attach_ioas	= vfio_iommufd_physical_pasid_attach_ioas,
+	.pasid_detach_ioas	= vfio_iommufd_physical_pasid_detach_ioas,
 };
 
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 454e9295970c..7b06d1bc7cb3 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -66,6 +66,7 @@ struct vfio_device {
 	void (*put_kvm)(struct kvm *kvm);
 #if IS_ENABLED(CONFIG_IOMMUFD)
 	struct iommufd_device *iommufd_device;
+	struct xarray pasid_pts;
 	u8 iommufd_attached:1;
 #endif
 	u8 cdev_opened:1;
@@ -83,6 +84,8 @@ struct vfio_device {
  *		 bound iommufd. Undo in unbind_iommufd if @detach_ioas is not
  *		 called.
  * @detach_ioas: Opposite of attach_ioas
+ * @pasid_attach_ioas: The pasid variation of attach_ioas
+ * @pasid_detach_ioas: Opposite of pasid_attach_ioas
  * @open_device: Called when the first file descriptor is opened for this device
  * @close_device: Opposite of open_device
  * @read: Perform read(2) on device file descriptor
@@ -107,6 +110,8 @@ struct vfio_device_ops {
 	void	(*unbind_iommufd)(struct vfio_device *vdev);
 	int	(*attach_ioas)(struct vfio_device *vdev, u32 *pt_id);
 	void	(*detach_ioas)(struct vfio_device *vdev);
+	int	(*pasid_attach_ioas)(struct vfio_device *vdev, u32 pasid, u32 *pt_id);
+	void	(*pasid_detach_ioas)(struct vfio_device *vdev, u32 pasid);
 	int	(*open_device)(struct vfio_device *vdev);
 	void	(*close_device)(struct vfio_device *vdev);
 	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
@@ -131,6 +136,8 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev,
 void vfio_iommufd_physical_unbind(struct vfio_device *vdev);
 int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
 void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev);
+int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev, u32 pasid, u32 *pt_id);
+void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev, u32 pasid);
 int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
 			       struct iommufd_ctx *ictx, u32 *out_device_id);
 void vfio_iommufd_emulated_unbind(struct vfio_device *vdev);
@@ -158,6 +165,10 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx)
 	((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL)
 #define vfio_iommufd_physical_detach_ioas \
 	((void (*)(struct vfio_device *vdev)) NULL)
+#define vfio_iommufd_physical_pasid_attach_ioas \
+	((int (*)(struct vfio_device *vdev, u32 pasid, u32 *pt_id)) NULL)
+#define vfio_iommufd_physical_pasid_detach_ioas \
+	((void (*)(struct vfio_device *vdev, u32 pasid)) NULL)
 #define vfio_iommufd_emulated_bind                                      \
 	((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx,   \
 		  u32 *out_device_id)) NULL)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT
  2023-11-27  6:39 [PATCH 0/3] vfio-pci support pasid attach/detach Yi Liu
  2023-11-27  6:39 ` [PATCH 1/3] vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices Yi Liu
@ 2023-11-27  6:39 ` Yi Liu
  2023-11-27  6:50   ` Duan, Zhenzhong
  2023-12-11 17:05   ` Alex Williamson
  2023-11-27  6:39 ` [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl Yi Liu
  2 siblings, 2 replies; 31+ messages in thread
From: Yi Liu @ 2023-11-27  6:39 UTC (permalink / raw)
  To: joro, alex.williamson, jgg, kevin.tian, robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.l.liu, yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi,
	lulu, suravee.suthikulpanit, iommu, linux-kernel,
	linux-kselftest, zhenzhong.duan, joao.m.martins, xin.zeng,
	yan.y.zhao

This adds ioctls for the userspace to attach a given pasid of a vfio
device to/from an IOAS/HWPT.

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/device_cdev.c | 45 +++++++++++++++++++++++++++++++
 drivers/vfio/vfio.h        |  4 +++
 drivers/vfio/vfio_main.c   |  8 ++++++
 include/uapi/linux/vfio.h  | 55 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 112 insertions(+)

diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index e75da0a70d1f..c2ac7ed44537 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -210,6 +210,51 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
 	return 0;
 }
 
+int vfio_df_ioctl_pasid_attach_pt(struct vfio_device_file *df,
+				  struct vfio_device_pasid_attach_iommufd_pt __user *arg)
+{
+	struct vfio_device *device = df->device;
+	struct vfio_device_pasid_attach_iommufd_pt attach;
+	unsigned long minsz;
+	int ret;
+
+	minsz = offsetofend(struct vfio_device_pasid_attach_iommufd_pt, pt_id);
+
+	if (copy_from_user(&attach, arg, minsz))
+		return -EFAULT;
+
+	if (attach.argsz < minsz || attach.flags)
+		return -EINVAL;
+
+	mutex_lock(&device->dev_set->lock);
+	ret = device->ops->pasid_attach_ioas(device, attach.pasid, &attach.pt_id);
+	mutex_unlock(&device->dev_set->lock);
+
+	return ret;
+}
+
+int vfio_df_ioctl_pasid_detach_pt(struct vfio_device_file *df,
+				  struct vfio_device_pasid_detach_iommufd_pt __user *arg)
+{
+	struct vfio_device *device = df->device;
+	struct vfio_device_pasid_detach_iommufd_pt detach;
+	unsigned long minsz;
+
+	minsz = offsetofend(struct vfio_device_pasid_detach_iommufd_pt, flags);
+
+	if (copy_from_user(&detach, arg, minsz))
+		return -EFAULT;
+
+	if (detach.argsz < minsz || detach.flags)
+		return -EINVAL;
+
+	mutex_lock(&device->dev_set->lock);
+	device->ops->pasid_detach_ioas(device, detach.pasid);
+	mutex_unlock(&device->dev_set->lock);
+
+	return 0;
+}
+
 static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 307e3f29b527..d228cdb6b345 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -353,6 +353,10 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
 			    struct vfio_device_attach_iommufd_pt __user *arg);
 int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
 			    struct vfio_device_detach_iommufd_pt __user *arg);
+int vfio_df_ioctl_pasid_attach_pt(struct vfio_device_file *df,
+				  struct vfio_device_pasid_attach_iommufd_pt __user *arg);
+int vfio_df_ioctl_pasid_detach_pt(struct vfio_device_file *df,
+				  struct vfio_device_pasid_detach_iommufd_pt __user *arg);
 
 #if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV)
 void vfio_init_device_cdev(struct vfio_device *device);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 8d4995ada74a..ff50c239873d 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1240,6 +1240,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
 		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
 			ret = vfio_df_ioctl_detach_pt(df, uptr);
 			goto out;
+
+		case VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT:
+			ret = vfio_df_ioctl_pasid_attach_pt(df, uptr);
+			goto out;
+
+		case VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT:
+			ret = vfio_df_ioctl_pasid_detach_pt(df, uptr);
+			goto out;
 		}
 	}
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 94b3badefde3..495193629029 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -977,6 +977,61 @@ struct vfio_device_detach_iommufd_pt {
 
 #define VFIO_DEVICE_DETACH_IOMMUFD_PT		_IO(VFIO_TYPE, VFIO_BASE + 20)
 
+/*
+ * VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 21,
+ *					      struct vfio_device_pasid_attach_iommufd_pt)
+ * @argsz:	User filled size of this data.
+ * @flags:	Must be 0.
+ * @pasid:	The pasid to be attached.
+ * @pt_id:	Input the target id which can represent an ioas or a hwpt
+ *		allocated via iommufd subsystem.
+ *		Output the input ioas id or the attached hwpt id which could
+ *		be the specified hwpt itself or a hwpt automatically created
+ *		for the specified ioas by kernel during the attachment.
+ *
+ * Associate a pasid (of a cdev device) with an address space within the
+ * bound iommufd. Undo by VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT or device fd
+ * close. This is only allowed on cdev fds.
+ *
+ * If a pasid is currently attached to a valid hw_pagetable (hwpt), without
+ * doing a VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT, a second
+ * VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT ioctl passing in another hwpt id is
+ * allowed. This action, also known as a hwpt replacement, will replace the
+ * pasid's currently attached hwpt with a new hwpt corresponding to the given
+ * @pt_id.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_pasid_attach_iommufd_pt {
+	__u32	argsz;
+	__u32	flags;
+	__u32	pasid;
+	__u32	pt_id;
+};
+
+#define VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT	_IO(VFIO_TYPE, VFIO_BASE + 21)
+
+/*
+ * VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 22,
+ *					      struct vfio_device_pasid_detach_iommufd_pt)
+ * @argsz:	User filled size of this data.
+ * @flags:	Must be 0.
+ * @pasid:	The pasid to be detached.
+ *
+ * Remove the association of a pasid (of a cdev device) and its current
+ * associated address space.  After it, the pasid of the device should be in
+ * a blocking DMA state.  This is only allowed on cdev fds.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_pasid_detach_iommufd_pt {
+	__u32	argsz;
+	__u32	flags;
+	__u32	pasid;
+};
+
+#define VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT	_IO(VFIO_TYPE, VFIO_BASE + 22)
+
 /*
  * Provide support for setting a PCI VF Token, which is used as a shared
  * secret between PF and VF drivers.  This feature may only be set on a
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-11-27  6:39 [PATCH 0/3] vfio-pci support pasid attach/detach Yi Liu
  2023-11-27  6:39 ` [PATCH 1/3] vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices Yi Liu
  2023-11-27  6:39 ` [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT Yi Liu
@ 2023-11-27  6:39 ` Yi Liu
  2023-11-27  7:28   ` Duan, Zhenzhong
                     ` (2 more replies)
  2 siblings, 3 replies; 31+ messages in thread
From: Yi Liu @ 2023-11-27  6:39 UTC (permalink / raw)
  To: joro, alex.williamson, jgg, kevin.tian, robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.l.liu, yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi,
	lulu, suravee.suthikulpanit, iommu, linux-kernel,
	linux-kselftest, zhenzhong.duan, joao.m.martins, xin.zeng,
	yan.y.zhao

This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
hence userspace could probe PASID capability by it. This is a bit different
with other capabilities which are reported to userspace when the user reads
the device's PCI configuration space. There are two reasons for this.

 - First, Qemu by default exposes all available PCI capabilities in vfio-pci
   config space to the guest as read-only, so adding PASID capability in the
   vfio-pci config space will make it exposed to the guest automatically while
   an old Qemu doesn't really support it.

 - Second, PASID capability does not exit on VFs (instead shares the cap of
   the PF). Creating a virtual PASID capability in vfio-pci config space needs
   to find a hole to place it, but doing so may require device specific
   knowledge to avoid potential conflict with device specific registers like
   hiden bits in VF config space. It's simpler by moving this burden to the
   VMM instead of maintaining a quirk system in the kernel.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        | 13 +++++++++
 2 files changed, 60 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1929103ee59a..8038aa45500e 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
 	return 0;
 }
 
+static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
+				       struct vfio_device_feature_pasid __user *arg,
+				       size_t argsz)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	struct vfio_device_feature_pasid pasid = { 0 };
+	struct pci_dev *pdev = vdev->pdev;
+	u32 capabilities = 0;
+	int ret;
+
+	/* We do not support SET of the PASID capability */
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(pasid));
+	if (ret != 1)
+		return ret;
+
+	/*
+	 * Needs go to PF if the device is VF as VF shares its PF's
+	 * PASID Capability.
+	 */
+	if (pdev->is_virtfn)
+		pdev = pci_physfn(pdev);
+
+	if (!pdev->pasid_enabled)
+		goto out;
+
+#ifdef CONFIG_PCI_PASID
+	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
+			      &capabilities);
+#endif
+
+	if (capabilities & PCI_PASID_CAP_EXEC)
+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
+	if (capabilities & PCI_PASID_CAP_PRIV)
+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
+
+	pasid.width = (capabilities >> 8) & 0x1f;
+
+out:
+	if (copy_to_user(arg, &pasid, sizeof(pasid)))
+		return -EFAULT;
+	return 0;
+}
+
 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 				void __user *arg, size_t argsz)
 {
@@ -1508,6 +1553,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_PASID:
+		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 495193629029..8326faf8622b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1512,6 +1512,19 @@ struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+/**
+ * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
+ * Zero width means no support for PASID.
+ */
+struct vfio_device_feature_pasid {
+	__u16 capabilities;
+#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
+#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
+	__u8 width;
+	__u8 __reserved;
+};
+#define VFIO_DEVICE_FEATURE_PASID 11
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* RE: [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT
  2023-11-27  6:39 ` [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT Yi Liu
@ 2023-11-27  6:50   ` Duan, Zhenzhong
  2023-11-28  3:06     ` Yi Liu
  2023-12-11 17:05   ` Alex Williamson
  1 sibling, 1 reply; 31+ messages in thread
From: Duan, Zhenzhong @ 2023-11-27  6:50 UTC (permalink / raw)
  To: Liu, Yi L, joro, alex.williamson, jgg, Tian, Kevin, robin.murphy,
	baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	joao.m.martins, Zeng, Xin, Zhao, Yan Y



>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Sent: Monday, November 27, 2023 2:39 PM
>Subject: [PATCH 2/3] vfio: Add
>VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT
>
>This adds ioctls for the userspace to attach a given pasid of a vfio
>device to/from an IOAS/HWPT.
>
>Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>---
> drivers/vfio/device_cdev.c | 45 +++++++++++++++++++++++++++++++
> drivers/vfio/vfio.h        |  4 +++
> drivers/vfio/vfio_main.c   |  8 ++++++
> include/uapi/linux/vfio.h  | 55 ++++++++++++++++++++++++++++++++++++++
> 4 files changed, 112 insertions(+)
>
>diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
>index e75da0a70d1f..c2ac7ed44537 100644
>--- a/drivers/vfio/device_cdev.c
>+++ b/drivers/vfio/device_cdev.c
>@@ -210,6 +210,51 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
> 	return 0;
> }
>
>+int vfio_df_ioctl_pasid_attach_pt(struct vfio_device_file *df,
>+				  struct vfio_device_pasid_attach_iommufd_pt
>__user *arg)
>+{
>+	struct vfio_device *device = df->device;
>+	struct vfio_device_pasid_attach_iommufd_pt attach;
>+	unsigned long minsz;
>+	int ret;
>+
>+	minsz = offsetofend(struct vfio_device_pasid_attach_iommufd_pt, pt_id);
>+
>+	if (copy_from_user(&attach, arg, minsz))
>+		return -EFAULT;
>+
>+	if (attach.argsz < minsz || attach.flags)
>+		return -EINVAL;
>+
>+	mutex_lock(&device->dev_set->lock);
>+	ret = device->ops->pasid_attach_ioas(device, attach.pasid,
>&attach.pt_id);
>+	mutex_unlock(&device->dev_set->lock);
>+
>+	return ret;
>+}
>+
>+int vfio_df_ioctl_pasid_detach_pt(struct vfio_device_file *df,
>+				  struct vfio_device_pasid_detach_iommufd_pt
>__user *arg)
>+{
>+	struct vfio_device *device = df->device;
>+	struct vfio_device_pasid_detach_iommufd_pt detach;
>+	unsigned long minsz;
>+
>+	minsz = offsetofend(struct vfio_device_pasid_detach_iommufd_pt, flags);

Pasid isn't copied, should use pasid here?

Thanks
Zhenzhong

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-11-27  6:39 ` [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl Yi Liu
@ 2023-11-27  7:28   ` Duan, Zhenzhong
  2023-11-28  3:11     ` Yi Liu
  2023-12-07  8:47   ` Tian, Kevin
  2023-12-11 18:03   ` Alex Williamson
  2 siblings, 1 reply; 31+ messages in thread
From: Duan, Zhenzhong @ 2023-11-27  7:28 UTC (permalink / raw)
  To: Liu, Yi L, joro, alex.williamson, jgg, Tian, Kevin, robin.murphy,
	baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	joao.m.martins, Zeng, Xin, Zhao, Yan Y



>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Sent: Monday, November 27, 2023 2:39 PM
>Subject: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>ioctl
>
>This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
>hence userspace could probe PASID capability by it. This is a bit different
>with other capabilities which are reported to userspace when the user reads
>the device's PCI configuration space. There are two reasons for this.
>
> - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>   config space to the guest as read-only, so adding PASID capability in the
>   vfio-pci config space will make it exposed to the guest automatically while
>   an old Qemu doesn't really support it.
>
> - Second, PASID capability does not exit on VFs (instead shares the cap of
>   the PF). Creating a virtual PASID capability in vfio-pci config space needs
>   to find a hole to place it, but doing so may require device specific
>   knowledge to avoid potential conflict with device specific registers like
>   hiden bits in VF config space. It's simpler by moving this burden to the
>   VMM instead of maintaining a quirk system in the kernel.
>
>Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>---
> drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h        | 13 +++++++++
> 2 files changed, 60 insertions(+)
>
>diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>index 1929103ee59a..8038aa45500e 100644
>--- a/drivers/vfio/pci/vfio_pci_core.c
>+++ b/drivers/vfio/pci/vfio_pci_core.c
>@@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct
>vfio_device *device, u32 flags,
> 	return 0;
> }
>
>+static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
>+				       struct vfio_device_feature_pasid __user
>*arg,
>+				       size_t argsz)
>+{
>+	struct vfio_pci_core_device *vdev =
>+		container_of(device, struct vfio_pci_core_device, vdev);
>+	struct vfio_device_feature_pasid pasid = { 0 };
>+	struct pci_dev *pdev = vdev->pdev;
>+	u32 capabilities = 0;
>+	int ret;
>+
>+	/* We do not support SET of the PASID capability */
>+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
>+				 sizeof(pasid));
>+	if (ret != 1)
>+		return ret;
>+
>+	/*
>+	 * Needs go to PF if the device is VF as VF shares its PF's
>+	 * PASID Capability.
>+	 */
>+	if (pdev->is_virtfn)
>+		pdev = pci_physfn(pdev);
>+
>+	if (!pdev->pasid_enabled)
>+		goto out;

Does a PF bound to VFIO have pasid enabled by default?
Isn't the guest kernel's responsibility to enable pasid cap of an assigned PF?

Thanks
Zhenzhong

>+
>+#ifdef CONFIG_PCI_PASID
>+	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
>+			      &capabilities);
>+#endif
>+
>+	if (capabilities & PCI_PASID_CAP_EXEC)
>+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
>+	if (capabilities & PCI_PASID_CAP_PRIV)
>+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
>+
>+	pasid.width = (capabilities >> 8) & 0x1f;
>+
>+out:
>+	if (copy_to_user(arg, &pasid, sizeof(pasid)))
>+		return -EFAULT;
>+	return 0;
>+}
>+
> int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> 				void __user *arg, size_t argsz)
> {
>@@ -1508,6 +1553,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device
>*device, u32 flags,
> 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
> 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
> 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
>+	case VFIO_DEVICE_FEATURE_PASID:
>+		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
> 	default:
> 		return -ENOTTY;
> 	}
>diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>index 495193629029..8326faf8622b 100644
>--- a/include/uapi/linux/vfio.h
>+++ b/include/uapi/linux/vfio.h
>@@ -1512,6 +1512,19 @@ struct vfio_device_feature_bus_master {
> };
> #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>
>+/**
>+ * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
>+ * Zero width means no support for PASID.
>+ */
>+struct vfio_device_feature_pasid {
>+	__u16 capabilities;
>+#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
>+#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
>+	__u8 width;
>+	__u8 __reserved;
>+};
>+#define VFIO_DEVICE_FEATURE_PASID 11
>+
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
>--
>2.34.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT
  2023-11-27  6:50   ` Duan, Zhenzhong
@ 2023-11-28  3:06     ` Yi Liu
  0 siblings, 0 replies; 31+ messages in thread
From: Yi Liu @ 2023-11-28  3:06 UTC (permalink / raw)
  To: Duan, Zhenzhong, joro, alex.williamson, jgg, Tian, Kevin,
	robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	joao.m.martins, Zeng, Xin, Zhao, Yan Y



On 2023/11/27 14:50, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Monday, November 27, 2023 2:39 PM
>> Subject: [PATCH 2/3] vfio: Add
>> VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT
>>
>> This adds ioctls for the userspace to attach a given pasid of a vfio
>> device to/from an IOAS/HWPT.
>>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> ---
>> drivers/vfio/device_cdev.c | 45 +++++++++++++++++++++++++++++++
>> drivers/vfio/vfio.h        |  4 +++
>> drivers/vfio/vfio_main.c   |  8 ++++++
>> include/uapi/linux/vfio.h  | 55 ++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 112 insertions(+)
>>
>> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
>> index e75da0a70d1f..c2ac7ed44537 100644
>> --- a/drivers/vfio/device_cdev.c
>> +++ b/drivers/vfio/device_cdev.c
>> @@ -210,6 +210,51 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
>> 	return 0;
>> }
>>
>> +int vfio_df_ioctl_pasid_attach_pt(struct vfio_device_file *df,
>> +				  struct vfio_device_pasid_attach_iommufd_pt
>> __user *arg)
>> +{
>> +	struct vfio_device *device = df->device;
>> +	struct vfio_device_pasid_attach_iommufd_pt attach;
>> +	unsigned long minsz;
>> +	int ret;
>> +
>> +	minsz = offsetofend(struct vfio_device_pasid_attach_iommufd_pt, pt_id);
>> +
>> +	if (copy_from_user(&attach, arg, minsz))
>> +		return -EFAULT;
>> +
>> +	if (attach.argsz < minsz || attach.flags)
>> +		return -EINVAL;
>> +
>> +	mutex_lock(&device->dev_set->lock);
>> +	ret = device->ops->pasid_attach_ioas(device, attach.pasid,
>> &attach.pt_id);
>> +	mutex_unlock(&device->dev_set->lock);
>> +
>> +	return ret;
>> +}
>> +
>> +int vfio_df_ioctl_pasid_detach_pt(struct vfio_device_file *df,
>> +				  struct vfio_device_pasid_detach_iommufd_pt
>> __user *arg)
>> +{
>> +	struct vfio_device *device = df->device;
>> +	struct vfio_device_pasid_detach_iommufd_pt detach;
>> +	unsigned long minsz;
>> +
>> +	minsz = offsetofend(struct vfio_device_pasid_detach_iommufd_pt, flags);
> 
> Pasid isn't copied, should use pasid here?

good catch! will fix it.

-- 
Regards,
Yi Liu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-11-27  7:28   ` Duan, Zhenzhong
@ 2023-11-28  3:11     ` Yi Liu
  2023-11-28  4:23       ` Duan, Zhenzhong
  0 siblings, 1 reply; 31+ messages in thread
From: Yi Liu @ 2023-11-28  3:11 UTC (permalink / raw)
  To: Duan, Zhenzhong, joro, alex.williamson, jgg, Tian, Kevin,
	robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	joao.m.martins, Zeng, Xin, Zhao, Yan Y

On 2023/11/27 15:28, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Monday, November 27, 2023 2:39 PM
>> Subject: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>> ioctl
>>
>> This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
>> hence userspace could probe PASID capability by it. This is a bit different
>> with other capabilities which are reported to userspace when the user reads
>> the device's PCI configuration space. There are two reasons for this.
>>
>> - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>    config space to the guest as read-only, so adding PASID capability in the
>>    vfio-pci config space will make it exposed to the guest automatically while
>>    an old Qemu doesn't really support it.
>>
>> - Second, PASID capability does not exit on VFs (instead shares the cap of
>>    the PF). Creating a virtual PASID capability in vfio-pci config space needs
>>    to find a hole to place it, but doing so may require device specific
>>    knowledge to avoid potential conflict with device specific registers like
>>    hiden bits in VF config space. It's simpler by moving this burden to the
>>    VMM instead of maintaining a quirk system in the kernel.
>>
>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> ---
>> drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
>> include/uapi/linux/vfio.h        | 13 +++++++++
>> 2 files changed, 60 insertions(+)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>> index 1929103ee59a..8038aa45500e 100644
>> --- a/drivers/vfio/pci/vfio_pci_core.c
>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>> @@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct
>> vfio_device *device, u32 flags,
>> 	return 0;
>> }
>>
>> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
>> +				       struct vfio_device_feature_pasid __user
>> *arg,
>> +				       size_t argsz)
>> +{
>> +	struct vfio_pci_core_device *vdev =
>> +		container_of(device, struct vfio_pci_core_device, vdev);
>> +	struct vfio_device_feature_pasid pasid = { 0 };
>> +	struct pci_dev *pdev = vdev->pdev;
>> +	u32 capabilities = 0;
>> +	int ret;
>> +
>> +	/* We do not support SET of the PASID capability */
>> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
>> +				 sizeof(pasid));
>> +	if (ret != 1)
>> +		return ret;
>> +
>> +	/*
>> +	 * Needs go to PF if the device is VF as VF shares its PF's
>> +	 * PASID Capability.
>> +	 */
>> +	if (pdev->is_virtfn)
>> +		pdev = pci_physfn(pdev);
>> +
>> +	if (!pdev->pasid_enabled)
>> +		goto out;
> 
> Does a PF bound to VFIO have pasid enabled by default?

Today, host iommu driver (at least intel iommu driver) enables it in the
time of device probe and seems not changed afterward. So yes, VFIO should
see it if pasid is enabled.

> Isn't the guest kernel's responsibility to enable pasid cap of an assigned PF?

guest kernel should not have the capability to change host's pasid
configuration. It can only write to its own vconfig emulated by
hypervisor.

> Thanks
> Zhenzhong
> 
>> +
>> +#ifdef CONFIG_PCI_PASID
>> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
>> +			      &capabilities);
>> +#endif
>> +
>> +	if (capabilities & PCI_PASID_CAP_EXEC)
>> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
>> +	if (capabilities & PCI_PASID_CAP_PRIV)
>> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
>> +
>> +	pasid.width = (capabilities >> 8) & 0x1f;
>> +
>> +out:
>> +	if (copy_to_user(arg, &pasid, sizeof(pasid)))
>> +		return -EFAULT;
>> +	return 0;
>> +}
>> +
>> int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>> 				void __user *arg, size_t argsz)
>> {
>> @@ -1508,6 +1553,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device
>> *device, u32 flags,
>> 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
>> 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>> 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
>> +	case VFIO_DEVICE_FEATURE_PASID:
>> +		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
>> 	default:
>> 		return -ENOTTY;
>> 	}
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 495193629029..8326faf8622b 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -1512,6 +1512,19 @@ struct vfio_device_feature_bus_master {
>> };
>> #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>>
>> +/**
>> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
>> + * Zero width means no support for PASID.
>> + */
>> +struct vfio_device_feature_pasid {
>> +	__u16 capabilities;
>> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
>> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
>> +	__u8 width;
>> +	__u8 __reserved;
>> +};
>> +#define VFIO_DEVICE_FEATURE_PASID 11
>> +
>> /* -------- API for Type1 VFIO IOMMU -------- */
>>
>> /**
>> --
>> 2.34.1
> 

-- 
Regards,
Yi Liu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-11-28  3:11     ` Yi Liu
@ 2023-11-28  4:23       ` Duan, Zhenzhong
  0 siblings, 0 replies; 31+ messages in thread
From: Duan, Zhenzhong @ 2023-11-28  4:23 UTC (permalink / raw)
  To: Liu, Yi L, joro, alex.williamson, jgg, Tian, Kevin, robin.murphy,
	baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	joao.m.martins, Zeng, Xin, Zhao, Yan Y



>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Sent: Tuesday, November 28, 2023 11:12 AM
>Subject: Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>ioctl
>
>On 2023/11/27 15:28, Duan, Zhenzhong wrote:
>>
>>
>>> -----Original Message-----
>>> From: Liu, Yi L <yi.l.liu@intel.com>
>>> Sent: Monday, November 27, 2023 2:39 PM
>>> Subject: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>>> ioctl
>>>
>>> This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
>>> hence userspace could probe PASID capability by it. This is a bit different
>>> with other capabilities which are reported to userspace when the user reads
>>> the device's PCI configuration space. There are two reasons for this.
>>>
>>> - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>>    config space to the guest as read-only, so adding PASID capability in the
>>>    vfio-pci config space will make it exposed to the guest automatically while
>>>    an old Qemu doesn't really support it.
>>>
>>> - Second, PASID capability does not exit on VFs (instead shares the cap of
>>>    the PF). Creating a virtual PASID capability in vfio-pci config space needs
>>>    to find a hole to place it, but doing so may require device specific
>>>    knowledge to avoid potential conflict with device specific registers like
>>>    hiden bits in VF config space. It's simpler by moving this burden to the
>>>    VMM instead of maintaining a quirk system in the kernel.
>>>
>>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>>> ---
>>> drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
>>> include/uapi/linux/vfio.h        | 13 +++++++++
>>> 2 files changed, 60 insertions(+)
>>>
>>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>>> index 1929103ee59a..8038aa45500e 100644
>>> --- a/drivers/vfio/pci/vfio_pci_core.c
>>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>>> @@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct
>>> vfio_device *device, u32 flags,
>>> 	return 0;
>>> }
>>>
>>> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
>>> +				       struct vfio_device_feature_pasid __user
>>> *arg,
>>> +				       size_t argsz)
>>> +{
>>> +	struct vfio_pci_core_device *vdev =
>>> +		container_of(device, struct vfio_pci_core_device, vdev);
>>> +	struct vfio_device_feature_pasid pasid = { 0 };
>>> +	struct pci_dev *pdev = vdev->pdev;
>>> +	u32 capabilities = 0;
>>> +	int ret;
>>> +
>>> +	/* We do not support SET of the PASID capability */
>>> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
>>> +				 sizeof(pasid));
>>> +	if (ret != 1)
>>> +		return ret;
>>> +
>>> +	/*
>>> +	 * Needs go to PF if the device is VF as VF shares its PF's
>>> +	 * PASID Capability.
>>> +	 */
>>> +	if (pdev->is_virtfn)
>>> +		pdev = pci_physfn(pdev);
>>> +
>>> +	if (!pdev->pasid_enabled)
>>> +		goto out;
>>
>> Does a PF bound to VFIO have pasid enabled by default?
>
>Today, host iommu driver (at least intel iommu driver) enables it in the
>time of device probe and seems not changed afterward. So yes, VFIO should
>see it if pasid is enabled.
>
>> Isn't the guest kernel's responsibility to enable pasid cap of an assigned PF?
>
>guest kernel should not have the capability to change host's pasid
>configuration. It can only write to its own vconfig emulated by
>hypervisor.

Understood, thanks Yi.

BRs.
Zhenzhong

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-11-27  6:39 ` [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl Yi Liu
  2023-11-27  7:28   ` Duan, Zhenzhong
@ 2023-12-07  8:47   ` Tian, Kevin
  2023-12-11  8:08     ` Yi Liu
  2023-12-11 18:03   ` Alex Williamson
  2 siblings, 1 reply; 31+ messages in thread
From: Tian, Kevin @ 2023-12-07  8:47 UTC (permalink / raw)
  To: Liu, Yi L, joro, alex.williamson, jgg, robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	Duan, Zhenzhong, joao.m.martins, Zeng, Xin, Zhao, Yan Y

> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Monday, November 27, 2023 2:39 PM
> 
> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
> +				       struct vfio_device_feature_pasid __user
> *arg,
> +				       size_t argsz)
> +{
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device, vdev);
> +	struct vfio_device_feature_pasid pasid = { 0 };
> +	struct pci_dev *pdev = vdev->pdev;
> +	u32 capabilities = 0;
> +	int ret;
> +
> +	/* We do not support SET of the PASID capability */

this line alone is meaningless. Please explain the reason e.g. due to
no PASID capability per VF...

> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(pasid));
> +	if (ret != 1)
> +		return ret;
> +
> +	/*
> +	 * Needs go to PF if the device is VF as VF shares its PF's
> +	 * PASID Capability.
> +	 */

/* VF shares the PASID capability of its PF */

> +	if (pdev->is_virtfn)
> +		pdev = pci_physfn(pdev);
> +
> +	if (!pdev->pasid_enabled)
> +		goto out;
> +
> +#ifdef CONFIG_PCI_PASID
> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
> +			      &capabilities);
> +#endif

#ifdef is unnecessary. If CONFIG_PCI_PASID is false pdev->pasid_enabled
won't be set anyway.

and it should read from PCI_PASID_CTRL which indicates whether a
capability is actually enabled.

> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the
> device.
> + * Zero width means no support for PASID.

also mention the encoding of this field according to PCIe spec.

or turn it to a plain number field.

> + */
> +struct vfio_device_feature_pasid {
> +	__u16 capabilities;
> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
> +	__u8 width;
> +	__u8 __reserved;
> +};
> +#define VFIO_DEVICE_FEATURE_PASID 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
> 
>  /**
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-07  8:47   ` Tian, Kevin
@ 2023-12-11  8:08     ` Yi Liu
  2023-12-12  2:20       ` Tian, Kevin
  0 siblings, 1 reply; 31+ messages in thread
From: Yi Liu @ 2023-12-11  8:08 UTC (permalink / raw)
  To: Tian, Kevin, joro, alex.williamson, jgg, robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	Duan, Zhenzhong, joao.m.martins, Zeng, Xin, Zhao, Yan Y

On 2023/12/7 16:47, Tian, Kevin wrote:
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Monday, November 27, 2023 2:39 PM
>>
>> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
>> +				       struct vfio_device_feature_pasid __user
>> *arg,
>> +				       size_t argsz)
>> +{
>> +	struct vfio_pci_core_device *vdev =
>> +		container_of(device, struct vfio_pci_core_device, vdev);
>> +	struct vfio_device_feature_pasid pasid = { 0 };
>> +	struct pci_dev *pdev = vdev->pdev;
>> +	u32 capabilities = 0;
>> +	int ret;
>> +
>> +	/* We do not support SET of the PASID capability */
> 
> this line alone is meaningless. Please explain the reason e.g. due to
> no PASID capability per VF...

sure. I think the major reason is we don't allow userspace to change the
PASID configuration. is it?

> 
>> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
>> +				 sizeof(pasid));
>> +	if (ret != 1)
>> +		return ret;
>> +
>> +	/*
>> +	 * Needs go to PF if the device is VF as VF shares its PF's
>> +	 * PASID Capability.
>> +	 */
> 
> /* VF shares the PASID capability of its PF */

got it.

>> +	if (pdev->is_virtfn)
>> +		pdev = pci_physfn(pdev);
>> +
>> +	if (!pdev->pasid_enabled)
>> +		goto out;
>> +
>> +#ifdef CONFIG_PCI_PASID
>> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
>> +			      &capabilities);
>> +#endif
> 
> #ifdef is unnecessary. If CONFIG_PCI_PASID is false pdev->pasid_enabled
> won't be set anyway.

it's sad that the pdev->pasid_cap is defined under #if CONFIG_PCI_PASID.
Perhaps we can have a wrapper for it.

> and it should read from PCI_PASID_CTRL which indicates whether a
> capability is actually enabled.

yes, for the EXEC and PRIV capability, needs to check if it's enabled or
not before reporting.

> 
>> +/**
>> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the
>> device.
>> + * Zero width means no support for PASID.
> 
> also mention the encoding of this field according to PCIe spec.

yes.

> or turn it to a plain number field.

It is not exact the same as the spec since bit0 is reserved. But
here bit0 is used as well.

>> + */
>> +struct vfio_device_feature_pasid {
>> +	__u16 capabilities;
>> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
>> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
>> +	__u8 width;
>> +	__u8 __reserved;
>> +};
>> +#define VFIO_DEVICE_FEATURE_PASID 11
>> +
>>   /* -------- API for Type1 VFIO IOMMU -------- */
>>
>>   /**
>> --
>> 2.34.1
> 

-- 
Regards,
Yi Liu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT
  2023-11-27  6:39 ` [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT Yi Liu
  2023-11-27  6:50   ` Duan, Zhenzhong
@ 2023-12-11 17:05   ` Alex Williamson
  2023-12-12  3:02     ` Yi Liu
  1 sibling, 1 reply; 31+ messages in thread
From: Alex Williamson @ 2023-12-11 17:05 UTC (permalink / raw)
  To: Yi Liu
  Cc: joro, jgg, kevin.tian, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	zhenzhong.duan, joao.m.martins, xin.zeng, yan.y.zhao

On Sun, 26 Nov 2023 22:39:08 -0800
Yi Liu <yi.l.liu@intel.com> wrote:

> This adds ioctls for the userspace to attach a given pasid of a vfio
> device to/from an IOAS/HWPT.
> 
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/device_cdev.c | 45 +++++++++++++++++++++++++++++++
>  drivers/vfio/vfio.h        |  4 +++
>  drivers/vfio/vfio_main.c   |  8 ++++++
>  include/uapi/linux/vfio.h  | 55 ++++++++++++++++++++++++++++++++++++++
>  4 files changed, 112 insertions(+)
> 
> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> index e75da0a70d1f..c2ac7ed44537 100644
> --- a/drivers/vfio/device_cdev.c
> +++ b/drivers/vfio/device_cdev.c
> @@ -210,6 +210,51 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
>  	return 0;
>  }
>  
> +int vfio_df_ioctl_pasid_attach_pt(struct vfio_device_file *df,
> +				  struct vfio_device_pasid_attach_iommufd_pt __user *arg)
> +{
> +	struct vfio_device *device = df->device;
> +	struct vfio_device_pasid_attach_iommufd_pt attach;
> +	unsigned long minsz;
> +	int ret;
> +
> +	minsz = offsetofend(struct vfio_device_pasid_attach_iommufd_pt, pt_id);
> +
> +	if (copy_from_user(&attach, arg, minsz))
> +		return -EFAULT;
> +
> +	if (attach.argsz < minsz || attach.flags)
> +		return -EINVAL;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	ret = device->ops->pasid_attach_ioas(device, attach.pasid, &attach.pt_id);

These callbacks were only implemented for vfio-pci in the previous
patch but they're called unconditionally.  Thanks,

Alex

> +	mutex_unlock(&device->dev_set->lock);
> +
> +	return ret;
> +}
> +
> +int vfio_df_ioctl_pasid_detach_pt(struct vfio_device_file *df,
> +				  struct vfio_device_pasid_detach_iommufd_pt __user *arg)
> +{
> +	struct vfio_device *device = df->device;
> +	struct vfio_device_pasid_detach_iommufd_pt detach;
> +	unsigned long minsz;
> +
> +	minsz = offsetofend(struct vfio_device_pasid_detach_iommufd_pt, flags);
> +
> +	if (copy_from_user(&detach, arg, minsz))
> +		return -EFAULT;
> +
> +	if (detach.argsz < minsz || detach.flags)
> +		return -EINVAL;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	device->ops->pasid_detach_ioas(device, detach.pasid);
> +	mutex_unlock(&device->dev_set->lock);
> +
> +	return 0;
> +}
> +
>  static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
>  {
>  	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index 307e3f29b527..d228cdb6b345 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -353,6 +353,10 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
>  			    struct vfio_device_attach_iommufd_pt __user *arg);
>  int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
>  			    struct vfio_device_detach_iommufd_pt __user *arg);
> +int vfio_df_ioctl_pasid_attach_pt(struct vfio_device_file *df,
> +				  struct vfio_device_pasid_attach_iommufd_pt __user *arg);
> +int vfio_df_ioctl_pasid_detach_pt(struct vfio_device_file *df,
> +				  struct vfio_device_pasid_detach_iommufd_pt __user *arg);
>  
>  #if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV)
>  void vfio_init_device_cdev(struct vfio_device *device);
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index 8d4995ada74a..ff50c239873d 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1240,6 +1240,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
>  		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
>  			ret = vfio_df_ioctl_detach_pt(df, uptr);
>  			goto out;
> +
> +		case VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT:
> +			ret = vfio_df_ioctl_pasid_attach_pt(df, uptr);
> +			goto out;
> +
> +		case VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT:
> +			ret = vfio_df_ioctl_pasid_detach_pt(df, uptr);
> +			goto out;
>  		}
>  	}
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 94b3badefde3..495193629029 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -977,6 +977,61 @@ struct vfio_device_detach_iommufd_pt {
>  
>  #define VFIO_DEVICE_DETACH_IOMMUFD_PT		_IO(VFIO_TYPE, VFIO_BASE + 20)
>  
> +/*
> + * VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 21,
> + *					      struct vfio_device_pasid_attach_iommufd_pt)
> + * @argsz:	User filled size of this data.
> + * @flags:	Must be 0.
> + * @pasid:	The pasid to be attached.
> + * @pt_id:	Input the target id which can represent an ioas or a hwpt
> + *		allocated via iommufd subsystem.
> + *		Output the input ioas id or the attached hwpt id which could
> + *		be the specified hwpt itself or a hwpt automatically created
> + *		for the specified ioas by kernel during the attachment.
> + *
> + * Associate a pasid (of a cdev device) with an address space within the
> + * bound iommufd. Undo by VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT or device fd
> + * close. This is only allowed on cdev fds.
> + *
> + * If a pasid is currently attached to a valid hw_pagetable (hwpt), without
> + * doing a VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT, a second
> + * VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT ioctl passing in another hwpt id is
> + * allowed. This action, also known as a hwpt replacement, will replace the
> + * pasid's currently attached hwpt with a new hwpt corresponding to the given
> + * @pt_id.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_pasid_attach_iommufd_pt {
> +	__u32	argsz;
> +	__u32	flags;
> +	__u32	pasid;
> +	__u32	pt_id;
> +};
> +
> +#define VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT	_IO(VFIO_TYPE, VFIO_BASE + 21)
> +
> +/*
> + * VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 22,
> + *					      struct vfio_device_pasid_detach_iommufd_pt)
> + * @argsz:	User filled size of this data.
> + * @flags:	Must be 0.
> + * @pasid:	The pasid to be detached.
> + *
> + * Remove the association of a pasid (of a cdev device) and its current
> + * associated address space.  After it, the pasid of the device should be in
> + * a blocking DMA state.  This is only allowed on cdev fds.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_pasid_detach_iommufd_pt {
> +	__u32	argsz;
> +	__u32	flags;
> +	__u32	pasid;
> +};
> +
> +#define VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT	_IO(VFIO_TYPE, VFIO_BASE + 22)
> +
>  /*
>   * Provide support for setting a PCI VF Token, which is used as a shared
>   * secret between PF and VF drivers.  This feature may only be set on a


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-11-27  6:39 ` [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl Yi Liu
  2023-11-27  7:28   ` Duan, Zhenzhong
  2023-12-07  8:47   ` Tian, Kevin
@ 2023-12-11 18:03   ` Alex Williamson
  2023-12-11 18:10     ` Jason Gunthorpe
                       ` (2 more replies)
  2 siblings, 3 replies; 31+ messages in thread
From: Alex Williamson @ 2023-12-11 18:03 UTC (permalink / raw)
  To: Yi Liu
  Cc: joro, jgg, kevin.tian, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	zhenzhong.duan, joao.m.martins, xin.zeng, yan.y.zhao

On Sun, 26 Nov 2023 22:39:09 -0800
Yi Liu <yi.l.liu@intel.com> wrote:

> This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
> hence userspace could probe PASID capability by it. This is a bit different
> with other capabilities which are reported to userspace when the user reads
> the device's PCI configuration space. There are two reasons for this.
> 
>  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>    config space to the guest as read-only, so adding PASID capability in the
>    vfio-pci config space will make it exposed to the guest automatically while
>    an old Qemu doesn't really support it.

Shouldn't we also be working on hiding the PASID capability in QEMU
ASAP?  This feature only allows QEMU to know PASID control is actually
available, not the guest.  Maybe we're hoping this is really only used
by VFs where there's no capability currently exposed to the guest?

>  - Second, PASID capability does not exit on VFs (instead shares the cap of

s/exit/exist/

>    the PF). Creating a virtual PASID capability in vfio-pci config space needs
>    to find a hole to place it, but doing so may require device specific
>    knowledge to avoid potential conflict with device specific registers like
>    hiden bits in VF config space. It's simpler by moving this burden to the
>    VMM instead of maintaining a quirk system in the kernel.

This feels a bit like an incomplete solution though and we might
already posses device specific knowledge in the form of a variant
driver.  Should this feature structure include a flag + field that
could serve to generically indicate to the VMM a location for
implementing the PASID capability?  The default core implementation
might fill this only for PFs where clearly an emualted PASID capability
can overlap the physical capability.  Thanks,

Alex

> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h        | 13 +++++++++
>  2 files changed, 60 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 1929103ee59a..8038aa45500e 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
>  	return 0;
>  }
>  
> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
> +				       struct vfio_device_feature_pasid __user *arg,
> +				       size_t argsz)
> +{
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device, vdev);
> +	struct vfio_device_feature_pasid pasid = { 0 };
> +	struct pci_dev *pdev = vdev->pdev;
> +	u32 capabilities = 0;
> +	int ret;
> +
> +	/* We do not support SET of the PASID capability */
> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(pasid));
> +	if (ret != 1)
> +		return ret;
> +
> +	/*
> +	 * Needs go to PF if the device is VF as VF shares its PF's
> +	 * PASID Capability.
> +	 */
> +	if (pdev->is_virtfn)
> +		pdev = pci_physfn(pdev);
> +
> +	if (!pdev->pasid_enabled)
> +		goto out;
> +
> +#ifdef CONFIG_PCI_PASID
> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
> +			      &capabilities);
> +#endif
> +
> +	if (capabilities & PCI_PASID_CAP_EXEC)
> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
> +	if (capabilities & PCI_PASID_CAP_PRIV)
> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
> +
> +	pasid.width = (capabilities >> 8) & 0x1f;
> +
> +out:
> +	if (copy_to_user(arg, &pasid, sizeof(pasid)))
> +		return -EFAULT;
> +	return 0;
> +}
> +
>  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  				void __user *arg, size_t argsz)
>  {
> @@ -1508,6 +1553,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(device, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_PASID:
> +		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 495193629029..8326faf8622b 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1512,6 +1512,19 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
> + * Zero width means no support for PASID.
> + */
> +struct vfio_device_feature_pasid {
> +	__u16 capabilities;
> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
> +	__u8 width;
> +	__u8 __reserved;
> +};
> +#define VFIO_DEVICE_FEATURE_PASID 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-11 18:03   ` Alex Williamson
@ 2023-12-11 18:10     ` Jason Gunthorpe
  2023-12-11 18:49       ` Alex Williamson
  2023-12-12  2:16     ` Tian, Kevin
  2023-12-12  2:43     ` Duan, Zhenzhong
  2 siblings, 1 reply; 31+ messages in thread
From: Jason Gunthorpe @ 2023-12-11 18:10 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Yi Liu, joro, kevin.tian, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	zhenzhong.duan, joao.m.martins, xin.zeng, yan.y.zhao

On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
> On Sun, 26 Nov 2023 22:39:09 -0800
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
> > hence userspace could probe PASID capability by it. This is a bit different
> > with other capabilities which are reported to userspace when the user reads
> > the device's PCI configuration space. There are two reasons for this.
> > 
> >  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
> >    config space to the guest as read-only, so adding PASID capability in the
> >    vfio-pci config space will make it exposed to the guest automatically while
> >    an old Qemu doesn't really support it.
> 
> Shouldn't we also be working on hiding the PASID capability in QEMU
> ASAP?  This feature only allows QEMU to know PASID control is actually
> available, not the guest.  Maybe we're hoping this is really only used
> by VFs where there's no capability currently exposed to the guest?

Makes sense, yes

> >    the PF). Creating a virtual PASID capability in vfio-pci config space needs
> >    to find a hole to place it, but doing so may require device specific
> >    knowledge to avoid potential conflict with device specific registers like
> >    hiden bits in VF config space. It's simpler by moving this burden to the
> >    VMM instead of maintaining a quirk system in the kernel.
> 
> This feels a bit like an incomplete solution though and we might
> already posses device specific knowledge in the form of a variant
> driver.  Should this feature structure include a flag + field that
> could serve to generically indicate to the VMM a location for
> implementing the PASID capability?  The default core implementation
> might fill this only for PFs where clearly an emualted PASID capability
> can overlap the physical capability.  Thanks,

In many ways I would perfer to solve this for good by having a way to
learn a range of available config space - I liked the suggestion to
use a DVSEC to mark empty space.

Jason

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-11 18:10     ` Jason Gunthorpe
@ 2023-12-11 18:49       ` Alex Williamson
  2023-12-12 15:35         ` Jason Gunthorpe
  0 siblings, 1 reply; 31+ messages in thread
From: Alex Williamson @ 2023-12-11 18:49 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Yi Liu, joro, kevin.tian, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	zhenzhong.duan, joao.m.martins, xin.zeng, yan.y.zhao

On Mon, 11 Dec 2023 14:10:28 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
> > On Sun, 26 Nov 2023 22:39:09 -0800
> > Yi Liu <yi.l.liu@intel.com> wrote:

> > >    the PF). Creating a virtual PASID capability in vfio-pci config space needs
> > >    to find a hole to place it, but doing so may require device specific
> > >    knowledge to avoid potential conflict with device specific registers like
> > >    hiden bits in VF config space. It's simpler by moving this burden to the
> > >    VMM instead of maintaining a quirk system in the kernel.  
> > 
> > This feels a bit like an incomplete solution though and we might
> > already posses device specific knowledge in the form of a variant
> > driver.  Should this feature structure include a flag + field that
> > could serve to generically indicate to the VMM a location for
> > implementing the PASID capability?  The default core implementation
> > might fill this only for PFs where clearly an emualted PASID capability
> > can overlap the physical capability.  Thanks,  
> 
> In many ways I would perfer to solve this for good by having a way to
> learn a range of available config space - I liked the suggestion to
> use a DVSEC to mark empty space.

Yes, DVSEC is the most plausible option for the device itself to convey
unused config space, but that requires hardware adoption so presumably
we're going to need to fill the gaps with device specific code.  That
code might live in a variant driver or in the VMM.  If we have faith
that DVSEC is the way, it'd make sense for a variant driver to
implement a virtual DVSEC to work out the QEMU implementation and set a
precedent.

I mostly just want us to recognize that this feature structure also has
the possibility to fill this gap and we're consciously passing it over
and should maybe formally propose the DVSEC solution and reference it
in the commit log or comments here to provide a complete picture.
Thanks,

Alex


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-11 18:03   ` Alex Williamson
  2023-12-11 18:10     ` Jason Gunthorpe
@ 2023-12-12  2:16     ` Tian, Kevin
  2023-12-12  3:44       ` Yi Liu
  2023-12-12  2:43     ` Duan, Zhenzhong
  2 siblings, 1 reply; 31+ messages in thread
From: Tian, Kevin @ 2023-12-12  2:16 UTC (permalink / raw)
  To: Alex Williamson, Liu, Yi L
  Cc: joro, jgg, robin.murphy, baolu.lu, cohuck, eric.auger, nicolinc,
	kvm, mjrosato, chao.p.peng, yi.y.sun, peterx, jasowang,
	shameerali.kolothum.thodi, lulu, suravee.suthikulpanit, iommu,
	linux-kernel, linux-kselftest, Duan, Zhenzhong, joao.m.martins,
	Zeng, Xin, Zhao, Yan Y

> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Tuesday, December 12, 2023 2:04 AM
> 
> On Sun, 26 Nov 2023 22:39:09 -0800
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > This reports the PASID capability data to userspace via
> VFIO_DEVICE_FEATURE,
> > hence userspace could probe PASID capability by it. This is a bit different
> > with other capabilities which are reported to userspace when the user
> reads
> > the device's PCI configuration space. There are two reasons for this.
> >
> >  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
> >    config space to the guest as read-only, so adding PASID capability in the
> >    vfio-pci config space will make it exposed to the guest automatically while
> >    an old Qemu doesn't really support it.
> 
> Shouldn't we also be working on hiding the PASID capability in QEMU
> ASAP?  This feature only allows QEMU to know PASID control is actually
> available, not the guest.  Maybe we're hoping this is really only used
> by VFs where there's no capability currently exposed to the guest?

We expect this to be used by both PF/VF. It doesn't make sense to have
separate interfaces between them.

I'm not aware of that the PASID capability has been exported today. So
yes we should fix QEMU asap. and also remove the line exposing it
in vfio_pci_config.c.

> 
> >  - Second, PASID capability does not exit on VFs (instead shares the cap of
> 
> s/exit/exist/
> 
> >    the PF). Creating a virtual PASID capability in vfio-pci config space needs
> >    to find a hole to place it, but doing so may require device specific
> >    knowledge to avoid potential conflict with device specific registers like
> >    hiden bits in VF config space. It's simpler by moving this burden to the
> >    VMM instead of maintaining a quirk system in the kernel.
> 
> This feels a bit like an incomplete solution though and we might
> already posses device specific knowledge in the form of a variant
> driver.  Should this feature structure include a flag + field that
> could serve to generically indicate to the VMM a location for
> implementing the PASID capability?  The default core implementation
> might fill this only for PFs where clearly an emualted PASID capability
> can overlap the physical capability.  Thanks,
> 

make sense

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-11  8:08     ` Yi Liu
@ 2023-12-12  2:20       ` Tian, Kevin
  2023-12-12  3:26         ` Yi Liu
  2023-12-12 15:31         ` Jason Gunthorpe
  0 siblings, 2 replies; 31+ messages in thread
From: Tian, Kevin @ 2023-12-12  2:20 UTC (permalink / raw)
  To: Liu, Yi L, joro, alex.williamson, jgg, robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	Duan, Zhenzhong, joao.m.martins, Zeng, Xin, Zhao, Yan Y

> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Monday, December 11, 2023 4:08 PM
> 
> On 2023/12/7 16:47, Tian, Kevin wrote:
> >> From: Liu, Yi L <yi.l.liu@intel.com>
> >> Sent: Monday, November 27, 2023 2:39 PM
> >>
> >> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32
> flags,
> >> +				       struct vfio_device_feature_pasid __user
> >> *arg,
> >> +				       size_t argsz)
> >> +{
> >> +	struct vfio_pci_core_device *vdev =
> >> +		container_of(device, struct vfio_pci_core_device, vdev);
> >> +	struct vfio_device_feature_pasid pasid = { 0 };
> >> +	struct pci_dev *pdev = vdev->pdev;
> >> +	u32 capabilities = 0;
> >> +	int ret;
> >> +
> >> +	/* We do not support SET of the PASID capability */
> >
> > this line alone is meaningless. Please explain the reason e.g. due to
> > no PASID capability per VF...
> 
> sure. I think the major reason is we don't allow userspace to change the
> PASID configuration. is it?

if only PF it's still possible to develop a model allowing userspace to
change.

but with VF this is not possible in concept.

> >> +	if (pdev->is_virtfn)
> >> +		pdev = pci_physfn(pdev);
> >> +
> >> +	if (!pdev->pasid_enabled)
> >> +		goto out;
> >> +
> >> +#ifdef CONFIG_PCI_PASID
> >> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
> >> +			      &capabilities);
> >> +#endif
> >
> > #ifdef is unnecessary. If CONFIG_PCI_PASID is false pdev->pasid_enabled
> > won't be set anyway.
> 
> it's sad that the pdev->pasid_cap is defined under #if CONFIG_PCI_PASID.
> Perhaps we can have a wrapper for it.

oh I didn't note it.

> 
> > and it should read from PCI_PASID_CTRL which indicates whether a
> > capability is actually enabled.
> 
> yes, for the EXEC and PRIV capability, needs to check if it's enabled or
> not before reporting.
> 
> >
> >> +/**
> >> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the
> >> device.
> >> + * Zero width means no support for PASID.
> >
> > also mention the encoding of this field according to PCIe spec.
> 
> yes.
> 
> > or turn it to a plain number field.
> 
> It is not exact the same as the spec since bit0 is reserved. But
> here bit0 is used as well.
> 

what is bit0 used for?

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-11 18:03   ` Alex Williamson
  2023-12-11 18:10     ` Jason Gunthorpe
  2023-12-12  2:16     ` Tian, Kevin
@ 2023-12-12  2:43     ` Duan, Zhenzhong
  2023-12-12  3:39       ` Alex Williamson
  2 siblings, 1 reply; 31+ messages in thread
From: Duan, Zhenzhong @ 2023-12-12  2:43 UTC (permalink / raw)
  To: Alex Williamson, Liu, Yi L
  Cc: joro, jgg, Tian, Kevin, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	joao.m.martins, Zeng, Xin, Zhao, Yan Y



>-----Original Message-----
>From: Alex Williamson <alex.williamson@redhat.com>
>Sent: Tuesday, December 12, 2023 2:04 AM
>Subject: Re: [PATCH 3/3] vfio: Report PASID capability via
>VFIO_DEVICE_FEATURE ioctl
>
>On Sun, 26 Nov 2023 22:39:09 -0800
>Yi Liu <yi.l.liu@intel.com> wrote:
>
>> This reports the PASID capability data to userspace via
>VFIO_DEVICE_FEATURE,
>> hence userspace could probe PASID capability by it. This is a bit different
>> with other capabilities which are reported to userspace when the user
>reads
>> the device's PCI configuration space. There are two reasons for this.
>>
>>  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>    config space to the guest as read-only, so adding PASID capability in the
>>    vfio-pci config space will make it exposed to the guest automatically
>while
>>    an old Qemu doesn't really support it.
>
>Shouldn't we also be working on hiding the PASID capability in QEMU
>ASAP?  This feature only allows QEMU to know PASID control is actually
>available, not the guest.  Maybe we're hoping this is really only used
>by VFs where there's no capability currently exposed to the guest?

PASID capability is not exposed to QEMU through config space,
VFIO_DEVICE_FEATURE ioctl is the only interface to expose PASID
cap to QEMU for both PF and VF.

/*
 * Lengths of PCIe/PCI-X Extended Config Capabilities
 *   0: Removed or masked from the user visible capability list
 *   FF: Variable length
 */
static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
...
        [PCI_EXT_CAP_ID_PASID]  =       0,      /* not yet */
}

Thanks
Zhenzhong


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT
  2023-12-11 17:05   ` Alex Williamson
@ 2023-12-12  3:02     ` Yi Liu
  0 siblings, 0 replies; 31+ messages in thread
From: Yi Liu @ 2023-12-12  3:02 UTC (permalink / raw)
  To: Alex Williamson
  Cc: joro, jgg, kevin.tian, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	zhenzhong.duan, joao.m.martins, xin.zeng, yan.y.zhao

On 2023/12/12 01:05, Alex Williamson wrote:
> On Sun, 26 Nov 2023 22:39:08 -0800
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
>> This adds ioctls for the userspace to attach a given pasid of a vfio
>> device to/from an IOAS/HWPT.
>>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> ---
>>   drivers/vfio/device_cdev.c | 45 +++++++++++++++++++++++++++++++
>>   drivers/vfio/vfio.h        |  4 +++
>>   drivers/vfio/vfio_main.c   |  8 ++++++
>>   include/uapi/linux/vfio.h  | 55 ++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 112 insertions(+)
>>
>> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
>> index e75da0a70d1f..c2ac7ed44537 100644
>> --- a/drivers/vfio/device_cdev.c
>> +++ b/drivers/vfio/device_cdev.c
>> @@ -210,6 +210,51 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
>>   	return 0;
>>   }
>>   
>> +int vfio_df_ioctl_pasid_attach_pt(struct vfio_device_file *df,
>> +				  struct vfio_device_pasid_attach_iommufd_pt __user *arg)
>> +{
>> +	struct vfio_device *device = df->device;
>> +	struct vfio_device_pasid_attach_iommufd_pt attach;
>> +	unsigned long minsz;
>> +	int ret;
>> +
>> +	minsz = offsetofend(struct vfio_device_pasid_attach_iommufd_pt, pt_id);
>> +
>> +	if (copy_from_user(&attach, arg, minsz))
>> +		return -EFAULT;
>> +
>> +	if (attach.argsz < minsz || attach.flags)
>> +		return -EINVAL;
>> +
>> +	mutex_lock(&device->dev_set->lock);
>> +	ret = device->ops->pasid_attach_ioas(device, attach.pasid, &attach.pt_id);
> 
> These callbacks were only implemented for vfio-pci in the previous
> patch but they're called unconditionally.  Thanks,

yes, will correct it and below. thanks for catching it.

> 
> Alex
> 
>> +	mutex_unlock(&device->dev_set->lock);
>> +
>> +	return ret;
>> +}
>> +
>> +int vfio_df_ioctl_pasid_detach_pt(struct vfio_device_file *df,
>> +				  struct vfio_device_pasid_detach_iommufd_pt __user *arg)
>> +{
>> +	struct vfio_device *device = df->device;
>> +	struct vfio_device_pasid_detach_iommufd_pt detach;
>> +	unsigned long minsz;
>> +
>> +	minsz = offsetofend(struct vfio_device_pasid_detach_iommufd_pt, flags);
>> +
>> +	if (copy_from_user(&detach, arg, minsz))
>> +		return -EFAULT;
>> +
>> +	if (detach.argsz < minsz || detach.flags)
>> +		return -EINVAL;
>> +
>> +	mutex_lock(&device->dev_set->lock);
>> +	device->ops->pasid_detach_ioas(device, detach.pasid);
>> +	mutex_unlock(&device->dev_set->lock);
>> +
>> +	return 0;
>> +}
>> +
>>   static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
>>   {
>>   	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
>> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
>> index 307e3f29b527..d228cdb6b345 100644
>> --- a/drivers/vfio/vfio.h
>> +++ b/drivers/vfio/vfio.h
>> @@ -353,6 +353,10 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
>>   			    struct vfio_device_attach_iommufd_pt __user *arg);
>>   int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
>>   			    struct vfio_device_detach_iommufd_pt __user *arg);
>> +int vfio_df_ioctl_pasid_attach_pt(struct vfio_device_file *df,
>> +				  struct vfio_device_pasid_attach_iommufd_pt __user *arg);
>> +int vfio_df_ioctl_pasid_detach_pt(struct vfio_device_file *df,
>> +				  struct vfio_device_pasid_detach_iommufd_pt __user *arg);
>>   
>>   #if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV)
>>   void vfio_init_device_cdev(struct vfio_device *device);
>> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
>> index 8d4995ada74a..ff50c239873d 100644
>> --- a/drivers/vfio/vfio_main.c
>> +++ b/drivers/vfio/vfio_main.c
>> @@ -1240,6 +1240,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
>>   		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
>>   			ret = vfio_df_ioctl_detach_pt(df, uptr);
>>   			goto out;
>> +
>> +		case VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT:
>> +			ret = vfio_df_ioctl_pasid_attach_pt(df, uptr);
>> +			goto out;
>> +
>> +		case VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT:
>> +			ret = vfio_df_ioctl_pasid_detach_pt(df, uptr);
>> +			goto out;
>>   		}
>>   	}
>>   
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 94b3badefde3..495193629029 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -977,6 +977,61 @@ struct vfio_device_detach_iommufd_pt {
>>   
>>   #define VFIO_DEVICE_DETACH_IOMMUFD_PT		_IO(VFIO_TYPE, VFIO_BASE + 20)
>>   
>> +/*
>> + * VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 21,
>> + *					      struct vfio_device_pasid_attach_iommufd_pt)
>> + * @argsz:	User filled size of this data.
>> + * @flags:	Must be 0.
>> + * @pasid:	The pasid to be attached.
>> + * @pt_id:	Input the target id which can represent an ioas or a hwpt
>> + *		allocated via iommufd subsystem.
>> + *		Output the input ioas id or the attached hwpt id which could
>> + *		be the specified hwpt itself or a hwpt automatically created
>> + *		for the specified ioas by kernel during the attachment.
>> + *
>> + * Associate a pasid (of a cdev device) with an address space within the
>> + * bound iommufd. Undo by VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT or device fd
>> + * close. This is only allowed on cdev fds.
>> + *
>> + * If a pasid is currently attached to a valid hw_pagetable (hwpt), without
>> + * doing a VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT, a second
>> + * VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT ioctl passing in another hwpt id is
>> + * allowed. This action, also known as a hwpt replacement, will replace the
>> + * pasid's currently attached hwpt with a new hwpt corresponding to the given
>> + * @pt_id.
>> + *
>> + * Return: 0 on success, -errno on failure.
>> + */
>> +struct vfio_device_pasid_attach_iommufd_pt {
>> +	__u32	argsz;
>> +	__u32	flags;
>> +	__u32	pasid;
>> +	__u32	pt_id;
>> +};
>> +
>> +#define VFIO_DEVICE_PASID_ATTACH_IOMMUFD_PT	_IO(VFIO_TYPE, VFIO_BASE + 21)
>> +
>> +/*
>> + * VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 22,
>> + *					      struct vfio_device_pasid_detach_iommufd_pt)
>> + * @argsz:	User filled size of this data.
>> + * @flags:	Must be 0.
>> + * @pasid:	The pasid to be detached.
>> + *
>> + * Remove the association of a pasid (of a cdev device) and its current
>> + * associated address space.  After it, the pasid of the device should be in
>> + * a blocking DMA state.  This is only allowed on cdev fds.
>> + *
>> + * Return: 0 on success, -errno on failure.
>> + */
>> +struct vfio_device_pasid_detach_iommufd_pt {
>> +	__u32	argsz;
>> +	__u32	flags;
>> +	__u32	pasid;
>> +};
>> +
>> +#define VFIO_DEVICE_PASID_DETACH_IOMMUFD_PT	_IO(VFIO_TYPE, VFIO_BASE + 22)
>> +
>>   /*
>>    * Provide support for setting a PCI VF Token, which is used as a shared
>>    * secret between PF and VF drivers.  This feature may only be set on a
> 

-- 
Regards,
Yi Liu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12  2:20       ` Tian, Kevin
@ 2023-12-12  3:26         ` Yi Liu
  2023-12-12 15:31         ` Jason Gunthorpe
  1 sibling, 0 replies; 31+ messages in thread
From: Yi Liu @ 2023-12-12  3:26 UTC (permalink / raw)
  To: Tian, Kevin, joro, alex.williamson, jgg, robin.murphy, baolu.lu
  Cc: cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	Duan, Zhenzhong, joao.m.martins, Zeng, Xin, Zhao, Yan Y

On 2023/12/12 10:20, Tian, Kevin wrote:
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Monday, December 11, 2023 4:08 PM
>>
>> On 2023/12/7 16:47, Tian, Kevin wrote:
>>>> From: Liu, Yi L <yi.l.liu@intel.com>
>>>> Sent: Monday, November 27, 2023 2:39 PM
>>>>
>>>> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32
>> flags,
>>>> +				       struct vfio_device_feature_pasid __user
>>>> *arg,
>>>> +				       size_t argsz)
>>>> +{
>>>> +	struct vfio_pci_core_device *vdev =
>>>> +		container_of(device, struct vfio_pci_core_device, vdev);
>>>> +	struct vfio_device_feature_pasid pasid = { 0 };
>>>> +	struct pci_dev *pdev = vdev->pdev;
>>>> +	u32 capabilities = 0;
>>>> +	int ret;
>>>> +
>>>> +	/* We do not support SET of the PASID capability */
>>>
>>> this line alone is meaningless. Please explain the reason e.g. due to
>>> no PASID capability per VF...
>>
>> sure. I think the major reason is we don't allow userspace to change the
>> PASID configuration. is it?
> 
> if only PF it's still possible to develop a model allowing userspace to
> change.
> 
> but with VF this is not possible in concept.

got it.

> 
>>>> +	if (pdev->is_virtfn)
>>>> +		pdev = pci_physfn(pdev);
>>>> +
>>>> +	if (!pdev->pasid_enabled)
>>>> +		goto out;
>>>> +
>>>> +#ifdef CONFIG_PCI_PASID
>>>> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
>>>> +			      &capabilities);
>>>> +#endif
>>>
>>> #ifdef is unnecessary. If CONFIG_PCI_PASID is false pdev->pasid_enabled
>>> won't be set anyway.
>>
>> it's sad that the pdev->pasid_cap is defined under #if CONFIG_PCI_PASID.
>> Perhaps we can have a wrapper for it.
> 
> oh I didn't note it.

If Alex feels better to have a wrapper, we may have one.

>>
>>> and it should read from PCI_PASID_CTRL which indicates whether a
>>> capability is actually enabled.
>>
>> yes, for the EXEC and PRIV capability, needs to check if it's enabled or
>> not before reporting.
>>
>>>
>>>> +/**
>>>> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the
>>>> device.
>>>> + * Zero width means no support for PASID.
>>>
>>> also mention the encoding of this field according to PCIe spec.
>>
>> yes.
>>
>>> or turn it to a plain number field.
>>
>> It is not exact the same as the spec since bit0 is reserved. But
>> here bit0 is used as well.
>>
> 
> what is bit0 used for?

it's just been reserved. No usage is mentioned in the latest spec. I don't
know the background neither.

-- 
Regards,
Yi Liu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12  2:43     ` Duan, Zhenzhong
@ 2023-12-12  3:39       ` Alex Williamson
  2023-12-12  3:53         ` Yi Liu
  2023-12-12 15:27         ` Jason Gunthorpe
  0 siblings, 2 replies; 31+ messages in thread
From: Alex Williamson @ 2023-12-12  3:39 UTC (permalink / raw)
  To: Duan, Zhenzhong
  Cc: Liu, Yi L, joro, jgg, Tian, Kevin, robin.murphy, baolu.lu,
	cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	joao.m.martins, Zeng, Xin, Zhao, Yan Y

On Tue, 12 Dec 2023 02:43:20 +0000
"Duan, Zhenzhong" <zhenzhong.duan@intel.com> wrote:

> >-----Original Message-----
> >From: Alex Williamson <alex.williamson@redhat.com>
> >Sent: Tuesday, December 12, 2023 2:04 AM
> >Subject: Re: [PATCH 3/3] vfio: Report PASID capability via
> >VFIO_DEVICE_FEATURE ioctl
> >
> >On Sun, 26 Nov 2023 22:39:09 -0800
> >Yi Liu <yi.l.liu@intel.com> wrote:
> >  
> >> This reports the PASID capability data to userspace via  
> >VFIO_DEVICE_FEATURE,  
> >> hence userspace could probe PASID capability by it. This is a bit different
> >> with other capabilities which are reported to userspace when the user  
> >reads  
> >> the device's PCI configuration space. There are two reasons for this.
> >>
> >>  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
> >>    config space to the guest as read-only, so adding PASID capability in the
> >>    vfio-pci config space will make it exposed to the guest automatically  
> >while  
> >>    an old Qemu doesn't really support it.  
> >
> >Shouldn't we also be working on hiding the PASID capability in QEMU
> >ASAP?  This feature only allows QEMU to know PASID control is actually
> >available, not the guest.  Maybe we're hoping this is really only used
> >by VFs where there's no capability currently exposed to the guest?  
> 
> PASID capability is not exposed to QEMU through config space,
> VFIO_DEVICE_FEATURE ioctl is the only interface to expose PASID
> cap to QEMU for both PF and VF.
> 
> /*
>  * Lengths of PCIe/PCI-X Extended Config Capabilities
>  *   0: Removed or masked from the user visible capability list
>  *   FF: Variable length
>  */
> static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
> ...
>         [PCI_EXT_CAP_ID_PASID]  =       0,      /* not yet */
> }

Ah, thanks.  The comment made me think is was already exposed and I
didn't double check.  So we really just want to convey the information
of the PASID capability outside of config space because if we pass the
capability itself existing userspace will blindly expose a read-only
version to the guest.  That could be better explained in the commit log
and comments.

So how do we keep up with PCIe spec updates relative to the PASID
capability with this proposal?  Would it make more sense to report the
raw capability register and capability version rather that a translated
copy thereof?  Perhaps just masking the fields we're currently prepared
to expose.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12  2:16     ` Tian, Kevin
@ 2023-12-12  3:44       ` Yi Liu
  0 siblings, 0 replies; 31+ messages in thread
From: Yi Liu @ 2023-12-12  3:44 UTC (permalink / raw)
  To: Tian, Kevin, Alex Williamson
  Cc: joro, jgg, robin.murphy, baolu.lu, cohuck, eric.auger, nicolinc,
	kvm, mjrosato, chao.p.peng, yi.y.sun, peterx, jasowang,
	shameerali.kolothum.thodi, lulu, suravee.suthikulpanit, iommu,
	linux-kernel, linux-kselftest, Duan, Zhenzhong, joao.m.martins,
	Zeng, Xin, Zhao, Yan Y

On 2023/12/12 10:16, Tian, Kevin wrote:
>> From: Alex Williamson <alex.williamson@redhat.com>
>> Sent: Tuesday, December 12, 2023 2:04 AM
>>
>> On Sun, 26 Nov 2023 22:39:09 -0800
>> Yi Liu <yi.l.liu@intel.com> wrote:
>>
>>> This reports the PASID capability data to userspace via
>> VFIO_DEVICE_FEATURE,
>>> hence userspace could probe PASID capability by it. This is a bit different
>>> with other capabilities which are reported to userspace when the user
>> reads
>>> the device's PCI configuration space. There are two reasons for this.
>>>
>>>   - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>>     config space to the guest as read-only, so adding PASID capability in the
>>>     vfio-pci config space will make it exposed to the guest automatically while
>>>     an old Qemu doesn't really support it.
>>
>> Shouldn't we also be working on hiding the PASID capability in QEMU
>> ASAP?  This feature only allows QEMU to know PASID control is actually
>> available, not the guest.  Maybe we're hoping this is really only used
>> by VFs where there's no capability currently exposed to the guest?
> 
> We expect this to be used by both PF/VF. It doesn't make sense to have
> separate interfaces between them.
> 
> I'm not aware of that the PASID capability has been exported today. So
> yes we should fix QEMU asap. and also remove the line exposing it
> in vfio_pci_config.c.

Kernel side hides the PASID capability by setting its length as 0
in the below array. As a result, QEMU wont see it in the cap chain.
Do you mean we need to let QEMU always ignore it even if kernel side
does not hide it?

static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
	...
	[PCI_EXT_CAP_ID_PASID]  =       0,      /* not yet */
	...
};

So far, kernel is still hiding it.

> 
>>
>>>   - Second, PASID capability does not exit on VFs (instead shares the cap of
>>
>> s/exit/exist/
>>
>>>     the PF). Creating a virtual PASID capability in vfio-pci config space needs
>>>     to find a hole to place it, but doing so may require device specific
>>>     knowledge to avoid potential conflict with device specific registers like
>>>     hiden bits in VF config space. It's simpler by moving this burden to the
>>>     VMM instead of maintaining a quirk system in the kernel.
>>
>> This feels a bit like an incomplete solution though and we might
>> already posses device specific knowledge in the form of a variant
>> driver.  Should this feature structure include a flag + field that
>> could serve to generically indicate to the VMM a location for
>> implementing the PASID capability?  The default core implementation
>> might fill this only for PFs where clearly an emualted PASID capability
>> can overlap the physical capability.  Thanks,
>>
> 
> make sense

A location maybe not enough, may also need to know if any successive
cap, so that we can insert the capability into the cap chain.

-- 
Regards,
Yi Liu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12  3:39       ` Alex Williamson
@ 2023-12-12  3:53         ` Yi Liu
  2023-12-12 15:27         ` Jason Gunthorpe
  1 sibling, 0 replies; 31+ messages in thread
From: Yi Liu @ 2023-12-12  3:53 UTC (permalink / raw)
  To: Alex Williamson, Duan, Zhenzhong
  Cc: joro, jgg, Tian, Kevin, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	joao.m.martins, Zeng, Xin, Zhao, Yan Y

On 2023/12/12 11:39, Alex Williamson wrote:
> On Tue, 12 Dec 2023 02:43:20 +0000
> "Duan, Zhenzhong" <zhenzhong.duan@intel.com> wrote:
> 
>>> -----Original Message-----
>>> From: Alex Williamson <alex.williamson@redhat.com>
>>> Sent: Tuesday, December 12, 2023 2:04 AM
>>> Subject: Re: [PATCH 3/3] vfio: Report PASID capability via
>>> VFIO_DEVICE_FEATURE ioctl
>>>
>>> On Sun, 26 Nov 2023 22:39:09 -0800
>>> Yi Liu <yi.l.liu@intel.com> wrote:
>>>   
>>>> This reports the PASID capability data to userspace via
>>> VFIO_DEVICE_FEATURE,
>>>> hence userspace could probe PASID capability by it. This is a bit different
>>>> with other capabilities which are reported to userspace when the user
>>> reads
>>>> the device's PCI configuration space. There are two reasons for this.
>>>>
>>>>   - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>>>     config space to the guest as read-only, so adding PASID capability in the
>>>>     vfio-pci config space will make it exposed to the guest automatically
>>> while
>>>>     an old Qemu doesn't really support it.
>>>
>>> Shouldn't we also be working on hiding the PASID capability in QEMU
>>> ASAP?  This feature only allows QEMU to know PASID control is actually
>>> available, not the guest.  Maybe we're hoping this is really only used
>>> by VFs where there's no capability currently exposed to the guest?
>>
>> PASID capability is not exposed to QEMU through config space,
>> VFIO_DEVICE_FEATURE ioctl is the only interface to expose PASID
>> cap to QEMU for both PF and VF.
>>
>> /*
>>   * Lengths of PCIe/PCI-X Extended Config Capabilities
>>   *   0: Removed or masked from the user visible capability list
>>   *   FF: Variable length
>>   */
>> static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
>> ...
>>          [PCI_EXT_CAP_ID_PASID]  =       0,      /* not yet */
>> }
> 
> Ah, thanks.  The comment made me think is was already exposed and I
> didn't double check.  So we really just want to convey the information
> of the PASID capability outside of config space because if we pass the
> capability itself existing userspace will blindly expose a read-only
> version to the guest.  That could be better explained in the commit log
> and comments.

aha, yes. It was mentioned there, but seems not quite clear. Will refine. :)

  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
    config space to the guest as read-only, so adding PASID capability in the
    vfio-pci config space will make it exposed to the guest automatically while
    an old Qemu doesn't really support it.


> So how do we keep up with PCIe spec updates relative to the PASID
> capability with this proposal?  Would it make more sense to report the
> raw capability register and capability version rather that a translated
> copy thereof?  Perhaps just masking the fields we're currently prepared
> to expose.  Thanks,

I have a minor concern on reporting raw capability register and capability
version. In this way, an old host kernel (supports version 1 pasid cap)
running on top of new hw which supports say version 2 pasid capability, the
VM would see the new capabilities that host kernel does not know. Is it
good?

-- 
Regards,
Yi Liu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12  3:39       ` Alex Williamson
  2023-12-12  3:53         ` Yi Liu
@ 2023-12-12 15:27         ` Jason Gunthorpe
  2024-01-15  8:20           ` Yi Liu
  1 sibling, 1 reply; 31+ messages in thread
From: Jason Gunthorpe @ 2023-12-12 15:27 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Duan, Zhenzhong, Liu, Yi L, joro, Tian, Kevin, robin.murphy,
	baolu.lu, cohuck, eric.auger, nicolinc, kvm, mjrosato,
	chao.p.peng, yi.y.sun, peterx, jasowang,
	shameerali.kolothum.thodi, lulu, suravee.suthikulpanit, iommu,
	linux-kernel, linux-kselftest, joao.m.martins, Zeng, Xin, Zhao,
	Yan Y

On Mon, Dec 11, 2023 at 08:39:46PM -0700, Alex Williamson wrote:

> So how do we keep up with PCIe spec updates relative to the PASID
> capability with this proposal?  Would it make more sense to report the
> raw capability register and capability version rather that a translated
> copy thereof?  Perhaps just masking the fields we're currently prepared
> to expose. 

I think the VMM must always create a cap based on the PCIe version it
understands. We don't know what future specs will put there so it
seems risky to forward it if we don't know that any possible
hypervisor support is present.

We have this problem on and off where stuff in PCI config space needs
explicit hypervisor support or it doesn't work in the VM and things
get confusing.

Jason

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12  2:20       ` Tian, Kevin
  2023-12-12  3:26         ` Yi Liu
@ 2023-12-12 15:31         ` Jason Gunthorpe
  2023-12-13  1:59           ` Tian, Kevin
  1 sibling, 1 reply; 31+ messages in thread
From: Jason Gunthorpe @ 2023-12-12 15:31 UTC (permalink / raw)
  To: Tian, Kevin
  Cc: Liu, Yi L, joro, alex.williamson, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	Duan, Zhenzhong, joao.m.martins, Zeng, Xin, Zhao, Yan Y

On Tue, Dec 12, 2023 at 02:20:01AM +0000, Tian, Kevin wrote:
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Monday, December 11, 2023 4:08 PM
> > 
> > On 2023/12/7 16:47, Tian, Kevin wrote:
> > >> From: Liu, Yi L <yi.l.liu@intel.com>
> > >> Sent: Monday, November 27, 2023 2:39 PM
> > >>
> > >> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32
> > flags,
> > >> +				       struct vfio_device_feature_pasid __user
> > >> *arg,
> > >> +				       size_t argsz)
> > >> +{
> > >> +	struct vfio_pci_core_device *vdev =
> > >> +		container_of(device, struct vfio_pci_core_device, vdev);
> > >> +	struct vfio_device_feature_pasid pasid = { 0 };
> > >> +	struct pci_dev *pdev = vdev->pdev;
> > >> +	u32 capabilities = 0;
> > >> +	int ret;
> > >> +
> > >> +	/* We do not support SET of the PASID capability */
> > >
> > > this line alone is meaningless. Please explain the reason e.g. due to
> > > no PASID capability per VF...
> > 
> > sure. I think the major reason is we don't allow userspace to change the
> > PASID configuration. is it?
> 
> if only PF it's still possible to develop a model allowing userspace to
> change.

More importantly the primary purpose of setting the PASID width is
because of the physical properties of the IOMMU HW.

IOMMU HW that supports virtualization should do so in a way that the
PASID with can be globally set to some value the hypervisor is aware
the HW can decode in all cases.

The VM should have no way to make the HW ignore (vs check for zero)
upper bits of the PASID that would require the physical PASID bits to
be reduced.

So we should never allow programming of this, VMM just fakes it and
ignores sets.

Similar argument for enable, IOMMU HW supporting virtualization should
always be able to decode PASID and reject PASID TLPs if the VM hasn't
configured the vIOMMU to decode them. The purpose of the disable bit
is to accommodate IOMMU HW that cannot decode the PASID TLP at all and
would become confused.

Jason

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-11 18:49       ` Alex Williamson
@ 2023-12-12 15:35         ` Jason Gunthorpe
  2023-12-13  2:10           ` Tian, Kevin
  2024-01-15  9:49           ` Yi Liu
  0 siblings, 2 replies; 31+ messages in thread
From: Jason Gunthorpe @ 2023-12-12 15:35 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Yi Liu, joro, kevin.tian, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	zhenzhong.duan, joao.m.martins, xin.zeng, yan.y.zhao

On Mon, Dec 11, 2023 at 11:49:49AM -0700, Alex Williamson wrote:
> On Mon, 11 Dec 2023 14:10:28 -0400
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
> > > On Sun, 26 Nov 2023 22:39:09 -0800
> > > Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > > >    the PF). Creating a virtual PASID capability in vfio-pci config space needs
> > > >    to find a hole to place it, but doing so may require device specific
> > > >    knowledge to avoid potential conflict with device specific registers like
> > > >    hiden bits in VF config space. It's simpler by moving this burden to the
> > > >    VMM instead of maintaining a quirk system in the kernel.  
> > > 
> > > This feels a bit like an incomplete solution though and we might
> > > already posses device specific knowledge in the form of a variant
> > > driver.  Should this feature structure include a flag + field that
> > > could serve to generically indicate to the VMM a location for
> > > implementing the PASID capability?  The default core implementation
> > > might fill this only for PFs where clearly an emualted PASID capability
> > > can overlap the physical capability.  Thanks,  
> > 
> > In many ways I would perfer to solve this for good by having a way to
> > learn a range of available config space - I liked the suggestion to
> > use a DVSEC to mark empty space.
> 
> Yes, DVSEC is the most plausible option for the device itself to convey
> unused config space, but that requires hardware adoption so presumably
> we're going to need to fill the gaps with device specific code.  That
> code might live in a variant driver or in the VMM.  If we have faith
> that DVSEC is the way, it'd make sense for a variant driver to
> implement a virtual DVSEC to work out the QEMU implementation and set a
> precedent.

How hard do you think it would be for the kernel to synthesize the
dvsec if the varient driver can provide a range for it?

On the other hand I'm not so keen on having variant drivers that are
only doing this just to avoid a table in qemu :\ It seems like a
reasonable thing to add to existing drivers, though none of them
support PASID yet..

> I mostly just want us to recognize that this feature structure also has
> the possibility to fill this gap and we're consciously passing it over
> and should maybe formally propose the DVSEC solution and reference it
> in the commit log or comments here to provide a complete picture.

You mean by passing an explicit empty range or something in a feature
IOCTL?

Jason

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12 15:31         ` Jason Gunthorpe
@ 2023-12-13  1:59           ` Tian, Kevin
  0 siblings, 0 replies; 31+ messages in thread
From: Tian, Kevin @ 2023-12-13  1:59 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Liu, Yi L, joro, alex.williamson, robin.murphy, baolu.lu, cohuck,
	eric.auger, nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun,
	peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	Duan, Zhenzhong, joao.m.martins, Zeng, Xin, Zhao, Yan Y

> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, December 12, 2023 11:32 PM
> 
> On Tue, Dec 12, 2023 at 02:20:01AM +0000, Tian, Kevin wrote:
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Monday, December 11, 2023 4:08 PM
> > >
> > > On 2023/12/7 16:47, Tian, Kevin wrote:
> > > >> From: Liu, Yi L <yi.l.liu@intel.com>
> > > >> Sent: Monday, November 27, 2023 2:39 PM
> > > >>
> > > >> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32
> > > flags,
> > > >> +				       struct vfio_device_feature_pasid __user
> > > >> *arg,
> > > >> +				       size_t argsz)
> > > >> +{
> > > >> +	struct vfio_pci_core_device *vdev =
> > > >> +		container_of(device, struct vfio_pci_core_device, vdev);
> > > >> +	struct vfio_device_feature_pasid pasid = { 0 };
> > > >> +	struct pci_dev *pdev = vdev->pdev;
> > > >> +	u32 capabilities = 0;
> > > >> +	int ret;
> > > >> +
> > > >> +	/* We do not support SET of the PASID capability */
> > > >
> > > > this line alone is meaningless. Please explain the reason e.g. due to
> > > > no PASID capability per VF...
> > >
> > > sure. I think the major reason is we don't allow userspace to change the
> > > PASID configuration. is it?
> >
> > if only PF it's still possible to develop a model allowing userspace to
> > change.
> 
> More importantly the primary purpose of setting the PASID width is
> because of the physical properties of the IOMMU HW.
> 
> IOMMU HW that supports virtualization should do so in a way that the
> PASID with can be globally set to some value the hypervisor is aware
> the HW can decode in all cases.
> 
> The VM should have no way to make the HW ignore (vs check for zero)
> upper bits of the PASID that would require the physical PASID bits to
> be reduced.
> 
> So we should never allow programming of this, VMM just fakes it and
> ignores sets.

PASID width is read-only so certainly sets should be ignored

> 
> Similar argument for enable, IOMMU HW supporting virtualization should
> always be able to decode PASID and reject PASID TLPs if the VM hasn't
> configured the vIOMMU to decode them. The purpose of the disable bit
> is to accommodate IOMMU HW that cannot decode the PASID TLP at all and
> would become confused.
> 

Yes, this explains why disallowing userspace to change doesn't cause
problem in this series. My earlier point was just that allowing userspace
to change could be implemented for PF (though unnecessary with your
explanation) to mimic the hardware behavior.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12 15:35         ` Jason Gunthorpe
@ 2023-12-13  2:10           ` Tian, Kevin
  2024-01-15  9:49           ` Yi Liu
  1 sibling, 0 replies; 31+ messages in thread
From: Tian, Kevin @ 2023-12-13  2:10 UTC (permalink / raw)
  To: Jason Gunthorpe, Alex Williamson
  Cc: Liu, Yi L, joro, robin.murphy, baolu.lu, cohuck, eric.auger,
	nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun, peterx, jasowang,
	shameerali.kolothum.thodi, lulu, suravee.suthikulpanit, iommu,
	linux-kernel, linux-kselftest, Duan, Zhenzhong, joao.m.martins,
	Zeng, Xin, Zhao, Yan Y

> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, December 12, 2023 11:35 PM
> 
> On Mon, Dec 11, 2023 at 11:49:49AM -0700, Alex Williamson wrote:
> > On Mon, 11 Dec 2023 14:10:28 -0400
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > > On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
> > > > On Sun, 26 Nov 2023 22:39:09 -0800
> > > > Yi Liu <yi.l.liu@intel.com> wrote:
> >
> > > > >    the PF). Creating a virtual PASID capability in vfio-pci config space
> needs
> > > > >    to find a hole to place it, but doing so may require device specific
> > > > >    knowledge to avoid potential conflict with device specific registers
> like
> > > > >    hiden bits in VF config space. It's simpler by moving this burden to
> the
> > > > >    VMM instead of maintaining a quirk system in the kernel.
> > > >
> > > > This feels a bit like an incomplete solution though and we might
> > > > already posses device specific knowledge in the form of a variant
> > > > driver.  Should this feature structure include a flag + field that
> > > > could serve to generically indicate to the VMM a location for
> > > > implementing the PASID capability?  The default core implementation
> > > > might fill this only for PFs where clearly an emualted PASID capability
> > > > can overlap the physical capability.  Thanks,
> > >
> > > In many ways I would perfer to solve this for good by having a way to
> > > learn a range of available config space - I liked the suggestion to
> > > use a DVSEC to mark empty space.
> >
> > Yes, DVSEC is the most plausible option for the device itself to convey
> > unused config space, but that requires hardware adoption so presumably
> > we're going to need to fill the gaps with device specific code.  That
> > code might live in a variant driver or in the VMM.  If we have faith
> > that DVSEC is the way, it'd make sense for a variant driver to
> > implement a virtual DVSEC to work out the QEMU implementation and set
> a
> > precedent.
> 
> How hard do you think it would be for the kernel to synthesize the
> dvsec if the varient driver can provide a range for it?
> 
> On the other hand I'm not so keen on having variant drivers that are
> only doing this just to avoid a table in qemu :\ It seems like a

me too. If we really want something like this I'd prefer to tracking a
table of device specific ranges instead of requesting full-fledged
variant drivers.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12 15:27         ` Jason Gunthorpe
@ 2024-01-15  8:20           ` Yi Liu
  0 siblings, 0 replies; 31+ messages in thread
From: Yi Liu @ 2024-01-15  8:20 UTC (permalink / raw)
  To: Jason Gunthorpe, Alex Williamson
  Cc: Duan, Zhenzhong, joro, Tian, Kevin, robin.murphy, baolu.lu,
	cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	joao.m.martins, Zeng, Xin, Zhao, Yan Y

On 2023/12/12 23:27, Jason Gunthorpe wrote:
> On Mon, Dec 11, 2023 at 08:39:46PM -0700, Alex Williamson wrote:
> 
>> So how do we keep up with PCIe spec updates relative to the PASID
>> capability with this proposal?  Would it make more sense to report the
>> raw capability register and capability version rather that a translated
>> copy thereof?  Perhaps just masking the fields we're currently prepared
>> to expose.
> 
> I think the VMM must always create a cap based on the PCIe version it
> understands. We don't know what future specs will put there so it
> seems risky to forward it if we don't know that any possible
> hypervisor support is present.

This series parses the capability register and reports the known caps
to user. While this does not include the version number, userspace should
decide the proper version number. Seems like what you suggests here.

> 
> We have this problem on and off where stuff in PCI config space needs
> explicit hypervisor support or it doesn't work in the VM and things
> get confusing.
> 
> Jason

-- 
Regards,
Yi Liu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl
  2023-12-12 15:35         ` Jason Gunthorpe
  2023-12-13  2:10           ` Tian, Kevin
@ 2024-01-15  9:49           ` Yi Liu
  1 sibling, 0 replies; 31+ messages in thread
From: Yi Liu @ 2024-01-15  9:49 UTC (permalink / raw)
  To: Jason Gunthorpe, Alex Williamson
  Cc: joro, kevin.tian, robin.murphy, baolu.lu, cohuck, eric.auger,
	nicolinc, kvm, mjrosato, chao.p.peng, yi.y.sun, peterx, jasowang,
	shameerali.kolothum.thodi, lulu, suravee.suthikulpanit, iommu,
	linux-kernel, linux-kselftest, zhenzhong.duan, joao.m.martins,
	xin.zeng, yan.y.zhao

On 2023/12/12 23:35, Jason Gunthorpe wrote:
> On Mon, Dec 11, 2023 at 11:49:49AM -0700, Alex Williamson wrote:
>> On Mon, 11 Dec 2023 14:10:28 -0400
>> Jason Gunthorpe <jgg@nvidia.com> wrote:
>>
>>> On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
>>>> On Sun, 26 Nov 2023 22:39:09 -0800
>>>> Yi Liu <yi.l.liu@intel.com> wrote:
>>
>>>>>     the PF). Creating a virtual PASID capability in vfio-pci config space needs
>>>>>     to find a hole to place it, but doing so may require device specific
>>>>>     knowledge to avoid potential conflict with device specific registers like
>>>>>     hiden bits in VF config space. It's simpler by moving this burden to the
>>>>>     VMM instead of maintaining a quirk system in the kernel.
>>>>
>>>> This feels a bit like an incomplete solution though and we might
>>>> already posses device specific knowledge in the form of a variant
>>>> driver.  Should this feature structure include a flag + field that
>>>> could serve to generically indicate to the VMM a location for
>>>> implementing the PASID capability?  The default core implementation
>>>> might fill this only for PFs where clearly an emualted PASID capability
>>>> can overlap the physical capability.  Thanks,
>>>
>>> In many ways I would perfer to solve this for good by having a way to
>>> learn a range of available config space - I liked the suggestion to
>>> use a DVSEC to mark empty space.
>>
>> Yes, DVSEC is the most plausible option for the device itself to convey
>> unused config space, but that requires hardware adoption so presumably
>> we're going to need to fill the gaps with device specific code.  That
>> code might live in a variant driver or in the VMM.  If we have faith
>> that DVSEC is the way, it'd make sense for a variant driver to
>> implement a virtual DVSEC to work out the QEMU implementation and set a
>> precedent.
> 
> How hard do you think it would be for the kernel to synthesize the
> dvsec if the varient driver can provide a range for it?
> 
> On the other hand I'm not so keen on having variant drivers that are
> only doing this just to avoid a table in qemu :\ It seems like a
> reasonable thing to add to existing drivers, though none of them
> support PASID yet..
> 
>> I mostly just want us to recognize that this feature structure also has
>> the possibility to fill this gap and we're consciously passing it over
>> and should maybe formally propose the DVSEC solution and reference it
>> in the commit log or comments here to provide a complete picture.
> 
> You mean by passing an explicit empty range or something in a feature
> IOCTL?

Hi Alex,

Any more suggestion on this? It appears to me that you are fine with PF
to implement the virtual PASID capability in the same offset with physical
PASID capability, while other cases need a way to know where to put the
virtual PASID capability. This may be done by a DVSEC or just pass empty
ranges through the VFIO_DEVICE_FEATURE ioctl?

Regards,
Yi Liu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/3] vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices
  2023-11-27  6:39 ` [PATCH 1/3] vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices Yi Liu
@ 2024-01-15 17:18   ` Jason Gunthorpe
  0 siblings, 0 replies; 31+ messages in thread
From: Jason Gunthorpe @ 2024-01-15 17:18 UTC (permalink / raw)
  To: Yi Liu
  Cc: joro, alex.williamson, kevin.tian, robin.murphy, baolu.lu,
	cohuck, eric.auger, nicolinc, kvm, mjrosato, chao.p.peng,
	yi.y.sun, peterx, jasowang, shameerali.kolothum.thodi, lulu,
	suravee.suthikulpanit, iommu, linux-kernel, linux-kselftest,
	zhenzhong.duan, joao.m.martins, xin.zeng, yan.y.zhao

On Sun, Nov 26, 2023 at 10:39:07PM -0800, Yi Liu wrote:

> @@ -168,6 +180,42 @@ void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev)
>  }
>  EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas);
>  
> +int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev,
> +					    u32 pasid, u32 *pt_id)
> +{
> +	void *entry;
> +	int rc;
> +
> +	lockdep_assert_held(&vdev->dev_set->lock);
> +
> +	if (WARN_ON(!vdev->iommufd_device))
> +		return -EINVAL;
> +
> +	entry = xa_load(&vdev->pasid_pts, pasid);
> +	if (xa_is_value(entry))
> +		rc = iommufd_device_pasid_replace(vdev->iommufd_device, pasid, pt_id);
> +	else
> +		rc = iommufd_device_pasid_attach(vdev->iommufd_device, pasid, pt_id);

An ida is a more approriate data structure if the only point is to
keep track if a pasid is in use or not..

Jason

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2024-01-15 17:18 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-27  6:39 [PATCH 0/3] vfio-pci support pasid attach/detach Yi Liu
2023-11-27  6:39 ` [PATCH 1/3] vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices Yi Liu
2024-01-15 17:18   ` Jason Gunthorpe
2023-11-27  6:39 ` [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT Yi Liu
2023-11-27  6:50   ` Duan, Zhenzhong
2023-11-28  3:06     ` Yi Liu
2023-12-11 17:05   ` Alex Williamson
2023-12-12  3:02     ` Yi Liu
2023-11-27  6:39 ` [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl Yi Liu
2023-11-27  7:28   ` Duan, Zhenzhong
2023-11-28  3:11     ` Yi Liu
2023-11-28  4:23       ` Duan, Zhenzhong
2023-12-07  8:47   ` Tian, Kevin
2023-12-11  8:08     ` Yi Liu
2023-12-12  2:20       ` Tian, Kevin
2023-12-12  3:26         ` Yi Liu
2023-12-12 15:31         ` Jason Gunthorpe
2023-12-13  1:59           ` Tian, Kevin
2023-12-11 18:03   ` Alex Williamson
2023-12-11 18:10     ` Jason Gunthorpe
2023-12-11 18:49       ` Alex Williamson
2023-12-12 15:35         ` Jason Gunthorpe
2023-12-13  2:10           ` Tian, Kevin
2024-01-15  9:49           ` Yi Liu
2023-12-12  2:16     ` Tian, Kevin
2023-12-12  3:44       ` Yi Liu
2023-12-12  2:43     ` Duan, Zhenzhong
2023-12-12  3:39       ` Alex Williamson
2023-12-12  3:53         ` Yi Liu
2023-12-12 15:27         ` Jason Gunthorpe
2024-01-15  8:20           ` Yi Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).