All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] vfio/pci: Add support for PCIe extended capabilities
@ 2021-08-10  6:25 Vivek Gautam
  2021-08-31 14:54 ` Will Deacon
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Vivek Gautam @ 2021-08-10  6:25 UTC (permalink / raw)
  To: will, julien.thierry.kdev, kvm
  Cc: andre.przywara, alexandru.elisei, lorenzo.pieralisi,
	jean-philippe, eric.auger, Vivek Gautam

Add support to parse extended configuration space for vfio based
assigned PCIe devices and add extended capabilities for the device
in the guest. This allows the guest to see and work on extended
capabilities, for example to toggle PRI extended cap to enable and
disable Shared virtual addressing (SVA) support.
PCIe extended capability header that is the first DWORD of all
extended caps is shown below -

   31               20  19   16  15                 0
   ____________________|_______|_____________________
  |    Next cap off    |  1h   |     Cap ID          |
  |____________________|_______|_____________________|

Out of the two upper bytes of extended cap header, the
lower nibble is actually cap version - 0x1.
'next cap offset' if present at bits [31:20], should be
right shifted by 4 bits to calculate the position of next
capability.
This change supports parsing and adding ATS, PRI and PASID caps.

Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
---
 include/kvm/pci.h |   6 +++
 vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 103 insertions(+), 7 deletions(-)

diff --git a/include/kvm/pci.h b/include/kvm/pci.h
index 0f2d5bb..a365337 100644
--- a/include/kvm/pci.h
+++ b/include/kvm/pci.h
@@ -165,6 +165,12 @@ struct pci_exp_cap {
 	u32 root_status;
 };
 
+struct pci_ext_cap_hdr {
+	u16	type;
+	/* bit 19:16 = 0x1: Cap version */
+	u16	next;
+};
+
 struct pci_device_header;
 
 typedef int (*bar_activate_fn_t)(struct kvm *kvm,
diff --git a/vfio/pci.c b/vfio/pci.c
index ea33fd6..d045e0d 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -665,19 +665,105 @@ static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
 	return 0;
 }
 
+static ssize_t vfio_pci_ext_cap_size(struct pci_ext_cap_hdr *cap_hdr)
+{
+	switch (cap_hdr->type) {
+	case PCI_EXT_CAP_ID_ATS:
+		return PCI_EXT_CAP_ATS_SIZEOF;
+	case PCI_EXT_CAP_ID_PRI:
+		return PCI_EXT_CAP_PRI_SIZEOF;
+	case PCI_EXT_CAP_ID_PASID:
+		return PCI_EXT_CAP_PASID_SIZEOF;
+	default:
+		pr_err("unknown extended PCI capability 0x%x", cap_hdr->type);
+		return 0;
+	}
+}
+
+static int vfio_pci_add_ext_cap(struct vfio_device *vdev, u8 *virt_hdr,
+			    struct pci_ext_cap_hdr *cap, off_t pos)
+{
+	struct pci_ext_cap_hdr *last;
+
+	cap->next = 0;
+	last = PCI_CAP(virt_hdr, 0x100);
+
+	while (last->next)
+		last = PCI_CAP(virt_hdr, last->next);
+
+	last->next = pos;
+	/*
+	 * Out of the two upper bytes of extended cap header, the
+	 * nibble [19:16] is actually cap version that should be 0x1,
+	 * so shift back the actual 'next' value by 4 bits.
+	 */
+	last->next = (last->next << 4) | 0x1;
+	memcpy(virt_hdr + pos, cap, vfio_pci_ext_cap_size(cap));
+
+	return 0;
+
+}
+
+static int vfio_pci_parse_ext_caps(struct vfio_device *vdev, u8 *virt_hdr)
+{
+	int ret;
+	u16 pos, next;
+	struct pci_ext_cap_hdr *ext_cap;
+	struct vfio_pci_device *pdev = &vdev->pci;
+
+	/* Extended cap only for PCIe devices */
+	if (!arch_has_pci_exp())
+		return 0;
+
+	/* Extended caps start from 0x100 offset. */
+	pos = 0x100;
+
+	for (; pos; pos = next) {
+		ext_cap = PCI_CAP(&pdev->hdr, pos);
+		/*
+		 * Out of the two upper bytes of extended cap header, the
+		 * lowest nibble is actually cap version.
+		 * 'next cap offset' if present at bits [31:20], while
+		 * bits [19:16] are set to 1 to indicate cap version.
+		 * So to get position of next cap right shift by 4 bits.
+		 */
+		next = (ext_cap->next >> 4);
+
+		switch (ext_cap->type) {
+		case PCI_EXT_CAP_ID_ATS:
+			ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
+			if (ret)
+				return ret;
+			break;
+		case PCI_EXT_CAP_ID_PRI:
+			ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
+			if (ret)
+				return ret;
+			break;
+		case PCI_EXT_CAP_ID_PASID:
+			ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
+			if (ret)
+				return ret;
+			break;
+		}
+	}
+
+	return 0;
+}
+
 static int vfio_pci_parse_caps(struct vfio_device *vdev)
 {
 	int ret;
 	size_t size;
 	u16 pos, next;
 	struct pci_cap_hdr *cap;
-	u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
+	u8 virt_hdr[PCI_DEV_CFG_SIZE];
 	struct vfio_pci_device *pdev = &vdev->pci;
 
 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
 		return 0;
 
-	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
+	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
 
 	pos = pdev->hdr.capabilities & ~3;
 
@@ -715,9 +801,13 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
 		}
 	}
 
+	ret = vfio_pci_parse_ext_caps(vdev, virt_hdr);
+	if (ret)
+		return ret;
+
 	/* Wipe remaining capabilities */
 	pos = PCI_STD_HEADER_SIZEOF;
-	size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
+	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
 
 	return 0;
@@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
 
 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
 {
-	ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
+	ssize_t sz = PCI_DEV_CFG_SIZE;
 	struct vfio_region_info *info;
 	struct vfio_pci_device *pdev = &vdev->pci;
 
@@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
 	/* Install our fake Configuration Space */
 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
 	/*
-	 * We don't touch the extended configuration space, let's be cautious
-	 * and not overwrite it all with zeros, or bad things might happen.
+	 * Update the extended configuration space as well since we
+	 * are now populating the extended capabilities.
 	 */
-	hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
+	hdr_sz = PCI_DEV_CFG_SIZE;
 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
 			     hdr_sz);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-08-10  6:25 [PATCH] vfio/pci: Add support for PCIe extended capabilities Vivek Gautam
@ 2021-08-31 14:54 ` Will Deacon
  2021-09-01 10:27   ` Andre Przywara
  2021-08-31 17:14 ` Andre Przywara
  2021-09-02  9:59 ` Alexandru Elisei
  2 siblings, 1 reply; 15+ messages in thread
From: Will Deacon @ 2021-08-31 14:54 UTC (permalink / raw)
  To: Vivek Gautam
  Cc: julien.thierry.kdev, kvm, andre.przywara, alexandru.elisei,
	lorenzo.pieralisi, jean-philippe, eric.auger

On Tue, Aug 10, 2021 at 11:55:14AM +0530, Vivek Gautam wrote:
> Add support to parse extended configuration space for vfio based
> assigned PCIe devices and add extended capabilities for the device
> in the guest. This allows the guest to see and work on extended
> capabilities, for example to toggle PRI extended cap to enable and
> disable Shared virtual addressing (SVA) support.
> PCIe extended capability header that is the first DWORD of all
> extended caps is shown below -
> 
>    31               20  19   16  15                 0
>    ____________________|_______|_____________________
>   |    Next cap off    |  1h   |     Cap ID          |
>   |____________________|_______|_____________________|
> 
> Out of the two upper bytes of extended cap header, the
> lower nibble is actually cap version - 0x1.
> 'next cap offset' if present at bits [31:20], should be
> right shifted by 4 bits to calculate the position of next
> capability.
> This change supports parsing and adding ATS, PRI and PASID caps.
> 
> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
> ---
>  include/kvm/pci.h |   6 +++
>  vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 103 insertions(+), 7 deletions(-)

Does this work correctly for architectures which don't define ARCH_HAS_PCI_EXP?

Will

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-08-10  6:25 [PATCH] vfio/pci: Add support for PCIe extended capabilities Vivek Gautam
  2021-08-31 14:54 ` Will Deacon
@ 2021-08-31 17:14 ` Andre Przywara
  2021-09-02 10:42   ` Vivek Kumar Gautam
  2021-09-02  9:59 ` Alexandru Elisei
  2 siblings, 1 reply; 15+ messages in thread
From: Andre Przywara @ 2021-08-31 17:14 UTC (permalink / raw)
  To: Vivek Gautam
  Cc: will, julien.thierry.kdev, kvm, alexandru.elisei,
	lorenzo.pieralisi, jean-philippe, eric.auger

On Tue, 10 Aug 2021 11:55:14 +0530
Vivek Gautam <vivek.gautam@arm.com> wrote:

Hi Vivek,

> Add support to parse extended configuration space for vfio based
> assigned PCIe devices and add extended capabilities for the device
> in the guest. This allows the guest to see and work on extended
> capabilities, for example to toggle PRI extended cap to enable and
> disable Shared virtual addressing (SVA) support.
> PCIe extended capability header that is the first DWORD of all
> extended caps is shown below -
> 
>    31               20  19   16  15                 0
>    ____________________|_______|_____________________
>   |    Next cap off    |  1h   |     Cap ID          |
>   |____________________|_______|_____________________|
> 
> Out of the two upper bytes of extended cap header, the
> lower nibble is actually cap version - 0x1.
> 'next cap offset' if present at bits [31:20], should be
> right shifted by 4 bits to calculate the position of next
> capability.
> This change supports parsing and adding ATS, PRI and PASID caps.
> 
> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
> ---
>  include/kvm/pci.h |   6 +++
>  vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 103 insertions(+), 7 deletions(-)
> 
> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
> index 0f2d5bb..a365337 100644
> --- a/include/kvm/pci.h
> +++ b/include/kvm/pci.h
> @@ -165,6 +165,12 @@ struct pci_exp_cap {
>  	u32 root_status;
>  };
>  
> +struct pci_ext_cap_hdr {
> +	u16	type;
> +	/* bit 19:16 = 0x1: Cap version */
> +	u16	next;
> +};
> +
>  struct pci_device_header;
>  
>  typedef int (*bar_activate_fn_t)(struct kvm *kvm,
> diff --git a/vfio/pci.c b/vfio/pci.c
> index ea33fd6..d045e0d 100644
> --- a/vfio/pci.c
> +++ b/vfio/pci.c
> @@ -665,19 +665,105 @@ static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
>  	return 0;
>  }
>  
> +static ssize_t vfio_pci_ext_cap_size(struct pci_ext_cap_hdr *cap_hdr)
> +{
> +	switch (cap_hdr->type) {
> +	case PCI_EXT_CAP_ID_ATS:
> +		return PCI_EXT_CAP_ATS_SIZEOF;
> +	case PCI_EXT_CAP_ID_PRI:
> +		return PCI_EXT_CAP_PRI_SIZEOF;
> +	case PCI_EXT_CAP_ID_PASID:
> +		return PCI_EXT_CAP_PASID_SIZEOF;
> +	default:
> +		pr_err("unknown extended PCI capability 0x%x", cap_hdr->type);
> +		return 0;

It looks somewhat weird to report an error, but then silently carry on
anyways. Since the return value is signed, you could return something
negative and check for that error in the caller.
I see that is copied from the "normal" capabilities, but maybe this
should be cleaned up there as well? It looks like ending here should be
an internal error, when this list is not up-to-date with the switch/case
below (which can be solved as well on the way).

> +	}
> +}
> +
> +static int vfio_pci_add_ext_cap(struct vfio_device *vdev, u8 *virt_hdr,
> +			    struct pci_ext_cap_hdr *cap, off_t pos)
> +{
> +	struct pci_ext_cap_hdr *last;
> +
> +	cap->next = 0;
> +	last = PCI_CAP(virt_hdr, 0x100);
> +
> +	while (last->next)
> +		last = PCI_CAP(virt_hdr, last->next);
> +
> +	last->next = pos;

This should be folded into the next statement, but ...

> +	/*
> +	 * Out of the two upper bytes of extended cap header, the
> +	 * nibble [19:16] is actually cap version that should be 0x1,
> +	 * so shift back the actual 'next' value by 4 bits.
> +	 */
> +	last->next = (last->next << 4) | 0x1;
> +	memcpy(virt_hdr + pos, cap, vfio_pci_ext_cap_size(cap));

So here you silently ignore the error when we see an unsupported
capability. Granted, copying 0 bytes doesn't do anything, but at the
same time we already updated the next pointer. So I guess you should
query for the size first, and bail out if this is unsupported. Not sure
what to do then, but I think we just ignore/filter that capability then?

> +
> +	return 0;
> +
> +}
> +
> +static int vfio_pci_parse_ext_caps(struct vfio_device *vdev, u8 *virt_hdr)
> +{
> +	int ret;
> +	u16 pos, next;
> +	struct pci_ext_cap_hdr *ext_cap;
> +	struct vfio_pci_device *pdev = &vdev->pci;
> +
> +	/* Extended cap only for PCIe devices */
> +	if (!arch_has_pci_exp())
> +		return 0;
> +
> +	/* Extended caps start from 0x100 offset. */
> +	pos = 0x100;

Please move that into the for-loop statement.

> +
> +	for (; pos; pos = next) {
> +		ext_cap = PCI_CAP(&pdev->hdr, pos);

Don't we need to check if there are extended capabilities at offset
0x100 first?
"Absence of any Extended Capabilities is required to be indicated by an
Extended Capability header with a Capability ID of 0000h, a Capability
Version of 0h, and a Next Capability Offset of 000h."

> +		/*
> +		 * Out of the two upper bytes of extended cap header, the
> +		 * lowest nibble is actually cap version.
> +		 * 'next cap offset' if present at bits [31:20], while
> +		 * bits [19:16] are set to 1 to indicate cap version.
> +		 * So to get position of next cap right shift by 4 bits.
> +		 */
> +		next = (ext_cap->next >> 4);
> +
> +		switch (ext_cap->type) {
> +		case PCI_EXT_CAP_ID_ATS:
> +			ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
> +			if (ret)

That function seems to be always returning 0, but this would be fixed
anyway, I guess. And then you could save the switch/case (which
is somewhat redundant with the one in vfio_pci_ext_cap_size()), and just
pass every capability to vfio_pci_add_ext_cap().

> +				return ret;
> +			break;
> +		case PCI_EXT_CAP_ID_PRI:
> +			ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
> +			if (ret)
> +				return ret;
> +			break;
> +		case PCI_EXT_CAP_ID_PASID:
> +			ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
> +			if (ret)
> +				return ret;
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int vfio_pci_parse_caps(struct vfio_device *vdev)
>  {
>  	int ret;
>  	size_t size;
>  	u16 pos, next;
>  	struct pci_cap_hdr *cap;
> -	u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
> +	u8 virt_hdr[PCI_DEV_CFG_SIZE];
>  	struct vfio_pci_device *pdev = &vdev->pci;
>  
>  	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
>  		return 0;
>  
> -	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
> +	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
>  
>  	pos = pdev->hdr.capabilities & ~3;
>  
> @@ -715,9 +801,13 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>  		}
>  	}
>  
> +	ret = vfio_pci_parse_ext_caps(vdev, virt_hdr);
> +	if (ret)
> +		return ret;
> +
>  	/* Wipe remaining capabilities */
>  	pos = PCI_STD_HEADER_SIZEOF;
> -	size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
> +	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
>  	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
>  
>  	return 0;
> @@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>  
>  static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
>  {
> -	ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
> +	ssize_t sz = PCI_DEV_CFG_SIZE;
>  	struct vfio_region_info *info;
>  	struct vfio_pci_device *pdev = &vdev->pci;
>  
> @@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
>  	/* Install our fake Configuration Space */
>  	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>  	/*
> -	 * We don't touch the extended configuration space, let's be cautious
> -	 * and not overwrite it all with zeros, or bad things might happen.

It would be good to hear from Alex what those bad things are, and if
passing on those extended caps now fixes those.

Cheers,
Andre

> +	 * Update the extended configuration space as well since we
> +	 * are now populating the extended capabilities.
>  	 */
> -	hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
> +	hdr_sz = PCI_DEV_CFG_SIZE;
>  	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
>  		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
>  			     hdr_sz);


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-08-31 14:54 ` Will Deacon
@ 2021-09-01 10:27   ` Andre Przywara
  2021-09-01 10:44     ` Will Deacon
  0 siblings, 1 reply; 15+ messages in thread
From: Andre Przywara @ 2021-09-01 10:27 UTC (permalink / raw)
  To: Will Deacon, Vivek Gautam
  Cc: julien.thierry.kdev, kvm, alexandru.elisei, lorenzo.pieralisi,
	jean-philippe, eric.auger

On 8/31/21 3:54 PM, Will Deacon wrote:

Hi Will,

> On Tue, Aug 10, 2021 at 11:55:14AM +0530, Vivek Gautam wrote:
>> Add support to parse extended configuration space for vfio based
>> assigned PCIe devices and add extended capabilities for the device
>> in the guest. This allows the guest to see and work on extended
>> capabilities, for example to toggle PRI extended cap to enable and
>> disable Shared virtual addressing (SVA) support.
>> PCIe extended capability header that is the first DWORD of all
>> extended caps is shown below -
>>
>>     31               20  19   16  15                 0
>>     ____________________|_______|_____________________
>>    |    Next cap off    |  1h   |     Cap ID          |
>>    |____________________|_______|_____________________|
>>
>> Out of the two upper bytes of extended cap header, the
>> lower nibble is actually cap version - 0x1.
>> 'next cap offset' if present at bits [31:20], should be
>> right shifted by 4 bits to calculate the position of next
>> capability.
>> This change supports parsing and adding ATS, PRI and PASID caps.
>>
>> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
>> ---
>>   include/kvm/pci.h |   6 +++
>>   vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>>   2 files changed, 103 insertions(+), 7 deletions(-)
> 
> Does this work correctly for architectures which don't define ARCH_HAS_PCI_EXP?

I think it does: the code compiles fine, and the whole functionality is 
guarded by:
+	/* Extended cap only for PCIe devices */
+	if (!arch_has_pci_exp())
+		return 0;

A clever compiler might even decide to not include this code at all.

Did you see any particular problem?

Cheers,
Andre

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-09-01 10:27   ` Andre Przywara
@ 2021-09-01 10:44     ` Will Deacon
  2021-09-02 10:28       ` Vivek Kumar Gautam
  0 siblings, 1 reply; 15+ messages in thread
From: Will Deacon @ 2021-09-01 10:44 UTC (permalink / raw)
  To: Andre Przywara
  Cc: Vivek Gautam, julien.thierry.kdev, kvm, alexandru.elisei,
	lorenzo.pieralisi, jean-philippe, eric.auger

On Wed, Sep 01, 2021 at 11:27:21AM +0100, Andre Przywara wrote:
> On 8/31/21 3:54 PM, Will Deacon wrote:
> 
> Hi Will,
> 
> > On Tue, Aug 10, 2021 at 11:55:14AM +0530, Vivek Gautam wrote:
> > > Add support to parse extended configuration space for vfio based
> > > assigned PCIe devices and add extended capabilities for the device
> > > in the guest. This allows the guest to see and work on extended
> > > capabilities, for example to toggle PRI extended cap to enable and
> > > disable Shared virtual addressing (SVA) support.
> > > PCIe extended capability header that is the first DWORD of all
> > > extended caps is shown below -
> > > 
> > >     31               20  19   16  15                 0
> > >     ____________________|_______|_____________________
> > >    |    Next cap off    |  1h   |     Cap ID          |
> > >    |____________________|_______|_____________________|
> > > 
> > > Out of the two upper bytes of extended cap header, the
> > > lower nibble is actually cap version - 0x1.
> > > 'next cap offset' if present at bits [31:20], should be
> > > right shifted by 4 bits to calculate the position of next
> > > capability.
> > > This change supports parsing and adding ATS, PRI and PASID caps.
> > > 
> > > Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
> > > ---
> > >   include/kvm/pci.h |   6 +++
> > >   vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
> > >   2 files changed, 103 insertions(+), 7 deletions(-)
> > 
> > Does this work correctly for architectures which don't define ARCH_HAS_PCI_EXP?
> 
> I think it does: the code compiles fine, and the whole functionality is
> guarded by:
> +	/* Extended cap only for PCIe devices */
> +	if (!arch_has_pci_exp())
> +		return 0;
> 
> A clever compiler might even decide to not include this code at all.
> 
> Did you see any particular problem?

The part I was worried about is that PCI_DEV_CFG_MASK (which is used by
the cfg space dispatch code) is derived from PCI_DEV_CFG_SIZE, but actually
I think this patch might _fix_ that problem because it removes the explicit
usage of PCI_DEV_CFG_SIZE_LEGACY!

Will

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-08-10  6:25 [PATCH] vfio/pci: Add support for PCIe extended capabilities Vivek Gautam
  2021-08-31 14:54 ` Will Deacon
  2021-08-31 17:14 ` Andre Przywara
@ 2021-09-02  9:59 ` Alexandru Elisei
  2021-09-02 10:48   ` Vivek Kumar Gautam
  2 siblings, 1 reply; 15+ messages in thread
From: Alexandru Elisei @ 2021-09-02  9:59 UTC (permalink / raw)
  To: Vivek Gautam, will, julien.thierry.kdev, kvm
  Cc: andre.przywara, lorenzo.pieralisi, jean-philippe, eric.auger

Hi Vivek,

On 8/10/21 7:25 AM, Vivek Gautam wrote:
> Add support to parse extended configuration space for vfio based
> assigned PCIe devices and add extended capabilities for the device
> in the guest. This allows the guest to see and work on extended
> capabilities, for example to toggle PRI extended cap to enable and
> disable Shared virtual addressing (SVA) support.
> PCIe extended capability header that is the first DWORD of all
> extended caps is shown below -
>
>    31               20  19   16  15                 0
>    ____________________|_______|_____________________
>   |    Next cap off    |  1h   |     Cap ID          |
>   |____________________|_______|_____________________|
>
> Out of the two upper bytes of extended cap header, the
> lower nibble is actually cap version - 0x1.
> 'next cap offset' if present at bits [31:20], should be
> right shifted by 4 bits to calculate the position of next
> capability.
> This change supports parsing and adding ATS, PRI and PASID caps.
>
> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
> ---
>  include/kvm/pci.h |   6 +++
>  vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 103 insertions(+), 7 deletions(-)
>
> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
> index 0f2d5bb..a365337 100644
> --- a/include/kvm/pci.h
> +++ b/include/kvm/pci.h
> @@ -165,6 +165,12 @@ struct pci_exp_cap {
>  	u32 root_status;
>  };
>  
> +struct pci_ext_cap_hdr {
> +	u16	type;
> +	/* bit 19:16 = 0x1: Cap version */

I believe bits 19:16 are the cap version if you look at the header as a 32bit
value (next:type). If you actually want to set those bits, you need to set bits
3:0 of the next field. I believe the comment should reflect that because it's
slightly confusing (no field is larger than 16 bits, for example). Or you can move
the comment at the top of the struct and keep it as it is.

> +	u16	next;
> +};
> +
>  struct pci_device_header;
>  
>  typedef int (*bar_activate_fn_t)(struct kvm *kvm,
> diff --git a/vfio/pci.c b/vfio/pci.c
> index ea33fd6..d045e0d 100644
> --- a/vfio/pci.c
> +++ b/vfio/pci.c
> @@ -665,19 +665,105 @@ static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
>  	return 0;
>  }
>  
> +static ssize_t vfio_pci_ext_cap_size(struct pci_ext_cap_hdr *cap_hdr)
> +{
> +	switch (cap_hdr->type) {
> +	case PCI_EXT_CAP_ID_ATS:
> +		return PCI_EXT_CAP_ATS_SIZEOF;
> +	case PCI_EXT_CAP_ID_PRI:
> +		return PCI_EXT_CAP_PRI_SIZEOF;
> +	case PCI_EXT_CAP_ID_PASID:
> +		return PCI_EXT_CAP_PASID_SIZEOF;
> +	default:
> +		pr_err("unknown extended PCI capability 0x%x", cap_hdr->type);
> +		return 0;
> +	}
> +}
> +
> +static int vfio_pci_add_ext_cap(struct vfio_device *vdev, u8 *virt_hdr,
> +			    struct pci_ext_cap_hdr *cap, off_t pos)
> +{
> +	struct pci_ext_cap_hdr *last;
> +
> +	cap->next = 0;
> +	last = PCI_CAP(virt_hdr, 0x100);
> +
> +	while (last->next)
> +		last = PCI_CAP(virt_hdr, last->next);
> +
> +	last->next = pos;
> +	/*
> +	 * Out of the two upper bytes of extended cap header, the
> +	 * nibble [19:16] is actually cap version that should be 0x1,
> +	 * so shift back the actual 'next' value by 4 bits.
> +	 */
> +	last->next = (last->next << 4) | 0x1;
> +	memcpy(virt_hdr + pos, cap, vfio_pci_ext_cap_size(cap));
> +
> +	return 0;
> +
> +}
> +
> +static int vfio_pci_parse_ext_caps(struct vfio_device *vdev, u8 *virt_hdr)
> +{
> +	int ret;
> +	u16 pos, next;
> +	struct pci_ext_cap_hdr *ext_cap;
> +	struct vfio_pci_device *pdev = &vdev->pci;
> +
> +	/* Extended cap only for PCIe devices */

Devices are PCI Express if they have the PCI Express Capability (this is also how
Linux tells them apart). The arch_has_pci_exp() is meant to check that the
architecture kvmtool has been compiled for can emulate a PCI Express bus (as
apposed to a legacy PCI bus). For example, when you compile kvmtool for x86, you
will get a legacy PCI bus.

I'm not saying the check is bad, because it definitely should be done, but if what
you're trying to do is to check that the device is a PCI Express capable device,
then you also need to have a look at the PCI Express Capability like Andre suggested.

> +	if (!arch_has_pci_exp())
> +		return 0;
> +
> +	/* Extended caps start from 0x100 offset. */
> +	pos = 0x100;
> +
> +	for (; pos; pos = next) {
> +		ext_cap = PCI_CAP(&pdev->hdr, pos);
> +		/*
> +		 * Out of the two upper bytes of extended cap header, the
> +		 * lowest nibble is actually cap version.
> +		 * 'next cap offset' if present at bits [31:20], while
> +		 * bits [19:16] are set to 1 to indicate cap version.
> +		 * So to get position of next cap right shift by 4 bits.
> +		 */
> +		next = (ext_cap->next >> 4);
> +
> +		switch (ext_cap->type) {
> +		case PCI_EXT_CAP_ID_ATS:
> +			ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
> +			if (ret)
> +				return ret;
> +			break;
> +		case PCI_EXT_CAP_ID_PRI:
> +			ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
> +			if (ret)
> +				return ret;
> +			break;
> +		case PCI_EXT_CAP_ID_PASID:
> +			ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
> +			if (ret)
> +				return ret;
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int vfio_pci_parse_caps(struct vfio_device *vdev)
>  {
>  	int ret;
>  	size_t size;
>  	u16 pos, next;
>  	struct pci_cap_hdr *cap;
> -	u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
> +	u8 virt_hdr[PCI_DEV_CFG_SIZE];
>  	struct vfio_pci_device *pdev = &vdev->pci;
>  
>  	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
>  		return 0;
>  
> -	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
> +	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
>  
>  	pos = pdev->hdr.capabilities & ~3;
>  
> @@ -715,9 +801,13 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>  		}
>  	}
>  
> +	ret = vfio_pci_parse_ext_caps(vdev, virt_hdr);
> +	if (ret)
> +		return ret;
> +
>  	/* Wipe remaining capabilities */
>  	pos = PCI_STD_HEADER_SIZEOF;
> -	size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
> +	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
>  	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
>  
>  	return 0;
> @@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>  
>  static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
>  {
> -	ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
> +	ssize_t sz = PCI_DEV_CFG_SIZE;
>  	struct vfio_region_info *info;
>  	struct vfio_pci_device *pdev = &vdev->pci;
>  
> @@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
>  	/* Install our fake Configuration Space */
>  	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>  	/*
> -	 * We don't touch the extended configuration space, let's be cautious
> -	 * and not overwrite it all with zeros, or bad things might happen.
> +	 * Update the extended configuration space as well since we
> +	 * are now populating the extended capabilities.
>  	 */
> -	hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
> +	hdr_sz = PCI_DEV_CFG_SIZE;

In one of the earlier versions of the PCI Express patches I was doing the same
thing here - overwriting the entire PCI Express configuration space for a device.
However, that made one of the devices I was using for testing stop working when
assigned to a VM.

I'll go through my testing notes and test it again, the cause of the failure might
have been something else entirely which was fixed since then.

Thanks,

Alex

>  	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
>  		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
>  			     hdr_sz);

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-09-01 10:44     ` Will Deacon
@ 2021-09-02 10:28       ` Vivek Kumar Gautam
  0 siblings, 0 replies; 15+ messages in thread
From: Vivek Kumar Gautam @ 2021-09-02 10:28 UTC (permalink / raw)
  To: Will Deacon, Andre Przywara
  Cc: julien.thierry.kdev, kvm, alexandru.elisei, lorenzo.pieralisi,
	jean-philippe, eric.auger

Hi Will,


On 9/1/21 4:14 PM, Will Deacon wrote:
> On Wed, Sep 01, 2021 at 11:27:21AM +0100, Andre Przywara wrote:
>> On 8/31/21 3:54 PM, Will Deacon wrote:
>>
>> Hi Will,
>>
>>> On Tue, Aug 10, 2021 at 11:55:14AM +0530, Vivek Gautam wrote:
>>>> Add support to parse extended configuration space for vfio based
>>>> assigned PCIe devices and add extended capabilities for the device
>>>> in the guest. This allows the guest to see and work on extended
>>>> capabilities, for example to toggle PRI extended cap to enable and
>>>> disable Shared virtual addressing (SVA) support.
>>>> PCIe extended capability header that is the first DWORD of all
>>>> extended caps is shown below -
>>>>
>>>>      31               20  19   16  15                 0
>>>>      ____________________|_______|_____________________
>>>>     |    Next cap off    |  1h   |     Cap ID          |
>>>>     |____________________|_______|_____________________|
>>>>
>>>> Out of the two upper bytes of extended cap header, the
>>>> lower nibble is actually cap version - 0x1.
>>>> 'next cap offset' if present at bits [31:20], should be
>>>> right shifted by 4 bits to calculate the position of next
>>>> capability.
>>>> This change supports parsing and adding ATS, PRI and PASID caps.
>>>>
>>>> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
>>>> ---
>>>>    include/kvm/pci.h |   6 +++
>>>>    vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>>>>    2 files changed, 103 insertions(+), 7 deletions(-)
>>>
>>> Does this work correctly for architectures which don't define ARCH_HAS_PCI_EXP?
>>
>> I think it does: the code compiles fine, and the whole functionality is
>> guarded by:
>> +    /* Extended cap only for PCIe devices */
>> +    if (!arch_has_pci_exp())
>> +            return 0;
>>
>> A clever compiler might even decide to not include this code at all.
>>
>> Did you see any particular problem?
>
> The part I was worried about is that PCI_DEV_CFG_MASK (which is used by
> the cfg space dispatch code) is derived from PCI_DEV_CFG_SIZE, but actually
> I think this patch might _fix_ that problem because it removes the explicit
> usage of PCI_DEV_CFG_SIZE_LEGACY!

That's right. On adding change for PCI_DEV_CFG_SIZE the entire
configuration space can now be read. With Alex's change for PCI Express
support PCI_DEV_CFG_SIZE can be either PCI_DEV_CFG_SIZE_EXTENDED or
PCI_DEV_CFG_SIZE_LEGACY depending on arch_has_pci_exp() check.

I booted with enabling and disabling ARCH_HAS_PCI_EXP and was able to
see a passthrough device getting detected fine in the VM. Logs are below.

With ARCH_HAS_PCI_EXP enabled  (master with this patch):
--------------------------------------------------------
# lspci -vv

00:00.0 Unassigned class [ff00]: ARM Device ff80

         Subsystem: ARM Device 0000

         Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
ParErr- Stepping- SERR- FastB2B- DisINTx+

         Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort-
<TAbort- <MAbort- >SERR- <PERR- INTx-

         Latency: 0

         Region 0: Memory at 50000000 (32-bit, non-prefetchable) [size=256K]

         Region 2: Memory at 50040000 (32-bit, non-prefetchable) [size=32K]

         Region 4: Memory at 50048000 (32-bit, non-prefetchable) [size=4K]

         Capabilities: [48] MSI-X: Enable+ Count=2048 Masked-

                 Vector table: BAR=2 offset=00000000

                 PBA: BAR=4 offset=00000000

         Capabilities: [54] Express (v2) Endpoint, MSI 00

                 DevCap: MaxPayload 128 bytes, PhantFunc 0, Latency L0s
<64ns, L1 <1us

                         ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+
SlotPowerLimit 0.000W

                 DevCtl: CorrErr- NonFatalErr- FatalErr- UnsupReq-

                         RlxdOrd- ExtTag+ PhantFunc- AuxPwr- NoSnoop-
FLReset-

                         MaxPayload 128 bytes, MaxReadReq 128 bytes

                 DevSta: CorrErr- NonFatalErr- FatalErr- UnsupReq-
AuxPwr- TransPend-

                 LnkCap: Port #0, Speed unknown, Width x0, ASPM not
supported

                         ClockPM- Surprise- LLActRep- BwNot- ASPMOptComp-

                 LnkCtl: ASPM Disabled; RCB 64 bytes, Disabled- CommClk-

                         ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-

                 LnkSta: Speed unknown (ok), Width x0 (ok)

                         TrErr- Train- SlotClk- DLActive- BWMgmt- ABWMgmt-

                 DevCap2: Completion Timeout: Not Supported, TimeoutDis-
NROPrPrP- LTR-

                          10BitTagComp- 10BitTagReq- OBFF Not Supported,
ExtFmt- EETLPPrefix-

                          EmergencyPowerReduction Not Supported,
EmergencyPowerReductionInit-

                          FRS- TPHComp- ExtTPHComp-

                          AtomicOpsCap: 32bit- 64bit- 128bitCAS-

                 DevCtl2: Completion Timeout: 50us to 50ms, TimeoutDis-
LTR- OBFF Disabled,

                          AtomicOpsCtl: ReqEn-

                 LnkCtl2: Target Link Speed: 2.5GT/s, EnterCompliance-
SpeedDis-

                          Transmit Margin: Normal Operating Range,
EnterModifiedCompliance- ComplianceSOS-

                          Compliance De-emphasis: -6dB

                 LnkSta2: Current De-emphasis Level: -6dB,
EqualizationComplete- EqualizationPhase1-

                          EqualizationPhase2- EqualizationPhase3-
LinkEqualizationRequest-

                          Retimer- 2Retimers- CrosslinkRes: unsupported

         Capabilities: [100 v1] Address Translation Service (ATS)

                 ATSCap: Invalidate Queue Depth: 00

                 ATSCtl: Enable+, Smallest Translation Unit: 00

         Capabilities: [108 v0] Page Request Interface (PRI)

                 PRICtl: Enable- Reset-

                 PRISta: RF- UPRGI- Stopped+

                 Page Request Capacity: 000000ff, Page Request
Allocation: 00000000

         Kernel driver in use: smmute-pci





With ARCH_HAS_PCI_EXP disabled (on top of this patch):
--------------------------------------------------------
# lspci -vv
00:00.0 Unassigned class [ff00]: ARM Device ff80
         Subsystem: ARM Device 0000
         Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
ParErr- Stepping- SERR- FastB2B- DisINTx+
         Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort-
<TAbort- <MAbort- >SERR- <PERR- INTx-
         Latency: 0
         Region 0: Memory at 50000000 (32-bit, non-prefetchable) [size=256K]
         Region 2: Memory at 50040000 (32-bit, non-prefetchable) [size=32K]
         Region 4: Memory at 50048000 (32-bit, non-prefetchable) [size=4K]
         Capabilities: [48] MSI-X: Enable+ Count=2048 Masked-
                 Vector table: BAR=2 offset=00000000
                 PBA: BAR=4 offset=00000000
         Kernel driver in use: smmute-pci

Let me know if you have concerns.

Best regards
Vivek

>
> Will
>
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-08-31 17:14 ` Andre Przywara
@ 2021-09-02 10:42   ` Vivek Kumar Gautam
  0 siblings, 0 replies; 15+ messages in thread
From: Vivek Kumar Gautam @ 2021-09-02 10:42 UTC (permalink / raw)
  To: Andre Przywara
  Cc: will, julien.thierry.kdev, kvm, alexandru.elisei,
	lorenzo.pieralisi, jean-philippe, eric.auger

Hi Andre,


On 8/31/21 10:44 PM, Andre Przywara wrote:
> On Tue, 10 Aug 2021 11:55:14 +0530
> Vivek Gautam <vivek.gautam@arm.com> wrote:
>
> Hi Vivek,

Thanks a lot for the review. Please find my comments inline below.

>
>> Add support to parse extended configuration space for vfio based
>> assigned PCIe devices and add extended capabilities for the device
>> in the guest. This allows the guest to see and work on extended
>> capabilities, for example to toggle PRI extended cap to enable and
>> disable Shared virtual addressing (SVA) support.
>> PCIe extended capability header that is the first DWORD of all
>> extended caps is shown below -
>>
>>     31               20  19   16  15                 0
>>     ____________________|_______|_____________________
>>    |    Next cap off    |  1h   |     Cap ID          |
>>    |____________________|_______|_____________________|
>>
>> Out of the two upper bytes of extended cap header, the
>> lower nibble is actually cap version - 0x1.
>> 'next cap offset' if present at bits [31:20], should be
>> right shifted by 4 bits to calculate the position of next
>> capability.
>> This change supports parsing and adding ATS, PRI and PASID caps.
>>
>> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
>> ---
>>   include/kvm/pci.h |   6 +++
>>   vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>>   2 files changed, 103 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
>> index 0f2d5bb..a365337 100644
>> --- a/include/kvm/pci.h
>> +++ b/include/kvm/pci.h
>> @@ -165,6 +165,12 @@ struct pci_exp_cap {
>>      u32 root_status;
>>   };
>>
>> +struct pci_ext_cap_hdr {
>> +    u16     type;
>> +    /* bit 19:16 = 0x1: Cap version */
>> +    u16     next;
>> +};
>> +
>>   struct pci_device_header;
>>
>>   typedef int (*bar_activate_fn_t)(struct kvm *kvm,
>> diff --git a/vfio/pci.c b/vfio/pci.c
>> index ea33fd6..d045e0d 100644
>> --- a/vfio/pci.c
>> +++ b/vfio/pci.c
>> @@ -665,19 +665,105 @@ static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
>>      return 0;
>>   }
>>
>> +static ssize_t vfio_pci_ext_cap_size(struct pci_ext_cap_hdr *cap_hdr)
>> +{
>> +    switch (cap_hdr->type) {
>> +    case PCI_EXT_CAP_ID_ATS:
>> +            return PCI_EXT_CAP_ATS_SIZEOF;
>> +    case PCI_EXT_CAP_ID_PRI:
>> +            return PCI_EXT_CAP_PRI_SIZEOF;
>> +    case PCI_EXT_CAP_ID_PASID:
>> +            return PCI_EXT_CAP_PASID_SIZEOF;
>> +    default:
>> +            pr_err("unknown extended PCI capability 0x%x", cap_hdr->type);
>> +            return 0;
>
> It looks somewhat weird to report an error, but then silently carry on
> anyways. Since the return value is signed, you could return something
> negative and check for that error in the caller.
> I see that is copied from the "normal" capabilities, but maybe this
> should be cleaned up there as well? It looks like ending here should be
> an internal error, when this list is not up-to-date with the switch/case
> below (which can be solved as well on the way).

Sure, will take care of the error case and fix.

>
>> +    }
>> +}
>> +
>> +static int vfio_pci_add_ext_cap(struct vfio_device *vdev, u8 *virt_hdr,
>> +                        struct pci_ext_cap_hdr *cap, off_t pos)
>> +{
>> +    struct pci_ext_cap_hdr *last;
>> +
>> +    cap->next = 0;
>> +    last = PCI_CAP(virt_hdr, 0x100);
>> +
>> +    while (last->next)
>> +            last = PCI_CAP(virt_hdr, last->next);
>> +
>> +    last->next = pos;
>
> This should be folded into the next statement, but ...
>
>> +    /*
>> +     * Out of the two upper bytes of extended cap header, the
>> +     * nibble [19:16] is actually cap version that should be 0x1,
>> +     * so shift back the actual 'next' value by 4 bits.
>> +     */
>> +    last->next = (last->next << 4) | 0x1;
>> +    memcpy(virt_hdr + pos, cap, vfio_pci_ext_cap_size(cap));
>
> So here you silently ignore the error when we see an unsupported
> capability. Granted, copying 0 bytes doesn't do anything, but at the
> same time we already updated the next pointer. So I guess you should
> query for the size first, and bail out if this is unsupported. Not sure
> what to do then, but I think we just ignore/filter that capability then?

Right, I got what you are saying here. As any other cap is not supported
at this point, it wouldn't be right to update the next pointer. I will
fix this.

>
>> +
>> +    return 0;
>> +
>> +}
>> +
>> +static int vfio_pci_parse_ext_caps(struct vfio_device *vdev, u8 *virt_hdr)
>> +{
>> +    int ret;
>> +    u16 pos, next;
>> +    struct pci_ext_cap_hdr *ext_cap;
>> +    struct vfio_pci_device *pdev = &vdev->pci;
>> +
>> +    /* Extended cap only for PCIe devices */
>> +    if (!arch_has_pci_exp())
>> +            return 0;
>> +
>> +    /* Extended caps start from 0x100 offset. */
>> +    pos = 0x100;
>
> Please move that into the for-loop statement.

Sure.

>
>> +
>> +    for (; pos; pos = next) {
>> +            ext_cap = PCI_CAP(&pdev->hdr, pos);
>
> Don't we need to check if there are extended capabilities at offset
> 0x100 first?
> "Absence of any Extended Capabilities is required to be indicated by an
> Extended Capability header with a Capability ID of 0000h, a Capability
> Version of 0h, and a Next Capability Offset of 000h."

Sure, will add a check before going into this loop to check whethere
extended caps are present or not.
I might have mistaken the 'presence' of extended cap with the
arch_has_pci_exp() check.

>
>> +            /*
>> +             * Out of the two upper bytes of extended cap header, the
>> +             * lowest nibble is actually cap version.
>> +             * 'next cap offset' if present at bits [31:20], while
>> +             * bits [19:16] are set to 1 to indicate cap version.
>> +             * So to get position of next cap right shift by 4 bits.
>> +             */
>> +            next = (ext_cap->next >> 4);
>> +
>> +            switch (ext_cap->type) {
>> +            case PCI_EXT_CAP_ID_ATS:
>> +                    ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
>> +                    if (ret)
>
> That function seems to be always returning 0, but this would be fixed
> anyway, I guess. And then you could save the switch/case (which
> is somewhat redundant with the one in vfio_pci_ext_cap_size()), and just
> pass every capability to vfio_pci_add_ext_cap().

Sure, will fix here too.

>
>> +                            return ret;
>> +                    break;
>> +            case PCI_EXT_CAP_ID_PRI:
>> +                    ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
>> +                    if (ret)
>> +                            return ret;
>> +                    break;
>> +            case PCI_EXT_CAP_ID_PASID:
>> +                    ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
>> +                    if (ret)
>> +                            return ret;
>> +                    break;
>> +            }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>   {
>>      int ret;
>>      size_t size;
>>      u16 pos, next;
>>      struct pci_cap_hdr *cap;
>> -    u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
>> +    u8 virt_hdr[PCI_DEV_CFG_SIZE];
>>      struct vfio_pci_device *pdev = &vdev->pci;
>>
>>      if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
>>              return 0;
>>
>> -    memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
>> +    memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
>>
>>      pos = pdev->hdr.capabilities & ~3;
>>
>> @@ -715,9 +801,13 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>              }
>>      }
>>
>> +    ret = vfio_pci_parse_ext_caps(vdev, virt_hdr);
>> +    if (ret)
>> +            return ret;
>> +
>>      /* Wipe remaining capabilities */
>>      pos = PCI_STD_HEADER_SIZEOF;
>> -    size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
>> +    size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
>>      memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
>>
>>      return 0;
>> @@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>
>>   static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
>>   {
>> -    ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
>> +    ssize_t sz = PCI_DEV_CFG_SIZE;
>>      struct vfio_region_info *info;
>>      struct vfio_pci_device *pdev = &vdev->pci;
>>
>> @@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
>>      /* Install our fake Configuration Space */
>>      info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>>      /*
>> -     * We don't touch the extended configuration space, let's be cautious
>> -     * and not overwrite it all with zeros, or bad things might happen.
>
> It would be good to hear from Alex what those bad things are, and if
> passing on those extended caps now fixes those.

On this, I will wait for Alex's testing as he mentioned in the thread.

Best regards
Vivek

>
> Cheers,
> Andre
>
>> +     * Update the extended configuration space as well since we
>> +     * are now populating the extended capabilities.
>>       */
>> -    hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
>> +    hdr_sz = PCI_DEV_CFG_SIZE;
>>      if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
>>              vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
>>                           hdr_sz);
>
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-09-02  9:59 ` Alexandru Elisei
@ 2021-09-02 10:48   ` Vivek Kumar Gautam
  2021-09-03 15:15     ` Alexandru Elisei
  0 siblings, 1 reply; 15+ messages in thread
From: Vivek Kumar Gautam @ 2021-09-02 10:48 UTC (permalink / raw)
  To: Alexandru Elisei, will, julien.thierry.kdev, kvm
  Cc: andre.przywara, lorenzo.pieralisi, jean-philippe, eric.auger

Hi Alex,


On 9/2/21 3:29 PM, Alexandru Elisei wrote:
> Hi Vivek,
>
> On 8/10/21 7:25 AM, Vivek Gautam wrote:
>> Add support to parse extended configuration space for vfio based
>> assigned PCIe devices and add extended capabilities for the device
>> in the guest. This allows the guest to see and work on extended
>> capabilities, for example to toggle PRI extended cap to enable and
>> disable Shared virtual addressing (SVA) support.
>> PCIe extended capability header that is the first DWORD of all
>> extended caps is shown below -
>>
>>     31               20  19   16  15                 0
>>     ____________________|_______|_____________________
>>    |    Next cap off    |  1h   |     Cap ID          |
>>    |____________________|_______|_____________________|
>>
>> Out of the two upper bytes of extended cap header, the
>> lower nibble is actually cap version - 0x1.
>> 'next cap offset' if present at bits [31:20], should be
>> right shifted by 4 bits to calculate the position of next
>> capability.
>> This change supports parsing and adding ATS, PRI and PASID caps.
>>
>> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
>> ---
>>   include/kvm/pci.h |   6 +++
>>   vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>>   2 files changed, 103 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
>> index 0f2d5bb..a365337 100644
>> --- a/include/kvm/pci.h
>> +++ b/include/kvm/pci.h
>> @@ -165,6 +165,12 @@ struct pci_exp_cap {
>>      u32 root_status;
>>   };
>>
>> +struct pci_ext_cap_hdr {
>> +    u16     type;
>> +    /* bit 19:16 = 0x1: Cap version */
>
> I believe bits 19:16 are the cap version if you look at the header as a 32bit
> value (next:type). If you actually want to set those bits, you need to set bits
> 3:0 of the next field. I believe the comment should reflect that because it's
> slightly confusing (no field is larger than 16 bits, for example). Or you can move
> the comment at the top of the struct and keep it as it is.

Right, bit [19:16] for u16 would be a wrong interpretation. I will move
this comment to top of the structure.

>
>> +    u16     next;
>> +};
>> +
>>   struct pci_device_header;
>>
>>   typedef int (*bar_activate_fn_t)(struct kvm *kvm,
>> diff --git a/vfio/pci.c b/vfio/pci.c
>> index ea33fd6..d045e0d 100644
>> --- a/vfio/pci.c
>> +++ b/vfio/pci.c
>> @@ -665,19 +665,105 @@ static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
>>      return 0;
>>   }
>>
>> +static ssize_t vfio_pci_ext_cap_size(struct pci_ext_cap_hdr *cap_hdr)
>> +{
>> +    switch (cap_hdr->type) {
>> +    case PCI_EXT_CAP_ID_ATS:
>> +            return PCI_EXT_CAP_ATS_SIZEOF;
>> +    case PCI_EXT_CAP_ID_PRI:
>> +            return PCI_EXT_CAP_PRI_SIZEOF;
>> +    case PCI_EXT_CAP_ID_PASID:
>> +            return PCI_EXT_CAP_PASID_SIZEOF;
>> +    default:
>> +            pr_err("unknown extended PCI capability 0x%x", cap_hdr->type);
>> +            return 0;
>> +    }
>> +}
>> +
>> +static int vfio_pci_add_ext_cap(struct vfio_device *vdev, u8 *virt_hdr,
>> +                        struct pci_ext_cap_hdr *cap, off_t pos)
>> +{
>> +    struct pci_ext_cap_hdr *last;
>> +
>> +    cap->next = 0;
>> +    last = PCI_CAP(virt_hdr, 0x100);
>> +
>> +    while (last->next)
>> +            last = PCI_CAP(virt_hdr, last->next);
>> +
>> +    last->next = pos;
>> +    /*
>> +     * Out of the two upper bytes of extended cap header, the
>> +     * nibble [19:16] is actually cap version that should be 0x1,
>> +     * so shift back the actual 'next' value by 4 bits.
>> +     */
>> +    last->next = (last->next << 4) | 0x1;
>> +    memcpy(virt_hdr + pos, cap, vfio_pci_ext_cap_size(cap));
>> +
>> +    return 0;
>> +
>> +}
>> +
>> +static int vfio_pci_parse_ext_caps(struct vfio_device *vdev, u8 *virt_hdr)
>> +{
>> +    int ret;
>> +    u16 pos, next;
>> +    struct pci_ext_cap_hdr *ext_cap;
>> +    struct vfio_pci_device *pdev = &vdev->pci;
>> +
>> +    /* Extended cap only for PCIe devices */
>
> Devices are PCI Express if they have the PCI Express Capability (this is also how
> Linux tells them apart). The arch_has_pci_exp() is meant to check that the
> architecture kvmtool has been compiled for can emulate a PCI Express bus (as
> apposed to a legacy PCI bus). For example, when you compile kvmtool for x86, you
> will get a legacy PCI bus.
>
> I'm not saying the check is bad, because it definitely should be done, but if what
> you're trying to do is to check that the device is a PCI Express capable device,
> then you also need to have a look at the PCI Express Capability like Andre suggested.

Yes, better to check the 'presence' of PCI express caps after reading
the extended configuration space rather than relying on a static check.
I will add as Andre suggested.

>
>> +    if (!arch_has_pci_exp())
>> +            return 0;
>> +
>> +    /* Extended caps start from 0x100 offset. */
>> +    pos = 0x100;
>> +
>> +    for (; pos; pos = next) {
>> +            ext_cap = PCI_CAP(&pdev->hdr, pos);
>> +            /*
>> +             * Out of the two upper bytes of extended cap header, the
>> +             * lowest nibble is actually cap version.
>> +             * 'next cap offset' if present at bits [31:20], while
>> +             * bits [19:16] are set to 1 to indicate cap version.
>> +             * So to get position of next cap right shift by 4 bits.
>> +             */
>> +            next = (ext_cap->next >> 4);
>> +
>> +            switch (ext_cap->type) {
>> +            case PCI_EXT_CAP_ID_ATS:
>> +                    ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
>> +                    if (ret)
>> +                            return ret;
>> +                    break;
>> +            case PCI_EXT_CAP_ID_PRI:
>> +                    ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
>> +                    if (ret)
>> +                            return ret;
>> +                    break;
>> +            case PCI_EXT_CAP_ID_PASID:
>> +                    ret = vfio_pci_add_ext_cap(vdev, virt_hdr, ext_cap, pos);
>> +                    if (ret)
>> +                            return ret;
>> +                    break;
>> +            }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>   {
>>      int ret;
>>      size_t size;
>>      u16 pos, next;
>>      struct pci_cap_hdr *cap;
>> -    u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
>> +    u8 virt_hdr[PCI_DEV_CFG_SIZE];
>>      struct vfio_pci_device *pdev = &vdev->pci;
>>
>>      if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
>>              return 0;
>>
>> -    memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
>> +    memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
>>
>>      pos = pdev->hdr.capabilities & ~3;
>>
>> @@ -715,9 +801,13 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>              }
>>      }
>>
>> +    ret = vfio_pci_parse_ext_caps(vdev, virt_hdr);
>> +    if (ret)
>> +            return ret;
>> +
>>      /* Wipe remaining capabilities */
>>      pos = PCI_STD_HEADER_SIZEOF;
>> -    size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
>> +    size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
>>      memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
>>
>>      return 0;
>> @@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>
>>   static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
>>   {
>> -    ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
>> +    ssize_t sz = PCI_DEV_CFG_SIZE;
>>      struct vfio_region_info *info;
>>      struct vfio_pci_device *pdev = &vdev->pci;
>>
>> @@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
>>      /* Install our fake Configuration Space */
>>      info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>>      /*
>> -     * We don't touch the extended configuration space, let's be cautious
>> -     * and not overwrite it all with zeros, or bad things might happen.
>> +     * Update the extended configuration space as well since we
>> +     * are now populating the extended capabilities.
>>       */
>> -    hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
>> +    hdr_sz = PCI_DEV_CFG_SIZE;
>
> In one of the earlier versions of the PCI Express patches I was doing the same
> thing here - overwriting the entire PCI Express configuration space for a device.
> However, that made one of the devices I was using for testing stop working when
> assigned to a VM.
>
> I'll go through my testing notes and test it again, the cause of the failure might
> have been something else entirely which was fixed since then.

Sure. Let me know your findings.

Best regards
Vivek

>
> Thanks,
>
> Alex
>
>>      if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
>>              vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
>>                           hdr_sz);
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-09-02 10:48   ` Vivek Kumar Gautam
@ 2021-09-03 15:15     ` Alexandru Elisei
  2021-09-08  9:32       ` Vivek Kumar Gautam
  0 siblings, 1 reply; 15+ messages in thread
From: Alexandru Elisei @ 2021-09-03 15:15 UTC (permalink / raw)
  To: Vivek Kumar Gautam, will, julien.thierry.kdev, kvm
  Cc: andre.przywara, lorenzo.pieralisi, jean-philippe, eric.auger

Hi Vivek,

On 9/2/21 11:48 AM, Vivek Kumar Gautam wrote:
> Hi Alex,
>
>
> On 9/2/21 3:29 PM, Alexandru Elisei wrote:
>> Hi Vivek,
>>
>> On 8/10/21 7:25 AM, Vivek Gautam wrote:
>>> Add support to parse extended configuration space for vfio based
>>> assigned PCIe devices and add extended capabilities for the device
>>> in the guest. This allows the guest to see and work on extended
>>> capabilities, for example to toggle PRI extended cap to enable and
>>> disable Shared virtual addressing (SVA) support.
>>> PCIe extended capability header that is the first DWORD of all
>>> extended caps is shown below -
>>>
>>>     31               20  19   16  15                 0
>>>     ____________________|_______|_____________________
>>>    |    Next cap off    |  1h   |     Cap ID          |
>>>    |____________________|_______|_____________________|
>>>
>>> Out of the two upper bytes of extended cap header, the
>>> lower nibble is actually cap version - 0x1.
>>> 'next cap offset' if present at bits [31:20], should be
>>> right shifted by 4 bits to calculate the position of next
>>> capability.
>>> This change supports parsing and adding ATS, PRI and PASID caps.
>>>
>>> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
>>> ---
>>>   include/kvm/pci.h |   6 +++
>>>   vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>>>   2 files changed, 103 insertions(+), 7 deletions(-)
[..]
>>> @@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>>     static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
>>>   {
>>> -    ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
>>> +    ssize_t sz = PCI_DEV_CFG_SIZE;
>>>       struct vfio_region_info *info;
>>>       struct vfio_pci_device *pdev = &vdev->pci;
>>>   @@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device
>>> *vdev)
>>>       /* Install our fake Configuration Space */
>>>       info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>>>       /*
>>> -     * We don't touch the extended configuration space, let's be cautious
>>> -     * and not overwrite it all with zeros, or bad things might happen.
>>> +     * Update the extended configuration space as well since we
>>> +     * are now populating the extended capabilities.
>>>        */
>>> -    hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
>>> +    hdr_sz = PCI_DEV_CFG_SIZE;
>>
>> In one of the earlier versions of the PCI Express patches I was doing the same
>> thing here - overwriting the entire PCI Express configuration space for a device.
>> However, that made one of the devices I was using for testing stop working when
>> assigned to a VM.
>>
>> I'll go through my testing notes and test it again, the cause of the failure might
>> have been something else entirely which was fixed since then.
>
> Sure. Let me know your findings.

I think I found the card that doesn't work when overwriting the extended device
configuration space. I tried device assignment with a Realtek 8168 Gigabit
Ethernet card on a Seattle machine, and the host freezes when I try to start a VM.
Even after reset, the machine doesn't boot anymore and it gets stuck during the
boot process at this message:

NewPackageList status: EFI_SUCCESS
BDS.SignalConnectDriversEvent(feeb6d60)
BDS.ConnectRootBridgeHandles(feeb6db0)

It doesn't go away no matter how many times I reset the machine, to get it booting
again I have to pull the plug and plug it again. I tried assigning the device to a
VM several times, and this happened every time. The card doesn't have the caps
that you added, this is caused entirely by the config space write (tried it with
only the config space change).

It could be a problem kvmtool, with Linux or with the machine, but this is the
only machine where device assignment works and I would like to keep it working
with this NIC.

One solution I see is to add a field to vfio_pci_device (something like has_pcie),
and based on that, vfio_pci_fixup_cfg_space() could overwrite only the first 256
bytes or the entire device configuration space.

It's also not clear to me what you are trying to achieve with this patch. Is there
a particular device that you want to get working? Or an entire class of devices
which have those features? If it's the former, you could have the size of the
config space write depend on the vendor + device id. If it's the latter, we could
key the size of the config space write based on the presence of those particular
PCIE caps and try and fix other devices if they break.

Will, Andre, do you see other solutions? Do you have a preference?

Thanks,

Alex

>
> Best regards
> Vivek
>
>>
>> Thanks,
>>
>> Alex
>>
>>>       if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
>>>           vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
>>>                    hdr_sz);

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-09-03 15:15     ` Alexandru Elisei
@ 2021-09-08  9:32       ` Vivek Kumar Gautam
  2021-09-08  9:36         ` Vivek Kumar Gautam
                           ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Vivek Kumar Gautam @ 2021-09-08  9:32 UTC (permalink / raw)
  To: Alexandru Elisei, will, julien.thierry.kdev, kvm
  Cc: andre.przywara, lorenzo.pieralisi, jean-philippe, eric.auger

Hi Alex,


On 9/3/21 8:45 PM, Alexandru Elisei wrote:
> Hi Vivek,
>
> On 9/2/21 11:48 AM, Vivek Kumar Gautam wrote:
>> Hi Alex,
>>
>>
>> On 9/2/21 3:29 PM, Alexandru Elisei wrote:
>>> Hi Vivek,
>>>
>>> On 8/10/21 7:25 AM, Vivek Gautam wrote:
>>>> Add support to parse extended configuration space for vfio based
>>>> assigned PCIe devices and add extended capabilities for the device
>>>> in the guest. This allows the guest to see and work on extended
>>>> capabilities, for example to toggle PRI extended cap to enable and
>>>> disable Shared virtual addressing (SVA) support.
>>>> PCIe extended capability header that is the first DWORD of all
>>>> extended caps is shown below -
>>>>
>>>>      31               20  19   16  15                 0
>>>>      ____________________|_______|_____________________
>>>>     |    Next cap off    |  1h   |     Cap ID          |
>>>>     |____________________|_______|_____________________|
>>>>
>>>> Out of the two upper bytes of extended cap header, the
>>>> lower nibble is actually cap version - 0x1.
>>>> 'next cap offset' if present at bits [31:20], should be
>>>> right shifted by 4 bits to calculate the position of next
>>>> capability.
>>>> This change supports parsing and adding ATS, PRI and PASID caps.
>>>>
>>>> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
>>>> ---
>>>>    include/kvm/pci.h |   6 +++
>>>>    vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>>>>    2 files changed, 103 insertions(+), 7 deletions(-)
> [..]
>>>> @@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>>>      static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
>>>>    {
>>>> -    ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
>>>> +    ssize_t sz = PCI_DEV_CFG_SIZE;
>>>>        struct vfio_region_info *info;
>>>>        struct vfio_pci_device *pdev = &vdev->pci;
>>>>    @@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device
>>>> *vdev)
>>>>        /* Install our fake Configuration Space */
>>>>        info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>>>>        /*
>>>> -     * We don't touch the extended configuration space, let's be cautious
>>>> -     * and not overwrite it all with zeros, or bad things might happen.
>>>> +     * Update the extended configuration space as well since we
>>>> +     * are now populating the extended capabilities.
>>>>         */
>>>> -    hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
>>>> +    hdr_sz = PCI_DEV_CFG_SIZE;
>>>
>>> In one of the earlier versions of the PCI Express patches I was doing the same
>>> thing here - overwriting the entire PCI Express configuration space for a device.
>>> However, that made one of the devices I was using for testing stop working when
>>> assigned to a VM.
>>>
>>> I'll go through my testing notes and test it again, the cause of the failure might
>>> have been something else entirely which was fixed since then.
>>
>> Sure. Let me know your findings.
>
> I think I found the card that doesn't work when overwriting the extended device
> configuration space. I tried device assignment with a Realtek 8168 Gigabit
> Ethernet card on a Seattle machine, and the host freezes when I try to start a VM.
> Even after reset, the machine doesn't boot anymore and it gets stuck during the
> boot process at this message:
>
> NewPackageList status: EFI_SUCCESS
> BDS.SignalConnectDriversEvent(feeb6d60)
> BDS.ConnectRootBridgeHandles(feeb6db0)
>
> It doesn't go away no matter how many times I reset the machine, to get it booting
> again I have to pull the plug and plug it again. I tried assigning the device to a
> VM several times, and this happened every time. The card doesn't have the caps
> that you added, this is caused entirely by the config space write (tried it with
> only the config space change).
>
> It could be a problem kvmtool, with Linux or with the machine, but this is the
> only machine where device assignment works and I would like to keep it working
> with this NIC.

Sorry for the delay in responding. I took sometime off work.
Sure, we will try to keep your machine working :)

>
> One solution I see is to add a field to vfio_pci_device (something like has_pcie),
> and based on that, vfio_pci_fixup_cfg_space() could overwrite only the first 256
> bytes or the entire device configuration space.

Does the card support PCI extended caps (as seen from the PCI spec v5.0
section-7.5)?
If no, then I guess the check that I am planning to add - to check if
the device supports extended Caps - can help here. Since we would add
extended caps based on the mentioned check, it seems only valid to have
that check before overwriting the configuration space.

>
> It's also not clear to me what you are trying to achieve with this patch. Is there
> a particular device that you want to get working? Or an entire class of devices
> which have those features? If it's the former, you could have the size of the
> config space write depend on the vendor + device id. If it's the latter, we could
> key the size of the config space write based on the presence of those particular
> PCIE caps and try and fix other devices if they break.

Absolutely, we can check for the presence of PCI extended capabilities
and based on that write the configuration space. If the device has issue
with only a specific extended capability we can try to fix that by
keying the DevID-VendorID pair? What do you think?

>
> Will, Andre, do you see other solutions? Do you have a preference?

Will, Andre, please let me know as well if you have any preferences.

Best regards
Vivek

>
> Thanks,
>
> Alex
>
>>
>> Best regards
>> Vivek
>>
>>>
>>> Thanks,
>>>
>>> Alex
>>>
>>>>        if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
>>>>            vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
>>>>                     hdr_sz);
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-09-08  9:32       ` Vivek Kumar Gautam
@ 2021-09-08  9:36         ` Vivek Kumar Gautam
  2021-09-08 16:20           ` Alexandru Elisei
  2021-09-08 16:18         ` Alexandru Elisei
  2021-10-04  9:47         ` Will Deacon
  2 siblings, 1 reply; 15+ messages in thread
From: Vivek Kumar Gautam @ 2021-09-08  9:36 UTC (permalink / raw)
  To: Alexandru Elisei, will, julien.thierry.kdev, kvm
  Cc: andre.przywara, lorenzo.pieralisi, jean-philippe, eric.auger



On 9/8/21 3:02 PM, Vivek Kumar Gautam wrote:
> Hi Alex,
>
>
> On 9/3/21 8:45 PM, Alexandru Elisei wrote:
>> Hi Vivek,
>>
>> On 9/2/21 11:48 AM, Vivek Kumar Gautam wrote:
>>> Hi Alex,
>>>
>>>
>>> On 9/2/21 3:29 PM, Alexandru Elisei wrote:
>>>> Hi Vivek,
>>>>
>>>> On 8/10/21 7:25 AM, Vivek Gautam wrote:
>>>>> Add support to parse extended configuration space for vfio based
>>>>> assigned PCIe devices and add extended capabilities for the device
>>>>> in the guest. This allows the guest to see and work on extended
>>>>> capabilities, for example to toggle PRI extended cap to enable and
>>>>> disable Shared virtual addressing (SVA) support.
>>>>> PCIe extended capability header that is the first DWORD of all
>>>>> extended caps is shown below -
>>>>>
>>>>>      31               20  19   16  15                 0
>>>>>      ____________________|_______|_____________________
>>>>>     |    Next cap off    |  1h   |     Cap ID          |
>>>>>     |____________________|_______|_____________________|
>>>>>
>>>>> Out of the two upper bytes of extended cap header, the
>>>>> lower nibble is actually cap version - 0x1.
>>>>> 'next cap offset' if present at bits [31:20], should be
>>>>> right shifted by 4 bits to calculate the position of next
>>>>> capability.
>>>>> This change supports parsing and adding ATS, PRI and PASID caps.
>>>>>
>>>>> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
>>>>> ---
>>>>>    include/kvm/pci.h |   6 +++
>>>>>    vfio/pci.c        | 104
>>>>> ++++++++++++++++++++++++++++++++++++++++++----
>>>>>    2 files changed, 103 insertions(+), 7 deletions(-)
>> [..]
>>>>> @@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct
>>>>> vfio_device *vdev)
>>>>>      static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
>>>>>    {
>>>>> -    ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
>>>>> +    ssize_t sz = PCI_DEV_CFG_SIZE;
>>>>>        struct vfio_region_info *info;
>>>>>        struct vfio_pci_device *pdev = &vdev->pci;
>>>>>    @@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct
>>>>> vfio_device
>>>>> *vdev)
>>>>>        /* Install our fake Configuration Space */
>>>>>        info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>>>>>        /*
>>>>> -     * We don't touch the extended configuration space, let's be
>>>>> cautious
>>>>> -     * and not overwrite it all with zeros, or bad things might
>>>>> happen.
>>>>> +     * Update the extended configuration space as well since we
>>>>> +     * are now populating the extended capabilities.
>>>>>         */
>>>>> -    hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
>>>>> +    hdr_sz = PCI_DEV_CFG_SIZE;
>>>>
>>>> In one of the earlier versions of the PCI Express patches I was
>>>> doing the same
>>>> thing here - overwriting the entire PCI Express configuration space
>>>> for a device.
>>>> However, that made one of the devices I was using for testing stop
>>>> working when
>>>> assigned to a VM.
>>>>
>>>> I'll go through my testing notes and test it again, the cause of the
>>>> failure might
>>>> have been something else entirely which was fixed since then.
>>>
>>> Sure. Let me know your findings.
>>
>> I think I found the card that doesn't work when overwriting the
>> extended device
>> configuration space. I tried device assignment with a Realtek 8168
>> Gigabit
>> Ethernet card on a Seattle machine, and the host freezes when I try to
>> start a VM.
>> Even after reset, the machine doesn't boot anymore and it gets stuck
>> during the
>> boot process at this message:
>>
>> NewPackageList status: EFI_SUCCESS
>> BDS.SignalConnectDriversEvent(feeb6d60)
>> BDS.ConnectRootBridgeHandles(feeb6db0)
>>
>> It doesn't go away no matter how many times I reset the machine, to
>> get it booting
>> again I have to pull the plug and plug it again. I tried assigning the
>> device to a
>> VM several times, and this happened every time. The card doesn't have
>> the caps
>> that you added, this is caused entirely by the config space write
>> (tried it with
>> only the config space change).
>>
>> It could be a problem kvmtool, with Linux or with the machine, but
>> this is the
>> only machine where device assignment works and I would like to keep it
>> working
>> with this NIC.
>
> Sorry for the delay in responding. I took sometime off work.
> Sure, we will try to keep your machine working :)
>
>>
>> One solution I see is to add a field to vfio_pci_device (something
>> like has_pcie),
>> and based on that, vfio_pci_fixup_cfg_space() could overwrite only the
>> first 256
>> bytes or the entire device configuration space.
>
> Does the card support PCI extended caps (as seen from the PCI spec v5.0
> section-7.5)?
> If no, then I guess the check that I am planning to add - to check if
> the device supports extended Caps - can help here. Since we would add
> extended caps based on the mentioned check, it seems only valid to have
> that check before overwriting the configuration space.
>
>>
>> It's also not clear to me what you are trying to achieve with this
>> patch. Is there
>> a particular device that you want to get working? Or an entire class
>> of devices
>> which have those features? If it's the former, you could have the size
>> of the
>> config space write depend on the vendor + device id. If it's the
>> latter, we could
>> key the size of the config space write based on the presence of those
>> particular
>> PCIE caps and try and fix other devices if they break.

Sorry, I missed adding here that I am adding support for Shared virtual
addressing that would need PCI devices' ATS and PRI extended capability
to be exposed to the guest. That's the reason I am adding this change to
write the device configuration space in kvmtool with ATS and PRI caps.

BRs
Vivek

>
> Absolutely, we can check for the presence of PCI extended capabilities
> and based on that write the configuration space. If the device has issue
> with only a specific extended capability we can try to fix that by
> keying the DevID-VendorID pair? What do you think?
>
>>
>> Will, Andre, do you see other solutions? Do you have a preference?
>
> Will, Andre, please let me know as well if you have any preferences.
>
> Best regards
> Vivek
>
>>
>> Thanks,
>>
>> Alex
>>
>>>
>>> Best regards
>>> Vivek
>>>
>>>>
>>>> Thanks,
>>>>
>>>> Alex
>>>>
>>>>>        if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) !=
>>>>> hdr_sz) {
>>>>>            vfio_dev_err(vdev, "failed to write %zd bytes to Config
>>>>> Space",
>>>>>                     hdr_sz);
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-09-08  9:32       ` Vivek Kumar Gautam
  2021-09-08  9:36         ` Vivek Kumar Gautam
@ 2021-09-08 16:18         ` Alexandru Elisei
  2021-10-04  9:47         ` Will Deacon
  2 siblings, 0 replies; 15+ messages in thread
From: Alexandru Elisei @ 2021-09-08 16:18 UTC (permalink / raw)
  To: Vivek Kumar Gautam, will, julien.thierry.kdev, kvm
  Cc: andre.przywara, lorenzo.pieralisi, jean-philippe, eric.auger

Hi Vivek,

On 9/8/21 10:32 AM, Vivek Kumar Gautam wrote:
> Hi Alex,
>
>
> On 9/3/21 8:45 PM, Alexandru Elisei wrote:
>> Hi Vivek,
>>
>> On 9/2/21 11:48 AM, Vivek Kumar Gautam wrote:
>>> Hi Alex,
>>>
>>>
>>> On 9/2/21 3:29 PM, Alexandru Elisei wrote:
>>>> Hi Vivek,
>>>>
>>>> On 8/10/21 7:25 AM, Vivek Gautam wrote:
>>>>> Add support to parse extended configuration space for vfio based
>>>>> assigned PCIe devices and add extended capabilities for the device
>>>>> in the guest. This allows the guest to see and work on extended
>>>>> capabilities, for example to toggle PRI extended cap to enable and
>>>>> disable Shared virtual addressing (SVA) support.
>>>>> PCIe extended capability header that is the first DWORD of all
>>>>> extended caps is shown below -
>>>>>
>>>>>      31               20  19   16  15                 0
>>>>>      ____________________|_______|_____________________
>>>>>     |    Next cap off    |  1h   |     Cap ID          |
>>>>>     |____________________|_______|_____________________|
>>>>>
>>>>> Out of the two upper bytes of extended cap header, the
>>>>> lower nibble is actually cap version - 0x1.
>>>>> 'next cap offset' if present at bits [31:20], should be
>>>>> right shifted by 4 bits to calculate the position of next
>>>>> capability.
>>>>> This change supports parsing and adding ATS, PRI and PASID caps.
>>>>>
>>>>> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
>>>>> ---
>>>>>    include/kvm/pci.h |   6 +++
>>>>>    vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>>>>>    2 files changed, 103 insertions(+), 7 deletions(-)
>> [..]
>>>>> @@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>>>>      static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
>>>>>    {
>>>>> -    ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
>>>>> +    ssize_t sz = PCI_DEV_CFG_SIZE;
>>>>>        struct vfio_region_info *info;
>>>>>        struct vfio_pci_device *pdev = &vdev->pci;
>>>>>    @@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device
>>>>> *vdev)
>>>>>        /* Install our fake Configuration Space */
>>>>>        info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>>>>>        /*
>>>>> -     * We don't touch the extended configuration space, let's be cautious
>>>>> -     * and not overwrite it all with zeros, or bad things might happen.
>>>>> +     * Update the extended configuration space as well since we
>>>>> +     * are now populating the extended capabilities.
>>>>>         */
>>>>> -    hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
>>>>> +    hdr_sz = PCI_DEV_CFG_SIZE;
>>>>
>>>> In one of the earlier versions of the PCI Express patches I was doing the same
>>>> thing here - overwriting the entire PCI Express configuration space for a
>>>> device.
>>>> However, that made one of the devices I was using for testing stop working when
>>>> assigned to a VM.
>>>>
>>>> I'll go through my testing notes and test it again, the cause of the failure
>>>> might
>>>> have been something else entirely which was fixed since then.
>>>
>>> Sure. Let me know your findings.
>>
>> I think I found the card that doesn't work when overwriting the extended device
>> configuration space. I tried device assignment with a Realtek 8168 Gigabit
>> Ethernet card on a Seattle machine, and the host freezes when I try to start a VM.
>> Even after reset, the machine doesn't boot anymore and it gets stuck during the
>> boot process at this message:
>>
>> NewPackageList status: EFI_SUCCESS
>> BDS.SignalConnectDriversEvent(feeb6d60)
>> BDS.ConnectRootBridgeHandles(feeb6db0)
>>
>> It doesn't go away no matter how many times I reset the machine, to get it booting
>> again I have to pull the plug and plug it again. I tried assigning the device to a
>> VM several times, and this happened every time. The card doesn't have the caps
>> that you added, this is caused entirely by the config space write (tried it with
>> only the config space change).
>>
>> It could be a problem kvmtool, with Linux or with the machine, but this is the
>> only machine where device assignment works and I would like to keep it working
>> with this NIC.
>
> Sorry for the delay in responding. I took sometime off work.
> Sure, we will try to keep your machine working :)
>
>>
>> One solution I see is to add a field to vfio_pci_device (something like has_pcie),
>> and based on that, vfio_pci_fixup_cfg_space() could overwrite only the first 256
>> bytes or the entire device configuration space.
>
> Does the card support PCI extended caps (as seen from the PCI spec v5.0
> section-7.5)?
> If no, then I guess the check that I am planning to add - to check if the device
> supports extended Caps - can help here. Since we would add extended caps based
> on the mentioned check, it seems only valid to have that check before
> overwriting the configuration space.

The card is has PCIE support:

# lspci -v
[..]
01:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8111/8168/8411 PCI
Express Gigabit Ethernet Controller (rev 06)
    Subsystem: TP-LINK Technologies Co., Ltd. TG-3468 Gigabit PCI Express Network
Adapter
    Flags: bus master, fast devsel, latency 0, IRQ 255
    I/O ports at 1000 [size=256]
    Memory at 40000000 (64-bit, non-prefetchable) [size=4K]
    Memory at 100000000 (64-bit, prefetchable) [size=16K]
    Capabilities: [40] Power Management version 3
    Capabilities: [50] MSI: Enable- Count=1/1 Maskable- 64bit+
    Capabilities: [70] Express Endpoint, MSI 01
    Capabilities: [b0] MSI-X: Enable- Count=4 Masked-
    Capabilities: [d0] Vital Product Data
    Capabilities: [100] Advanced Error Reporting
    Capabilities: [140] Virtual Channel
    Capabilities: [160] Device Serial Number d7-40-00-00-68-4c-e0-00

>
>>
>> It's also not clear to me what you are trying to achieve with this patch. Is there
>> a particular device that you want to get working? Or an entire class of devices
>> which have those features? If it's the former, you could have the size of the
>> config space write depend on the vendor + device id. If it's the latter, we could
>> key the size of the config space write based on the presence of those particular
>> PCIE caps and try and fix other devices if they break.
>
> Absolutely, we can check for the presence of PCI extended capabilities and based
> on that write the configuration space. If the device has issue with only a
> specific extended capability we can try to fix that by keying the DevID-VendorID
> pair? What do you think?

My preference would be to overwrite the extended configuration space only for
those devices that have at least of the PCIE capabilities that this patch adds. If
a device doesn't have neither of ATS, PRI or PASID, then kvmtool should not touch
the extended configuration space, similar to how it behaves today.

Thanks,

Alex

>
>>
>> Will, Andre, do you see other solutions? Do you have a preference?
>
> Will, Andre, please let me know as well if you have any preferences.
>
> Best regards
> Vivek
>
>>
>> Thanks,
>>
>> Alex
>>
>>>
>>> Best regards
>>> Vivek
>>>
>>>>
>>>> Thanks,
>>>>
>>>> Alex
>>>>
>>>>>        if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
>>>>>            vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
>>>>>                     hdr_sz);

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-09-08  9:36         ` Vivek Kumar Gautam
@ 2021-09-08 16:20           ` Alexandru Elisei
  0 siblings, 0 replies; 15+ messages in thread
From: Alexandru Elisei @ 2021-09-08 16:20 UTC (permalink / raw)
  To: Vivek Kumar Gautam, will, julien.thierry.kdev, kvm
  Cc: andre.przywara, lorenzo.pieralisi, jean-philippe, eric.auger

Hi Vivek,

On 9/8/21 10:36 AM, Vivek Kumar Gautam wrote:
>
>
> On 9/8/21 3:02 PM, Vivek Kumar Gautam wrote:
>> Hi Alex,
>>
>>
>> On 9/3/21 8:45 PM, Alexandru Elisei wrote:
>>> Hi Vivek,
>>>
>>> On 9/2/21 11:48 AM, Vivek Kumar Gautam wrote:
>>>> Hi Alex,
>>>>
>>>>
>>>> On 9/2/21 3:29 PM, Alexandru Elisei wrote:
>>>>> Hi Vivek,
>>>>>
>>>>> On 8/10/21 7:25 AM, Vivek Gautam wrote:
>>>>>> Add support to parse extended configuration space for vfio based
>>>>>> assigned PCIe devices and add extended capabilities for the device
>>>>>> in the guest. This allows the guest to see and work on extended
>>>>>> capabilities, for example to toggle PRI extended cap to enable and
>>>>>> disable Shared virtual addressing (SVA) support.
>>>>>> PCIe extended capability header that is the first DWORD of all
>>>>>> extended caps is shown below -
>>>>>>
>>>>>>      31               20  19   16  15                 0
>>>>>>      ____________________|_______|_____________________
>>>>>>     |    Next cap off    |  1h   |     Cap ID          |
>>>>>>     |____________________|_______|_____________________|
>>>>>>
>>>>>> Out of the two upper bytes of extended cap header, the
>>>>>> lower nibble is actually cap version - 0x1.
>>>>>> 'next cap offset' if present at bits [31:20], should be
>>>>>> right shifted by 4 bits to calculate the position of next
>>>>>> capability.
>>>>>> This change supports parsing and adding ATS, PRI and PASID caps.
>>>>>>
>>>>>> Signed-off-by: Vivek Gautam <vivek.gautam@arm.com>
>>>>>> ---
>>>>>>    include/kvm/pci.h |   6 +++
>>>>>>    vfio/pci.c        | 104 ++++++++++++++++++++++++++++++++++++++++++----
>>>>>>    2 files changed, 103 insertions(+), 7 deletions(-)
>>> [..]
>>>>>> @@ -725,7 +815,7 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>>>>>      static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
>>>>>>    {
>>>>>> -    ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
>>>>>> +    ssize_t sz = PCI_DEV_CFG_SIZE;
>>>>>>        struct vfio_region_info *info;
>>>>>>        struct vfio_pci_device *pdev = &vdev->pci;
>>>>>>    @@ -831,10 +921,10 @@ static int vfio_pci_fixup_cfg_space(struct
>>>>>> vfio_device
>>>>>> *vdev)
>>>>>>        /* Install our fake Configuration Space */
>>>>>>        info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>>>>>>        /*
>>>>>> -     * We don't touch the extended configuration space, let's be cautious
>>>>>> -     * and not overwrite it all with zeros, or bad things might happen.
>>>>>> +     * Update the extended configuration space as well since we
>>>>>> +     * are now populating the extended capabilities.
>>>>>>         */
>>>>>> -    hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
>>>>>> +    hdr_sz = PCI_DEV_CFG_SIZE;
>>>>>
>>>>> In one of the earlier versions of the PCI Express patches I was doing the same
>>>>> thing here - overwriting the entire PCI Express configuration space for a
>>>>> device.
>>>>> However, that made one of the devices I was using for testing stop working when
>>>>> assigned to a VM.
>>>>>
>>>>> I'll go through my testing notes and test it again, the cause of the failure
>>>>> might
>>>>> have been something else entirely which was fixed since then.
>>>>
>>>> Sure. Let me know your findings.
>>>
>>> I think I found the card that doesn't work when overwriting the extended device
>>> configuration space. I tried device assignment with a Realtek 8168 Gigabit
>>> Ethernet card on a Seattle machine, and the host freezes when I try to start a
>>> VM.
>>> Even after reset, the machine doesn't boot anymore and it gets stuck during the
>>> boot process at this message:
>>>
>>> NewPackageList status: EFI_SUCCESS
>>> BDS.SignalConnectDriversEvent(feeb6d60)
>>> BDS.ConnectRootBridgeHandles(feeb6db0)
>>>
>>> It doesn't go away no matter how many times I reset the machine, to get it
>>> booting
>>> again I have to pull the plug and plug it again. I tried assigning the device
>>> to a
>>> VM several times, and this happened every time. The card doesn't have the caps
>>> that you added, this is caused entirely by the config space write (tried it with
>>> only the config space change).
>>>
>>> It could be a problem kvmtool, with Linux or with the machine, but this is the
>>> only machine where device assignment works and I would like to keep it working
>>> with this NIC.
>>
>> Sorry for the delay in responding. I took sometime off work.
>> Sure, we will try to keep your machine working :)
>>
>>>
>>> One solution I see is to add a field to vfio_pci_device (something like
>>> has_pcie),
>>> and based on that, vfio_pci_fixup_cfg_space() could overwrite only the first 256
>>> bytes or the entire device configuration space.
>>
>> Does the card support PCI extended caps (as seen from the PCI spec v5.0
>> section-7.5)?
>> If no, then I guess the check that I am planning to add - to check if the
>> device supports extended Caps - can help here. Since we would add extended caps
>> based on the mentioned check, it seems only valid to have that check before
>> overwriting the configuration space.
>>
>>>
>>> It's also not clear to me what you are trying to achieve with this patch. Is
>>> there
>>> a particular device that you want to get working? Or an entire class of devices
>>> which have those features? If it's the former, you could have the size of the
>>> config space write depend on the vendor + device id. If it's the latter, we could
>>> key the size of the config space write based on the presence of those particular
>>> PCIE caps and try and fix other devices if they break.
>
> Sorry, I missed adding here that I am adding support for Shared virtual
> addressing that would need PCI devices' ATS and PRI extended capability to be
> exposed to the guest. That's the reason I am adding this change to write the
> device configuration space in kvmtool with ATS and PRI caps.

Actually, you did, it's my fault for not parsing the commit message properly.
Thanks for repeating it though.

Thanks,

Alex

>
> BRs
> Vivek
>
>>
>> Absolutely, we can check for the presence of PCI extended capabilities and
>> based on that write the configuration space. If the device has issue with only
>> a specific extended capability we can try to fix that by keying the
>> DevID-VendorID pair? What do you think?
>>
>>>
>>> Will, Andre, do you see other solutions? Do you have a preference?
>>
>> Will, Andre, please let me know as well if you have any preferences.
>>
>> Best regards
>> Vivek
>>
>>>
>>> Thanks,
>>>
>>> Alex
>>>
>>>>
>>>> Best regards
>>>> Vivek
>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Alex
>>>>>
>>>>>>        if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
>>>>>>            vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
>>>>>>                     hdr_sz);

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] vfio/pci: Add support for PCIe extended capabilities
  2021-09-08  9:32       ` Vivek Kumar Gautam
  2021-09-08  9:36         ` Vivek Kumar Gautam
  2021-09-08 16:18         ` Alexandru Elisei
@ 2021-10-04  9:47         ` Will Deacon
  2 siblings, 0 replies; 15+ messages in thread
From: Will Deacon @ 2021-10-04  9:47 UTC (permalink / raw)
  To: Vivek Kumar Gautam
  Cc: Alexandru Elisei, julien.thierry.kdev, kvm, andre.przywara,
	lorenzo.pieralisi, jean-philippe, eric.auger

On Wed, Sep 08, 2021 at 03:02:20PM +0530, Vivek Kumar Gautam wrote:
> On 9/3/21 8:45 PM, Alexandru Elisei wrote:
> > On 9/2/21 11:48 AM, Vivek Kumar Gautam wrote:
> > > On 9/2/21 3:29 PM, Alexandru Elisei wrote:
> > I think I found the card that doesn't work when overwriting the extended device
> > configuration space. I tried device assignment with a Realtek 8168 Gigabit
> > Ethernet card on a Seattle machine, and the host freezes when I try to start a VM.
> > Even after reset, the machine doesn't boot anymore and it gets stuck during the
> > boot process at this message:
> > 
> > NewPackageList status: EFI_SUCCESS
> > BDS.SignalConnectDriversEvent(feeb6d60)
> > BDS.ConnectRootBridgeHandles(feeb6db0)
> > 
> > It doesn't go away no matter how many times I reset the machine, to get it booting
> > again I have to pull the plug and plug it again. I tried assigning the device to a
> > VM several times, and this happened every time. The card doesn't have the caps
> > that you added, this is caused entirely by the config space write (tried it with
> > only the config space change).
> > 
> > It could be a problem kvmtool, with Linux or with the machine, but this is the
> > only machine where device assignment works and I would like to keep it working
> > with this NIC.

There is at least a problem with the machine/firmware if you can get it
into this state.

> Sorry for the delay in responding. I took sometime off work.
> Sure, we will try to keep your machine working :)
> 
> > 
> > One solution I see is to add a field to vfio_pci_device (something like has_pcie),
> > and based on that, vfio_pci_fixup_cfg_space() could overwrite only the first 256
> > bytes or the entire device configuration space.
> 
> Does the card support PCI extended caps (as seen from the PCI spec v5.0
> section-7.5)?
> If no, then I guess the check that I am planning to add - to check if
> the device supports extended Caps - can help here. Since we would add
> extended caps based on the mentioned check, it seems only valid to have
> that check before overwriting the configuration space.
> 
> > 
> > It's also not clear to me what you are trying to achieve with this patch. Is there
> > a particular device that you want to get working? Or an entire class of devices
> > which have those features? If it's the former, you could have the size of the
> > config space write depend on the vendor + device id. If it's the latter, we could
> > key the size of the config space write based on the presence of those particular
> > PCIE caps and try and fix other devices if they break.
> 
> Absolutely, we can check for the presence of PCI extended capabilities
> and based on that write the configuration space. If the device has issue
> with only a specific extended capability we can try to fix that by
> keying the DevID-VendorID pair? What do you think?
> 
> > 
> > Will, Andre, do you see other solutions? Do you have a preference?
> 
> Will, Andre, please let me know as well if you have any preferences.

If it's straightforward to keep this working on Seattle, then let's do it,
but I don't think we should bend over backwards to support device assignment
on a dead platform (and I say that as a regular user of one of these
things!)

Will

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2021-10-04  9:47 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-10  6:25 [PATCH] vfio/pci: Add support for PCIe extended capabilities Vivek Gautam
2021-08-31 14:54 ` Will Deacon
2021-09-01 10:27   ` Andre Przywara
2021-09-01 10:44     ` Will Deacon
2021-09-02 10:28       ` Vivek Kumar Gautam
2021-08-31 17:14 ` Andre Przywara
2021-09-02 10:42   ` Vivek Kumar Gautam
2021-09-02  9:59 ` Alexandru Elisei
2021-09-02 10:48   ` Vivek Kumar Gautam
2021-09-03 15:15     ` Alexandru Elisei
2021-09-08  9:32       ` Vivek Kumar Gautam
2021-09-08  9:36         ` Vivek Kumar Gautam
2021-09-08 16:20           ` Alexandru Elisei
2021-09-08 16:18         ` Alexandru Elisei
2021-10-04  9:47         ` Will Deacon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.