linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs
@ 2022-10-11 11:23 Rui Ma
  2022-10-11 11:37 ` Leon Romanovsky
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Rui Ma @ 2022-10-11 11:23 UTC (permalink / raw)
  To: helgaas; +Cc: linux-kernel, linux-pci, Alexander.Deucher, bhelgaas, Rui Ma

In some certain SR-IOV scene, when the device physical space(such as Video
RAM)is fixed, as the number of VFs increases, some device driver may decrease
actual BAR memory space used by each VF. However, the VF BAR memory mapping is
always based on the usual BAR probing algorithm in PCIe spec. So do not map this
unneeded memory can save host memory which occupied by PTEs. Although each PTE
only occupies a few bytes of space on its own, a large number of PTEs can still
take up a lot of space.

Signed-off-by: Rui Ma <Rui.Ma@amd.com>
---
 drivers/pci/iov.c    | 14 ++++++++++++--
 drivers/pci/pci.h    | 15 +++++++++++++++
 drivers/pci/quirks.c | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 952217572113..92a69e51d85c 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -296,6 +296,14 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 	struct pci_sriov *iov = dev->sriov;
 	struct pci_bus *bus;
 
+    /*
+     * Some SR-IOV device's BAR map range is larger than they can actually use.
+     * This extra BAR space occupy too much reverse mapping size(physical page
+     * back to the PTEs). So add a divisor shift parameter to resize the request
+     * resource of VF according to num of VFs.
+     */
+	u16 shift = 1;
+
 	bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
 	if (!bus)
 		goto failed;
@@ -328,8 +336,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 		virtfn->resource[i].name = pci_name(virtfn);
 		virtfn->resource[i].flags = res->flags;
 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		shift = 1;
+		shift = virtfn_get_shift(dev, iov->num_VFs, i);
 		virtfn->resource[i].start = res->start + size * id;
-		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
+		virtfn->resource[i].end = virtfn->resource[i].start + (size >> (shift - 1)) - 1;
 		rc = request_resource(res, &virtfn->resource[i]);
 		BUG_ON(rc);
 	}
@@ -680,12 +690,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 	msleep(100);
 	pci_cfg_access_unlock(dev);
 
+	iov->num_VFs = nr_virtfn;
 	rc = sriov_add_vfs(dev, initial);
 	if (rc)
 		goto err_pcibios;
 
 	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
-	iov->num_VFs = nr_virtfn;
 
 	return 0;
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 3d60cabde1a1..befc67a280eb 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -603,6 +603,21 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
 }
 #endif
 
+struct virtfn_get_shift_methods {
+	u16 vendor;
+	u16 device;
+	u16 (*get_shift)(struct pci_dev *dev, u16 arg, int arg2);
+};
+
+#ifdef CONFIG_PCI_QUIRKS
+u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2);
+#else
+static inline u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
+{
+	return (u16)1;
+}
+#endif
+
 #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64)
 int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
 			  struct resource *res);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index da829274fc66..3466738c1c54 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4085,6 +4085,43 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
 	return -ENOTTY;
 }
 
+static u16 resize_vf_bar0(struct pci_dev *dev, u16 num_VFs, int bar_num)
+{
+	u16 shift = 1;
+
+	if (bar_num == 0) {
+		while ((1 << shift) <= num_VFs)
+			shift += 1;
+	}
+	pci_info(dev, "with %d VFs, VF BAR%d get shift: %d\n", num_VFs, bar_num, shift);
+	return shift;
+}
+
+static const struct virtfn_get_shift_methods virtfn_get_shift_methods[] = {
+	{ PCI_VENDOR_ID_ATI, 0x73a1, resize_vf_bar0},
+	{ 0 }
+};
+
+/*
+ * Get shift num to calculate SR-IOV device BAR. Sometimes the BAR size for
+ * SR-IOV device is too large and we want to calculate the size to define
+ * the end of virtfn.
+ */
+u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
+{
+	const struct virtfn_get_shift_methods *i;
+
+	for (i = virtfn_get_shift_methods; i->get_shift; i++) {
+		if ((i->vendor == dev->vendor ||
+		     i->vendor == (u16)PCI_ANY_ID) &&
+		    (i->device == dev->device ||
+		     i->device == (u16)PCI_ANY_ID))
+			return i->get_shift(dev, arg1, arg2);
+	}
+
+	return (u16)1;
+}
+
 static void quirk_dma_func0_alias(struct pci_dev *dev)
 {
 	if (PCI_FUNC(dev->devfn) != 0)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs
  2022-10-11 11:23 [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs Rui Ma
@ 2022-10-11 11:37 ` Leon Romanovsky
  2022-10-11 11:48   ` David Laight
  2022-10-17  6:26 ` Christoph Hellwig
  2022-11-09 23:46 ` Bjorn Helgaas
  2 siblings, 1 reply; 9+ messages in thread
From: Leon Romanovsky @ 2022-10-11 11:37 UTC (permalink / raw)
  To: Rui Ma; +Cc: helgaas, linux-kernel, linux-pci, Alexander.Deucher, bhelgaas

On Tue, Oct 11, 2022 at 07:23:25PM +0800, Rui Ma wrote:
> In some certain SR-IOV scene, when the device physical space(such as Video
> RAM)is fixed, as the number of VFs increases, some device driver may decrease
> actual BAR memory space used by each VF. However, the VF BAR memory mapping is
> always based on the usual BAR probing algorithm in PCIe spec. So do not map this
> unneeded memory can save host memory which occupied by PTEs. Although each PTE
> only occupies a few bytes of space on its own, a large number of PTEs can still
> take up a lot of space.
> 
> Signed-off-by: Rui Ma <Rui.Ma@amd.com>
> ---
>  drivers/pci/iov.c    | 14 ++++++++++++--
>  drivers/pci/pci.h    | 15 +++++++++++++++
>  drivers/pci/quirks.c | 37 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 64 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 952217572113..92a69e51d85c 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -296,6 +296,14 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
>  	struct pci_sriov *iov = dev->sriov;
>  	struct pci_bus *bus;
>  
> +    /*
> +     * Some SR-IOV device's BAR map range is larger than they can actually use.
> +     * This extra BAR space occupy too much reverse mapping size(physical page
> +     * back to the PTEs). So add a divisor shift parameter to resize the request
> +     * resource of VF according to num of VFs.
> +     */
> +	u16 shift = 1;
> +
>  	bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
>  	if (!bus)
>  		goto failed;
> @@ -328,8 +336,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
>  		virtfn->resource[i].name = pci_name(virtfn);
>  		virtfn->resource[i].flags = res->flags;
>  		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> +		shift = 1;
> +		shift = virtfn_get_shift(dev, iov->num_VFs, i);
>  		virtfn->resource[i].start = res->start + size * id;
> -		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
> +		virtfn->resource[i].end = virtfn->resource[i].start + (size >> (shift - 1)) - 1;
>  		rc = request_resource(res, &virtfn->resource[i]);
>  		BUG_ON(rc);
>  	}
> @@ -680,12 +690,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>  	msleep(100);
>  	pci_cfg_access_unlock(dev);
>  
> +	iov->num_VFs = nr_virtfn;
>  	rc = sriov_add_vfs(dev, initial);
>  	if (rc)
>  		goto err_pcibios;
>  
>  	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
> -	iov->num_VFs = nr_virtfn;
>  
>  	return 0;
>  
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 3d60cabde1a1..befc67a280eb 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -603,6 +603,21 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
>  }
>  #endif
>  
> +struct virtfn_get_shift_methods {
> +	u16 vendor;
> +	u16 device;
> +	u16 (*get_shift)(struct pci_dev *dev, u16 arg, int arg2);
> +};
> +
> +#ifdef CONFIG_PCI_QUIRKS
> +u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2);
> +#else
> +static inline u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
> +{
> +	return (u16)1;

<...>

> +	return (u16)1;

Why do you need these casts? You can omit them.

Thanks

> +}
> +
>  static void quirk_dma_func0_alias(struct pci_dev *dev)
>  {
>  	if (PCI_FUNC(dev->devfn) != 0)
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs
  2022-10-11 11:37 ` Leon Romanovsky
@ 2022-10-11 11:48   ` David Laight
  0 siblings, 0 replies; 9+ messages in thread
From: David Laight @ 2022-10-11 11:48 UTC (permalink / raw)
  To: 'Leon Romanovsky', Rui Ma
  Cc: helgaas, linux-kernel, linux-pci, Alexander.Deucher, bhelgaas

From: Leon Romanovsky
> Sent: 11 October 2022 12:37
> 
> On Tue, Oct 11, 2022 at 07:23:25PM +0800, Rui Ma wrote:
> > In some certain SR-IOV scene, when the device physical space(such as Video
> > RAM)is fixed, as the number of VFs increases, some device driver may decrease
> > actual BAR memory space used by each VF. However, the VF BAR memory mapping is
> > always based on the usual BAR probing algorithm in PCIe spec. So do not map this
> > unneeded memory can save host memory which occupied by PTEs. Although each PTE
> > only occupies a few bytes of space on its own, a large number of PTEs can still
> > take up a lot of space.
...
> > +    /*
> > +     * Some SR-IOV device's BAR map range is larger than they can actually use.
> > +     * This extra BAR space occupy too much reverse mapping size(physical page
> > +     * back to the PTEs). So add a divisor shift parameter to resize the request
> > +     * resource of VF according to num of VFs.
> > +     */
> > +	u16 shift = 1;

Why u16??

> > +		virtfn->resource[i].end = virtfn->resource[i].start + (size >> (shift - 1)) - 1;

The 'shift - 1' may require a mask.

...
> > +struct virtfn_get_shift_methods {
> > +	u16 vendor;
> > +	u16 device;
> > +	u16 (*get_shift)(struct pci_dev *dev, u16 arg, int arg2);

More pointless u16 - they just make the code larger.

...
> > +static inline u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
> > +{
> > +	return (u16)1;
> 
> <...>
> 
> > +	return (u16)1;
> 
> Why do you need these casts? You can omit them.

Kill all the u16

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs
  2022-10-11 11:23 [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs Rui Ma
  2022-10-11 11:37 ` Leon Romanovsky
@ 2022-10-17  6:26 ` Christoph Hellwig
  2022-11-09 23:46 ` Bjorn Helgaas
  2 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2022-10-17  6:26 UTC (permalink / raw)
  To: Rui Ma; +Cc: helgaas, linux-kernel, linux-pci, Alexander.Deucher, bhelgaas

Besides the various style issues:  I don't think we can just
hardcode specific devices in the PCIe core.  If we want to do this
we'll need something like PCIe extended capabilities or ACPI tables
to describe the behavior in a standardized way.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs
  2022-10-11 11:23 [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs Rui Ma
  2022-10-11 11:37 ` Leon Romanovsky
  2022-10-17  6:26 ` Christoph Hellwig
@ 2022-11-09 23:46 ` Bjorn Helgaas
  2 siblings, 0 replies; 9+ messages in thread
From: Bjorn Helgaas @ 2022-11-09 23:46 UTC (permalink / raw)
  To: Rui Ma; +Cc: linux-kernel, linux-pci, Alexander.Deucher, bhelgaas

On Tue, Oct 11, 2022 at 07:23:25PM +0800, Rui Ma wrote:
> In some certain SR-IOV scene, when the device physical space(such as Video
> RAM)is fixed, as the number of VFs increases, some device driver may decrease
> actual BAR memory space used by each VF. However, the VF BAR memory mapping is
> always based on the usual BAR probing algorithm in PCIe spec. So do not map this
> unneeded memory can save host memory which occupied by PTEs. Although each PTE
> only occupies a few bytes of space on its own, a large number of PTEs can still
> take up a lot of space.

Dropping this for now until we resolve whether this is working around
a KVM bug as Alex suggests:

https://lore.kernel.org/r/BL1PR12MB51446437265DD1E8AA0794E9F7239@BL1PR12MB5144.namprd12.prod.outlook.com

> Signed-off-by: Rui Ma <Rui.Ma@amd.com>
> ---
>  drivers/pci/iov.c    | 14 ++++++++++++--
>  drivers/pci/pci.h    | 15 +++++++++++++++
>  drivers/pci/quirks.c | 37 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 64 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 952217572113..92a69e51d85c 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -296,6 +296,14 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
>  	struct pci_sriov *iov = dev->sriov;
>  	struct pci_bus *bus;
>  
> +    /*
> +     * Some SR-IOV device's BAR map range is larger than they can actually use.
> +     * This extra BAR space occupy too much reverse mapping size(physical page
> +     * back to the PTEs). So add a divisor shift parameter to resize the request
> +     * resource of VF according to num of VFs.
> +     */
> +	u16 shift = 1;
> +
>  	bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
>  	if (!bus)
>  		goto failed;
> @@ -328,8 +336,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
>  		virtfn->resource[i].name = pci_name(virtfn);
>  		virtfn->resource[i].flags = res->flags;
>  		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> +		shift = 1;
> +		shift = virtfn_get_shift(dev, iov->num_VFs, i);
>  		virtfn->resource[i].start = res->start + size * id;
> -		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
> +		virtfn->resource[i].end = virtfn->resource[i].start + (size >> (shift - 1)) - 1;
>  		rc = request_resource(res, &virtfn->resource[i]);
>  		BUG_ON(rc);
>  	}
> @@ -680,12 +690,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>  	msleep(100);
>  	pci_cfg_access_unlock(dev);
>  
> +	iov->num_VFs = nr_virtfn;
>  	rc = sriov_add_vfs(dev, initial);
>  	if (rc)
>  		goto err_pcibios;
>  
>  	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
> -	iov->num_VFs = nr_virtfn;
>  
>  	return 0;
>  
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 3d60cabde1a1..befc67a280eb 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -603,6 +603,21 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
>  }
>  #endif
>  
> +struct virtfn_get_shift_methods {
> +	u16 vendor;
> +	u16 device;
> +	u16 (*get_shift)(struct pci_dev *dev, u16 arg, int arg2);
> +};
> +
> +#ifdef CONFIG_PCI_QUIRKS
> +u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2);
> +#else
> +static inline u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
> +{
> +	return (u16)1;
> +}
> +#endif
> +
>  #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64)
>  int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
>  			  struct resource *res);
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index da829274fc66..3466738c1c54 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -4085,6 +4085,43 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
>  	return -ENOTTY;
>  }
>  
> +static u16 resize_vf_bar0(struct pci_dev *dev, u16 num_VFs, int bar_num)
> +{
> +	u16 shift = 1;
> +
> +	if (bar_num == 0) {
> +		while ((1 << shift) <= num_VFs)
> +			shift += 1;
> +	}
> +	pci_info(dev, "with %d VFs, VF BAR%d get shift: %d\n", num_VFs, bar_num, shift);
> +	return shift;
> +}
> +
> +static const struct virtfn_get_shift_methods virtfn_get_shift_methods[] = {
> +	{ PCI_VENDOR_ID_ATI, 0x73a1, resize_vf_bar0},
> +	{ 0 }
> +};
> +
> +/*
> + * Get shift num to calculate SR-IOV device BAR. Sometimes the BAR size for
> + * SR-IOV device is too large and we want to calculate the size to define
> + * the end of virtfn.
> + */
> +u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
> +{
> +	const struct virtfn_get_shift_methods *i;
> +
> +	for (i = virtfn_get_shift_methods; i->get_shift; i++) {
> +		if ((i->vendor == dev->vendor ||
> +		     i->vendor == (u16)PCI_ANY_ID) &&
> +		    (i->device == dev->device ||
> +		     i->device == (u16)PCI_ANY_ID))
> +			return i->get_shift(dev, arg1, arg2);
> +	}
> +
> +	return (u16)1;
> +}
> +
>  static void quirk_dma_func0_alias(struct pci_dev *dev)
>  {
>  	if (PCI_FUNC(dev->devfn) != 0)
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs
  2022-10-11 11:19   ` Ma, Rui
@ 2022-10-11 14:05     ` Deucher, Alexander
  0 siblings, 0 replies; 9+ messages in thread
From: Deucher, Alexander @ 2022-10-11 14:05 UTC (permalink / raw)
  To: Ma, Rui, Bjorn Helgaas, kvm
  Cc: bhelgaas, linux-pci, linux-kernel, Koenig, Christian

[AMD Official Use Only - General]

> -----Original Message-----
> From: Ma, Rui <Rui.Ma@amd.com>
> Sent: Tuesday, October 11, 2022 7:19 AM
> To: Bjorn Helgaas <helgaas@kernel.org>
> Cc: bhelgaas@google.com; linux-pci@vger.kernel.org; linux-
> kernel@vger.kernel.org; Deucher, Alexander
> <Alexander.Deucher@amd.com>
> Subject: RE: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host
> memory occupied by PTEs
> 
> [AMD Official Use Only - General]
> 
> Hi Helgass:
> 	Thank you very much for your suggestions on my patch!
> 
> 	The patch is a device-specific behavior. In our AMD device SR-IOV,
> the actual VF BAR size is dependent on NumVFs too. If only one VF is
> created, the VF BAR size will depend on BAR probing algorithm as described
> in Section 9.3.3.14, but when several VFs created our own driver will
> decrease BAR0 memory size  according to NumVFs. So I want to add this
> quirk to keep Linux code and certain driver code consistent.
> 
> > Except that this doesn't affect the *starting* address of each VF BAR,
> which I guess is what you mean by "BAR memory mapping is always based on
> the initial device physical device."
> Yes we should not change the starting address or the device cannot load
> well.
> 
> > Well, I guess the device still describes its worst-case BAR size; the quirk
> basically just optimizes space usage.  Right?
> Yes.
> 
> > Aren't both virt and phys contiguous and nicely aligned for this case?  It
> seems like the perfect application for huge pages.
> It cannot use huge page though address aligned.

+ KVM, Christian

I think this is just working around a bug in KVM.  This is to avoid wasting huge amounts of memory for page tables due to KVM using 4K pages for some reason.  I think we should figure out why KVM is not using huge pages for this rather than working around this in the PCI layer.

Alex

> 
> >> +		shift = 1;
> >> +		shift = virtfn_get_shift(dev, iov->num_VFs, i);
> >Maybe more of the fiddling could be hidden in the quirk, e.g.,
> >  size = quirk_vf_bar_size(dev, iov->num_VFs, i);
> >  if (!size)
> >    size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> If I hide get-shift func in the quirk is it concise to call pci_iov_resource_size()
> in quirk?
> 
> 	And I solved other issues in the patch sent later. Thank you for your
> patience!
> 
> 
> Regards,
> Rui
> 
> -----Original Message-----
> From: Bjorn Helgaas <helgaas@kernel.org>
> Sent: Wednesday, September 28, 2022 6:07 AM
> To: Ma, Rui <Rui.Ma@amd.com>
> Cc: bhelgaas@google.com; linux-pci@vger.kernel.org; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host
> memory occupied by PTEs
> 
> On Mon, Sep 26, 2022 at 04:05:42PM +0800, Rui Ma wrote:
> > In SR_IOV scene, when the device physical space(such as Video RAM)is
> > fixed, as the number of VFs increases, the actual BAR memory space
> > used by each VF decreases.
> 
> s/SR_IOV/SR-IOV/ to match spec usage.
> 
> I think this is device-specific behavior, right?  I don't see anything in the PCIe
> spec about the BAR size being dependent on NumVFs.  If it's device-specific,
> it shouldn't be presented as "for all SR-IOV devices, the actual BAR memory
> space decreases as number of VFs increases."
> 
> > However, the BAR memory mapping is always based on the initial device
> > physical device. So do not map this unneeded memory can save host
> > memory occupied by PTEs. Although each PTE only occupies a few bytes
> > of space on its own, a large number of PTEs can still take up a lot of space.
> 
> So IIUC this is basically a quirk to override the "VF BAR aperture"
> size, which PCIe r6.0, sec 9.2.1.1.1 says is "determined by the usual BAR
> probing algorithm as described in Section 9.3.3.14."
> 
> Except that this doesn't affect the *starting* address of each VF BAR, which I
> guess is what you mean by "BAR memory mapping is always based on the
> initial device physical device."
> 
> Hmm.  This kind of breaks the "plug and play" model of PCI because the
> device is no longer self-describing.  Well, I guess the device still describes its
> worst-case BAR size; the quirk basically just optimizes space usage.  Right?
> 
> It's a shame if we can't reduce PTE usage by using hugeTLB pages for this.
> Aren't both virt and phys contiguous and nicely aligned for this case?  It
> seems like the perfect application for huge pages.
> 
> > Signed-off-by: Rui Ma <Rui.Ma@amd.com>
> > ---
> >  drivers/pci/iov.c    | 14 ++++++++++++--
> >  drivers/pci/pci.h    | 15 +++++++++++++++
> >  drivers/pci/quirks.c | 37 +++++++++++++++++++++++++++++++++++++
> >  3 files changed, 64 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index
> > 952217572113..6b9f9b6b9be1 100644
> > --- a/drivers/pci/iov.c
> > +++ b/drivers/pci/iov.c
> > @@ -296,6 +296,14 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
> >  	struct pci_sriov *iov = dev->sriov;
> >  	struct pci_bus *bus;
> >
> > +    /*
> > +     * Some SR-IOV device's BAR map range is larger than they can actually
> use.
> > +     * This extra BAR space occupy too much reverse mapping size(physical
> page
> > +     * back to the PTEs). So add a divisor shift parameter to resize the
> > +     * request resource of VF.
> > +     */
> > +	u16 shift = 1;
> > +
> >  	bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
> >  	if (!bus)
> >  		goto failed;
> > @@ -328,8 +336,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
> >  		virtfn->resource[i].name = pci_name(virtfn);
> >  		virtfn->resource[i].flags = res->flags;
> >  		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> > +		shift = 1;
> > +		shift = virtfn_get_shift(dev, iov->num_VFs, i);
> 
> Maybe more of the fiddling could be hidden in the quirk, e.g.,
> 
>   size = quirk_vf_bar_size(dev, iov->num_VFs, i);
>   if (!size)
>     size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> 
> >  		virtfn->resource[i].start = res->start + size * id;
> > -		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
> > +		virtfn->resource[i].end = virtfn->resource[i].start + (size >>
> > +(shift - 1)) - 1;
> >  		rc = request_resource(res, &virtfn->resource[i]);
> >  		BUG_ON(rc);
> >  	}
> > @@ -680,12 +690,12 @@ static int sriov_enable(struct pci_dev *dev, int
> nr_virtfn)
> >  	msleep(100);
> >  	pci_cfg_access_unlock(dev);
> >
> > +	iov->num_VFs = nr_virtfn;
> >  	rc = sriov_add_vfs(dev, initial);
> >  	if (rc)
> >  		goto err_pcibios;
> >
> >  	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
> > -	iov->num_VFs = nr_virtfn;
> >
> >  	return 0;
> >
> > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index
> > 3d60cabde1a1..befc67a280eb 100644
> > --- a/drivers/pci/pci.h
> > +++ b/drivers/pci/pci.h
> > @@ -603,6 +603,21 @@ static inline int pci_dev_specific_reset(struct
> > pci_dev *dev, bool probe)  }  #endif
> >
> > +struct virtfn_get_shift_methods {
> > +	u16 vendor;
> > +	u16 device;
> > +	u16 (*get_shift)(struct pci_dev *dev, u16 arg, int arg2); };
> > +
> > +#ifdef CONFIG_PCI_QUIRKS
> > +u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2); #else
> > +static inline u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int
> > +arg2) {
> > +	return (u16)1;
> > +}
> > +#endif
> > +
> >  #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64)  int
> > acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
> >  			  struct resource *res);
> > diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index
> > da829274fc66..add587919705 100644
> > --- a/drivers/pci/quirks.c
> > +++ b/drivers/pci/quirks.c
> > @@ -4085,6 +4085,43 @@ int pci_dev_specific_reset(struct pci_dev *dev,
> bool probe)
> >  	return -ENOTTY;
> >  }
> >
> > +static u16 divided_by_VF(struct pci_dev *dev, u16 num_VFs, int
> > +bar_num)
> 
> This is clearly ATI specific or at the very least specific to devices that divvy up
> BAR0 in special ways, so the name is a bit too generic.
> 
> > +{
> > +	u16 shift = 1;
> > +
> > +	if (bar_num == 0) {
> > +		while ((1 << shift) <= num_VFs)
> > +			shift += 1;
> > +	}
> > +	pci_info(dev, "BAR %d get shift: %d.\n", bar_num, shift);
> 
> Drop the period at end.  If we're changing the size, I think it would be useful
> to know num_VFs, BAR #, and the new size.  IIUC, "dev" here is the PF, so
> this is the "VF BAR", not the BAR 0 of the PF.
> 
> > +	return shift;
> > +}
> > +
> > +static const struct virtfn_get_shift_methods virtfn_get_shift_methods[] =
> {
> > +	{ PCI_VENDOR_ID_ATI, 0x73a1, divided_by_VF},
> > +	{ 0 }
> > +};
> > +
> > +/*
> > + * Get shift num to calculate SR-IOV device BAR. Sometimes the BAR
> > +size for
> > + * SR-IOV device is too large and we want to calculate the size to
> > +define
> > + * the end of virtfn.
> > + */
> > +u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2) {
> > +	const struct virtfn_get_shift_methods *i;
> > +
> > +	for (i = virtfn_get_shift_methods; i->get_shift; i++) {
> > +		if ((i->vendor == dev->vendor ||
> > +		     i->vendor == (u16)PCI_ANY_ID) &&
> > +		    (i->device == dev->device ||
> > +		     i->device == (u16)PCI_ANY_ID))
> > +			return i->get_shift(dev, arg1, arg2);
> > +	}
> > +
> > +	return (u16)1;
> > +}
> > +
> >  static void quirk_dma_func0_alias(struct pci_dev *dev)  {
> >  	if (PCI_FUNC(dev->devfn) != 0)
> > --
> > 2.25.1
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs
  2022-09-27 22:06 ` Bjorn Helgaas
@ 2022-10-11 11:19   ` Ma, Rui
  2022-10-11 14:05     ` Deucher, Alexander
  0 siblings, 1 reply; 9+ messages in thread
From: Ma, Rui @ 2022-10-11 11:19 UTC (permalink / raw)
  To: Bjorn Helgaas; +Cc: bhelgaas, linux-pci, linux-kernel, Deucher, Alexander

[AMD Official Use Only - General]

Hi Helgass:
	Thank you very much for your suggestions on my patch!

	The patch is a device-specific behavior. In our AMD device SR-IOV, the actual VF BAR size is dependent on NumVFs too. If only one VF is created, the VF BAR size will depend on BAR probing algorithm as described in Section 9.3.3.14, but when several
VFs created our own driver will decrease BAR0 memory size  according to NumVFs. So I want to add this quirk to keep Linux code and certain driver code consistent.

> Except that this doesn't affect the *starting* address of each VF BAR, which I guess is what you mean by "BAR memory mapping is always based on the initial device physical device."
Yes we should not change the starting address or the device cannot load well.

> Well, I guess the device still describes its worst-case BAR size; the quirk basically just optimizes space usage.  Right?
Yes.

> Aren't both virt and phys contiguous and nicely aligned for this case?  It seems like the perfect application for huge pages.
It cannot use huge page though address aligned.

>> +		shift = 1;
>> +		shift = virtfn_get_shift(dev, iov->num_VFs, i);
>Maybe more of the fiddling could be hidden in the quirk, e.g.,
>  size = quirk_vf_bar_size(dev, iov->num_VFs, i);
>  if (!size)
>    size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
If I hide get-shift func in the quirk is it concise to call pci_iov_resource_size() in quirk?

	And I solved other issues in the patch sent later. Thank you for your patience!


Regards,
Rui

-----Original Message-----
From: Bjorn Helgaas <helgaas@kernel.org> 
Sent: Wednesday, September 28, 2022 6:07 AM
To: Ma, Rui <Rui.Ma@amd.com>
Cc: bhelgaas@google.com; linux-pci@vger.kernel.org; linux-kernel@vger.kernel.org
Subject: Re: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs

On Mon, Sep 26, 2022 at 04:05:42PM +0800, Rui Ma wrote:
> In SR_IOV scene, when the device physical space(such as Video RAM)is 
> fixed, as the number of VFs increases, the actual BAR memory space 
> used by each VF decreases.

s/SR_IOV/SR-IOV/ to match spec usage.

I think this is device-specific behavior, right?  I don't see anything in the PCIe spec about the BAR size being dependent on NumVFs.  If it's device-specific, it shouldn't be presented as "for all SR-IOV devices, the actual BAR memory space decreases as number of VFs increases."

> However, the BAR memory mapping is always based on the initial device 
> physical device. So do not map this unneeded memory can save host 
> memory occupied by PTEs. Although each PTE only occupies a few bytes 
> of space on its own, a large number of PTEs can still take up a lot of space.

So IIUC this is basically a quirk to override the "VF BAR aperture"
size, which PCIe r6.0, sec 9.2.1.1.1 says is "determined by the usual BAR probing algorithm as described in Section 9.3.3.14."

Except that this doesn't affect the *starting* address of each VF BAR, which I guess is what you mean by "BAR memory mapping is always based on the initial device physical device."

Hmm.  This kind of breaks the "plug and play" model of PCI because the device is no longer self-describing.  Well, I guess the device still describes its worst-case BAR size; the quirk basically just optimizes space usage.  Right?

It's a shame if we can't reduce PTE usage by using hugeTLB pages for this.  Aren't both virt and phys contiguous and nicely aligned for this case?  It seems like the perfect application for huge pages.

> Signed-off-by: Rui Ma <Rui.Ma@amd.com>
> ---
>  drivers/pci/iov.c    | 14 ++++++++++++--
>  drivers/pci/pci.h    | 15 +++++++++++++++
>  drivers/pci/quirks.c | 37 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 64 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 
> 952217572113..6b9f9b6b9be1 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -296,6 +296,14 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
>  	struct pci_sriov *iov = dev->sriov;
>  	struct pci_bus *bus;
>  
> +    /*
> +     * Some SR-IOV device's BAR map range is larger than they can actually use.
> +     * This extra BAR space occupy too much reverse mapping size(physical page
> +     * back to the PTEs). So add a divisor shift parameter to resize the
> +     * request resource of VF.
> +     */
> +	u16 shift = 1;
> +
>  	bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
>  	if (!bus)
>  		goto failed;
> @@ -328,8 +336,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
>  		virtfn->resource[i].name = pci_name(virtfn);
>  		virtfn->resource[i].flags = res->flags;
>  		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> +		shift = 1;
> +		shift = virtfn_get_shift(dev, iov->num_VFs, i);

Maybe more of the fiddling could be hidden in the quirk, e.g.,

  size = quirk_vf_bar_size(dev, iov->num_VFs, i);
  if (!size)
    size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);

>  		virtfn->resource[i].start = res->start + size * id;
> -		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
> +		virtfn->resource[i].end = virtfn->resource[i].start + (size >> 
> +(shift - 1)) - 1;
>  		rc = request_resource(res, &virtfn->resource[i]);
>  		BUG_ON(rc);
>  	}
> @@ -680,12 +690,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>  	msleep(100);
>  	pci_cfg_access_unlock(dev);
>  
> +	iov->num_VFs = nr_virtfn;
>  	rc = sriov_add_vfs(dev, initial);
>  	if (rc)
>  		goto err_pcibios;
>  
>  	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
> -	iov->num_VFs = nr_virtfn;
>  
>  	return 0;
>  
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 
> 3d60cabde1a1..befc67a280eb 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -603,6 +603,21 @@ static inline int pci_dev_specific_reset(struct 
> pci_dev *dev, bool probe)  }  #endif
>  
> +struct virtfn_get_shift_methods {
> +	u16 vendor;
> +	u16 device;
> +	u16 (*get_shift)(struct pci_dev *dev, u16 arg, int arg2); };
> +
> +#ifdef CONFIG_PCI_QUIRKS
> +u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2); #else 
> +static inline u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int 
> +arg2) {
> +	return (u16)1;
> +}
> +#endif
> +
>  #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64)  int 
> acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
>  			  struct resource *res);
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 
> da829274fc66..add587919705 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -4085,6 +4085,43 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
>  	return -ENOTTY;
>  }
>  
> +static u16 divided_by_VF(struct pci_dev *dev, u16 num_VFs, int 
> +bar_num)

This is clearly ATI specific or at the very least specific to devices that divvy up BAR0 in special ways, so the name is a bit too generic.

> +{
> +	u16 shift = 1;
> +
> +	if (bar_num == 0) {
> +		while ((1 << shift) <= num_VFs)
> +			shift += 1;
> +	}
> +	pci_info(dev, "BAR %d get shift: %d.\n", bar_num, shift);

Drop the period at end.  If we're changing the size, I think it would be useful to know num_VFs, BAR #, and the new size.  IIUC, "dev" here is the PF, so this is the "VF BAR", not the BAR 0 of the PF.

> +	return shift;
> +}
> +
> +static const struct virtfn_get_shift_methods virtfn_get_shift_methods[] = {
> +	{ PCI_VENDOR_ID_ATI, 0x73a1, divided_by_VF},
> +	{ 0 }
> +};
> +
> +/*
> + * Get shift num to calculate SR-IOV device BAR. Sometimes the BAR 
> +size for
> + * SR-IOV device is too large and we want to calculate the size to 
> +define
> + * the end of virtfn.
> + */
> +u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2) {
> +	const struct virtfn_get_shift_methods *i;
> +
> +	for (i = virtfn_get_shift_methods; i->get_shift; i++) {
> +		if ((i->vendor == dev->vendor ||
> +		     i->vendor == (u16)PCI_ANY_ID) &&
> +		    (i->device == dev->device ||
> +		     i->device == (u16)PCI_ANY_ID))
> +			return i->get_shift(dev, arg1, arg2);
> +	}
> +
> +	return (u16)1;
> +}
> +
>  static void quirk_dma_func0_alias(struct pci_dev *dev)  {
>  	if (PCI_FUNC(dev->devfn) != 0)
> --
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs
  2022-09-26  8:05 Rui Ma
@ 2022-09-27 22:06 ` Bjorn Helgaas
  2022-10-11 11:19   ` Ma, Rui
  0 siblings, 1 reply; 9+ messages in thread
From: Bjorn Helgaas @ 2022-09-27 22:06 UTC (permalink / raw)
  To: Rui Ma; +Cc: bhelgaas, linux-pci, linux-kernel

On Mon, Sep 26, 2022 at 04:05:42PM +0800, Rui Ma wrote:
> In SR_IOV scene, when the device physical space(such as Video RAM)is fixed,
> as the number of VFs increases, the actual BAR memory space used by each VF
> decreases.

s/SR_IOV/SR-IOV/ to match spec usage.

I think this is device-specific behavior, right?  I don't see anything
in the PCIe spec about the BAR size being dependent on NumVFs.  If
it's device-specific, it shouldn't be presented as "for all SR-IOV
devices, the actual BAR memory space decreases as number of VFs
increases."

> However, the BAR memory mapping is always based on the initial
> device physical device. So do not map this unneeded memory can save host
> memory occupied by PTEs. Although each PTE only occupies a few bytes of
> space on its own, a large number of PTEs can still take up a lot of space.

So IIUC this is basically a quirk to override the "VF BAR aperture"
size, which PCIe r6.0, sec 9.2.1.1.1 says is "determined by the usual
BAR probing algorithm as described in Section 9.3.3.14."

Except that this doesn't affect the *starting* address of each VF BAR,
which I guess is what you mean by "BAR memory mapping is always based
on the initial device physical device."

Hmm.  This kind of breaks the "plug and play" model of PCI because the
device is no longer self-describing.  Well, I guess the device still
describes its worst-case BAR size; the quirk basically just optimizes
space usage.  Right?

It's a shame if we can't reduce PTE usage by using hugeTLB pages for
this.  Aren't both virt and phys contiguous and nicely aligned for
this case?  It seems like the perfect application for huge pages.

> Signed-off-by: Rui Ma <Rui.Ma@amd.com>
> ---
>  drivers/pci/iov.c    | 14 ++++++++++++--
>  drivers/pci/pci.h    | 15 +++++++++++++++
>  drivers/pci/quirks.c | 37 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 64 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 952217572113..6b9f9b6b9be1 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -296,6 +296,14 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
>  	struct pci_sriov *iov = dev->sriov;
>  	struct pci_bus *bus;
>  
> +    /*
> +     * Some SR-IOV device's BAR map range is larger than they can actually use.
> +     * This extra BAR space occupy too much reverse mapping size(physical page
> +     * back to the PTEs). So add a divisor shift parameter to resize the
> +     * request resource of VF.
> +     */
> +	u16 shift = 1;
> +
>  	bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
>  	if (!bus)
>  		goto failed;
> @@ -328,8 +336,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
>  		virtfn->resource[i].name = pci_name(virtfn);
>  		virtfn->resource[i].flags = res->flags;
>  		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> +		shift = 1;
> +		shift = virtfn_get_shift(dev, iov->num_VFs, i);

Maybe more of the fiddling could be hidden in the quirk, e.g.,

  size = quirk_vf_bar_size(dev, iov->num_VFs, i);
  if (!size)
    size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);

>  		virtfn->resource[i].start = res->start + size * id;
> -		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
> +		virtfn->resource[i].end = virtfn->resource[i].start + (size >> (shift - 1)) - 1;
>  		rc = request_resource(res, &virtfn->resource[i]);
>  		BUG_ON(rc);
>  	}
> @@ -680,12 +690,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>  	msleep(100);
>  	pci_cfg_access_unlock(dev);
>  
> +	iov->num_VFs = nr_virtfn;
>  	rc = sriov_add_vfs(dev, initial);
>  	if (rc)
>  		goto err_pcibios;
>  
>  	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
> -	iov->num_VFs = nr_virtfn;
>  
>  	return 0;
>  
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 3d60cabde1a1..befc67a280eb 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -603,6 +603,21 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
>  }
>  #endif
>  
> +struct virtfn_get_shift_methods {
> +	u16 vendor;
> +	u16 device;
> +	u16 (*get_shift)(struct pci_dev *dev, u16 arg, int arg2);
> +};
> +
> +#ifdef CONFIG_PCI_QUIRKS
> +u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2);
> +#else
> +static inline u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
> +{
> +	return (u16)1;
> +}
> +#endif
> +
>  #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64)
>  int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
>  			  struct resource *res);
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index da829274fc66..add587919705 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -4085,6 +4085,43 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
>  	return -ENOTTY;
>  }
>  
> +static u16 divided_by_VF(struct pci_dev *dev, u16 num_VFs, int bar_num)

This is clearly ATI specific or at the very least specific to devices
that divvy up BAR0 in special ways, so the name is a bit too generic.

> +{
> +	u16 shift = 1;
> +
> +	if (bar_num == 0) {
> +		while ((1 << shift) <= num_VFs)
> +			shift += 1;
> +	}
> +	pci_info(dev, "BAR %d get shift: %d.\n", bar_num, shift);

Drop the period at end.  If we're changing the size, I think it would
be useful to know num_VFs, BAR #, and the new size.  IIUC, "dev" here
is the PF, so this is the "VF BAR", not the BAR 0 of the PF.

> +	return shift;
> +}
> +
> +static const struct virtfn_get_shift_methods virtfn_get_shift_methods[] = {
> +	{ PCI_VENDOR_ID_ATI, 0x73a1, divided_by_VF},
> +	{ 0 }
> +};
> +
> +/*
> + * Get shift num to calculate SR-IOV device BAR. Sometimes the BAR size for
> + * SR-IOV device is too large and we want to calculate the size to define
> + * the end of virtfn.
> + */
> +u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
> +{
> +	const struct virtfn_get_shift_methods *i;
> +
> +	for (i = virtfn_get_shift_methods; i->get_shift; i++) {
> +		if ((i->vendor == dev->vendor ||
> +		     i->vendor == (u16)PCI_ANY_ID) &&
> +		    (i->device == dev->device ||
> +		     i->device == (u16)PCI_ANY_ID))
> +			return i->get_shift(dev, arg1, arg2);
> +	}
> +
> +	return (u16)1;
> +}
> +
>  static void quirk_dma_func0_alias(struct pci_dev *dev)
>  {
>  	if (PCI_FUNC(dev->devfn) != 0)
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs
@ 2022-09-26  8:05 Rui Ma
  2022-09-27 22:06 ` Bjorn Helgaas
  0 siblings, 1 reply; 9+ messages in thread
From: Rui Ma @ 2022-09-26  8:05 UTC (permalink / raw)
  To: bhelgaas, helgaas; +Cc: linux-pci, linux-kernel, Rui Ma

In SR_IOV scene, when the device physical space(such as Video RAM)is fixed,
as the number of VFs increases, the actual BAR memory space used by each VF
decreases. However, the BAR memory mapping is always based on the initial
device physical device. So do not map this unneeded memory can save host
memory occupied by PTEs. Although each PTE only occupies a few bytes of
space on its own, a large number of PTEs can still take up a lot of space.

Signed-off-by: Rui Ma <Rui.Ma@amd.com>
---
 drivers/pci/iov.c    | 14 ++++++++++++--
 drivers/pci/pci.h    | 15 +++++++++++++++
 drivers/pci/quirks.c | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 952217572113..6b9f9b6b9be1 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -296,6 +296,14 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 	struct pci_sriov *iov = dev->sriov;
 	struct pci_bus *bus;
 
+    /*
+     * Some SR-IOV device's BAR map range is larger than they can actually use.
+     * This extra BAR space occupy too much reverse mapping size(physical page
+     * back to the PTEs). So add a divisor shift parameter to resize the
+     * request resource of VF.
+     */
+	u16 shift = 1;
+
 	bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
 	if (!bus)
 		goto failed;
@@ -328,8 +336,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 		virtfn->resource[i].name = pci_name(virtfn);
 		virtfn->resource[i].flags = res->flags;
 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		shift = 1;
+		shift = virtfn_get_shift(dev, iov->num_VFs, i);
 		virtfn->resource[i].start = res->start + size * id;
-		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
+		virtfn->resource[i].end = virtfn->resource[i].start + (size >> (shift - 1)) - 1;
 		rc = request_resource(res, &virtfn->resource[i]);
 		BUG_ON(rc);
 	}
@@ -680,12 +690,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 	msleep(100);
 	pci_cfg_access_unlock(dev);
 
+	iov->num_VFs = nr_virtfn;
 	rc = sriov_add_vfs(dev, initial);
 	if (rc)
 		goto err_pcibios;
 
 	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
-	iov->num_VFs = nr_virtfn;
 
 	return 0;
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 3d60cabde1a1..befc67a280eb 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -603,6 +603,21 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
 }
 #endif
 
+struct virtfn_get_shift_methods {
+	u16 vendor;
+	u16 device;
+	u16 (*get_shift)(struct pci_dev *dev, u16 arg, int arg2);
+};
+
+#ifdef CONFIG_PCI_QUIRKS
+u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2);
+#else
+static inline u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
+{
+	return (u16)1;
+}
+#endif
+
 #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64)
 int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
 			  struct resource *res);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index da829274fc66..add587919705 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4085,6 +4085,43 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
 	return -ENOTTY;
 }
 
+static u16 divided_by_VF(struct pci_dev *dev, u16 num_VFs, int bar_num)
+{
+	u16 shift = 1;
+
+	if (bar_num == 0) {
+		while ((1 << shift) <= num_VFs)
+			shift += 1;
+	}
+	pci_info(dev, "BAR %d get shift: %d.\n", bar_num, shift);
+	return shift;
+}
+
+static const struct virtfn_get_shift_methods virtfn_get_shift_methods[] = {
+	{ PCI_VENDOR_ID_ATI, 0x73a1, divided_by_VF},
+	{ 0 }
+};
+
+/*
+ * Get shift num to calculate SR-IOV device BAR. Sometimes the BAR size for
+ * SR-IOV device is too large and we want to calculate the size to define
+ * the end of virtfn.
+ */
+u16 virtfn_get_shift(struct pci_dev *dev, u16 arg1, int arg2)
+{
+	const struct virtfn_get_shift_methods *i;
+
+	for (i = virtfn_get_shift_methods; i->get_shift; i++) {
+		if ((i->vendor == dev->vendor ||
+		     i->vendor == (u16)PCI_ANY_ID) &&
+		    (i->device == dev->device ||
+		     i->device == (u16)PCI_ANY_ID))
+			return i->get_shift(dev, arg1, arg2);
+	}
+
+	return (u16)1;
+}
+
 static void quirk_dma_func0_alias(struct pci_dev *dev)
 {
 	if (PCI_FUNC(dev->devfn) != 0)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-11-09 23:46 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-11 11:23 [PATCH] PCI/IOV: Decrease VF memory BAR size to save host memory occupied by PTEs Rui Ma
2022-10-11 11:37 ` Leon Romanovsky
2022-10-11 11:48   ` David Laight
2022-10-17  6:26 ` Christoph Hellwig
2022-11-09 23:46 ` Bjorn Helgaas
  -- strict thread matches above, loose matches on Subject: below --
2022-09-26  8:05 Rui Ma
2022-09-27 22:06 ` Bjorn Helgaas
2022-10-11 11:19   ` Ma, Rui
2022-10-11 14:05     ` Deucher, Alexander

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).