linux-hyperv.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
@ 2019-05-19 22:28 Haiyang Zhang
  2019-05-22  3:14 ` Dexuan Cui
  2019-05-24 22:15 ` Sasha Levin
  0 siblings, 2 replies; 9+ messages in thread
From: Haiyang Zhang @ 2019-05-19 22:28 UTC (permalink / raw)
  To: sashal, bhelgaas, lorenzo.pieralisi, linux-hyperv, linux-pci
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger, olaf, vkuznets,
	linux-kernel

Due to Azure host agent settings, the device instance ID's bytes 8 and 9
are no longer unique. This causes some of the PCI devices not showing up
in VMs with multiple passthrough devices, such as GPUs. So, as recommended
by Azure host team, we now use the bytes 4 and 5 which usually provide
unique numbers.

In the rare cases of collision, we will detect and find another number
that is not in use.
Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this idea.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/pci/controller/pci-hyperv.c | 91 +++++++++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 82acd61..6b9cc6e60a 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -37,6 +37,8 @@
  * the PCI back-end driver in Hyper-V.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
@@ -2507,6 +2509,47 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
 		complete(&hbus->remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/* PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+	unsigned int i;
+
+	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+		return dom;
+
+	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+		if (test_and_set_bit(i, hvpci_dom_map) == 0)
+			return i;
+	}
+
+	return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+	clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:	VMBus's tracking struct for this root PCI bus
@@ -2518,6 +2561,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 			const struct hv_vmbus_device_id *dev_id)
 {
 	struct hv_pcibus_device *hbus;
+	u16 dom_req, dom;
 	int ret;
 
 	/*
@@ -2532,19 +2576,32 @@ static int hv_pci_probe(struct hv_device *hdev,
 	hbus->state = hv_pcibus_init;
 
 	/*
-	 * The PCI bus "domain" is what is called "segment" in ACPI and
-	 * other specs.  Pull it from the instance ID, to get something
-	 * unique.  Bytes 8 and 9 are what is used in Windows guests, so
-	 * do the same thing for consistency.  Note that, since this code
-	 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-	 * that (1) the only domain in use for something that looks like
-	 * a physical PCI bus (which is actually emulated by the
-	 * hypervisor) is domain 0 and (2) there will be no overlap
-	 * between domains derived from these instance IDs in the same
-	 * VM.
+	 * The PCI bus "domain" is what is called "segment" in ACPI and other
+	 * specs. Pull it from the instance ID, to get something usually
+	 * unique. In rare cases of collision, we will find out another number
+	 * not in use.
+	 * Note that, since this code only runs in a Hyper-V VM, Hyper-V
+	 * together with this guest driver can guarantee that (1) The only
+	 * domain used by Gen1 VMs for something that looks like a physical
+	 * PCI bus (which is actually emulated by the hypervisor) is domain 0.
+	 * (2) There will be no overlap between domains (after fixing possible
+	 * collisions) in the same VM.
 	 */
-	hbus->sysdata.domain = hdev->dev_instance.b[9] |
-			       hdev->dev_instance.b[8] << 8;
+	dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
+	dom = hv_get_dom_num(dom_req);
+
+	if (dom == HVPCI_DOM_INVALID) {
+		pr_err("Unable to use dom# 0x%hx or other numbers",
+		       dom_req);
+		ret = -EINVAL;
+		goto free_bus;
+	}
+
+	if (dom != dom_req)
+		pr_info("PCI dom# 0x%hx has collision, using 0x%hx",
+			dom_req, dom);
+
+	hbus->sysdata.domain = dom;
 
 	hbus->hdev = hdev;
 	refcount_set(&hbus->remove_lock, 1);
@@ -2559,7 +2616,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 					   hbus->sysdata.domain);
 	if (!hbus->wq) {
 		ret = -ENOMEM;
-		goto free_bus;
+		goto free_dom;
 	}
 
 	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
@@ -2636,6 +2693,8 @@ static int hv_pci_probe(struct hv_device *hdev,
 	vmbus_close(hdev->channel);
 destroy_wq:
 	destroy_workqueue(hbus->wq);
+free_dom:
+	hv_put_dom_num(hbus->sysdata.domain);
 free_bus:
 	free_page((unsigned long)hbus);
 	return ret;
@@ -2717,6 +2776,9 @@ static int hv_pci_remove(struct hv_device *hdev)
 	put_hvpcibus(hbus);
 	wait_for_completion(&hbus->remove_event);
 	destroy_workqueue(hbus->wq);
+
+	hv_put_dom_num(hbus->sysdata.domain);
+
 	free_page((unsigned long)hbus);
 	return 0;
 }
@@ -2744,6 +2806,9 @@ static void __exit exit_hv_pci_drv(void)
 
 static int __init init_hv_pci_drv(void)
 {
+	/* Set the invalid domain number's bit, so it will not be used */
+	set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
+
 	return vmbus_driver_register(&hv_pci_drv);
 }
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
  2019-05-19 22:28 [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision Haiyang Zhang
@ 2019-05-22  3:14 ` Dexuan Cui
       [not found]   ` <BYAPR21MB131778CC3A356745BC74EE3DCC000@BYAPR21MB1317.namprd21.prod.outlook.com>
  2019-05-24 22:15 ` Sasha Levin
  1 sibling, 1 reply; 9+ messages in thread
From: Dexuan Cui @ 2019-05-22  3:14 UTC (permalink / raw)
  To: Haiyang Zhang, sashal, bhelgaas, lorenzo.pieralisi, linux-hyperv,
	linux-pci
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger, olaf, vkuznets,
	linux-kernel

> From: linux-hyperv-owner@vger.kernel.org
> Sent: Sunday, May 19, 2019 3:29 PM
> 
> Due to Azure host agent settings, the device instance ID's bytes 8 and 9
> are no longer unique. This causes some of the PCI devices not showing up
> in VMs with multiple passthrough devices, such as GPUs. So, as recommended
> by Azure host team, we now use the bytes 4 and 5 which usually provide
> unique numbers.
> 
> In the rare cases of collision, we will detect and find another number
> that is not in use.
> Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this idea.
> 
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>

The patch looks good to me.

Reviewed-by: Dexuan Cui <decui@microsoft.com>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
       [not found]   ` <BYAPR21MB131778CC3A356745BC74EE3DCC000@BYAPR21MB1317.namprd21.prod.outlook.com>
@ 2019-05-22  5:14     ` Dexuan Cui
  0 siblings, 0 replies; 9+ messages in thread
From: Dexuan Cui @ 2019-05-22  5:14 UTC (permalink / raw)
  To: Stephen Hemminger, Haiyang Zhang, sashal, bhelgaas,
	lorenzo.pieralisi, linux-hyperv, linux-pci
  Cc: KY Srinivasan, olaf, vkuznets, linux-kernel

> From: Stephen Hemminger <sthemmin@microsoft.com> 
> Sent: Tuesday, May 21, 2019 9:40 PM
>
> Thanks for working this out with the host team.
> Now if we could just get some persistent slot information it would make udev in systemd happy.

The slot info comes from the serial number "hpdev->desc.ser", which may not guarantee
uniqueness: "...Hyper-V doesn't provide unique serial numbers": see the changelog of the below
commit:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=29927dfb7f69bcf2ae7fd1cda10997e646a5189c

And we can pass through multiple devices to a VM in any order (i.e. hot add/remove), so IMO
the "slot" info is not unique and persistent.

Thanks,
-- Dexuan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
  2019-05-19 22:28 [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision Haiyang Zhang
  2019-05-22  3:14 ` Dexuan Cui
@ 2019-05-24 22:15 ` Sasha Levin
  1 sibling, 0 replies; 9+ messages in thread
From: Sasha Levin @ 2019-05-24 22:15 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: bhelgaas, lorenzo.pieralisi, linux-hyperv, linux-pci,
	KY Srinivasan, Stephen Hemminger, olaf, vkuznets, linux-kernel

On Sun, May 19, 2019 at 10:28:37PM +0000, Haiyang Zhang wrote:
>Due to Azure host agent settings, the device instance ID's bytes 8 and 9
>are no longer unique. This causes some of the PCI devices not showing up
>in VMs with multiple passthrough devices, such as GPUs. So, as recommended
>by Azure host team, we now use the bytes 4 and 5 which usually provide
>unique numbers.
>
>In the rare cases of collision, we will detect and find another number
>that is not in use.
>Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this idea.
>
>Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>

Queued up for hyperv-fixes, thank you.

--
Thanks,
Sasha

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
  2019-08-06 18:55 ` Bjorn Helgaas
@ 2019-08-06 19:45   ` Haiyang Zhang
  0 siblings, 0 replies; 9+ messages in thread
From: Haiyang Zhang @ 2019-08-06 19:45 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: sashal, lorenzo.pieralisi, linux-hyperv, linux-pci,
	KY Srinivasan, Stephen Hemminger, olaf, vkuznets, linux-kernel



> -----Original Message-----
> From: Bjorn Helgaas <helgaas@kernel.org>
> Sent: Tuesday, August 6, 2019 2:55 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: sashal@kernel.org; lorenzo.pieralisi@arm.com; linux-
> hyperv@vger.kernel.org; linux-pci@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Stephen Hemminger <sthemmin@microsoft.com>;
> olaf@aepfle.de; vkuznets <vkuznets@redhat.com>; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number
> collision
> 
> On Fri, Aug 02, 2019 at 06:52:56PM +0000, Haiyang Zhang wrote:
> > Due to Azure host agent settings, the device instance ID's bytes 8 and
> > 9 are no longer unique. This causes some of the PCI devices not
> > showing up in VMs with multiple passthrough devices, such as GPUs. So,
> > as recommended by Azure host team, we now use the bytes 4 and 5 which
> > usually provide unique numbers.
> 
> What does "Azure host agent settings" mean?  Would it be useful to say
> something more specific, so users could ready this and say "oh, I'm using the
> Azure host agent settings mentioned here, so I need this patch"?  Is this
> related to a specific Azure host agent commit or release?
> 
> "This causes some of the PCI devices ..." is not a sentence.  I think I
> understand what you're saying -- "This sometimes causes device passthrough
> to VMs to fail." Is there something about GPUs that makes them more
> susceptible to this problem?
> 
> I think there are really two changes in this patch:
> 
>   1) Start with a domain number from bytes 4-5 instead of bytes 8-9.
> 
>   2) If the domain number is not unique, allocate another one using
>   the bitmap.
> 
> It sounds like part 2) by itself would be enough to solve the problem, and
> including part 1) just reduces the likelihood of having to allocate another
> domain number.
> 
> > In the rare cases of collision, we will detect and find another number
> > that is not in use.
> > Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this
> idea.
> 
> This looks like two paragraphs and should have a blank line between them.
> 
> > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> > ---
> >  drivers/pci/controller/pci-hyperv.c | 91
> > +++++++++++++++++++++++++++++++------
> >  1 file changed, 78 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/pci/controller/pci-hyperv.c
> > b/drivers/pci/controller/pci-hyperv.c
> > index 82acd61..6b9cc6e60a 100644
> > --- a/drivers/pci/controller/pci-hyperv.c
> > +++ b/drivers/pci/controller/pci-hyperv.c
> > @@ -37,6 +37,8 @@
> >   * the PCI back-end driver in Hyper-V.
> >   */
> >
> > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> > +
> >  #include <linux/kernel.h>
> >  #include <linux/module.h>
> >  #include <linux/pci.h>
> > @@ -2507,6 +2509,47 @@ static void put_hvpcibus(struct
> hv_pcibus_device *hbus)
> >  		complete(&hbus->remove_event);
> >  }
> >
> > +#define HVPCI_DOM_MAP_SIZE (64 * 1024) static
> > +DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
> > +
> > +/* PCI domain number 0 is used by emulated devices on Gen1 VMs, so
> > +define 0
> > + * as invalid for passthrough PCI devices of this driver.
> > + */
> 
> Please use the usual multi-line comment style:
> 
>   /*
>    * PCI domain number ...
>    */
> 
> > +#define HVPCI_DOM_INVALID 0
> > +
> > +/**
> > + * hv_get_dom_num() - Get a valid PCI domain number
> > + * Check if the PCI domain number is in use, and return another
> > +number if
> > + * it is in use.
> > + *
> > + * @dom: Requested domain number
> > + *
> > + * return: domain number on success, HVPCI_DOM_INVALID on failure  */
> > +static u16 hv_get_dom_num(u16 dom) {
> > +	unsigned int i;
> > +
> > +	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
> > +		return dom;
> > +
> > +	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
> > +		if (test_and_set_bit(i, hvpci_dom_map) == 0)
> > +			return i;
> > +	}
> > +
> > +	return HVPCI_DOM_INVALID;
> > +}
> > +
> > +/**
> > + * hv_put_dom_num() - Mark the PCI domain number as free
> > + * @dom: Domain number to be freed
> > + */
> > +static void hv_put_dom_num(u16 dom)
> > +{
> > +	clear_bit(dom, hvpci_dom_map);
> > +}
> > +
> >  /**
> >   * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
> >   * @hdev:	VMBus's tracking struct for this root PCI bus
> > @@ -2518,6 +2561,7 @@ static int hv_pci_probe(struct hv_device *hdev,
> >  			const struct hv_vmbus_device_id *dev_id)  {
> >  	struct hv_pcibus_device *hbus;
> > +	u16 dom_req, dom;
> >  	int ret;
> >
> >  	/*
> > @@ -2532,19 +2576,32 @@ static int hv_pci_probe(struct hv_device
> *hdev,
> >  	hbus->state = hv_pcibus_init;
> >
> >  	/*
> > -	 * The PCI bus "domain" is what is called "segment" in ACPI and
> > -	 * other specs.  Pull it from the instance ID, to get something
> > -	 * unique.  Bytes 8 and 9 are what is used in Windows guests, so
> > -	 * do the same thing for consistency.  Note that, since this code
> > -	 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
> > -	 * that (1) the only domain in use for something that looks like
> > -	 * a physical PCI bus (which is actually emulated by the
> > -	 * hypervisor) is domain 0 and (2) there will be no overlap
> > -	 * between domains derived from these instance IDs in the same
> > -	 * VM.
> > +	 * The PCI bus "domain" is what is called "segment" in ACPI and other
> > +	 * specs. Pull it from the instance ID, to get something usually
> > +	 * unique. In rare cases of collision, we will find out another number
> > +	 * not in use.
> > +	 * Note that, since this code only runs in a Hyper-V VM, Hyper-V
> > +	 * together with this guest driver can guarantee that (1) The only
> > +	 * domain used by Gen1 VMs for something that looks like a physical
> > +	 * PCI bus (which is actually emulated by the hypervisor) is domain 0.
> > +	 * (2) There will be no overlap between domains (after fixing possible
> > +	 * collisions) in the same VM.
> 
> Please use blank lines between paragraphs.
> 
> >  	 */
> > -	hbus->sysdata.domain = hdev->dev_instance.b[9] |
> > -			       hdev->dev_instance.b[8] << 8;
> > +	dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
> > +	dom = hv_get_dom_num(dom_req);
> > +
> > +	if (dom == HVPCI_DOM_INVALID) {
> > +		pr_err("Unable to use dom# 0x%hx or other numbers",
> > +		       dom_req);
> > +		ret = -EINVAL;
> > +		goto free_bus;
> > +	}
> > +
> > +	if (dom != dom_req)
> > +		pr_info("PCI dom# 0x%hx has collision, using 0x%hx",
> > +			dom_req, dom);
> 
> Can these use "dev_err/info(&hdev->device, ...)" like the other message in
> this function?  It's always nicer to have a specific device reference when one
> is available.  Probably don't need the new pr_fmt() definition if you do this.
> 
> > +
> > +	hbus->sysdata.domain = dom;
> >
> >  	hbus->hdev = hdev;
> >  	refcount_set(&hbus->remove_lock, 1); @@ -2559,7 +2616,7 @@
> static
> > int hv_pci_probe(struct hv_device *hdev,
> >  					   hbus->sysdata.domain);
> >  	if (!hbus->wq) {
> >  		ret = -ENOMEM;
> > -		goto free_bus;
> > +		goto free_dom;
> >  	}
> >
> >  	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL,
> > 0, @@ -2636,6 +2693,8 @@ static int hv_pci_probe(struct hv_device
> *hdev,
> >  	vmbus_close(hdev->channel);
> >  destroy_wq:
> >  	destroy_workqueue(hbus->wq);
> > +free_dom:
> > +	hv_put_dom_num(hbus->sysdata.domain);
> >  free_bus:
> >  	free_page((unsigned long)hbus);
> >  	return ret;
> > @@ -2717,6 +2776,9 @@ static int hv_pci_remove(struct hv_device *hdev)
> >  	put_hvpcibus(hbus);
> >  	wait_for_completion(&hbus->remove_event);
> >  	destroy_workqueue(hbus->wq);
> > +
> > +	hv_put_dom_num(hbus->sysdata.domain);
> > +
> >  	free_page((unsigned long)hbus);
> >  	return 0;
> >  }
> > @@ -2744,6 +2806,9 @@ static void __exit exit_hv_pci_drv(void)
> >
> >  static int __init init_hv_pci_drv(void)  {
> > +	/* Set the invalid domain number's bit, so it will not be used */
> > +	set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
> > +
> >  	return vmbus_driver_register(&hv_pci_drv);
> >  }
> >
> > --
> > 1.8.3.1

Thanks for the comments. I will make the recommended changes.

- Haiyang

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
  2019-08-05 18:36 ` Sasha Levin
@ 2019-08-06 18:55   ` Bjorn Helgaas
  0 siblings, 0 replies; 9+ messages in thread
From: Bjorn Helgaas @ 2019-08-06 18:55 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Haiyang Zhang, lorenzo.pieralisi, linux-hyperv, linux-pci,
	KY Srinivasan, Stephen Hemminger, olaf, vkuznets, linux-kernel

On Mon, Aug 05, 2019 at 02:36:57PM -0400, Sasha Levin wrote:
> On Fri, Aug 02, 2019 at 06:52:56PM +0000, Haiyang Zhang wrote:
> > Due to Azure host agent settings, the device instance ID's bytes 8 and 9
> > are no longer unique. This causes some of the PCI devices not showing up
> > in VMs with multiple passthrough devices, such as GPUs. So, as recommended
> > by Azure host team, we now use the bytes 4 and 5 which usually provide
> > unique numbers.
> > 
> > In the rare cases of collision, we will detect and find another number
> > that is not in use.
> > Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this idea.

> > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> 
> Acked-by: Sasha Levin <sashal@kernel.org>
> 
> Bjorn, will you take it through the PCI tree or do you want me to take
> it through hyper-v?

Lorenzo usually applies patches to drivers/pci/controller/*, so that
would be my preference.

Bjorn

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
  2019-08-02 18:52 Haiyang Zhang
  2019-08-05 18:36 ` Sasha Levin
@ 2019-08-06 18:55 ` Bjorn Helgaas
  2019-08-06 19:45   ` Haiyang Zhang
  1 sibling, 1 reply; 9+ messages in thread
From: Bjorn Helgaas @ 2019-08-06 18:55 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: sashal, lorenzo.pieralisi, linux-hyperv, linux-pci,
	KY Srinivasan, Stephen Hemminger, olaf, vkuznets, linux-kernel

On Fri, Aug 02, 2019 at 06:52:56PM +0000, Haiyang Zhang wrote:
> Due to Azure host agent settings, the device instance ID's bytes 8 and 9
> are no longer unique. This causes some of the PCI devices not showing up
> in VMs with multiple passthrough devices, such as GPUs. So, as recommended
> by Azure host team, we now use the bytes 4 and 5 which usually provide
> unique numbers.

What does "Azure host agent settings" mean?  Would it be useful to say
something more specific, so users could ready this and say "oh, I'm
using the Azure host agent settings mentioned here, so I need this
patch"?  Is this related to a specific Azure host agent commit or
release?

"This causes some of the PCI devices ..." is not a sentence.  I think
I understand what you're saying -- "This sometimes causes device
passthrough to VMs to fail." Is there something about GPUs that makes
them more susceptible to this problem?

I think there are really two changes in this patch:

  1) Start with a domain number from bytes 4-5 instead of bytes 8-9.

  2) If the domain number is not unique, allocate another one using
  the bitmap.

It sounds like part 2) by itself would be enough to solve the problem,
and including part 1) just reduces the likelihood of having to
allocate another domain number.

> In the rare cases of collision, we will detect and find another number
> that is not in use.
> Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this idea.

This looks like two paragraphs and should have a blank line between
them.

> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> ---
>  drivers/pci/controller/pci-hyperv.c | 91 +++++++++++++++++++++++++++++++------
>  1 file changed, 78 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index 82acd61..6b9cc6e60a 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -37,6 +37,8 @@
>   * the PCI back-end driver in Hyper-V.
>   */
>  
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
>  #include <linux/kernel.h>
>  #include <linux/module.h>
>  #include <linux/pci.h>
> @@ -2507,6 +2509,47 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
>  		complete(&hbus->remove_event);
>  }
>  
> +#define HVPCI_DOM_MAP_SIZE (64 * 1024)
> +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
> +
> +/* PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
> + * as invalid for passthrough PCI devices of this driver.
> + */

Please use the usual multi-line comment style:

  /*
   * PCI domain number ...
   */

> +#define HVPCI_DOM_INVALID 0
> +
> +/**
> + * hv_get_dom_num() - Get a valid PCI domain number
> + * Check if the PCI domain number is in use, and return another number if
> + * it is in use.
> + *
> + * @dom: Requested domain number
> + *
> + * return: domain number on success, HVPCI_DOM_INVALID on failure
> + */
> +static u16 hv_get_dom_num(u16 dom)
> +{
> +	unsigned int i;
> +
> +	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
> +		return dom;
> +
> +	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
> +		if (test_and_set_bit(i, hvpci_dom_map) == 0)
> +			return i;
> +	}
> +
> +	return HVPCI_DOM_INVALID;
> +}
> +
> +/**
> + * hv_put_dom_num() - Mark the PCI domain number as free
> + * @dom: Domain number to be freed
> + */
> +static void hv_put_dom_num(u16 dom)
> +{
> +	clear_bit(dom, hvpci_dom_map);
> +}
> +
>  /**
>   * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
>   * @hdev:	VMBus's tracking struct for this root PCI bus
> @@ -2518,6 +2561,7 @@ static int hv_pci_probe(struct hv_device *hdev,
>  			const struct hv_vmbus_device_id *dev_id)
>  {
>  	struct hv_pcibus_device *hbus;
> +	u16 dom_req, dom;
>  	int ret;
>  
>  	/*
> @@ -2532,19 +2576,32 @@ static int hv_pci_probe(struct hv_device *hdev,
>  	hbus->state = hv_pcibus_init;
>  
>  	/*
> -	 * The PCI bus "domain" is what is called "segment" in ACPI and
> -	 * other specs.  Pull it from the instance ID, to get something
> -	 * unique.  Bytes 8 and 9 are what is used in Windows guests, so
> -	 * do the same thing for consistency.  Note that, since this code
> -	 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
> -	 * that (1) the only domain in use for something that looks like
> -	 * a physical PCI bus (which is actually emulated by the
> -	 * hypervisor) is domain 0 and (2) there will be no overlap
> -	 * between domains derived from these instance IDs in the same
> -	 * VM.
> +	 * The PCI bus "domain" is what is called "segment" in ACPI and other
> +	 * specs. Pull it from the instance ID, to get something usually
> +	 * unique. In rare cases of collision, we will find out another number
> +	 * not in use.
> +	 * Note that, since this code only runs in a Hyper-V VM, Hyper-V
> +	 * together with this guest driver can guarantee that (1) The only
> +	 * domain used by Gen1 VMs for something that looks like a physical
> +	 * PCI bus (which is actually emulated by the hypervisor) is domain 0.
> +	 * (2) There will be no overlap between domains (after fixing possible
> +	 * collisions) in the same VM.

Please use blank lines between paragraphs.

>  	 */
> -	hbus->sysdata.domain = hdev->dev_instance.b[9] |
> -			       hdev->dev_instance.b[8] << 8;
> +	dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
> +	dom = hv_get_dom_num(dom_req);
> +
> +	if (dom == HVPCI_DOM_INVALID) {
> +		pr_err("Unable to use dom# 0x%hx or other numbers",
> +		       dom_req);
> +		ret = -EINVAL;
> +		goto free_bus;
> +	}
> +
> +	if (dom != dom_req)
> +		pr_info("PCI dom# 0x%hx has collision, using 0x%hx",
> +			dom_req, dom);

Can these use "dev_err/info(&hdev->device, ...)" like the other
message in this function?  It's always nicer to have a specific device
reference when one is available.  Probably don't need the new pr_fmt()
definition if you do this.

> +
> +	hbus->sysdata.domain = dom;
>  
>  	hbus->hdev = hdev;
>  	refcount_set(&hbus->remove_lock, 1);
> @@ -2559,7 +2616,7 @@ static int hv_pci_probe(struct hv_device *hdev,
>  					   hbus->sysdata.domain);
>  	if (!hbus->wq) {
>  		ret = -ENOMEM;
> -		goto free_bus;
> +		goto free_dom;
>  	}
>  
>  	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
> @@ -2636,6 +2693,8 @@ static int hv_pci_probe(struct hv_device *hdev,
>  	vmbus_close(hdev->channel);
>  destroy_wq:
>  	destroy_workqueue(hbus->wq);
> +free_dom:
> +	hv_put_dom_num(hbus->sysdata.domain);
>  free_bus:
>  	free_page((unsigned long)hbus);
>  	return ret;
> @@ -2717,6 +2776,9 @@ static int hv_pci_remove(struct hv_device *hdev)
>  	put_hvpcibus(hbus);
>  	wait_for_completion(&hbus->remove_event);
>  	destroy_workqueue(hbus->wq);
> +
> +	hv_put_dom_num(hbus->sysdata.domain);
> +
>  	free_page((unsigned long)hbus);
>  	return 0;
>  }
> @@ -2744,6 +2806,9 @@ static void __exit exit_hv_pci_drv(void)
>  
>  static int __init init_hv_pci_drv(void)
>  {
> +	/* Set the invalid domain number's bit, so it will not be used */
> +	set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
> +
>  	return vmbus_driver_register(&hv_pci_drv);
>  }
>  
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
  2019-08-02 18:52 Haiyang Zhang
@ 2019-08-05 18:36 ` Sasha Levin
  2019-08-06 18:55   ` Bjorn Helgaas
  2019-08-06 18:55 ` Bjorn Helgaas
  1 sibling, 1 reply; 9+ messages in thread
From: Sasha Levin @ 2019-08-05 18:36 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: bhelgaas, lorenzo.pieralisi, linux-hyperv, linux-pci,
	KY Srinivasan, Stephen Hemminger, olaf, vkuznets, linux-kernel

On Fri, Aug 02, 2019 at 06:52:56PM +0000, Haiyang Zhang wrote:
>Due to Azure host agent settings, the device instance ID's bytes 8 and 9
>are no longer unique. This causes some of the PCI devices not showing up
>in VMs with multiple passthrough devices, such as GPUs. So, as recommended
>by Azure host team, we now use the bytes 4 and 5 which usually provide
>unique numbers.
>
>In the rare cases of collision, we will detect and find another number
>that is not in use.
>Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this idea.
>
>Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>

Acked-by: Sasha Levin <sashal@kernel.org>

Bjorn, will you take it through the PCI tree or do you want me to take
it through hyper-v?

--
Thanks,
Sasha

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
@ 2019-08-02 18:52 Haiyang Zhang
  2019-08-05 18:36 ` Sasha Levin
  2019-08-06 18:55 ` Bjorn Helgaas
  0 siblings, 2 replies; 9+ messages in thread
From: Haiyang Zhang @ 2019-08-02 18:52 UTC (permalink / raw)
  To: sashal, bhelgaas, lorenzo.pieralisi, linux-hyperv, linux-pci
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger, olaf, vkuznets,
	linux-kernel

Due to Azure host agent settings, the device instance ID's bytes 8 and 9
are no longer unique. This causes some of the PCI devices not showing up
in VMs with multiple passthrough devices, such as GPUs. So, as recommended
by Azure host team, we now use the bytes 4 and 5 which usually provide
unique numbers.

In the rare cases of collision, we will detect and find another number
that is not in use.
Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this idea.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/pci/controller/pci-hyperv.c | 91 +++++++++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 82acd61..6b9cc6e60a 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -37,6 +37,8 @@
  * the PCI back-end driver in Hyper-V.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
@@ -2507,6 +2509,47 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
 		complete(&hbus->remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/* PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+	unsigned int i;
+
+	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+		return dom;
+
+	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+		if (test_and_set_bit(i, hvpci_dom_map) == 0)
+			return i;
+	}
+
+	return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+	clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:	VMBus's tracking struct for this root PCI bus
@@ -2518,6 +2561,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 			const struct hv_vmbus_device_id *dev_id)
 {
 	struct hv_pcibus_device *hbus;
+	u16 dom_req, dom;
 	int ret;
 
 	/*
@@ -2532,19 +2576,32 @@ static int hv_pci_probe(struct hv_device *hdev,
 	hbus->state = hv_pcibus_init;
 
 	/*
-	 * The PCI bus "domain" is what is called "segment" in ACPI and
-	 * other specs.  Pull it from the instance ID, to get something
-	 * unique.  Bytes 8 and 9 are what is used in Windows guests, so
-	 * do the same thing for consistency.  Note that, since this code
-	 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-	 * that (1) the only domain in use for something that looks like
-	 * a physical PCI bus (which is actually emulated by the
-	 * hypervisor) is domain 0 and (2) there will be no overlap
-	 * between domains derived from these instance IDs in the same
-	 * VM.
+	 * The PCI bus "domain" is what is called "segment" in ACPI and other
+	 * specs. Pull it from the instance ID, to get something usually
+	 * unique. In rare cases of collision, we will find out another number
+	 * not in use.
+	 * Note that, since this code only runs in a Hyper-V VM, Hyper-V
+	 * together with this guest driver can guarantee that (1) The only
+	 * domain used by Gen1 VMs for something that looks like a physical
+	 * PCI bus (which is actually emulated by the hypervisor) is domain 0.
+	 * (2) There will be no overlap between domains (after fixing possible
+	 * collisions) in the same VM.
 	 */
-	hbus->sysdata.domain = hdev->dev_instance.b[9] |
-			       hdev->dev_instance.b[8] << 8;
+	dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
+	dom = hv_get_dom_num(dom_req);
+
+	if (dom == HVPCI_DOM_INVALID) {
+		pr_err("Unable to use dom# 0x%hx or other numbers",
+		       dom_req);
+		ret = -EINVAL;
+		goto free_bus;
+	}
+
+	if (dom != dom_req)
+		pr_info("PCI dom# 0x%hx has collision, using 0x%hx",
+			dom_req, dom);
+
+	hbus->sysdata.domain = dom;
 
 	hbus->hdev = hdev;
 	refcount_set(&hbus->remove_lock, 1);
@@ -2559,7 +2616,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 					   hbus->sysdata.domain);
 	if (!hbus->wq) {
 		ret = -ENOMEM;
-		goto free_bus;
+		goto free_dom;
 	}
 
 	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
@@ -2636,6 +2693,8 @@ static int hv_pci_probe(struct hv_device *hdev,
 	vmbus_close(hdev->channel);
 destroy_wq:
 	destroy_workqueue(hbus->wq);
+free_dom:
+	hv_put_dom_num(hbus->sysdata.domain);
 free_bus:
 	free_page((unsigned long)hbus);
 	return ret;
@@ -2717,6 +2776,9 @@ static int hv_pci_remove(struct hv_device *hdev)
 	put_hvpcibus(hbus);
 	wait_for_completion(&hbus->remove_event);
 	destroy_workqueue(hbus->wq);
+
+	hv_put_dom_num(hbus->sysdata.domain);
+
 	free_page((unsigned long)hbus);
 	return 0;
 }
@@ -2744,6 +2806,9 @@ static void __exit exit_hv_pci_drv(void)
 
 static int __init init_hv_pci_drv(void)
 {
+	/* Set the invalid domain number's bit, so it will not be used */
+	set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
+
 	return vmbus_driver_register(&hv_pci_drv);
 }
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-08-06 19:45 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-19 22:28 [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision Haiyang Zhang
2019-05-22  3:14 ` Dexuan Cui
     [not found]   ` <BYAPR21MB131778CC3A356745BC74EE3DCC000@BYAPR21MB1317.namprd21.prod.outlook.com>
2019-05-22  5:14     ` Dexuan Cui
2019-05-24 22:15 ` Sasha Levin
2019-08-02 18:52 Haiyang Zhang
2019-08-05 18:36 ` Sasha Levin
2019-08-06 18:55   ` Bjorn Helgaas
2019-08-06 18:55 ` Bjorn Helgaas
2019-08-06 19:45   ` Haiyang Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).