[PATCH v3] PCI: Data corruption happening due to race condition

linux-pci.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3] PCI: Data corruption happening due to race condition
@ 2018-07-03  9:05 Hari Vyas
  2018-07-03  9:05 ` Hari Vyas
  2018-07-31 16:37 ` Bjorn Helgaas
  0 siblings, 2 replies; 51+ messages in thread
From: Hari Vyas @ 2018-07-03  9:05 UTC (permalink / raw)
  To: bhelgaas, benh; +Cc: linux-pci, ray.jui, Hari Vyas

Changes in v3:
	As per review comments from Lukas Wunner <lukas@wunner.de>,
	squashed 3 commits to single commit. Without this build breaks.
	Also clubbed set and clear function for is_added bits to a
	single assign function. This optimizes code and reduce LoC.
	Removed one wrongly added blank line in pci.c
	 
Changes in v2:
        To avoid race condition while updating is_added and is_busmaster
        bits, is_added is moved to a private flag variable.
        is_added updation is handled in atomic manner also.

Hari Vyas (1):
  PCI: Data corruption happening due to race condition

 arch/powerpc/kernel/pci-common.c          |  4 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
 arch/powerpc/platforms/pseries/setup.c    |  3 ++-
 drivers/pci/bus.c                         |  6 +++---
 drivers/pci/hotplug/acpiphp_glue.c        |  2 +-
 drivers/pci/pci.h                         | 11 +++++++++++
 drivers/pci/probe.c                       |  4 ++--
 drivers/pci/remove.c                      |  5 +++--
 include/linux/pci.h                       |  1 -
 9 files changed, 27 insertions(+), 12 deletions(-)

-- 
1.9.1

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-03  9:05 [PATCH v3] PCI: Data corruption happening due to race condition Hari Vyas
@ 2018-07-03  9:05 ` Hari Vyas
  2018-07-03  9:13   ` Lukas Wunner
                     ` (2 more replies)
  2018-07-31 16:37 ` Bjorn Helgaas
  1 sibling, 3 replies; 51+ messages in thread
From: Hari Vyas @ 2018-07-03  9:05 UTC (permalink / raw)
  To: bhelgaas, benh; +Cc: linux-pci, ray.jui, Hari Vyas

When a pci device is detected, a variable is_added is set to
1 in pci device structure and proc, sys entries are created.

When a pci device is removed, first is_added is checked for one
and then device is detached with clearing of proc and sys
entries and at end, is_added is set to 0.

is_added and is_busmaster are bit fields in pci_dev structure
sharing same memory location.

A strange issue was observed with multiple times removal and
rescan of a pcie nvme device using sysfs commands where is_added
flag was observed as zero instead of one while removing device
and proc,sys entries are not cleared.  This causes issue in
later device addition with warning message "proc_dir_entry"
already registered.

Debugging revealed a race condition between pcie core driver
enabling is_added bit(pci_bus_add_device()) and nvme driver
reset work-queue enabling is_busmaster bit (by pci_set_master()).
As both fields are not handled in atomic manner and that clears
is_added bit.

Fix moves device addition is_added bit to separate private flag
variable and use different atomic functions to set and retrieve
device addition state. As is_added shares different memory
location so race condition is avoided.

Signed-off-by: Hari Vyas <hari.vyas@broadcom.com>
---
 arch/powerpc/kernel/pci-common.c          |  4 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
 arch/powerpc/platforms/pseries/setup.c    |  3 ++-
 drivers/pci/bus.c                         |  6 +++---
 drivers/pci/hotplug/acpiphp_glue.c        |  2 +-
 drivers/pci/pci.h                         | 11 +++++++++++
 drivers/pci/probe.c                       |  4 ++--
 drivers/pci/remove.c                      |  5 +++--
 include/linux/pci.h                       |  1 -
 9 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index fe9733f..471aac3 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -42,6 +42,8 @@
 #include <asm/ppc-pci.h>
 #include <asm/eeh.h>
 
+#include "../../../drivers/pci/pci.h"
+
 /* hose_spinlock protects accesses to the the phb_bitmap. */
 static DEFINE_SPINLOCK(hose_spinlock);
 LIST_HEAD(hose_list);
@@ -1014,7 +1016,7 @@ void pcibios_setup_bus_devices(struct pci_bus *bus)
 		/* Cardbus can call us to add new devices to a bus, so ignore
 		 * those who are already fully discovered
 		 */
-		if (dev->is_added)
+		if (pci_dev_is_added(dev))
 			continue;
 
 		pcibios_setup_device(dev);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5bd0eb6..70b2e1e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -46,6 +46,7 @@
 
 #include "powernv.h"
 #include "pci.h"
+#include "../../../../drivers/pci/pci.h"
 
 #define PNV_IODA1_M64_NUM	16	/* Number of M64 BARs	*/
 #define PNV_IODA1_M64_SEGS	8	/* Segments per M64 BAR	*/
@@ -3138,7 +3139,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 	struct pci_dn *pdn;
 	int mul, total_vfs;
 
-	if (!pdev->is_physfn || pdev->is_added)
+	if (!pdev->is_physfn || pci_dev_is_added(pdev))
 		return;
 
 	pdn = pci_get_pdn(pdev);
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 139f0af..8a4868a 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -71,6 +71,7 @@
 #include <asm/security_features.h>
 
 #include "pseries.h"
+#include "../../../../drivers/pci/pci.h"
 
 int CMO_PrPSP = -1;
 int CMO_SecPSP = -1;
@@ -664,7 +665,7 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
 	const int *indexes;
 	struct device_node *dn = pci_device_to_OF_node(pdev);
 
-	if (!pdev->is_physfn || pdev->is_added)
+	if (!pdev->is_physfn || pci_dev_is_added(pdev))
 		return;
 	/*Firmware must support open sriov otherwise dont configure*/
 	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 35b7fc8..5cb40b2 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -330,7 +330,7 @@ void pci_bus_add_device(struct pci_dev *dev)
 		return;
 	}
 
-	dev->is_added = 1;
+	pci_dev_assign_added(dev, true);
 }
 EXPORT_SYMBOL_GPL(pci_bus_add_device);
 
@@ -347,14 +347,14 @@ void pci_bus_add_devices(const struct pci_bus *bus)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		/* Skip already-added devices */
-		if (dev->is_added)
+		if (pci_dev_is_added(dev))
 			continue;
 		pci_bus_add_device(dev);
 	}
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		/* Skip if device attach failed */
-		if (!dev->is_added)
+		if (!pci_dev_is_added(dev))
 			continue;
 		child = dev->subordinate;
 		if (child)
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 3a17b29..ef0b1b6 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -509,7 +509,7 @@ static void enable_slot(struct acpiphp_slot *slot)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		/* Assume that newly added devices are powered on already. */
-		if (!dev->is_added)
+		if (!pci_dev_is_added(dev))
 			dev->current_state = PCI_D0;
 	}
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 882f1f9..0881725 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -288,6 +288,7 @@ struct pci_sriov {
 
 /* pci_dev priv_flags */
 #define PCI_DEV_DISCONNECTED 0
+#define PCI_DEV_ADDED 1
 
 static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
 {
@@ -300,6 +301,16 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
 	return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags);
 }
 
+static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
+{
+	assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added);
+}
+
+static inline bool pci_dev_is_added(const struct pci_dev *dev)
+{
+	return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
+}
+
 #ifdef CONFIG_PCI_ATS
 void pci_restore_ats_state(struct pci_dev *dev);
 #else
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ac876e3..611adcd 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2433,13 +2433,13 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
 	dev = pci_scan_single_device(bus, devfn);
 	if (!dev)
 		return 0;
-	if (!dev->is_added)
+	if (!pci_dev_is_added(dev))
 		nr++;
 
 	for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) {
 		dev = pci_scan_single_device(bus, devfn + fn);
 		if (dev) {
-			if (!dev->is_added)
+			if (!pci_dev_is_added(dev))
 				nr++;
 			dev->multifunction = 1;
 		}
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 6f072ea..5e3d0dc 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -19,11 +19,12 @@ static void pci_stop_dev(struct pci_dev *dev)
 {
 	pci_pme_active(dev, false);
 
-	if (dev->is_added) {
+	if (pci_dev_is_added(dev)) {
 		device_release_driver(&dev->dev);
 		pci_proc_detach_device(dev);
 		pci_remove_sysfs_dev_files(dev);
-		dev->is_added = 0;
+
+		pci_dev_assign_added(dev, false);
 	}
 
 	if (dev->bus->self)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 340029b..506125b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -368,7 +368,6 @@ struct pci_dev {
 	unsigned int	transparent:1;		/* Subtractive decode bridge */
 	unsigned int	multifunction:1;	/* Multi-function device */
 
-	unsigned int	is_added:1;
 	unsigned int	is_busmaster:1;		/* Is busmaster */
 	unsigned int	no_msi:1;		/* May not use MSI */
 	unsigned int	no_64bit_msi:1; 	/* May only use 32-bit MSIs */
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-03  9:05 ` Hari Vyas
@ 2018-07-03  9:13   ` Lukas Wunner
  2018-07-18 23:29   ` Bjorn Helgaas
  2018-07-19 17:41   ` Bjorn Helgaas
  2 siblings, 0 replies; 51+ messages in thread
From: Lukas Wunner @ 2018-07-03  9:13 UTC (permalink / raw)
  To: Hari Vyas; +Cc: bhelgaas, benh, linux-pci, ray.jui

On Tue, Jul 03, 2018 at 02:35:41PM +0530, Hari Vyas wrote:
> When a pci device is detected, a variable is_added is set to
> 1 in pci device structure and proc, sys entries are created.
> 
> When a pci device is removed, first is_added is checked for one
> and then device is detached with clearing of proc and sys
> entries and at end, is_added is set to 0.
> 
> is_added and is_busmaster are bit fields in pci_dev structure
> sharing same memory location.
> 
> A strange issue was observed with multiple times removal and
> rescan of a pcie nvme device using sysfs commands where is_added
> flag was observed as zero instead of one while removing device
> and proc,sys entries are not cleared.  This causes issue in
> later device addition with warning message "proc_dir_entry"
> already registered.
> 
> Debugging revealed a race condition between pcie core driver
> enabling is_added bit(pci_bus_add_device()) and nvme driver
> reset work-queue enabling is_busmaster bit (by pci_set_master()).
> As both fields are not handled in atomic manner and that clears
> is_added bit.
> 
> Fix moves device addition is_added bit to separate private flag
> variable and use different atomic functions to set and retrieve
> device addition state. As is_added shares different memory
> location so race condition is avoided.
> 
> Signed-off-by: Hari Vyas <hari.vyas@broadcom.com>

Reviewed-by: Lukas Wunner <lukas@wunner.de>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-03  9:05 ` Hari Vyas
  2018-07-03  9:13   ` Lukas Wunner
@ 2018-07-18 23:29   ` Bjorn Helgaas
  2018-07-19  4:18     ` Benjamin Herrenschmidt
  2018-07-19 17:41   ` Bjorn Helgaas
  2 siblings, 1 reply; 51+ messages in thread
From: Bjorn Helgaas @ 2018-07-18 23:29 UTC (permalink / raw)
  To: Hari Vyas
  Cc: bhelgaas, linux-pci, ray.jui, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, linuxppc-dev

[+cc Paul, Michael, linuxppc-dev]

On Tue, Jul 03, 2018 at 02:35:41PM +0530, Hari Vyas wrote:
> When a pci device is detected, a variable is_added is set to
> 1 in pci device structure and proc, sys entries are created.
> 
> When a pci device is removed, first is_added is checked for one
> and then device is detached with clearing of proc and sys
> entries and at end, is_added is set to 0.
> 
> is_added and is_busmaster are bit fields in pci_dev structure
> sharing same memory location.
> 
> A strange issue was observed with multiple times removal and
> rescan of a pcie nvme device using sysfs commands where is_added
> flag was observed as zero instead of one while removing device
> and proc,sys entries are not cleared.  This causes issue in
> later device addition with warning message "proc_dir_entry"
> already registered.
> 
> Debugging revealed a race condition between pcie core driver
> enabling is_added bit(pci_bus_add_device()) and nvme driver
> reset work-queue enabling is_busmaster bit (by pci_set_master()).
> As both fields are not handled in atomic manner and that clears
> is_added bit.
> 
> Fix moves device addition is_added bit to separate private flag
> variable and use different atomic functions to set and retrieve
> device addition state. As is_added shares different memory
> location so race condition is avoided.

Really nice bit of debugging!

> Signed-off-by: Hari Vyas <hari.vyas@broadcom.com>
> ---
>  arch/powerpc/kernel/pci-common.c          |  4 +++-
>  arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
>  arch/powerpc/platforms/pseries/setup.c    |  3 ++-
>  drivers/pci/bus.c                         |  6 +++---
>  drivers/pci/hotplug/acpiphp_glue.c        |  2 +-
>  drivers/pci/pci.h                         | 11 +++++++++++
>  drivers/pci/probe.c                       |  4 ++--
>  drivers/pci/remove.c                      |  5 +++--
>  include/linux/pci.h                       |  1 -
>  9 files changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> index fe9733f..471aac3 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -42,6 +42,8 @@
>  #include <asm/ppc-pci.h>
>  #include <asm/eeh.h>
>  
> +#include "../../../drivers/pci/pci.h"

I see why you need it, but this include path is really ugly.  Outside
of bootloaders and tools, there are very few instances of includes
like this that reference a different top-level directory, and I'm not
very keen about adding more.

Obviously powerpc is the only arch that needs dev->is_added.  It seems
to be because "We can only call pcibios_setup_device() after bus setup
is complete, since some of the platform specific DMA setup code
depends on it."

I don't know powerpc, but it does raise the question in my mind of
whether powerpc could be changed to do the DMA setup more like other
arches do to remove this ordering dependency and the need to use
dev->is_added.

That sounds like a lot of work, but it would have the benefit of
unifying some code that is probably needlessly arch-specific.

>  /* hose_spinlock protects accesses to the the phb_bitmap. */
>  static DEFINE_SPINLOCK(hose_spinlock);
>  LIST_HEAD(hose_list);
> @@ -1014,7 +1016,7 @@ void pcibios_setup_bus_devices(struct pci_bus *bus)
>  		/* Cardbus can call us to add new devices to a bus, so ignore
>  		 * those who are already fully discovered
>  		 */
> -		if (dev->is_added)
> +		if (pci_dev_is_added(dev))
>  			continue;
>  
>  		pcibios_setup_device(dev);
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 5bd0eb6..70b2e1e 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -46,6 +46,7 @@
>  
>  #include "powernv.h"
>  #include "pci.h"
> +#include "../../../../drivers/pci/pci.h"
>  
>  #define PNV_IODA1_M64_NUM	16	/* Number of M64 BARs	*/
>  #define PNV_IODA1_M64_SEGS	8	/* Segments per M64 BAR	*/
> @@ -3138,7 +3139,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>  	struct pci_dn *pdn;
>  	int mul, total_vfs;
>  
> -	if (!pdev->is_physfn || pdev->is_added)
> +	if (!pdev->is_physfn || pci_dev_is_added(pdev))
>  		return;
>  
>  	pdn = pci_get_pdn(pdev);
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 139f0af..8a4868a 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -71,6 +71,7 @@
>  #include <asm/security_features.h>
>  
>  #include "pseries.h"
> +#include "../../../../drivers/pci/pci.h"
>  
>  int CMO_PrPSP = -1;
>  int CMO_SecPSP = -1;
> @@ -664,7 +665,7 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
>  	const int *indexes;
>  	struct device_node *dn = pci_device_to_OF_node(pdev);
>  
> -	if (!pdev->is_physfn || pdev->is_added)
> +	if (!pdev->is_physfn || pci_dev_is_added(pdev))
>  		return;
>  	/*Firmware must support open sriov otherwise dont configure*/
>  	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
> diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
> index 35b7fc8..5cb40b2 100644
> --- a/drivers/pci/bus.c
> +++ b/drivers/pci/bus.c
> @@ -330,7 +330,7 @@ void pci_bus_add_device(struct pci_dev *dev)
>  		return;
>  	}
>  
> -	dev->is_added = 1;
> +	pci_dev_assign_added(dev, true);
>  }
>  EXPORT_SYMBOL_GPL(pci_bus_add_device);
>  
> @@ -347,14 +347,14 @@ void pci_bus_add_devices(const struct pci_bus *bus)
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		/* Skip already-added devices */
> -		if (dev->is_added)
> +		if (pci_dev_is_added(dev))
>  			continue;
>  		pci_bus_add_device(dev);
>  	}
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		/* Skip if device attach failed */
> -		if (!dev->is_added)
> +		if (!pci_dev_is_added(dev))
>  			continue;
>  		child = dev->subordinate;
>  		if (child)
> diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
> index 3a17b29..ef0b1b6 100644
> --- a/drivers/pci/hotplug/acpiphp_glue.c
> +++ b/drivers/pci/hotplug/acpiphp_glue.c
> @@ -509,7 +509,7 @@ static void enable_slot(struct acpiphp_slot *slot)
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		/* Assume that newly added devices are powered on already. */
> -		if (!dev->is_added)
> +		if (!pci_dev_is_added(dev))
>  			dev->current_state = PCI_D0;
>  	}
>  
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 882f1f9..0881725 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -288,6 +288,7 @@ struct pci_sriov {
>  
>  /* pci_dev priv_flags */
>  #define PCI_DEV_DISCONNECTED 0
> +#define PCI_DEV_ADDED 1
>  
>  static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
>  {
> @@ -300,6 +301,16 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
>  	return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags);
>  }
>  
> +static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
> +{
> +	assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added);
> +}
> +
> +static inline bool pci_dev_is_added(const struct pci_dev *dev)
> +{
> +	return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
> +}
> +
>  #ifdef CONFIG_PCI_ATS
>  void pci_restore_ats_state(struct pci_dev *dev);
>  #else
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index ac876e3..611adcd 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -2433,13 +2433,13 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
>  	dev = pci_scan_single_device(bus, devfn);
>  	if (!dev)
>  		return 0;
> -	if (!dev->is_added)
> +	if (!pci_dev_is_added(dev))
>  		nr++;
>  
>  	for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) {
>  		dev = pci_scan_single_device(bus, devfn + fn);
>  		if (dev) {
> -			if (!dev->is_added)
> +			if (!pci_dev_is_added(dev))
>  				nr++;
>  			dev->multifunction = 1;
>  		}
> diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
> index 6f072ea..5e3d0dc 100644
> --- a/drivers/pci/remove.c
> +++ b/drivers/pci/remove.c
> @@ -19,11 +19,12 @@ static void pci_stop_dev(struct pci_dev *dev)
>  {
>  	pci_pme_active(dev, false);
>  
> -	if (dev->is_added) {
> +	if (pci_dev_is_added(dev)) {
>  		device_release_driver(&dev->dev);
>  		pci_proc_detach_device(dev);
>  		pci_remove_sysfs_dev_files(dev);
> -		dev->is_added = 0;
> +
> +		pci_dev_assign_added(dev, false);
>  	}
>  
>  	if (dev->bus->self)
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 340029b..506125b 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -368,7 +368,6 @@ struct pci_dev {
>  	unsigned int	transparent:1;		/* Subtractive decode bridge */
>  	unsigned int	multifunction:1;	/* Multi-function device */
>  
> -	unsigned int	is_added:1;
>  	unsigned int	is_busmaster:1;		/* Is busmaster */
>  	unsigned int	no_msi:1;		/* May not use MSI */
>  	unsigned int	no_64bit_msi:1; 	/* May only use 32-bit MSIs */
> -- 
> 1.9.1
> 

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-18 23:29   ` Bjorn Helgaas
@ 2018-07-19  4:18     ` Benjamin Herrenschmidt
  2018-07-19 14:04       ` Hari Vyas
  2018-07-27 22:25       ` Bjorn Helgaas
  0 siblings, 2 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-07-19  4:18 UTC (permalink / raw)
  To: Bjorn Helgaas, Hari Vyas
  Cc: bhelgaas, linux-pci, ray.jui, Paul Mackerras, Michael Ellerman,
	linuxppc-dev

On Wed, 2018-07-18 at 18:29 -0500, Bjorn Helgaas wrote:
> [+cc Paul, Michael, linuxppc-dev]
> 

   ..../...

> > Debugging revealed a race condition between pcie core driver
> > enabling is_added bit(pci_bus_add_device()) and nvme driver
> > reset work-queue enabling is_busmaster bit (by pci_set_master()).
> > As both fields are not handled in atomic manner and that clears
> > is_added bit.
> > 
> > Fix moves device addition is_added bit to separate private flag
> > variable and use different atomic functions to set and retrieve
> > device addition state. As is_added shares different memory
> > location so race condition is avoided.
> 
> Really nice bit of debugging!

Indeed. However I'm not fan of the solution. Shouldn't we instead have
some locking for the content of pci_dev ? I've always been wary of us
having other similar races in there.

As for the powerpc bits, I'm probably the one who wrote them, however,
I'm on vacation this week and right now, no bandwidth to context switch
all that back in :-) So give me a few days and/or ping me next week.

The powerpc PCI code contains a lot of cruft coming from the depth of
history, including rather nasty assumptions. We want to progressively
clean it up, starting with EEH, but it will take time.

Cheers,
Ben.

> > Signed-off-by: Hari Vyas <hari.vyas@broadcom.com>
> > ---
> >  arch/powerpc/kernel/pci-common.c          |  4 +++-
> >  arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
> >  arch/powerpc/platforms/pseries/setup.c    |  3 ++-
> >  drivers/pci/bus.c                         |  6 +++---
> >  drivers/pci/hotplug/acpiphp_glue.c        |  2 +-
> >  drivers/pci/pci.h                         | 11 +++++++++++
> >  drivers/pci/probe.c                       |  4 ++--
> >  drivers/pci/remove.c                      |  5 +++--
> >  include/linux/pci.h                       |  1 -
> >  9 files changed, 27 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> > index fe9733f..471aac3 100644
> > --- a/arch/powerpc/kernel/pci-common.c
> > +++ b/arch/powerpc/kernel/pci-common.c
> > @@ -42,6 +42,8 @@
> >  #include <asm/ppc-pci.h>
> >  #include <asm/eeh.h>
> >  
> > +#include "../../../drivers/pci/pci.h"
> 
> I see why you need it, but this include path is really ugly.  Outside
> of bootloaders and tools, there are very few instances of includes
> like this that reference a different top-level directory, and I'm not
> very keen about adding more.
> 
> Obviously powerpc is the only arch that needs dev->is_added.  It seems
> to be because "We can only call pcibios_setup_device() after bus setup
> is complete, since some of the platform specific DMA setup code
> depends on it."
> 
> I don't know powerpc, but it does raise the question in my mind of
> whether powerpc could be changed to do the DMA setup more like other
> arches do to remove this ordering dependency and the need to use
> dev->is_added.
> 
> That sounds like a lot of work, but it would have the benefit of
> unifying some code that is probably needlessly arch-specific.
> 
> >  /* hose_spinlock protects accesses to the the phb_bitmap. */
> >  static DEFINE_SPINLOCK(hose_spinlock);
> >  LIST_HEAD(hose_list);
> > @@ -1014,7 +1016,7 @@ void pcibios_setup_bus_devices(struct pci_bus *bus)
> >  		/* Cardbus can call us to add new devices to a bus, so ignore
> >  		 * those who are already fully discovered
> >  		 */
> > -		if (dev->is_added)
> > +		if (pci_dev_is_added(dev))
> >  			continue;
> >  
> >  		pcibios_setup_device(dev);
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index 5bd0eb6..70b2e1e 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -46,6 +46,7 @@
> >  
> >  #include "powernv.h"
> >  #include "pci.h"
> > +#include "../../../../drivers/pci/pci.h"
> >  
> >  #define PNV_IODA1_M64_NUM	16	/* Number of M64 BARs	*/
> >  #define PNV_IODA1_M64_SEGS	8	/* Segments per M64 BAR	*/
> > @@ -3138,7 +3139,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> >  	struct pci_dn *pdn;
> >  	int mul, total_vfs;
> >  
> > -	if (!pdev->is_physfn || pdev->is_added)
> > +	if (!pdev->is_physfn || pci_dev_is_added(pdev))
> >  		return;
> >  
> >  	pdn = pci_get_pdn(pdev);
> > diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> > index 139f0af..8a4868a 100644
> > --- a/arch/powerpc/platforms/pseries/setup.c
> > +++ b/arch/powerpc/platforms/pseries/setup.c
> > @@ -71,6 +71,7 @@
> >  #include <asm/security_features.h>
> >  
> >  #include "pseries.h"
> > +#include "../../../../drivers/pci/pci.h"
> >  
> >  int CMO_PrPSP = -1;
> >  int CMO_SecPSP = -1;
> > @@ -664,7 +665,7 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
> >  	const int *indexes;
> >  	struct device_node *dn = pci_device_to_OF_node(pdev);
> >  
> > -	if (!pdev->is_physfn || pdev->is_added)
> > +	if (!pdev->is_physfn || pci_dev_is_added(pdev))
> >  		return;
> >  	/*Firmware must support open sriov otherwise dont configure*/
> >  	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
> > diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
> > index 35b7fc8..5cb40b2 100644
> > --- a/drivers/pci/bus.c
> > +++ b/drivers/pci/bus.c
> > @@ -330,7 +330,7 @@ void pci_bus_add_device(struct pci_dev *dev)
> >  		return;
> >  	}
> >  
> > -	dev->is_added = 1;
> > +	pci_dev_assign_added(dev, true);
> >  }
> >  EXPORT_SYMBOL_GPL(pci_bus_add_device);
> >  
> > @@ -347,14 +347,14 @@ void pci_bus_add_devices(const struct pci_bus *bus)
> >  
> >  	list_for_each_entry(dev, &bus->devices, bus_list) {
> >  		/* Skip already-added devices */
> > -		if (dev->is_added)
> > +		if (pci_dev_is_added(dev))
> >  			continue;
> >  		pci_bus_add_device(dev);
> >  	}
> >  
> >  	list_for_each_entry(dev, &bus->devices, bus_list) {
> >  		/* Skip if device attach failed */
> > -		if (!dev->is_added)
> > +		if (!pci_dev_is_added(dev))
> >  			continue;
> >  		child = dev->subordinate;
> >  		if (child)
> > diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
> > index 3a17b29..ef0b1b6 100644
> > --- a/drivers/pci/hotplug/acpiphp_glue.c
> > +++ b/drivers/pci/hotplug/acpiphp_glue.c
> > @@ -509,7 +509,7 @@ static void enable_slot(struct acpiphp_slot *slot)
> >  
> >  	list_for_each_entry(dev, &bus->devices, bus_list) {
> >  		/* Assume that newly added devices are powered on already. */
> > -		if (!dev->is_added)
> > +		if (!pci_dev_is_added(dev))
> >  			dev->current_state = PCI_D0;
> >  	}
> >  
> > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> > index 882f1f9..0881725 100644
> > --- a/drivers/pci/pci.h
> > +++ b/drivers/pci/pci.h
> > @@ -288,6 +288,7 @@ struct pci_sriov {
> >  
> >  /* pci_dev priv_flags */
> >  #define PCI_DEV_DISCONNECTED 0
> > +#define PCI_DEV_ADDED 1
> >  
> >  static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
> >  {
> > @@ -300,6 +301,16 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
> >  	return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags);
> >  }
> >  
> > +static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
> > +{
> > +	assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added);
> > +}
> > +
> > +static inline bool pci_dev_is_added(const struct pci_dev *dev)
> > +{
> > +	return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
> > +}
> > +
> >  #ifdef CONFIG_PCI_ATS
> >  void pci_restore_ats_state(struct pci_dev *dev);
> >  #else
> > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> > index ac876e3..611adcd 100644
> > --- a/drivers/pci/probe.c
> > +++ b/drivers/pci/probe.c
> > @@ -2433,13 +2433,13 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
> >  	dev = pci_scan_single_device(bus, devfn);
> >  	if (!dev)
> >  		return 0;
> > -	if (!dev->is_added)
> > +	if (!pci_dev_is_added(dev))
> >  		nr++;
> >  
> >  	for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) {
> >  		dev = pci_scan_single_device(bus, devfn + fn);
> >  		if (dev) {
> > -			if (!dev->is_added)
> > +			if (!pci_dev_is_added(dev))
> >  				nr++;
> >  			dev->multifunction = 1;
> >  		}
> > diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
> > index 6f072ea..5e3d0dc 100644
> > --- a/drivers/pci/remove.c
> > +++ b/drivers/pci/remove.c
> > @@ -19,11 +19,12 @@ static void pci_stop_dev(struct pci_dev *dev)
> >  {
> >  	pci_pme_active(dev, false);
> >  
> > -	if (dev->is_added) {
> > +	if (pci_dev_is_added(dev)) {
> >  		device_release_driver(&dev->dev);
> >  		pci_proc_detach_device(dev);
> >  		pci_remove_sysfs_dev_files(dev);
> > -		dev->is_added = 0;
> > +
> > +		pci_dev_assign_added(dev, false);
> >  	}
> >  
> >  	if (dev->bus->self)
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index 340029b..506125b 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -368,7 +368,6 @@ struct pci_dev {
> >  	unsigned int	transparent:1;		/* Subtractive decode bridge */
> >  	unsigned int	multifunction:1;	/* Multi-function device */
> >  
> > -	unsigned int	is_added:1;
> >  	unsigned int	is_busmaster:1;		/* Is busmaster */
> >  	unsigned int	no_msi:1;		/* May not use MSI */
> >  	unsigned int	no_64bit_msi:1; 	/* May only use 32-bit MSIs */
> > -- 
> > 1.9.1
> > 

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-19  4:18     ` Benjamin Herrenschmidt
@ 2018-07-19 14:04       ` Hari Vyas
  2018-07-19 18:55         ` Lukas Wunner
  2018-07-27 22:25       ` Bjorn Helgaas
  1 sibling, 1 reply; 51+ messages in thread
From: Hari Vyas @ 2018-07-19 14:04 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Bjorn Helgaas, Bjorn Helgaas, linux-pci, Ray Jui, Paul Mackerras,
	Michael Ellerman, linuxppc-dev

Hi Bjonr, Ben

On Thu, Jul 19, 2018 at 9:48 AM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Wed, 2018-07-18 at 18:29 -0500, Bjorn Helgaas wrote:
>> [+cc Paul, Michael, linuxppc-dev]
>>
>
>    ..../...
>
>> > Debugging revealed a race condition between pcie core driver
>> > enabling is_added bit(pci_bus_add_device()) and nvme driver
>> > reset work-queue enabling is_busmaster bit (by pci_set_master()).
>> > As both fields are not handled in atomic manner and that clears
>> > is_added bit.
>> >
>> > Fix moves device addition is_added bit to separate private flag
>> > variable and use different atomic functions to set and retrieve
>> > device addition state. As is_added shares different memory
>> > location so race condition is avoided.
>>
>> Really nice bit of debugging!
>
> Indeed. However I'm not fan of the solution. Shouldn't we instead have
> some locking for the content of pci_dev ? I've always been wary of us
> having other similar races in there.
>
> As for the powerpc bits, I'm probably the one who wrote them, however,
> I'm on vacation this week and right now, no bandwidth to context switch
> all that back in :-) So give me a few days and/or ping me next week.
>
> The powerpc PCI code contains a lot of cruft coming from the depth of
> history, including rather nasty assumptions. We want to progressively
> clean it up, starting with EEH, but it will take time.
>
> Cheers,
> Ben.
>
Some driver too directly using pci_dev structure flags and may cause similar
type of issues in race condition and should be avoided.
Probably not causing issue currently but some race scenario may affect
and needs to be handled
with some get(),set() api's in atomic manner.
I will suggest to use bit position for all remaining bitfields and use
atomic operation. In that way,
it can be controlled and avoid direct updating fields from outside.
Ex;
enum pci_dev_flags {
   IS_BUSMASTER=1,
.  NO_MSI=2.
}
void assign_pci_dev_flag(struct pci_dev *dev, int flag, bool val)
{
   assign_bit(flag, &dev->flags, val);
}
Proper cleanup is required at so many places but that will certainly
take some time
i.e.  good effort but will be future safe.If  Bjorn agrees, we can
work on that one.
>> > Signed-off-by: Hari Vyas <hari.vyas@broadcom.com>
>> > ---
>> >  arch/powerpc/kernel/pci-common.c          |  4 +++-
>> >  arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
>> >  arch/powerpc/platforms/pseries/setup.c    |  3 ++-
>> >  drivers/pci/bus.c                         |  6 +++---
>> >  drivers/pci/hotplug/acpiphp_glue.c        |  2 +-
>> >  drivers/pci/pci.h                         | 11 +++++++++++
>> >  drivers/pci/probe.c                       |  4 ++--
>> >  drivers/pci/remove.c                      |  5 +++--
>> >  include/linux/pci.h                       |  1 -
>> >  9 files changed, 27 insertions(+), 12 deletions(-)
>> >
>> > diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
>> > index fe9733f..471aac3 100644
>> > --- a/arch/powerpc/kernel/pci-common.c
>> > +++ b/arch/powerpc/kernel/pci-common.c
>> > @@ -42,6 +42,8 @@
>> >  #include <asm/ppc-pci.h>
>> >  #include <asm/eeh.h>
>> >
>> > +#include "../../../drivers/pci/pci.h"
>>
>> I see why you need it, but this include path is really ugly.  Outside
>> of bootloaders and tools, there are very few instances of includes
>> like this that reference a different top-level directory, and I'm not
>> very keen about adding more.
>>
>> Obviously powerpc is the only arch that needs dev->is_added.  It seems
>> to be because "We can only call pcibios_setup_device() after bus setup
>> is complete, since some of the platform specific DMA setup code
>> depends on it."
>>
>> I don't know powerpc, but it does raise the question in my mind of
>> whether powerpc could be changed to do the DMA setup more like other
>> arches do to remove this ordering dependency and the need to use
>> dev->is_added.
>>
>> That sounds like a lot of work, but it would have the benefit of
>> unifying some code that is probably needlessly arch-specific.
>>
Yes. I also agree, including pci.h with ../ references is really bad.
First patch
was using spin lock for protecting is_added and is_busmaster bits but in final
patch moved is_added to private flags.

>> >  /* hose_spinlock protects accesses to the the phb_bitmap. */
>> >  static DEFINE_SPINLOCK(hose_spinlock);
>> >  LIST_HEAD(hose_list);
>> > @@ -1014,7 +1016,7 @@ void pcibios_setup_bus_devices(struct pci_bus *bus)
>> >             /* Cardbus can call us to add new devices to a bus, so ignore
>> >              * those who are already fully discovered
>> >              */
>> > -           if (dev->is_added)
>> > +           if (pci_dev_is_added(dev))
>> >                     continue;
>> >
>> >             pcibios_setup_device(dev);
>> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> > index 5bd0eb6..70b2e1e 100644
>> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> > @@ -46,6 +46,7 @@
>> >
>> >  #include "powernv.h"
>> >  #include "pci.h"
>> > +#include "../../../../drivers/pci/pci.h"
>> >
>> >  #define PNV_IODA1_M64_NUM  16      /* Number of M64 BARs   */
>> >  #define PNV_IODA1_M64_SEGS 8       /* Segments per M64 BAR */
>> > @@ -3138,7 +3139,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> >     struct pci_dn *pdn;
>> >     int mul, total_vfs;
>> >
>> > -   if (!pdev->is_physfn || pdev->is_added)
>> > +   if (!pdev->is_physfn || pci_dev_is_added(pdev))
>> >             return;
>> >
>> >     pdn = pci_get_pdn(pdev);
>> > diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> > index 139f0af..8a4868a 100644
>> > --- a/arch/powerpc/platforms/pseries/setup.c
>> > +++ b/arch/powerpc/platforms/pseries/setup.c
>> > @@ -71,6 +71,7 @@
>> >  #include <asm/security_features.h>
>> >
>> >  #include "pseries.h"
>> > +#include "../../../../drivers/pci/pci.h"
>> >
>> >  int CMO_PrPSP = -1;
>> >  int CMO_SecPSP = -1;
>> > @@ -664,7 +665,7 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
>> >     const int *indexes;
>> >     struct device_node *dn = pci_device_to_OF_node(pdev);
>> >
>> > -   if (!pdev->is_physfn || pdev->is_added)
>> > +   if (!pdev->is_physfn || pci_dev_is_added(pdev))
>> >             return;
>> >     /*Firmware must support open sriov otherwise dont configure*/
>> >     indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
>> > diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
>> > index 35b7fc8..5cb40b2 100644
>> > --- a/drivers/pci/bus.c
>> > +++ b/drivers/pci/bus.c
>> > @@ -330,7 +330,7 @@ void pci_bus_add_device(struct pci_dev *dev)
>> >             return;
>> >     }
>> >
>> > -   dev->is_added = 1;
>> > +   pci_dev_assign_added(dev, true);
>> >  }
>> >  EXPORT_SYMBOL_GPL(pci_bus_add_device);
>> >
>> > @@ -347,14 +347,14 @@ void pci_bus_add_devices(const struct pci_bus *bus)
>> >
>> >     list_for_each_entry(dev, &bus->devices, bus_list) {
>> >             /* Skip already-added devices */
>> > -           if (dev->is_added)
>> > +           if (pci_dev_is_added(dev))
>> >                     continue;
>> >             pci_bus_add_device(dev);
>> >     }
>> >
>> >     list_for_each_entry(dev, &bus->devices, bus_list) {
>> >             /* Skip if device attach failed */
>> > -           if (!dev->is_added)
>> > +           if (!pci_dev_is_added(dev))
>> >                     continue;
>> >             child = dev->subordinate;
>> >             if (child)
>> > diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
>> > index 3a17b29..ef0b1b6 100644
>> > --- a/drivers/pci/hotplug/acpiphp_glue.c
>> > +++ b/drivers/pci/hotplug/acpiphp_glue.c
>> > @@ -509,7 +509,7 @@ static void enable_slot(struct acpiphp_slot *slot)
>> >
>> >     list_for_each_entry(dev, &bus->devices, bus_list) {
>> >             /* Assume that newly added devices are powered on already. */
>> > -           if (!dev->is_added)
>> > +           if (!pci_dev_is_added(dev))
>> >                     dev->current_state = PCI_D0;
>> >     }
>> >
>> > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>> > index 882f1f9..0881725 100644
>> > --- a/drivers/pci/pci.h
>> > +++ b/drivers/pci/pci.h
>> > @@ -288,6 +288,7 @@ struct pci_sriov {
>> >
>> >  /* pci_dev priv_flags */
>> >  #define PCI_DEV_DISCONNECTED 0
>> > +#define PCI_DEV_ADDED 1
>> >
>> >  static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
>> >  {
>> > @@ -300,6 +301,16 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
>> >     return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags);
>> >  }
>> >
>> > +static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
>> > +{
>> > +   assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added);
>> > +}
>> > +
>> > +static inline bool pci_dev_is_added(const struct pci_dev *dev)
>> > +{
>> > +   return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
>> > +}
>> > +
>> >  #ifdef CONFIG_PCI_ATS
>> >  void pci_restore_ats_state(struct pci_dev *dev);
>> >  #else
>> > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
>> > index ac876e3..611adcd 100644
>> > --- a/drivers/pci/probe.c
>> > +++ b/drivers/pci/probe.c
>> > @@ -2433,13 +2433,13 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
>> >     dev = pci_scan_single_device(bus, devfn);
>> >     if (!dev)
>> >             return 0;
>> > -   if (!dev->is_added)
>> > +   if (!pci_dev_is_added(dev))
>> >             nr++;
>> >
>> >     for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) {
>> >             dev = pci_scan_single_device(bus, devfn + fn);
>> >             if (dev) {
>> > -                   if (!dev->is_added)
>> > +                   if (!pci_dev_is_added(dev))
>> >                             nr++;
>> >                     dev->multifunction = 1;
>> >             }
>> > diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
>> > index 6f072ea..5e3d0dc 100644
>> > --- a/drivers/pci/remove.c
>> > +++ b/drivers/pci/remove.c
>> > @@ -19,11 +19,12 @@ static void pci_stop_dev(struct pci_dev *dev)
>> >  {
>> >     pci_pme_active(dev, false);
>> >
>> > -   if (dev->is_added) {
>> > +   if (pci_dev_is_added(dev)) {
>> >             device_release_driver(&dev->dev);
>> >             pci_proc_detach_device(dev);
>> >             pci_remove_sysfs_dev_files(dev);
>> > -           dev->is_added = 0;
>> > +
>> > +           pci_dev_assign_added(dev, false);
>> >     }
>> >
>> >     if (dev->bus->self)
>> > diff --git a/include/linux/pci.h b/include/linux/pci.h
>> > index 340029b..506125b 100644
>> > --- a/include/linux/pci.h
>> > +++ b/include/linux/pci.h
>> > @@ -368,7 +368,6 @@ struct pci_dev {
>> >     unsigned int    transparent:1;          /* Subtractive decode bridge */
>> >     unsigned int    multifunction:1;        /* Multi-function device */
>> >
>> > -   unsigned int    is_added:1;
>> >     unsigned int    is_busmaster:1;         /* Is busmaster */
>> >     unsigned int    no_msi:1;               /* May not use MSI */
>> >     unsigned int    no_64bit_msi:1;         /* May only use 32-bit MSIs */
>> > --
>> > 1.9.1
>> >

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-03  9:05 ` Hari Vyas
  2018-07-03  9:13   ` Lukas Wunner
  2018-07-18 23:29   ` Bjorn Helgaas
@ 2018-07-19 17:41   ` Bjorn Helgaas
  2018-07-20  9:16     ` Hari Vyas
  2 siblings, 1 reply; 51+ messages in thread
From: Bjorn Helgaas @ 2018-07-19 17:41 UTC (permalink / raw)
  To: Hari Vyas; +Cc: bhelgaas, benh, linux-pci, ray.jui

On Tue, Jul 03, 2018 at 02:35:41PM +0530, Hari Vyas wrote:
> When a pci device is detected, a variable is_added is set to
> 1 in pci device structure and proc, sys entries are created.
> 
> When a pci device is removed, first is_added is checked for one
> and then device is detached with clearing of proc and sys
> entries and at end, is_added is set to 0.
> 
> is_added and is_busmaster are bit fields in pci_dev structure
> sharing same memory location.
> 
> A strange issue was observed with multiple times removal and
> rescan of a pcie nvme device using sysfs commands where is_added
> flag was observed as zero instead of one while removing device
> and proc,sys entries are not cleared.  This causes issue in
> later device addition with warning message "proc_dir_entry"
> already registered.
> 
> Debugging revealed a race condition between pcie core driver
> enabling is_added bit(pci_bus_add_device()) and nvme driver
> reset work-queue enabling is_busmaster bit (by pci_set_master()).
> As both fields are not handled in atomic manner and that clears
> is_added bit.
> 
> Fix moves device addition is_added bit to separate private flag
> variable and use different atomic functions to set and retrieve
> device addition state. As is_added shares different memory
> location so race condition is avoided.

If/when you post a v4 of this, can you include the bugzilla URL
right here, i.e.,

Link: https://bugzilla.kernel.org/show_bug.cgi?id=200283
> Signed-off-by: Hari Vyas <hari.vyas@broadcom.com>

That way we can connect the commit with the bugzilla, which contains
more information that may be useful in the future.

Bjorn

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-19 14:04       ` Hari Vyas
@ 2018-07-19 18:55         ` Lukas Wunner
  2018-07-20  4:27           ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 51+ messages in thread
From: Lukas Wunner @ 2018-07-19 18:55 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Hari Vyas
  Cc: Bjorn Helgaas, linux-pci, Ray Jui, Paul Mackerras,
	Michael Ellerman, linuxppc-dev

On Thu, Jul 19, 2018 at 9:48 AM, Benjamin Herrenschmidt <benh@kernel.crashing.org wrote:
> Indeed. However I'm not fan of the solution. Shouldn't we instead have
> some locking for the content of pci_dev? I've always been wary of us
> having other similar races in there.

The solution presented is perfectly fine as it uses atomic bitops which
obviate the need for locking.  Why do you want to add unnecessary locking
on top?

Certain other parts of struct pci_dev use their own locking, e.g.
pci_bus_sem to protect bus_list.  Most elements can and should
be accessed lockless for performance.

> > The powerpc PCI code contains a lot of cruft coming from the depth of
> > history, including rather nasty assumptions. We want to progressively
> > clean it up, starting with EEH, but it will take time.

Then I suggest using the #include "../../../drivers/pci/pci.h" for now
until the powerpc arch code has been consolidated.

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-19 18:55         ` Lukas Wunner
@ 2018-07-20  4:27           ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-07-20  4:27 UTC (permalink / raw)
  To: Lukas Wunner, Hari Vyas
  Cc: Bjorn Helgaas, linux-pci, Ray Jui, Paul Mackerras,
	Michael Ellerman, linuxppc-dev

On Thu, 2018-07-19 at 20:55 +0200, Lukas Wunner wrote:
> On Thu, Jul 19, 2018 at 9:48 AM, Benjamin Herrenschmidt <benh@kernel.crashing.org wrote:
> > Indeed. However I'm not fan of the solution. Shouldn't we instead have
> > some locking for the content of pci_dev? I've always been wary of us
> > having other similar races in there.
> 
> The solution presented is perfectly fine as it uses atomic bitops which
> obviate the need for locking.  Why do you want to add unnecessary locking
> on top?

Atomic bitops tend to be *more* expensive than a lock.

My concern is that the PCIe code historically had no locking and I
worry we may have other fields in there with similar issues. But maybe
I'm wrong.

> Certain other parts of struct pci_dev use their own locking, e.g.
> pci_bus_sem to protect bus_list.  Most elements can and should
> be accessed lockless for performance.
> 
> 
> > > The powerpc PCI code contains a lot of cruft coming from the depth of
> > > history, including rather nasty assumptions. We want to progressively
> > > clean it up, starting with EEH, but it will take time.
> 
> Then I suggest using the #include "../../../drivers/pci/pci.h" for now
> until the powerpc arch code has been consolidated.

There's also the need both in powerpc and sparc to access the guts of
pci_dev because those archs will "fabricate" as pci_dev from the
device-tree rather than probing it under some circumstances.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-19 17:41   ` Bjorn Helgaas
@ 2018-07-20  9:16     ` Hari Vyas
  2018-07-20 12:20       ` Bjorn Helgaas
  0 siblings, 1 reply; 51+ messages in thread
From: Hari Vyas @ 2018-07-20  9:16 UTC (permalink / raw)
  To: Bjorn Helgaas; +Cc: Bjorn Helgaas, Benjamin Herrenschmidt, linux-pci, Ray Jui

On Thu, Jul 19, 2018 at 11:11 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> On Tue, Jul 03, 2018 at 02:35:41PM +0530, Hari Vyas wrote:
>> When a pci device is detected, a variable is_added is set to
>> 1 in pci device structure and proc, sys entries are created.
>>
>> When a pci device is removed, first is_added is checked for one
>> and then device is detached with clearing of proc and sys
>> entries and at end, is_added is set to 0.
>>
>> is_added and is_busmaster are bit fields in pci_dev structure
>> sharing same memory location.
>>
>> A strange issue was observed with multiple times removal and
>> rescan of a pcie nvme device using sysfs commands where is_added
>> flag was observed as zero instead of one while removing device
>> and proc,sys entries are not cleared.  This causes issue in
>> later device addition with warning message "proc_dir_entry"
>> already registered.
>>
>> Debugging revealed a race condition between pcie core driver
>> enabling is_added bit(pci_bus_add_device()) and nvme driver
>> reset work-queue enabling is_busmaster bit (by pci_set_master()).
>> As both fields are not handled in atomic manner and that clears
>> is_added bit.
>>
>> Fix moves device addition is_added bit to separate private flag
>> variable and use different atomic functions to set and retrieve
>> device addition state. As is_added shares different memory
>> location so race condition is avoided.
>
> If/when you post a v4 of this, can you include the bugzilla URL
> right here, i.e.,
>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=200283
>> Signed-off-by: Hari Vyas <hari.vyas@broadcom.com>
>
> That way we can connect the commit with the bugzilla, which contains
> more information that may be useful in the future.
>
> Bjorn
Bit confused.  Do I  need  to change commit message for including bugzila link ?
Believe v3 should be Okay from code perspective time-being. Please confirm

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-20  9:16     ` Hari Vyas
@ 2018-07-20 12:20       ` Bjorn Helgaas
  0 siblings, 0 replies; 51+ messages in thread
From: Bjorn Helgaas @ 2018-07-20 12:20 UTC (permalink / raw)
  To: hari.vyas; +Cc: Bjorn Helgaas, Benjamin Herrenschmidt, linux-pci, Ray Jui

On Fri, Jul 20, 2018 at 4:16 AM Hari Vyas <hari.vyas@broadcom.com> wrote:
>
> On Thu, Jul 19, 2018 at 11:11 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Tue, Jul 03, 2018 at 02:35:41PM +0530, Hari Vyas wrote:
> >> When a pci device is detected, a variable is_added is set to
> >> 1 in pci device structure and proc, sys entries are created.
> >>
> >> When a pci device is removed, first is_added is checked for one
> >> and then device is detached with clearing of proc and sys
> >> entries and at end, is_added is set to 0.
> >>
> >> is_added and is_busmaster are bit fields in pci_dev structure
> >> sharing same memory location.
> >>
> >> A strange issue was observed with multiple times removal and
> >> rescan of a pcie nvme device using sysfs commands where is_added
> >> flag was observed as zero instead of one while removing device
> >> and proc,sys entries are not cleared.  This causes issue in
> >> later device addition with warning message "proc_dir_entry"
> >> already registered.
> >>
> >> Debugging revealed a race condition between pcie core driver
> >> enabling is_added bit(pci_bus_add_device()) and nvme driver
> >> reset work-queue enabling is_busmaster bit (by pci_set_master()).
> >> As both fields are not handled in atomic manner and that clears
> >> is_added bit.
> >>
> >> Fix moves device addition is_added bit to separate private flag
> >> variable and use different atomic functions to set and retrieve
> >> device addition state. As is_added shares different memory
> >> location so race condition is avoided.
> >
> > If/when you post a v4 of this, can you include the bugzilla URL
> > right here, i.e.,
> >
> > Link: https://bugzilla.kernel.org/show_bug.cgi?id=200283
> >> Signed-off-by: Hari Vyas <hari.vyas@broadcom.com>
> >
> > That way we can connect the commit with the bugzilla, which contains
> > more information that may be useful in the future.

> Bit confused.  Do I  need  to change commit message for including bugzila link ?
> Believe v3 should be Okay from code perspective time-being. Please confirm

You don't need to repost it just to update the commit message (I'll
try to remember to add the link when I apply it).  But if you do
repost it for some other reason, it helps me out if you include the
link.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-19  4:18     ` Benjamin Herrenschmidt
  2018-07-19 14:04       ` Hari Vyas
@ 2018-07-27 22:25       ` Bjorn Helgaas
  2018-07-28  0:45         ` Benjamin Herrenschmidt
  2018-07-31 11:21         ` Michael Ellerman
  1 sibling, 2 replies; 51+ messages in thread
From: Bjorn Helgaas @ 2018-07-27 22:25 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui, Paul Mackerras,
	Michael Ellerman, linuxppc-dev

On Thu, Jul 19, 2018 at 02:18:09PM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2018-07-18 at 18:29 -0500, Bjorn Helgaas wrote:
> > [+cc Paul, Michael, linuxppc-dev]
> > 
> 
>    ..../...
> 
> > > Debugging revealed a race condition between pcie core driver
> > > enabling is_added bit(pci_bus_add_device()) and nvme driver
> > > reset work-queue enabling is_busmaster bit (by pci_set_master()).
> > > As both fields are not handled in atomic manner and that clears
> > > is_added bit.
> > > 
> > > Fix moves device addition is_added bit to separate private flag
> > > variable and use different atomic functions to set and retrieve
> > > device addition state. As is_added shares different memory
> > > location so race condition is avoided.
> > 
> > Really nice bit of debugging!
> 
> Indeed. However I'm not fan of the solution. Shouldn't we instead have
> some locking for the content of pci_dev ? I've always been wary of us
> having other similar races in there.
> 
> As for the powerpc bits, I'm probably the one who wrote them, however,
> I'm on vacation this week and right now, no bandwidth to context switch
> all that back in :-) So give me a few days and/or ping me next week.

OK, here's a ping :)

Some powerpc cleanup would be ideal, but I'd like to fix the race for
v4.19, so I'm fine with this patch as-is.  But I'd definitely want
your ack before inserting the ugly #include path in the powerpc code.

> The powerpc PCI code contains a lot of cruft coming from the depth of
> history, including rather nasty assumptions. We want to progressively
> clean it up, starting with EEH, but it will take time.
> 
> Cheers,
> Ben.
> 
> > > Signed-off-by: Hari Vyas <hari.vyas@broadcom.com>
> > > ---
> > >  arch/powerpc/kernel/pci-common.c          |  4 +++-
> > >  arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
> > >  arch/powerpc/platforms/pseries/setup.c    |  3 ++-
> > >  drivers/pci/bus.c                         |  6 +++---
> > >  drivers/pci/hotplug/acpiphp_glue.c        |  2 +-
> > >  drivers/pci/pci.h                         | 11 +++++++++++
> > >  drivers/pci/probe.c                       |  4 ++--
> > >  drivers/pci/remove.c                      |  5 +++--
> > >  include/linux/pci.h                       |  1 -
> > >  9 files changed, 27 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> > > index fe9733f..471aac3 100644
> > > --- a/arch/powerpc/kernel/pci-common.c
> > > +++ b/arch/powerpc/kernel/pci-common.c
> > > @@ -42,6 +42,8 @@
> > >  #include <asm/ppc-pci.h>
> > >  #include <asm/eeh.h>
> > >  
> > > +#include "../../../drivers/pci/pci.h"
> > 
> > I see why you need it, but this include path is really ugly.  Outside
> > of bootloaders and tools, there are very few instances of includes
> > like this that reference a different top-level directory, and I'm not
> > very keen about adding more.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-27 22:25       ` Bjorn Helgaas
@ 2018-07-28  0:45         ` Benjamin Herrenschmidt
  2018-07-31 11:21         ` Michael Ellerman
  1 sibling, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-07-28  0:45 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui, Paul Mackerras,
	Michael Ellerman, linuxppc-dev

On Fri, 2018-07-27 at 17:25 -0500, Bjorn Helgaas wrote:
> > As for the powerpc bits, I'm probably the one who wrote them, however,
> > I'm on vacation this week and right now, no bandwidth to context switch
> > all that back in :-) So give me a few days and/or ping me next week.
> 
> OK, here's a ping :)
> 
> Some powerpc cleanup would be ideal, but I'd like to fix the race for
> v4.19, so I'm fine with this patch as-is.  But I'd definitely want
> your ack before inserting the ugly #include path in the powerpc code.

Go for it. Looks like I got a last minute meeting in Austin next week
so i"ll have no time to look at any of this for a while.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-27 22:25       ` Bjorn Helgaas
  2018-07-28  0:45         ` Benjamin Herrenschmidt
@ 2018-07-31 11:21         ` Michael Ellerman
  1 sibling, 0 replies; 51+ messages in thread
From: Michael Ellerman @ 2018-07-31 11:21 UTC (permalink / raw)
  To: Bjorn Helgaas, Benjamin Herrenschmidt
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui, Paul Mackerras,
	linuxppc-dev, Sam Bobroff

Bjorn Helgaas <helgaas@kernel.org> writes:
> On Thu, Jul 19, 2018 at 02:18:09PM +1000, Benjamin Herrenschmidt wrote:
>> On Wed, 2018-07-18 at 18:29 -0500, Bjorn Helgaas wrote:
>> > [+cc Paul, Michael, linuxppc-dev]
>> > 
>> 
>>    ..../...
>> 
>> > > Debugging revealed a race condition between pcie core driver
>> > > enabling is_added bit(pci_bus_add_device()) and nvme driver
>> > > reset work-queue enabling is_busmaster bit (by pci_set_master()).
>> > > As both fields are not handled in atomic manner and that clears
>> > > is_added bit.
>> > > 
>> > > Fix moves device addition is_added bit to separate private flag
>> > > variable and use different atomic functions to set and retrieve
>> > > device addition state. As is_added shares different memory
>> > > location so race condition is avoided.
>> > 
>> > Really nice bit of debugging!
>> 
>> Indeed. However I'm not fan of the solution. Shouldn't we instead have
>> some locking for the content of pci_dev ? I've always been wary of us
>> having other similar races in there.
>> 
>> As for the powerpc bits, I'm probably the one who wrote them, however,
>> I'm on vacation this week and right now, no bandwidth to context switch
>> all that back in :-) So give me a few days and/or ping me next week.
>
> OK, here's a ping :)
>
> Some powerpc cleanup would be ideal, but I'd like to fix the race for
> v4.19, so I'm fine with this patch as-is.  But I'd definitely want
> your ack before inserting the ugly #include path in the powerpc code.

Sorry, the patch didn't hit linuxppc so I forgot about it.

I'm OK with the patch, the include is a bit gross, but I guess it's
fine.

I have a change to pseries/setup.c queued that might collide, though
it's just an addition of another include so it's a trivial fixup.

Acked-by: Michael Ellerman <mpe@ellerman.id.au>


In terms of longer term clean up, do you have a sketch of what you'd
like to see?

cheers

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v3] PCI: Data corruption happening due to race condition
  2018-07-03  9:05 [PATCH v3] PCI: Data corruption happening due to race condition Hari Vyas
  2018-07-03  9:05 ` Hari Vyas
@ 2018-07-31 16:37 ` Bjorn Helgaas
  2018-08-15  3:35   ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Benjamin Herrenschmidt
  1 sibling, 1 reply; 51+ messages in thread
From: Bjorn Helgaas @ 2018-07-31 16:37 UTC (permalink / raw)
  To: Hari Vyas; +Cc: bhelgaas, benh, linux-pci, ray.jui

On Tue, Jul 03, 2018 at 02:35:40PM +0530, Hari Vyas wrote:
> Changes in v3:
> 	As per review comments from Lukas Wunner <lukas@wunner.de>,
> 	squashed 3 commits to single commit. Without this build breaks.
> 	Also clubbed set and clear function for is_added bits to a
> 	single assign function. This optimizes code and reduce LoC.
> 	Removed one wrongly added blank line in pci.c
> 	 
> Changes in v2:
>         To avoid race condition while updating is_added and is_busmaster
>         bits, is_added is moved to a private flag variable.
>         is_added updation is handled in atomic manner also.
> 
> Hari Vyas (1):
>   PCI: Data corruption happening due to race condition
> 
>  arch/powerpc/kernel/pci-common.c          |  4 +++-
>  arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
>  arch/powerpc/platforms/pseries/setup.c    |  3 ++-
>  drivers/pci/bus.c                         |  6 +++---
>  drivers/pci/hotplug/acpiphp_glue.c        |  2 +-
>  drivers/pci/pci.h                         | 11 +++++++++++
>  drivers/pci/probe.c                       |  4 ++--
>  drivers/pci/remove.c                      |  5 +++--
>  include/linux/pci.h                       |  1 -
>  9 files changed, 27 insertions(+), 12 deletions(-)

Applied with Lukas' reviewed-by and Michael's ack to for-linus for v4.18,
thanks!

^ permalink raw reply	[flat|nested] 51+ messages in thread

* PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-07-31 16:37 ` Bjorn Helgaas
@ 2018-08-15  3:35   ` Benjamin Herrenschmidt
  2018-08-15  4:16     ` Benjamin Herrenschmidt
  2018-08-15 18:50     ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Bjorn Helgaas
  0 siblings, 2 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-15  3:35 UTC (permalink / raw)
  To: Bjorn Helgaas, Hari Vyas; +Cc: bhelgaas, linux-pci, ray.jui

On Tue, 2018-07-31 at 11:37 -0500, Bjorn Helgaas wrote:
> On Tue, Jul 03, 2018 at 02:35:40PM +0530, Hari Vyas wrote:
> > Changes in v3:
> > 	As per review comments from Lukas Wunner <lukas@wunner.de>,
> > 	squashed 3 commits to single commit. Without this build breaks.
> > 	Also clubbed set and clear function for is_added bits to a
> > 	single assign function. This optimizes code and reduce LoC.
> > 	Removed one wrongly added blank line in pci.c
> > 	 
> > Changes in v2:
> >         To avoid race condition while updating is_added and is_busmaster
> >         bits, is_added is moved to a private flag variable.
> >         is_added updation is handled in atomic manner also.
> > 
> > Hari Vyas (1):
> >   PCI: Data corruption happening due to race condition

Sooo .... I was chasing a different problem which makes me think we
have a deeper problem here.

In my case, I have a system with >70 nvme devices behind layers of
switches.

What I think is happening is all the nvme devices are probed in
parallel (the machine has about 40 CPU cores).

They all call pci_enable_device() around the same time.

This will walk up the bridge/switch chain and try to enable every
switch along the way. However there is *no* locking at the switch level
at all that I can see. Or am I missing something subtle ?

So here's an example simplified scenario:

	Bridge
	/    \
     dev A   dev B

Both dev A and B hit pci_enable_device() simultaneously, thus both
call pci_enable_bridge() at the same time: This does (simplified):

	if (pci_is_enabled(dev)) {
		if (!dev->is_busmaster)
			pci_set_master(dev);
		return;
	}

	retval = pci_enable_device(dev);
	if (retval)
		pci_err(dev, "Error enabling bridge (%d), continuing\n",
			retval);
	pci_set_master(dev);

Now the pci_is_enabled() just checks dev->enable_cnt and pci_enable_device()
increments it *before* enabling the device.

So it's possible that pci_is_enabled() returns true for the bridge for dev B
because dev A just did the atomic_inc_return(), but hasn't actually enabled
the bridge yet (hasnt yet hit the config space).

At that point, driver for dev B hits an MMIO and gets an UR response from
the bridge.

I need to setup a rig to verify my theory but I think this is racy. The same
race is also present with dev->is_busmaster. Using bitmaps won't help.

What's really needed is a per device mutex covering all those operations
on a given device. (This would also allow to get rid of those games with
atomics).

Any comments ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-15  3:35   ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Benjamin Herrenschmidt
@ 2018-08-15  4:16     ` Benjamin Herrenschmidt
  2018-08-15  4:44       ` Benjamin Herrenschmidt
  2018-08-15 18:50     ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Bjorn Helgaas
  1 sibling, 1 reply; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-15  4:16 UTC (permalink / raw)
  To: Bjorn Helgaas, Hari Vyas; +Cc: bhelgaas, linux-pci, ray.jui

On Wed, 2018-08-15 at 13:35 +1000, Benjamin Herrenschmidt wrote:
> What's really needed is a per device mutex covering all those operations
> on a given device. (This would also allow to get rid of those games with
> atomics).
> 
> Any comments ?

Note: I'm experienting with a per-pci_dev mutex to protect the device
state. If it works out, I suggest we add it, then progressively
move things one by one under the protection of the mutex.

Any objection to the approach ? If it fixes my problem, I'll send
preliminary patches that cover the basic enable/disable and bus
master settings to start with. We should try to get them into
stable as this is breaking real world stuff for us.

I suspect large x86 systems get lucky because their BIOSes do all
the enables. It would be possible to reproduce the races there by
hot-pluging a large external drawer using a cable card with a switch

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-15  4:16     ` Benjamin Herrenschmidt
@ 2018-08-15  4:44       ` Benjamin Herrenschmidt
  2018-08-15  5:21         ` [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races Benjamin Herrenschmidt
                           ` (2 more replies)
  0 siblings, 3 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-15  4:44 UTC (permalink / raw)
  To: Bjorn Helgaas, Hari Vyas; +Cc: bhelgaas, linux-pci, ray.jui

On Wed, 2018-08-15 at 14:16 +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2018-08-15 at 13:35 +1000, Benjamin Herrenschmidt wrote:
> > What's really needed is a per device mutex covering all those operations
> > on a given device. (This would also allow to get rid of those games with
> > atomics).
> > 
> > Any comments ?
> 
> Note: I'm experienting with a per-pci_dev mutex to protect the device
> state. If it works out, I suggest we add it, then progressively
> move things one by one under the protection of the mutex.
> 
> Any objection to the approach ? If it fixes my problem, I'll send
> preliminary patches that cover the basic enable/disable and bus
> master settings to start with. We should try to get them into
> stable as this is breaking real world stuff for us.
> 
> I suspect large x86 systems get lucky because their BIOSes do all
> the enables. It would be possible to reproduce the races there by
> hot-pluging a large external drawer using a cable card with a switch

BTW. Should we plan a PCIe microconf for Plumbers ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
  2018-08-15  4:44       ` Benjamin Herrenschmidt
@ 2018-08-15  5:21         ` Benjamin Herrenschmidt
  2018-08-15 19:09         ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Bjorn Helgaas
  2018-08-15 21:50         ` [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races Benjamin Herrenschmidt
  2 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-15  5:21 UTC (permalink / raw)
  To: Bjorn Helgaas, Hari Vyas; +Cc: bhelgaas, linux-pci, ray.jui

[Note: This isn't meant to be merged, it need splitting at the very
least, see below]

This is something I cooked up quickly today to test if that would fix
my problems with large number of switch and NVME devices on POWER.

So far it does...

The issue (as discussed in the Re: PCIe enable device races thread) is
that pci_enable_device and related functions along with pci_set_master
and pci_enable_bridge are fundamentally racy.

There is no lockign protecting the state of the device in pci_dev and
if multiple devices under the same bridge try to enable it simultaneously
one some of them will potentially start accessing it before it has actually
been enabled.

Now there are a LOT more fields in pci_dev that aren't covered by any
form of locking.

Additionally, some other parts such as ASPM or SR-IOV seem to be trying to do
their own ad-hoc locking, but will manipulate bit fields in pci_dev that
might be sharing words with other unrelated bit fields racily.

So I think we need to introduce a pci_dev mutex aimed at synchronizing
the main enable/master state of a given device, and possibly also the
bazillion of state bits held inside pci_dev.

I suggest a progressive approach:

 1- First we add the mutex, and put a red line comment "/* ----- */" in
    struct pci_dev, at the bottom

 2- We move the basic enable/disable/set_master stuff under that lock and
    move the corresponding fields in pci_dev below the line comment.

 3- Progressively, as we untangle them and verify their locking, we move
    individual fields below the line so that we have a good visibility of
    what has been addressed/audited already and what not

Note: I would very much like to keep most of the probing/resource allocation
single threaded at least within a given domain, I know we have some attempts
at locking in the hotplug code for that, I'm not ready to tackle that or
change it at this stage.

We might be able to also addrss the is_added issues (Hari Vyas stuff) using
the same mutex, I haven't looked into the details.

Not-signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 316496e99da9..9c4895c40f1d 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1348,9 +1348,13 @@ static int do_pci_enable_device(struct pci_dev *dev, int bars)
  */
 int pci_reenable_device(struct pci_dev *dev)
 {
+	int rc = 0;
+
+	mutex_lock(&dev->lock);
 	if (pci_is_enabled(dev))
-		return do_pci_enable_device(dev, (1 << PCI_NUM_RESOURCES) - 1);
-	return 0;
+		rc = do_pci_enable_device(dev, (1 << PCI_NUM_RESOURCES) - 1);
+	mutex_unlock(&dev->lock);
+	return rc;
 }
 EXPORT_SYMBOL(pci_reenable_device);
 
@@ -1363,12 +1367,6 @@ static void pci_enable_bridge(struct pci_dev *dev)
 	if (bridge)
 		pci_enable_bridge(bridge);
 
-	if (pci_is_enabled(dev)) {
-		if (!dev->is_busmaster)
-			pci_set_master(dev);
-		return;
-	}
-
 	retval = pci_enable_device(dev);
 	if (retval)
 		pci_err(dev, "Error enabling bridge (%d), continuing\n",
@@ -1379,9 +1377,16 @@ static void pci_enable_bridge(struct pci_dev *dev)
 static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 {
 	struct pci_dev *bridge;
-	int err;
+	int err = 0;
 	int i, bars = 0;
 
+	/* Handle upstream bridges first to avoid locking issues */
+	bridge = pci_upstream_bridge(dev);
+	if (bridge)
+		pci_enable_bridge(bridge);
+
+	mutex_lock(&dev->lock);
+
 	/*
 	 * Power state could be unknown at this point, either due to a fresh
 	 * boot or a device removal call.  So get the current power state
@@ -1394,12 +1399,9 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 		dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
 	}
 
+	/* Already enabled ? */
 	if (atomic_inc_return(&dev->enable_cnt) > 1)
-		return 0;		/* already enabled */
-
-	bridge = pci_upstream_bridge(dev);
-	if (bridge)
-		pci_enable_bridge(bridge);
+		goto bail;
 
 	/* only skip sriov related */
 	for (i = 0; i <= PCI_ROM_RESOURCE; i++)
@@ -1412,6 +1414,8 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 	err = do_pci_enable_device(dev, bars);
 	if (err < 0)
 		atomic_dec(&dev->enable_cnt);
+ bail:
+	mutex_unlock(&dev->lock);
 	return err;
 }
 
@@ -1618,6 +1622,7 @@ static void do_pci_disable_device(struct pci_dev *dev)
 	if (pci_command & PCI_COMMAND_MASTER) {
 		pci_command &= ~PCI_COMMAND_MASTER;
 		pci_write_config_word(dev, PCI_COMMAND, pci_command);
+		dev->is_busmaster = 0;
 	}
 
 	pcibios_disable_device(dev);
@@ -1632,8 +1637,10 @@ static void do_pci_disable_device(struct pci_dev *dev)
  */
 void pci_disable_enabled_device(struct pci_dev *dev)
 {
+	mutex_lock(&dev->lock);
 	if (pci_is_enabled(dev))
 		do_pci_disable_device(dev);
+	mutex_unlock(&dev->lock);
 }
 
 /**
@@ -1657,12 +1664,13 @@ void pci_disable_device(struct pci_dev *dev)
 	dev_WARN_ONCE(&dev->dev, atomic_read(&dev->enable_cnt) <= 0,
 		      "disabling already-disabled device");
 
+	mutex_lock(&dev->lock);
 	if (atomic_dec_return(&dev->enable_cnt) != 0)
-		return;
+		goto bail;
 
 	do_pci_disable_device(dev);
-
-	dev->is_busmaster = 0;
+ bail:
+	mutex_unlock(&dev->lock);
 }
 EXPORT_SYMBOL(pci_disable_device);
 
@@ -3764,8 +3772,12 @@ void __weak pcibios_set_master(struct pci_dev *dev)
  */
 void pci_set_master(struct pci_dev *dev)
 {
-	__pci_set_master(dev, true);
-	pcibios_set_master(dev);
+	mutex_lock(&dev->lock);
+	if (!dev->is_busmaster) {
+		__pci_set_master(dev, true);
+		pcibios_set_master(dev);
+	}
+	mutex_unlock(&dev->lock);
 }
 EXPORT_SYMBOL(pci_set_master);
 
@@ -3775,7 +3787,9 @@ EXPORT_SYMBOL(pci_set_master);
  */
 void pci_clear_master(struct pci_dev *dev)
 {
+	mutex_lock(&dev->lock);
 	__pci_set_master(dev, false);
+	mutex_unlock(&dev->lock);
 }
 EXPORT_SYMBOL(pci_clear_master);
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 611adcd9c169..77701bfe0d3d 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2102,6 +2102,7 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
 	INIT_LIST_HEAD(&dev->bus_list);
 	dev->dev.type = &pci_dev_type;
 	dev->bus = pci_bus_get(bus);
+	mutex_init(&dev->lock);
 
 	return dev;
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c133ccfa002e..09e94ba6adf0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -33,6 +33,7 @@
 #include <linux/io.h>
 #include <linux/resource_ext.h>
 #include <uapi/linux/pci.h>
+#include <linux/mutex.h>
 
 #include <linux/pci_ids.h>
 
@@ -289,6 +290,8 @@ struct pci_dev {
 	struct proc_dir_entry *procent;	/* Device entry in /proc/bus/pci */
 	struct pci_slot	*slot;		/* Physical slot this device is in */
 
+	struct mutex	lock;
+
 	unsigned int	devfn;		/* Encoded device & function index */
 	unsigned short	vendor;
 	unsigned short	device;

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-15  3:35   ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Benjamin Herrenschmidt
  2018-08-15  4:16     ` Benjamin Herrenschmidt
@ 2018-08-15 18:50     ` Bjorn Helgaas
  2018-08-15 21:52       ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 51+ messages in thread
From: Bjorn Helgaas @ 2018-08-15 18:50 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui

On Wed, Aug 15, 2018 at 01:35:16PM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2018-07-31 at 11:37 -0500, Bjorn Helgaas wrote:
> > On Tue, Jul 03, 2018 at 02:35:40PM +0530, Hari Vyas wrote:
> > > Changes in v3:
> > > 	As per review comments from Lukas Wunner <lukas@wunner.de>,
> > > 	squashed 3 commits to single commit. Without this build breaks.
> > > 	Also clubbed set and clear function for is_added bits to a
> > > 	single assign function. This optimizes code and reduce LoC.
> > > 	Removed one wrongly added blank line in pci.c
> > > 	 
> > > Changes in v2:
> > >         To avoid race condition while updating is_added and is_busmaster
> > >         bits, is_added is moved to a private flag variable.
> > >         is_added updation is handled in atomic manner also.
> > > 
> > > Hari Vyas (1):
> > >   PCI: Data corruption happening due to race condition
> 
> Sooo .... I was chasing a different problem which makes me think we
> have a deeper problem here.
> 
> In my case, I have a system with >70 nvme devices behind layers of
> switches.
> 
> What I think is happening is all the nvme devices are probed in
> parallel (the machine has about 40 CPU cores).
> 
> They all call pci_enable_device() around the same time.
> 
> This will walk up the bridge/switch chain and try to enable every
> switch along the way. However there is *no* locking at the switch level
> at all that I can see. Or am I missing something subtle ?
> 
> So here's an example simplified scenario:
> 
> 	Bridge
> 	/    \
>      dev A   dev B
> 
> Both dev A and B hit pci_enable_device() simultaneously, thus both
> call pci_enable_bridge() at the same time: This does (simplified):
> 
> 	if (pci_is_enabled(dev)) {
> 		if (!dev->is_busmaster)
> 			pci_set_master(dev);
> 		return;
> 	}
> 
> 	retval = pci_enable_device(dev);
> 	if (retval)
> 		pci_err(dev, "Error enabling bridge (%d), continuing\n",
> 			retval);
> 	pci_set_master(dev);
> 
> Now the pci_is_enabled() just checks dev->enable_cnt and pci_enable_device()
> increments it *before* enabling the device.
> 
> So it's possible that pci_is_enabled() returns true for the bridge for dev B
> because dev A just did the atomic_inc_return(), but hasn't actually enabled
> the bridge yet (hasnt yet hit the config space).
> 
> At that point, driver for dev B hits an MMIO and gets an UR response from
> the bridge.
> 
> I need to setup a rig to verify my theory but I think this is racy. The same
> race is also present with dev->is_busmaster. Using bitmaps won't help.
> 
> What's really needed is a per device mutex covering all those operations
> on a given device. (This would also allow to get rid of those games with
> atomics).

Yes, this is definitely broken.  Some folks have tried to fix it in
the past, but it hasn't quite happened yet.  We actually merged one
patch, 40f11adc7cd9 ("PCI: Avoid race while enabling upstream
bridges"), but had to revert it after we found issues:

https://lkml.kernel.org/r/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com
https://lkml.kernel.org/r/20170915072352.10453.31977.stgit@bhelgaas-glaptop.roam.corp.google.com

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-15  4:44       ` Benjamin Herrenschmidt
  2018-08-15  5:21         ` [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races Benjamin Herrenschmidt
@ 2018-08-15 19:09         ` Bjorn Helgaas
  2018-08-15 21:50         ` [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races Benjamin Herrenschmidt
  2 siblings, 0 replies; 51+ messages in thread
From: Bjorn Helgaas @ 2018-08-15 19:09 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui

On Wed, Aug 15, 2018 at 02:44:22PM +1000, Benjamin Herrenschmidt wrote:
> BTW. Should we plan a PCIe microconf for Plumbers ?

Last year Lorenzo organized a PCI miniconf, but he doesn't plan to be
there this year.  I do plan to be at Plumbers and would be happy to
participate.  We would need somebody to put together a proposal
(topics, speakers, key participants).  Or we could just do a BoF.
Or we could identify particular topics and set up dinner for
interested people.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
  2018-08-15  4:44       ` Benjamin Herrenschmidt
  2018-08-15  5:21         ` [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races Benjamin Herrenschmidt
  2018-08-15 19:09         ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Bjorn Helgaas
@ 2018-08-15 21:50         ` Benjamin Herrenschmidt
  2018-08-15 22:40           ` Guenter Roeck
  2018-08-17  3:07           ` Bjorn Helgaas
  2 siblings, 2 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-15 21:50 UTC (permalink / raw)
  To: Bjorn Helgaas, Hari Vyas; +Cc: bhelgaas, linux-pci, ray.jui, linux-kernel

(Resent with lkml on copy)

[Note: This isn't meant to be merged, it need splitting at the very
least, see below]

This is something I cooked up quickly today to test if that would fix
my problems with large number of switch and NVME devices on POWER.

So far it does...

The issue (as discussed in the Re: PCIe enable device races thread) is
that pci_enable_device and related functions along with pci_set_master
and pci_enable_bridge are fundamentally racy.

There is no lockign protecting the state of the device in pci_dev and
if multiple devices under the same bridge try to enable it simultaneously
one some of them will potentially start accessing it before it has actually
been enabled.

Now there are a LOT more fields in pci_dev that aren't covered by any
form of locking.

Additionally, some other parts such as ASPM or SR-IOV seem to be trying to do
their own ad-hoc locking, but will manipulate bit fields in pci_dev that
might be sharing words with other unrelated bit fields racily.

So I think we need to introduce a pci_dev mutex aimed at synchronizing
the main enable/master state of a given device, and possibly also the
bazillion of state bits held inside pci_dev.

I suggest a progressive approach:

 1- First we add the mutex, and put a red line comment "/* ----- */" in
    struct pci_dev, at the bottom

 2- We move the basic enable/disable/set_master stuff under that lock and
    move the corresponding fields in pci_dev below the line comment.

 3- Progressively, as we untangle them and verify their locking, we move
    individual fields below the line so that we have a good visibility of
    what has been addressed/audited already and what not

Note: I would very much like to keep most of the probing/resource allocation
single threaded at least within a given domain, I know we have some attempts
at locking in the hotplug code for that, I'm not ready to tackle that or
change it at this stage.

We might be able to also addrss the is_added issues (Hari Vyas stuff) using
the same mutex, I haven't looked into the details.

Not-signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 316496e99da9..9c4895c40f1d 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1348,9 +1348,13 @@ static int do_pci_enable_device(struct pci_dev *dev, int bars)
  */
 int pci_reenable_device(struct pci_dev *dev)
 {
+	int rc = 0;
+
+	mutex_lock(&dev->lock);
 	if (pci_is_enabled(dev))
-		return do_pci_enable_device(dev, (1 << PCI_NUM_RESOURCES) - 1);
-	return 0;
+		rc = do_pci_enable_device(dev, (1 << PCI_NUM_RESOURCES) - 1);
+	mutex_unlock(&dev->lock);
+	return rc;
 }
 EXPORT_SYMBOL(pci_reenable_device);
 
@@ -1363,12 +1367,6 @@ static void pci_enable_bridge(struct pci_dev *dev)
 	if (bridge)
 		pci_enable_bridge(bridge);
 
-	if (pci_is_enabled(dev)) {
-		if (!dev->is_busmaster)
-			pci_set_master(dev);
-		return;
-	}
-
 	retval = pci_enable_device(dev);
 	if (retval)
 		pci_err(dev, "Error enabling bridge (%d), continuing\n",
@@ -1379,9 +1377,16 @@ static void pci_enable_bridge(struct pci_dev *dev)
 static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 {
 	struct pci_dev *bridge;
-	int err;
+	int err = 0;
 	int i, bars = 0;
 
+	/* Handle upstream bridges first to avoid locking issues */
+	bridge = pci_upstream_bridge(dev);
+	if (bridge)
+		pci_enable_bridge(bridge);
+
+	mutex_lock(&dev->lock);
+
 	/*
 	 * Power state could be unknown at this point, either due to a fresh
 	 * boot or a device removal call.  So get the current power state
@@ -1394,12 +1399,9 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 		dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
 	}
 
+	/* Already enabled ? */
 	if (atomic_inc_return(&dev->enable_cnt) > 1)
-		return 0;		/* already enabled */
-
-	bridge = pci_upstream_bridge(dev);
-	if (bridge)
-		pci_enable_bridge(bridge);
+		goto bail;
 
 	/* only skip sriov related */
 	for (i = 0; i <= PCI_ROM_RESOURCE; i++)
@@ -1412,6 +1414,8 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 	err = do_pci_enable_device(dev, bars);
 	if (err < 0)
 		atomic_dec(&dev->enable_cnt);
+ bail:
+	mutex_unlock(&dev->lock);
 	return err;
 }
 
@@ -1618,6 +1622,7 @@ static void do_pci_disable_device(struct pci_dev *dev)
 	if (pci_command & PCI_COMMAND_MASTER) {
 		pci_command &= ~PCI_COMMAND_MASTER;
 		pci_write_config_word(dev, PCI_COMMAND, pci_command);
+		dev->is_busmaster = 0;
 	}
 
 	pcibios_disable_device(dev);
@@ -1632,8 +1637,10 @@ static void do_pci_disable_device(struct pci_dev *dev)
  */
 void pci_disable_enabled_device(struct pci_dev *dev)
 {
+	mutex_lock(&dev->lock);
 	if (pci_is_enabled(dev))
 		do_pci_disable_device(dev);
+	mutex_unlock(&dev->lock);
 }
 
 /**
@@ -1657,12 +1664,13 @@ void pci_disable_device(struct pci_dev *dev)
 	dev_WARN_ONCE(&dev->dev, atomic_read(&dev->enable_cnt) <= 0,
 		      "disabling already-disabled device");
 
+	mutex_lock(&dev->lock);
 	if (atomic_dec_return(&dev->enable_cnt) != 0)
-		return;
+		goto bail;
 
 	do_pci_disable_device(dev);
-
-	dev->is_busmaster = 0;
+ bail:
+	mutex_unlock(&dev->lock);
 }
 EXPORT_SYMBOL(pci_disable_device);
 
@@ -3764,8 +3772,12 @@ void __weak pcibios_set_master(struct pci_dev *dev)
  */
 void pci_set_master(struct pci_dev *dev)
 {
-	__pci_set_master(dev, true);
-	pcibios_set_master(dev);
+	mutex_lock(&dev->lock);
+	if (!dev->is_busmaster) {
+		__pci_set_master(dev, true);
+		pcibios_set_master(dev);
+	}
+	mutex_unlock(&dev->lock);
 }
 EXPORT_SYMBOL(pci_set_master);
 
@@ -3775,7 +3787,9 @@ EXPORT_SYMBOL(pci_set_master);
  */
 void pci_clear_master(struct pci_dev *dev)
 {
+	mutex_lock(&dev->lock);
 	__pci_set_master(dev, false);
+	mutex_unlock(&dev->lock);
 }
 EXPORT_SYMBOL(pci_clear_master);
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 611adcd9c169..77701bfe0d3d 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2102,6 +2102,7 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
 	INIT_LIST_HEAD(&dev->bus_list);
 	dev->dev.type = &pci_dev_type;
 	dev->bus = pci_bus_get(bus);
+	mutex_init(&dev->lock);
 
 	return dev;
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c133ccfa002e..09e94ba6adf0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -33,6 +33,7 @@
 #include <linux/io.h>
 #include <linux/resource_ext.h>
 #include <uapi/linux/pci.h>
+#include <linux/mutex.h>
 
 #include <linux/pci_ids.h>
 
@@ -289,6 +290,8 @@ struct pci_dev {
 	struct proc_dir_entry *procent;	/* Device entry in /proc/bus/pci */
 	struct pci_slot	*slot;		/* Physical slot this device is in */
 
+	struct mutex	lock;
+
 	unsigned int	devfn;		/* Encoded device & function index */
 	unsigned short	vendor;
 	unsigned short	device;

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-15 18:50     ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Bjorn Helgaas
@ 2018-08-15 21:52       ` Benjamin Herrenschmidt
  2018-08-15 23:23         ` Benjamin Herrenschmidt
                           ` (2 more replies)
  0 siblings, 3 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-15 21:52 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui, Konstantin Khlebnikov,
	Jens Axboe

On Wed, 2018-08-15 at 13:50 -0500, Bjorn Helgaas wrote:
> Yes, this is definitely broken.  Some folks have tried to fix it in
> the past, but it hasn't quite happened yet.  We actually merged one
> patch, 40f11adc7cd9 ("PCI: Avoid race while enabling upstream
> bridges"), but had to revert it after we found issues:
> 
> https://lkml.kernel.org/r/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com
> https://lkml.kernel.org/r/20170915072352.10453.31977.stgit@bhelgaas-glaptop.roam.corp.google.com

Ok so I had a look at this previous patch and it adds yet anothe use of
some global mutex to protect part of the operation which makes me
cringe a bit, we have too many of these.

What do you think of the one I sent yesterday ? (I can't find it in the
archives yet)

[RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races

The patch itself needs splitting etc... but the basic idea is to move away
from those global mutexes in a number of places and have one in the pci_dev
struct itself to protect its state.

I would also like to use this rather than the bitmap atomics for is_added
etc... (Hari's fix) in the long run. Atomics aren't significantly cheaper
and imho makes thing even messier.

Jens, Konstantin, any chance you can test if the above also breaks iwlwifi
(I don't see why it would but ...)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
  2018-08-15 21:50         ` [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races Benjamin Herrenschmidt
@ 2018-08-15 22:40           ` Guenter Roeck
  2018-08-15 23:38             ` Benjamin Herrenschmidt
  2018-08-17  3:07           ` Bjorn Helgaas
  1 sibling, 1 reply; 51+ messages in thread
From: Guenter Roeck @ 2018-08-15 22:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Bjorn Helgaas, Hari Vyas, bhelgaas, linux-pci, ray.jui, linux-kernel

On Thu, Aug 16, 2018 at 07:50:13AM +1000, Benjamin Herrenschmidt wrote:
> (Resent with lkml on copy)
> 
> [Note: This isn't meant to be merged, it need splitting at the very
> least, see below]
> 
> This is something I cooked up quickly today to test if that would fix
> my problems with large number of switch and NVME devices on POWER.
> 

Is that a problem that can be reproduced with a qemu setup ?

Thanks,
Guenter

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-15 21:52       ` Benjamin Herrenschmidt
@ 2018-08-15 23:23         ` Benjamin Herrenschmidt
  2018-08-16  7:58         ` Konstantin Khlebnikov
  2018-08-16 12:28         ` Lukas Wunner
  2 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-15 23:23 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui, Konstantin Khlebnikov,
	Jens Axboe

On Thu, 2018-08-16 at 07:52 +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2018-08-15 at 13:50 -0500, Bjorn Helgaas wrote:
> > Yes, this is definitely broken.  Some folks have tried to fix it in
> > the past, but it hasn't quite happened yet.  We actually merged one
> > patch, 40f11adc7cd9 ("PCI: Avoid race while enabling upstream
> > bridges"), but had to revert it after we found issues:
> > 
> > https://lkml.kernel.org/r/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com
> > https://lkml.kernel.org/r/20170915072352.10453.31977.stgit@bhelgaas-glaptop.roam.corp.google.com
> 
> Ok so I had a look at this previous patch and it adds yet anothe use of
> some global mutex to protect part of the operation which makes me
> cringe a bit, we have too many of these.
> 
> What do you think of the one I sent yesterday ? (I can't find it in the
> archives yet)

I verified on my Thinkpad X1 Carbon Gen4, everything seems to be
working fine with the patch applied, including iwlwifi.

> [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
> 
> The patch itself needs splitting etc... but the basic idea is to move away
> from those global mutexes in a number of places and have one in the pci_dev
> struct itself to protect its state.
> 
> I would also like to use this rather than the bitmap atomics for is_added
> etc... (Hari's fix) in the long run. Atomics aren't significantly cheaper
> and imho makes thing even messier.
> 
> Jens, Konstantin, any chance you can test if the above also breaks iwlwifi
> (I don't see why it would but ...)
> 
> Cheers,
> Ben.
> 

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
  2018-08-15 22:40           ` Guenter Roeck
@ 2018-08-15 23:38             ` Benjamin Herrenschmidt
  2018-08-20  1:31               ` Guenter Roeck
  0 siblings, 1 reply; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-15 23:38 UTC (permalink / raw)
  To: Guenter Roeck
  Cc: Bjorn Helgaas, Hari Vyas, bhelgaas, linux-pci, ray.jui, linux-kernel

On Wed, 2018-08-15 at 15:40 -0700, Guenter Roeck wrote:
> On Thu, Aug 16, 2018 at 07:50:13AM +1000, Benjamin Herrenschmidt wrote:
> > (Resent with lkml on copy)
> > 
> > [Note: This isn't meant to be merged, it need splitting at the very
> > least, see below]
> > 
> > This is something I cooked up quickly today to test if that would fix
> > my problems with large number of switch and NVME devices on POWER.
> > 
> 
> Is that a problem that can be reproduced with a qemu setup ?

With difficulty... mt-tcg might help, but you need a rather large
systems to reproduce it.

My repro-case is a 2 socket POWER9 system (about 40 cores off the top
of my mind, so 160 threads) with 72 NVME devices underneath a tree of
switches (I don't have the system at hand today to check how many).

It's possible to observe it I suppose on a smaller system (in theory a
single bridge with 2 devices is enough) but in practice the timing is
extremely hard to hit.

You need a combination of:

  - The bridges come up disabled (which is the case when Linux does the
resource assignment, such as on POWER but not on x86 unless it's
hotplug)

  - The nvme devices try to enable them simultaneously

Also the resulting error is a UR, I don't know how well qemu models
that.

On the above system, I get usually *one* device failing due to the race
out of 72, and not on every boot.

However, the bug is known (see Bjorn's reply to the other thread) "Re:
PCIe enable device races (Was: [PATCH v3] PCI: Data corruption
happening due to race condition)" on linux-pci, so I'm not the only one
with a repro-case around.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-15 21:52       ` Benjamin Herrenschmidt
  2018-08-15 23:23         ` Benjamin Herrenschmidt
@ 2018-08-16  7:58         ` Konstantin Khlebnikov
  2018-08-16  8:02           ` Benjamin Herrenschmidt
  2018-08-16 12:28         ` Lukas Wunner
  2 siblings, 1 reply; 51+ messages in thread
From: Konstantin Khlebnikov @ 2018-08-16  7:58 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui, Jens Axboe

On 16.08.2018 00:52, Benjamin Herrenschmidt wrote:
> On Wed, 2018-08-15 at 13:50 -0500, Bjorn Helgaas wrote:
>> Yes, this is definitely broken.  Some folks have tried to fix it in
>> the past, but it hasn't quite happened yet.  We actually merged one
>> patch, 40f11adc7cd9 ("PCI: Avoid race while enabling upstream
>> bridges"), but had to revert it after we found issues:
>>
>> https://lkml.kernel.org/r/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com
>> https://lkml.kernel.org/r/20170915072352.10453.31977.stgit@bhelgaas-glaptop.roam.corp.google.com
> 
> Ok so I had a look at this previous patch and it adds yet anothe use of
> some global mutex to protect part of the operation which makes me
> cringe a bit, we have too many of these.
> 
> What do you think of the one I sent yesterday ? (I can't find it in the
> archives yet)
> 
> [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
> 
> The patch itself needs splitting etc... but the basic idea is to move away
> from those global mutexes in a number of places and have one in the pci_dev
> struct itself to protect its state.
> 
> I would also like to use this rather than the bitmap atomics for is_added
> etc... (Hari's fix) in the long run. Atomics aren't significantly cheaper
> and imho makes thing even messier.
> 
> Jens, Konstantin, any chance you can test if the above also breaks iwlwifi
> (I don't see why it would but ...)
> 

I suppose original race was discovered between enabling bridge and device as described here

https://lore.kernel.org/lkml/150547971091.977464.16294045866179907260.stgit@buzz/T/#u

I barely can remember what I ever posted this, so I couldn't reproduce for sure.

> Cheers,
> Ben.
> 
> 

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16  7:58         ` Konstantin Khlebnikov
@ 2018-08-16  8:02           ` Benjamin Herrenschmidt
  2018-08-16  9:22             ` Hari Vyas
  2018-08-16 19:43             ` Jens Axboe
  0 siblings, 2 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-16  8:02 UTC (permalink / raw)
  To: Konstantin Khlebnikov, Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui, Jens Axboe

On Thu, 2018-08-16 at 10:58 +0300, Konstantin Khlebnikov wrote:
> On 16.08.2018 00:52, Benjamin Herrenschmidt wrote:
> > On Wed, 2018-08-15 at 13:50 -0500, Bjorn Helgaas wrote:
> > > Yes, this is definitely broken.  Some folks have tried to fix it in
> > > the past, but it hasn't quite happened yet.  We actually merged one
> > > patch, 40f11adc7cd9 ("PCI: Avoid race while enabling upstream
> > > bridges"), but had to revert it after we found issues:
> > > 
> > > https://lkml.kernel.org/r/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com
> > > https://lkml.kernel.org/r/20170915072352.10453.31977.stgit@bhelgaas-glaptop.roam.corp.google.com
> > 
> > Ok so I had a look at this previous patch and it adds yet anothe use of
> > some global mutex to protect part of the operation which makes me
> > cringe a bit, we have too many of these.
> > 
> > What do you think of the one I sent yesterday ? (I can't find it in the
> > archives yet)
> > 
> > [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
> > 
> > The patch itself needs splitting etc... but the basic idea is to move away
> > from those global mutexes in a number of places and have one in the pci_dev
> > struct itself to protect its state.
> > 
> > I would also like to use this rather than the bitmap atomics for is_added
> > etc... (Hari's fix) in the long run. Atomics aren't significantly cheaper
> > and imho makes thing even messier.
> > 
> > Jens, Konstantin, any chance you can test if the above also breaks iwlwifi
> > (I don't see why it would but ...)
> > 
> 
> I suppose original race was discovered between enabling bridge and device as described here
> 
> https://lore.kernel.org/lkml/150547971091.977464.16294045866179907260.stgit@buzz/T/#u
> 
> I barely can remember what I ever posted this, so I couldn't reproduce for sure.

Ok. Well, my patch fixes it for my repro-case at least and seems to not
break anyhting on my thinkpad so ...

Bjorn, are you ok with the approach ? If yes, I'll start breaking up
that patch into a few smaller bits in case something goes wrong and we
want to bisect (such as the changes I did to tracking is_busmaster
etc...)

Cheers,
ben.
 

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16  8:02           ` Benjamin Herrenschmidt
@ 2018-08-16  9:22             ` Hari Vyas
  2018-08-16 10:10               ` Benjamin Herrenschmidt
  2018-08-16 19:43             ` Jens Axboe
  1 sibling, 1 reply; 51+ messages in thread
From: Hari Vyas @ 2018-08-16  9:22 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Konstantin Khlebnikov, Bjorn Helgaas, Bjorn Helgaas, linux-pci,
	Ray Jui, Jens Axboe

On Thu, Aug 16, 2018 at 1:32 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Thu, 2018-08-16 at 10:58 +0300, Konstantin Khlebnikov wrote:
>> On 16.08.2018 00:52, Benjamin Herrenschmidt wrote:
>> > On Wed, 2018-08-15 at 13:50 -0500, Bjorn Helgaas wrote:
>> > > Yes, this is definitely broken.  Some folks have tried to fix it in
>> > > the past, but it hasn't quite happened yet.  We actually merged one
>> > > patch, 40f11adc7cd9 ("PCI: Avoid race while enabling upstream
>> > > bridges"), but had to revert it after we found issues:
>> > >
>> > > https://lkml.kernel.org/r/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com
>> > > https://lkml.kernel.org/r/20170915072352.10453.31977.stgit@bhelgaas-glaptop.roam.corp.google.com
>> >
>> > Ok so I had a look at this previous patch and it adds yet anothe use of
>> > some global mutex to protect part of the operation which makes me
>> > cringe a bit, we have too many of these.
>> >
>> > What do you think of the one I sent yesterday ? (I can't find it in the
>> > archives yet)
>> >
>> > [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
>> >
>> > The patch itself needs splitting etc... but the basic idea is to move away
>> > from those global mutexes in a number of places and have one in the pci_dev
>> > struct itself to protect its state.
>> >
>> > I would also like to use this rather than the bitmap atomics for is_added
>> > etc... (Hari's fix) in the long run. Atomics aren't significantly cheaper
>> > and imho makes thing even messier.
>> >
>> > Jens, Konstantin, any chance you can test if the above also breaks iwlwifi
>> > (I don't see why it would but ...)
>> >
>>
>> I suppose original race was discovered between enabling bridge and device as described here
>>
>> https://lore.kernel.org/lkml/150547971091.977464.16294045866179907260.stgit@buzz/T/#u
>>
>> I barely can remember what I ever posted this, so I couldn't reproduce for sure.
>
> Ok. Well, my patch fixes it for my repro-case at least and seems to not
> break anyhting on my thinkpad so ...
>
> Bjorn, are you ok with the approach ? If yes, I'll start breaking up
> that patch into a few smaller bits in case something goes wrong and we
> want to bisect (such as the changes I did to tracking is_busmaster
> etc...)
>
> Cheers,
> ben.
>
>
There was an issue reported by my colleague srinath while enabling pci
bridge and a race
condition was happening while setting memory and master bits i.e. bits
were over-written.
As per my understanding is_busmaster and is_added bit race issue was
at internal data
management and is quite different from pci bridge enabling issue.
Am I missing some thing ? Would be interested to know what exactly was
affected due to
is_busmaster fix.

In any case, one bug is already filed and may propose a patch soon
about pci bridge enabling
scenario. In summary, bit manipulation is not working fine due to race
conditions in SMP
environment.

Regards,
hari

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16  9:22             ` Hari Vyas
@ 2018-08-16 10:10               ` Benjamin Herrenschmidt
  2018-08-16 10:11                 ` Benjamin Herrenschmidt
  2018-08-16 10:26                 ` Lukas Wunner
  0 siblings, 2 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-16 10:10 UTC (permalink / raw)
  To: Hari Vyas
  Cc: Konstantin Khlebnikov, Bjorn Helgaas, Bjorn Helgaas, linux-pci,
	Ray Jui, Jens Axboe

On Thu, 2018-08-16 at 14:52 +0530, Hari Vyas wrote:
> 
> There was an issue reported by my colleague srinath while enabling pci
> bridge and a race
> condition was happening while setting memory and master bits i.e. bits
> were over-written.
> As per my understanding is_busmaster and is_added bit race issue was
> at internal data
> management and is quite different from pci bridge enabling issue.
> Am I missing some thing ? Would be interested to know what exactly was
> affected due to
> is_busmaster fix.

The is_busmaster fix isn't I think affecting anything, however I don't
like the use of atomics for these things. It's a band-aid. If we grow a
proper pci_dev mutex, which is what I'm introducing here, it should be
able to also handle the is_added race etc..

> In any case, one bug is already filed and may propose a patch soon
> about pci bridge enabling scenario.

I already did, see the patch I posted earlier.

>  In summary, bit manipulation is not working fine due to race
> conditions in SMP
> environment.

Right, among other things.

My proposed patch (which I will try to break down tomorrow or next week
into smaller bits) introduces a per-pci_dev mutex and uses it to fix
the enable & set_master races.

My proposal is to then progressively move more things under the
umbrella of that mutex in order to properly protect the pci_dev
internal state.

Cheers,
Ben.

> Regards,
> hari

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 10:10               ` Benjamin Herrenschmidt
@ 2018-08-16 10:11                 ` Benjamin Herrenschmidt
  2018-08-16 10:26                 ` Lukas Wunner
  1 sibling, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-16 10:11 UTC (permalink / raw)
  To: Hari Vyas
  Cc: Konstantin Khlebnikov, Bjorn Helgaas, Bjorn Helgaas, linux-pci,
	Ray Jui, Jens Axboe

On Thu, 2018-08-16 at 20:10 +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 14:52 +0530, Hari Vyas wrote:
> > 
> > There was an issue reported by my colleague srinath while enabling pci
> > bridge and a race
> > condition was happening while setting memory and master bits i.e. bits
> > were over-written.
> > As per my understanding is_busmaster and is_added bit race issue was
> > at internal data
> > management and is quite different from pci bridge enabling issue.
> > Am I missing some thing ? Would be interested to know what exactly was
> > affected due to
> > is_busmaster fix.
> 
> The is_busmaster fix isn't I think affecting anything, however I don't
> like the use of atomics for these things. It's a band-aid. If we grow a
> proper pci_dev mutex, which is what I'm introducing here, it should be
> able to also handle the is_added race etc..
> 
> > In any case, one bug is already filed and may propose a patch soon
> > about pci bridge enabling scenario.
> 
> I already did, see the patch I posted earlier.

https://lore.kernel.org/lkml/08bc40a6af3e614e97a78fbaab688bfcd14520ac.camel@kernel.crashing.org/

> >  In summary, bit manipulation is not working fine due to race
> > conditions in SMP
> > environment.
> 
> Right, among other things.
> 
> My proposed patch (which I will try to break down tomorrow or next week
> into smaller bits) introduces a per-pci_dev mutex and uses it to fix
> the enable & set_master races.
> 
> My proposal is to then progressively move more things under the
> umbrella of that mutex in order to properly protect the pci_dev
> internal state.
> 
> Cheers,
> Ben.
> 
> > Regards,
> > hari

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 10:10               ` Benjamin Herrenschmidt
  2018-08-16 10:11                 ` Benjamin Herrenschmidt
@ 2018-08-16 10:26                 ` Lukas Wunner
  2018-08-16 10:47                   ` Hari Vyas
  2018-08-16 23:17                   ` Benjamin Herrenschmidt
  1 sibling, 2 replies; 51+ messages in thread
From: Lukas Wunner @ 2018-08-16 10:26 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Hari Vyas, Konstantin Khlebnikov, Bjorn Helgaas, Bjorn Helgaas,
	linux-pci, Ray Jui, Jens Axboe

On Thu, Aug 16, 2018 at 08:10:28PM +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 14:52 +0530, Hari Vyas wrote:
> > There was an issue reported by my colleague srinath while enabling pci
> > bridge and a race condition was happening while setting memory and
> > master bits i.e. bits were over-written.
> > As per my understanding is_busmaster and is_added bit race issue was
> > at internal data management and is quite different from pci bridge
> > enabling issue.
> > Am I missing some thing ? Would be interested to know what exactly was
> > affected due to is_busmaster fix.
> 
> The is_busmaster fix isn't I think affecting anything, however I don't
> like the use of atomics for these things. It's a band-aid. If we grow a
> proper pci_dev mutex, which is what I'm introducing here, it should be
> able to also handle the is_added race etc..

What is your rationale to introduce an additional mutex instead if
utilizing the existing mutex in struct device via device_lock() /
device_unlock() or alternatively pci_dev_lock() / pci_dev_unlock()?

This is also what Bjorn had suggested here:
https://lore.kernel.org/lkml/20170816134354.GV32525@bhelgaas-glaptop.roam.corp.google.com/

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 10:26                 ` Lukas Wunner
@ 2018-08-16 10:47                   ` Hari Vyas
  2018-08-16 23:20                     ` Benjamin Herrenschmidt
  2018-08-16 23:17                   ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 51+ messages in thread
From: Hari Vyas @ 2018-08-16 10:47 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Benjamin Herrenschmidt, Konstantin Khlebnikov, Bjorn Helgaas,
	Bjorn Helgaas, linux-pci, Ray Jui, Jens Axboe

On Thu, Aug 16, 2018 at 3:56 PM, Lukas Wunner <lukas@wunner.de> wrote:
> On Thu, Aug 16, 2018 at 08:10:28PM +1000, Benjamin Herrenschmidt wrote:
>> On Thu, 2018-08-16 at 14:52 +0530, Hari Vyas wrote:
>> > There was an issue reported by my colleague srinath while enabling pci
>> > bridge and a race condition was happening while setting memory and
>> > master bits i.e. bits were over-written.
>> > As per my understanding is_busmaster and is_added bit race issue was
>> > at internal data management and is quite different from pci bridge
>> > enabling issue.
>> > Am I missing some thing ? Would be interested to know what exactly was
>> > affected due to is_busmaster fix.
>>
>> The is_busmaster fix isn't I think affecting anything, however I don't
>> like the use of atomics for these things. It's a band-aid. If we grow a
>> proper pci_dev mutex, which is what I'm introducing here, it should be
>> able to also handle the is_added race etc..
>
> What is your rationale to introduce an additional mutex instead if
> utilizing the existing mutex in struct device via device_lock() /
> device_unlock() or alternatively pci_dev_lock() / pci_dev_unlock()?
>
> This is also what Bjorn had suggested here:
> https://lore.kernel.org/lkml/20170816134354.GV32525@bhelgaas-glaptop.roam.corp.google.com/
>
> Thanks,
>
> Lukas
Agreeing. My "pci bridge enabling" proposed simple fix(issue is not
easy to reproduce in our environment
so not tested yet but believe it should work) too uses existing
locking mechanism only.

https://bugzilla.kernel.org/show_bug.cgi?id=200793

Currently addressing only PCI_COMMAND but can be easily extended for
other pci config having bit fields.
Good that we all are in same direction. Issue should be fixed though
be addressed in different way.

Regards,
hari

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-15 21:52       ` Benjamin Herrenschmidt
  2018-08-15 23:23         ` Benjamin Herrenschmidt
  2018-08-16  7:58         ` Konstantin Khlebnikov
@ 2018-08-16 12:28         ` Lukas Wunner
  2018-08-16 23:25           ` Benjamin Herrenschmidt
  2 siblings, 1 reply; 51+ messages in thread
From: Lukas Wunner @ 2018-08-16 12:28 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Bjorn Helgaas, Hari Vyas, linux-pci, ray.jui,
	Konstantin Khlebnikov, Jens Axboe

On Thu, Aug 16, 2018 at 07:52:06AM +1000, Benjamin Herrenschmidt wrote:
> I would also like to use this rather than the bitmap atomics for is_added
> etc... (Hari's fix) in the long run. Atomics aren't significantly cheaper
> and imho makes thing even messier.

PCI device enablement isn't a hotpath, so the performance saving of
acquire/release memory barriers vs. full memory barriers is hardly
a strong argument.

Atomic bitops have the advantage of being usable in atomic context,
unlike a struct mutex.  And they don't require any spinning if the
variable is accessed concurrently.

Last not least an atomic bitop needs just 1 line of code versus 3 lines
of code to acquire a lock, update the variable and release the lock.
"Elegance is not optional."

That said, the race *you're* dealing with appears to be distinct from
Hari's in that a lock is unavoidable because (AFAIUI) atomic_inc_return()
is called before enablement actually happens, yet both needs to happen
atomically.

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16  8:02           ` Benjamin Herrenschmidt
  2018-08-16  9:22             ` Hari Vyas
@ 2018-08-16 19:43             ` Jens Axboe
  2018-08-16 21:37               ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 51+ messages in thread
From: Jens Axboe @ 2018-08-16 19:43 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Konstantin Khlebnikov, Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui

On 8/16/18 2:02 AM, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 10:58 +0300, Konstantin Khlebnikov wrote:
>> On 16.08.2018 00:52, Benjamin Herrenschmidt wrote:
>>> On Wed, 2018-08-15 at 13:50 -0500, Bjorn Helgaas wrote:
>>>> Yes, this is definitely broken.  Some folks have tried to fix it in
>>>> the past, but it hasn't quite happened yet.  We actually merged one
>>>> patch, 40f11adc7cd9 ("PCI: Avoid race while enabling upstream
>>>> bridges"), but had to revert it after we found issues:
>>>>
>>>> https://lkml.kernel.org/r/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com
>>>> https://lkml.kernel.org/r/20170915072352.10453.31977.stgit@bhelgaas-glaptop.roam.corp.google.com
>>>
>>> Ok so I had a look at this previous patch and it adds yet anothe use of
>>> some global mutex to protect part of the operation which makes me
>>> cringe a bit, we have too many of these.
>>>
>>> What do you think of the one I sent yesterday ? (I can't find it in the
>>> archives yet)
>>>
>>> [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
>>>
>>> The patch itself needs splitting etc... but the basic idea is to move away
>>> from those global mutexes in a number of places and have one in the pci_dev
>>> struct itself to protect its state.
>>>
>>> I would also like to use this rather than the bitmap atomics for is_added
>>> etc... (Hari's fix) in the long run. Atomics aren't significantly cheaper
>>> and imho makes thing even messier.
>>>
>>> Jens, Konstantin, any chance you can test if the above also breaks iwlwifi
>>> (I don't see why it would but ...)
>>>
>>
>> I suppose original race was discovered between enabling bridge and device as described here
>>
>> https://lore.kernel.org/lkml/150547971091.977464.16294045866179907260.stgit@buzz/T/#u
>>
>> I barely can remember what I ever posted this, so I couldn't reproduce for sure.
> 
> Ok. Well, my patch fixes it for my repro-case at least and seems to not
> break anyhting on my thinkpad so ...
> 
> Bjorn, are you ok with the approach ? If yes, I'll start breaking up
> that patch into a few smaller bits in case something goes wrong and we
> want to bisect (such as the changes I did to tracking is_busmaster
> etc...)

I can try it too, but I was never CC'ed on the actual patch.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 19:43             ` Jens Axboe
@ 2018-08-16 21:37               ` Benjamin Herrenschmidt
  2018-08-16 21:56                 ` Jens Axboe
  0 siblings, 1 reply; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-16 21:37 UTC (permalink / raw)
  To: Jens Axboe, Konstantin Khlebnikov, Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui

On Thu, 2018-08-16 at 13:43 -0600, Jens Axboe wrote:
> 
> > Ok. Well, my patch fixes it for my repro-case at least and seems to not
> > break anyhting on my thinkpad so ...
> > 
> > Bjorn, are you ok with the approach ? If yes, I'll start breaking up
> > that patch into a few smaller bits in case something goes wrong and we
> > want to bisect (such as the changes I did to tracking is_busmaster
> > etc...)
> 
> I can try it too, but I was never CC'ed on the actual patch.

https://lore.kernel.org/lkml/08bc40a6af3e614e97a78fbaab688bfcd14520ac.camel@kernel.crashing.org/

(I'll also fwd privately)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 21:37               ` Benjamin Herrenschmidt
@ 2018-08-16 21:56                 ` Jens Axboe
  2018-08-16 23:09                   ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 51+ messages in thread
From: Jens Axboe @ 2018-08-16 21:56 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Konstantin Khlebnikov, Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui

On 8/16/18 3:37 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 13:43 -0600, Jens Axboe wrote:
>>
>>> Ok. Well, my patch fixes it for my repro-case at least and seems to not
>>> break anyhting on my thinkpad so ...
>>>
>>> Bjorn, are you ok with the approach ? If yes, I'll start breaking up
>>> that patch into a few smaller bits in case something goes wrong and we
>>> want to bisect (such as the changes I did to tracking is_busmaster
>>> etc...)
>>
>> I can try it too, but I was never CC'ed on the actual patch.
> 
> https://lore.kernel.org/lkml/08bc40a6af3e614e97a78fbaab688bfcd14520ac.camel@kernel.crashing.org/
> 
> (I'll also fwd privately)

Boots and works fine for me, but I'm also now on a gen6 x1, my initial
report was on a gen4. I still have the gen4, I can power it up and see
if it works there as well.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 21:56                 ` Jens Axboe
@ 2018-08-16 23:09                   ` Benjamin Herrenschmidt
  2018-08-17  0:14                     ` Jens Axboe
  0 siblings, 1 reply; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-16 23:09 UTC (permalink / raw)
  To: Jens Axboe, Konstantin Khlebnikov, Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui

On Thu, 2018-08-16 at 15:56 -0600, Jens Axboe wrote:
> On 8/16/18 3:37 PM, Benjamin Herrenschmidt wrote:
> > On Thu, 2018-08-16 at 13:43 -0600, Jens Axboe wrote:
> > > 
> > > > Ok. Well, my patch fixes it for my repro-case at least and seems to not
> > > > break anyhting on my thinkpad so ...
> > > > 
> > > > Bjorn, are you ok with the approach ? If yes, I'll start breaking up
> > > > that patch into a few smaller bits in case something goes wrong and we
> > > > want to bisect (such as the changes I did to tracking is_busmaster
> > > > etc...)
> > > 
> > > I can try it too, but I was never CC'ed on the actual patch.
> > 
> > https://lore.kernel.org/lkml/08bc40a6af3e614e97a78fbaab688bfcd14520ac.camel@kernel.crashing.org/
> > 
> > (I'll also fwd privately)
> 
> Boots and works fine for me, but I'm also now on a gen6 x1, my initial
> report was on a gen4. I still have the gen4, I can power it up and see
> if it works there as well.

Mine is a gen4 and it works fine :)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 10:26                 ` Lukas Wunner
  2018-08-16 10:47                   ` Hari Vyas
@ 2018-08-16 23:17                   ` Benjamin Herrenschmidt
  2018-08-17  0:43                     ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-16 23:17 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Hari Vyas, Konstantin Khlebnikov, Bjorn Helgaas, Bjorn Helgaas,
	linux-pci, Ray Jui, Jens Axboe

On Thu, 2018-08-16 at 12:26 +0200, Lukas Wunner wrote:
> On Thu, Aug 16, 2018 at 08:10:28PM +1000, Benjamin Herrenschmidt wrote:
> > On Thu, 2018-08-16 at 14:52 +0530, Hari Vyas wrote:
> > > There was an issue reported by my colleague srinath while enabling pci
> > > bridge and a race condition was happening while setting memory and
> > > master bits i.e. bits were over-written.
> > > As per my understanding is_busmaster and is_added bit race issue was
> > > at internal data management and is quite different from pci bridge
> > > enabling issue.
> > > Am I missing some thing ? Would be interested to know what exactly was
> > > affected due to is_busmaster fix.
> > 
> > The is_busmaster fix isn't I think affecting anything, however I don't
> > like the use of atomics for these things. It's a band-aid. If we grow a
> > proper pci_dev mutex, which is what I'm introducing here, it should be
> > able to also handle the is_added race etc..
> 
> What is your rationale to introduce an additional mutex instead if
> utilizing the existing mutex in struct device via device_lock() /
> device_unlock() or alternatively pci_dev_lock() / pci_dev_unlock()?
> 
> This is also what Bjorn had suggested here:
> https://lore.kernel.org/lkml/20170816134354.GV32525@bhelgaas-glaptop.roam.corp.google.com/

The device_lock() is really meant for the core device model
(drivers/base/*) internal synchronization. I'm rather weary of
extending its use to drivers.

In the specific PCI case, the obvious reason is that probe() is already
called with that held. Thus we have an immediate deadlock if we try to
take it in pci_enable_device() for example on the device itself.

We could require that pci_enable_device() is always called with the
lock already held, and only take it for the parents when walking the
bus hierarchy but frankly, this sort of implicit locking requirements
are a rabbit hole and will lead to all sort of interesting bugs and
extra driver auditing requirements.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 10:47                   ` Hari Vyas
@ 2018-08-16 23:20                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-16 23:20 UTC (permalink / raw)
  To: Hari Vyas, Lukas Wunner
  Cc: Konstantin Khlebnikov, Bjorn Helgaas, Bjorn Helgaas, linux-pci,
	Ray Jui, Jens Axboe

On Thu, 2018-08-16 at 16:17 +0530, Hari Vyas wrote:
> On Thu, Aug 16, 2018 at 3:56 PM, Lukas Wunner <lukas@wunner.de> wrote:
> > On Thu, Aug 16, 2018 at 08:10:28PM +1000, Benjamin Herrenschmidt wrote:
> > > On Thu, 2018-08-16 at 14:52 +0530, Hari Vyas wrote:
> > > > There was an issue reported by my colleague srinath while enabling pci
> > > > bridge and a race condition was happening while setting memory and
> > > > master bits i.e. bits were over-written.
> > > > As per my understanding is_busmaster and is_added bit race issue was
> > > > at internal data management and is quite different from pci bridge
> > > > enabling issue.
> > > > Am I missing some thing ? Would be interested to know what exactly was
> > > > affected due to is_busmaster fix.
> > > 
> > > The is_busmaster fix isn't I think affecting anything, however I don't
> > > like the use of atomics for these things. It's a band-aid. If we grow a
> > > proper pci_dev mutex, which is what I'm introducing here, it should be
> > > able to also handle the is_added race etc..
> > 
> > What is your rationale to introduce an additional mutex instead if
> > utilizing the existing mutex in struct device via device_lock() /
> > device_unlock() or alternatively pci_dev_lock() / pci_dev_unlock()?
> > 
> > This is also what Bjorn had suggested here:
> > https://lore.kernel.org/lkml/20170816134354.GV32525@bhelgaas-glaptop.roam.corp.google.com/
> > 
> > Thanks,
> > 
> > Lukas
> 
> Agreeing. My "pci bridge enabling" proposed simple fix(issue is not
> easy to reproduce in our environment
> so not tested yet but believe it should work) too uses existing
> locking mechanism only.
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=200793
> 
> Currently addressing only PCI_COMMAND but can be easily extended for
> other pci config having bit fields.
> Good that we all are in same direction. Issue should be fixed though
> be addressed in different way.

This is straight in line with your is_added fix, more way too fine
grained locking that fixes the details of accessing a specific field or
pair off fields but completely ignore the higher level interactions.

I'm not fan of this approach at all.

Most of the manipulations done in all those code path are NOT
scalability critical and that sort of extra fine grained locking is not
only very fragile, but wasteful.

Itt's like playing whack-a-mole with micro-races, the overall picture
quickly becomes a mess, it already more/less is with all the random
global mutexes here or there.

It's a lot cleaner to have a mutex in the device itself that covers its
general state.

Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 12:28         ` Lukas Wunner
@ 2018-08-16 23:25           ` Benjamin Herrenschmidt
  2018-08-17  1:12             ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-16 23:25 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Bjorn Helgaas, Hari Vyas, linux-pci, ray.jui,
	Konstantin Khlebnikov, Jens Axboe

On Thu, 2018-08-16 at 14:28 +0200, Lukas Wunner wrote:
> On Thu, Aug 16, 2018 at 07:52:06AM +1000, Benjamin Herrenschmidt wrote:
> > I would also like to use this rather than the bitmap atomics for is_added
> > etc... (Hari's fix) in the long run. Atomics aren't significantly cheaper
> > and imho makes thing even messier.
> 
> PCI device enablement isn't a hotpath, so the performance saving of
> acquire/release memory barriers vs. full memory barriers is hardly
> a strong argument.

My argument is not about performance. Fine grained atomics are a mess,
and make you lose the big picture. The problem isn't so much that
indiviual bits need to avoid low level RMW races, but overall higher
level bnehaviours need to be synchronized, such as the enable race
above.

> Atomic bitops have the advantage of being usable in atomic context,
> unlike a struct mutex.  And they don't require any spinning if the
> variable is accessed concurrently.

And why on earth would you need to use pci_enable_device or manipulate
is_added in an atomic context ?

Atomic bitops are a giant race-o-matic and experience shows that 99% of
the time, people using them insteaf of standard locking are getting it
wrong.

The PCI code is already messy enough, let's not "fix" the broken
synchronization accross the board in an even messier way.

> Last not least an atomic bitop needs just 1 line of code versus 3 lines
> of code to acquire a lock, update the variable and release the lock.
> "Elegance is not optional."

Sorry, atomics isn't elegance, it's broken crap. The flags dont' live
in isolation. They relate to an overall function. For example
is_busmaster relates to the actual setting of the bus master bit in the
command register. Those two needs to be updated together atomically,
you need a lock or a mutex. Not an atomic.

This is true of about everything else.

> That said, the race *you're* dealing with appears to be distinct from
> Hari's in that a lock is unavoidable because (AFAIUI) atomic_inc_return()
> is called before enablement actually happens, yet both needs to happen
> atomically.

Every single time one of these flags represent more than the flag
itself, but some other state that is being modified or updates, then
you need a lock to cover both the flag and the other state.

So if we are going to create a sane locking mechanism to protect the
various bits of pci_dev (not just enable), let's use it for everything
shall we ? At least everything that doesn't need to be manipulated in
atomic context.

That also possibly include removing some of the ad-hoc locks in various
parts of pcie/* etc...

Ben.

> Thanks,
> 
> Lukas

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 23:09                   ` Benjamin Herrenschmidt
@ 2018-08-17  0:14                     ` Jens Axboe
  0 siblings, 0 replies; 51+ messages in thread
From: Jens Axboe @ 2018-08-17  0:14 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Konstantin Khlebnikov, Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, ray.jui

On 8/16/18 5:09 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 15:56 -0600, Jens Axboe wrote:
>> On 8/16/18 3:37 PM, Benjamin Herrenschmidt wrote:
>>> On Thu, 2018-08-16 at 13:43 -0600, Jens Axboe wrote:
>>>>
>>>>> Ok. Well, my patch fixes it for my repro-case at least and seems to not
>>>>> break anyhting on my thinkpad so ...
>>>>>
>>>>> Bjorn, are you ok with the approach ? If yes, I'll start breaking up
>>>>> that patch into a few smaller bits in case something goes wrong and we
>>>>> want to bisect (such as the changes I did to tracking is_busmaster
>>>>> etc...)
>>>>
>>>> I can try it too, but I was never CC'ed on the actual patch.
>>>
>>> https://lore.kernel.org/lkml/08bc40a6af3e614e97a78fbaab688bfcd14520ac.camel@kernel.crashing.org/
>>>
>>> (I'll also fwd privately)
>>
>> Boots and works fine for me, but I'm also now on a gen6 x1, my initial
>> report was on a gen4. I still have the gen4, I can power it up and see
>> if it works there as well.
> 
> Mine is a gen4 and it works fine :)

OK good, saves me the time then :-)

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 23:17                   ` Benjamin Herrenschmidt
@ 2018-08-17  0:43                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-17  0:43 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Hari Vyas, Konstantin Khlebnikov, Bjorn Helgaas, Bjorn Helgaas,
	linux-pci, Ray Jui, Jens Axboe

On Fri, 2018-08-17 at 09:17 +1000, Benjamin Herrenschmidt wrote:
> 
> > What is your rationale to introduce an additional mutex instead if
> > utilizing the existing mutex in struct device via device_lock() /
> > device_unlock() or alternatively pci_dev_lock() / pci_dev_unlock()?
> > 
> > This is also what Bjorn had suggested here:
> > https://lore.kernel.org/lkml/20170816134354.GV32525@bhelgaas-glaptop.roam.corp.google.com/
> 
> The device_lock() is really meant for the core device model
> (drivers/base/*) internal synchronization. I'm rather weary of
> extending its use to drivers.
> 
> In the specific PCI case, the obvious reason is that probe() is already
> called with that held. Thus we have an immediate deadlock if we try to
> take it in pci_enable_device() for example on the device itself.
> 
> We could require that pci_enable_device() is always called with the
> lock already held, and only take it for the parents when walking the
> bus hierarchy but frankly, this sort of implicit locking requirements
> are a rabbit hole and will lead to all sort of interesting bugs and
> extra driver auditing requirements.

In addition, there are additional uncertainties about whether
the device_lock of the parent is held or not.

Generally this only happens for USB but there are some cases when
the device has busy consumer links where remove() will hold it too.

That would make the walking-up-the-bus for pci_enable_bridge()
rather tricky.

That leads to crazy things like pci_reset_function() vs
pci_reset_function_locked(). Whenever we end up doing that, that's
usually a sign that we didn't think things through.

I think we would benefit greatly from the clarity of having a dedicated
mutex for pci_dev that handles all of our state management. We can
start with the enable_cnt (and as a result stop using atomics) and
is_busmaster, we can extend to is_added thus making Hari's patch
unnecessary, and then start looking at everything else.

We still need the actual device_lock for some things, like the whole
pci_bus_reset() business which we really want to exclude from
concurrent driver binding/unbinding I suppose.

We mostly get away with the lack of locking today because the bulk of
the :1 fields in there tend to be only use either at discovery time
or by the driver init, so in a single threaded manner, but as we saw
already, that assumption breaks on bridges and is generally rather
unsafe.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-16 23:25           ` Benjamin Herrenschmidt
@ 2018-08-17  1:12             ` Benjamin Herrenschmidt
  2018-08-17 16:39               ` Lukas Wunner
  0 siblings, 1 reply; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-17  1:12 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Bjorn Helgaas, Hari Vyas, linux-pci, ray.jui,
	Konstantin Khlebnikov, Jens Axboe

On Fri, 2018-08-17 at 09:25 +1000, Benjamin Herrenschmidt wrote:
> > That said, the race *you're* dealing with appears to be distinct from
> > Hari's in that a lock is unavoidable because (AFAIUI) atomic_inc_return()
> > is called before enablement actually happens, yet both needs to happen
> > atomically.
> 
> Every single time one of these flags represent more than the flag
> itself, but some other state that is being modified or updates, then
> you need a lock to cover both the flag and the other state.
> 
> So if we are going to create a sane locking mechanism to protect the
> various bits of pci_dev (not just enable), let's use it for everything
> shall we ? At least everything that doesn't need to be manipulated in
> atomic context.
> 
> That also possibly include removing some of the ad-hoc locks in various
> parts of pcie/* etc...

Allright, looking at those atomic flags, we have two today:

 - PCI_DEV_DISCONNECTED

Now that's a complete dup of pci_channel_state_t error_state, yuck.

Also the atomic bit is completely pointless. It only protects the
actual field from RMW access, it doesn't synchronize with any of the
users.

For example, it's being tested in the config ops wrappers (instead of
the channel state) but that's completely racy. In that specific case it
probably doesn't matter either way, but still.

It's also tested in __pci_write_msi_msg, why ? What for ? If MMIO is
blocked it's handled by the channel state. Again, you notice the
complete absence of synchronization between the producer and the
consumer of that bit.

 - PCI_DEV_ADDED

Now the only reason that was moved was to avoid the RMW races on the
bit itself. There is, here too, 0 synchronization with the callers.

Now I forgot the specific details of the race Hari found, but this is
definitely not the right way to fix things. Plus it forced powerpc to
do a relative path include which sucks.

The latter would be much more cleanly handled using the mutex I
proposed.

The former should go a way, that's what error_state is already meant to
be. As for the locking, this needs to be looked at more closely since
this is inherently a racy op, though testing it in the MSI writing code
looks more like a band-aid than a feature to me. The original commit
lokos like it's meant to just be some kind of optimisation. One has to
be careful however of the possible ordering issues when the bit is
cleared.

So at this stage, I don't see any reasonable justification for those
private atomic bits. They should should both go away along with the
whole priv_flags.

Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
  2018-08-15 21:50         ` [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races Benjamin Herrenschmidt
  2018-08-15 22:40           ` Guenter Roeck
@ 2018-08-17  3:07           ` Bjorn Helgaas
  2018-08-17  3:42             ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 51+ messages in thread
From: Bjorn Helgaas @ 2018-08-17  3:07 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Hari Vyas, bhelgaas, linux-pci, Ray Jui, linux-kernel,
	Srinath Mannam, Guenter Roeck, Jens Axboe, Lukas Wunner,
	Konstantin Khlebnikov, Marta Rybczynska, Pierre-Yves Kerbrat

[+cc Srinath, Guenter, Jens, Lukas, Konstantin, Marta, Pierre-Yves]

On Thu, Aug 16, 2018 at 07:50:13AM +1000, Benjamin Herrenschmidt wrote:
> [Note: This isn't meant to be merged, it need splitting at the very
> least, see below]
> 
> This is something I cooked up quickly today to test if that would fix
> my problems with large number of switch and NVME devices on POWER.
> 
> So far it does...
> 
> The issue (as discussed in the Re: PCIe enable device races thread) is
> that pci_enable_device and related functions along with pci_set_master
> and pci_enable_bridge are fundamentally racy.
> 
> There is no lockign protecting the state of the device in pci_dev and
> if multiple devices under the same bridge try to enable it simultaneously
> one some of them will potentially start accessing it before it has actually
> been enabled.
> 
> Now there are a LOT more fields in pci_dev that aren't covered by any
> form of locking.

Most of the PCI core relies on the assumption that only a single
thread touches a device at a time.  This is generally true of the core
during enumeration because enumeration is single-threaded.  It's
generally true in normal driver operation because the core shouldn't
touch a device after a driver claims it.

But there are several exceptions, and I think we need to understand
those scenarios before applying locks willy-nilly.

One big exception is that enabling device A may also touch an upstream
bridge B.  That causes things like your issue and Srinath's issue
where drivers simultaneously enabling two devices below the same
bridge corrupt the bridge's state [1].  Marta reported essentially the
same issue [2].

Hari's issue [3] seems related to a race between a driver work queue
and the core enumerating the device.  I should have pushed harder to
understand this; I feel like we papered over the immediate problem
without clearing up the larger issue of why the core enumeration path
(pci_bus_add_device()) is running at the same time as a driver.

DPC/AER error handling adds more cases where the core potentially
accesses devices asynchronously to the driver.

User-mode sysfs controls like ASPM are also asynchronous to drivers.

Even setpci is a potential issue, though I don't know how far we want
to go to protect it.  I think we should make setpci taint the kernel
anyway.

It might be nice if we had some sort of writeup of the locking
strategy as a whole.

[1] https://lkml.kernel.org/r/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com
[2] https://lkml.kernel.org/r/744877924.5841545.1521630049567.JavaMail.zimbra@kalray.eu
[3] https://lkml.kernel.org/r/1530608741-30664-2-git-send-email-hari.vyas@broadcom.com

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
  2018-08-17  3:07           ` Bjorn Helgaas
@ 2018-08-17  3:42             ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-17  3:42 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Hari Vyas, bhelgaas, linux-pci, Ray Jui, linux-kernel,
	Srinath Mannam, Guenter Roeck, Jens Axboe, Lukas Wunner,
	Konstantin Khlebnikov, Marta Rybczynska, Pierre-Yves Kerbrat

On Thu, 2018-08-16 at 22:07 -0500, Bjorn Helgaas wrote:
> [+cc Srinath, Guenter, Jens, Lukas, Konstantin, Marta, Pierre-Yves]
> 
> On Thu, Aug 16, 2018 at 07:50:13AM +1000, Benjamin Herrenschmidt wrote:
> > [Note: This isn't meant to be merged, it need splitting at the very
> > least, see below]
> > 
> > This is something I cooked up quickly today to test if that would fix
> > my problems with large number of switch and NVME devices on POWER.
> > 
> > So far it does...
> > 
> > The issue (as discussed in the Re: PCIe enable device races thread) is
> > that pci_enable_device and related functions along with pci_set_master
> > and pci_enable_bridge are fundamentally racy.
> > 
> > There is no lockign protecting the state of the device in pci_dev and
> > if multiple devices under the same bridge try to enable it simultaneously
> > one some of them will potentially start accessing it before it has actually
> > been enabled.
> > 
> > Now there are a LOT more fields in pci_dev that aren't covered by any
> > form of locking.
> 
> Most of the PCI core relies on the assumption that only a single
> thread touches a device at a time.  This is generally true of the core
> during enumeration because enumeration is single-threaded.  It's
> generally true in normal driver operation because the core shouldn't
> touch a device after a driver claims it.

Mostly :-) There are a few exceptions though.

> But there are several exceptions, and I think we need to understand
> those scenarios before applying locks willy-nilly.

We need to stop creating ad-hoc locks. We have a good handle already on
the main enable/disable and bus master scenario, and the race with
is_added.

Ignore the patch itself, it has at least 2 bugs with PM, I'll send a
series improving things a bit later.

> One big exception is that enabling device A may also touch an upstream
> bridge B.  That causes things like your issue and Srinath's issue
> where drivers simultaneously enabling two devices below the same
> bridge corrupt the bridge's state [1].  Marta reported essentially the
> same issue [2].
> 
> Hari's issue [3] seems related to a race between a driver work queue
> and the core enumerating the device.  I should have pushed harder to
> understand this; I feel like we papered over the immediate problem
> without clearing up the larger issue of why the core enumeration path
> (pci_bus_add_device()) is running at the same time as a driver.

It's not. What is happening is that is_added is set by
pci_bus_add_device() after it has bound the driver. An easy fix would
have been to move it up instead:

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 737d1c52f002..ff4d536d43fc 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -323,16 +323,16 @@ void pci_bus_add_device(struct pci_dev *dev)
        pci_proc_attach_device(dev);
        pci_bridge_d3_update(dev);

+       dev->is_added = 1;
        dev->match_driver = true;
        retval = device_attach(&dev->dev);
        if (retval < 0 && retval != -EPROBE_DEFER) {
+               dev->is_added = 0;
                pci_warn(dev, "device attach failed (%d)\n", retval);
                pci_proc_detach_device(dev);
                pci_remove_sysfs_dev_files(dev);
                return;
        }
-
-       dev->is_added = 1;
 }
 EXPORT_SYMBOL_GPL(pci_bus_add_device);

(Untested).

Note: another advantage of the above is that the current code has an
odd asymetry: is_added is currently set after we attach but also
cleared after we detatch.

If we want to keep the flag being set after attaching, then we do
indeed need to protect it against concurrent access to other fields.

The easiest way to do that would have been to remove the :1 as this:

-	unsigned int	is_added:1;
+	unsigned int	is_added;

Again, none of these approach involves the invasive patch your merged
which uses that atomic operation which provides the false sense of
security that your are somewhat "protected" while in fact you only
protect the field itself, but provide no protection about overall
concurrency of the callers which might clash in different ways.

Finally, we could also move is_added under the protection of the new
mutex I propose adding, but that would really only work as long as
we move all the :1 fields protected by that mutex together inside
the struct pci_dev structure as to avoid collisions with other fields
being modified.

All of the above are preferable to what you merged.

> DPC/AER error handling adds more cases where the core potentially
> accesses devices asynchronously to the driver.
>
> User-mode sysfs controls like ASPM are also asynchronous to drivers.
> 
> Even setpci is a potential issue, though I don't know how far we want
> to go to protect it.  I think we should make setpci taint the kernel
> anyway.

I wouldn't bother too much about it.

> It might be nice if we had some sort of writeup of the locking
> strategy as a whole.
> 
> [1] https://lkml.kernel.org/r/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com
> [2] https://lkml.kernel.org/r/744877924.5841545.1521630049567.JavaMail.zimbra@kalray.eu
> [3] https://lkml.kernel.org/r/1530608741-30664-2-git-send-email-hari.vyas@broadcom.com

Rather than not having one ? :)

This is what I'm proposing here. Let me send patch series demonstrating
the start of this, which also fix both above issues and completely
remove that rather annoying atomic priv_flags.

I would also like to get rid of the atomic enable_cnt but that will
need a bit more churn through archs and drivers.

Cheers,
Ben.

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-17  1:12             ` Benjamin Herrenschmidt
@ 2018-08-17 16:39               ` Lukas Wunner
  2018-08-18  3:37                 ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 51+ messages in thread
From: Lukas Wunner @ 2018-08-17 16:39 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Bjorn Helgaas, Hari Vyas, linux-pci, ray.jui,
	Konstantin Khlebnikov, Jens Axboe

On Fri, Aug 17, 2018 at 11:12:50AM +1000, Benjamin Herrenschmidt wrote:
> Allright, looking at those atomic flags, we have two today:
> 
>  - PCI_DEV_DISCONNECTED
> 
> Now that's a complete dup of pci_channel_state_t error_state, yuck.

Guess what, I did suggest to use pci_channel_state back then:

   "We've got three pci_channel_state values defined in include/linux/pci.h,
    "normal", "frozen" and "perm_failure".  Instead of adding a new
    "is_removed" bit to struct pci_dev, would it perhaps make more sense to
    just add a new type of pci_channel_state for removed devices?"
    https://spinics.net/lists/linux-pci/msg55411.html

This was Keith's answer:

   "I'd be happy if we can reuse that, but concerned about overloading
    error_state's intended purpose for AER. The conditions under which an
    'is_removed' may be set can also create AER events, and the aer driver
    overrides the error_state."
    https://spinics.net/lists/linux-pci/msg55417.html


> Also the atomic bit is completely pointless. It only protects the
> actual field from RMW access, it doesn't synchronize with any of the
> users.

Synchronizing with users?  There's nothing to synchronize with here,
once it has been determined the device is gone, the bit should be set
ASAP.  Places where this bit is checked need to be able to cope with the
device physically removed but the bit not yet set.  They should just
skip device accesses *if* the bit is set.

The bit was made atomic because Bjorn wanted to avoid RMW races:

   "This makes me slightly worried because this is a bitfield and there's
    no locking.  A concurrent write to some nearby field can corrupt
    things.  It doesn't look *likely*, but it's a lot of work to be
    convinced that this is completely safe, especially since the writer is
    running on behalf of the bridge, and the target is a child of the
    bridge."
    https://patchwork.kernel.org/patch/9402793/


> It's also tested in __pci_write_msi_msg, why ? What for ? If MMIO is
> blocked it's handled by the channel state. Again, you notice the
> complete absence of synchronization between the producer and the
> consumer of that bit.

Well, a quick git blame would have led you to commit 0170591bb067,
which contains the following rationale:

   "Check the device connected state prior to executing device shutdown
    operations or writing MSI messages so that tear down on disconnected
    devices completes quicker."
                      ^^^^^^^

>  - PCI_DEV_ADDED
> 
> Now the only reason that was moved was to avoid the RMW races on the
> bit itself. There is, here too, 0 synchronization with the callers.
> 
> Now I forgot the specific details of the race Hari found, but this is
> definitely not the right way to fix things. Plus it forced powerpc to
> do a relative path include which sucks.
> 
> The latter would be much more cleanly handled using the mutex I
> proposed.

I disagree, a mutex is not cleaner if it adds 3 LoC instead of 1
while the only point is to avoid RMW races and not achieve any kind
of synchronization.


> The former should go a way, that's what error_state is already meant to
> be. As for the locking, this needs to be looked at more closely since
> this is inherently a racy op, though testing it in the MSI writing code
> looks more like a band-aid than a feature to me. The original commit
> lokos like it's meant to just be some kind of optimisation. One has to
> be careful however of the possible ordering issues when the bit is
> cleared.

PCI_DEV_DISCONNECTED is never cleared.  What sense would that make?

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-17 16:39               ` Lukas Wunner
@ 2018-08-18  3:37                 ` Benjamin Herrenschmidt
  2018-08-18  9:22                   ` Lukas Wunner
  0 siblings, 1 reply; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-18  3:37 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Bjorn Helgaas, Hari Vyas, linux-pci, ray.jui,
	Konstantin Khlebnikov, Jens Axboe

On Fri, 2018-08-17 at 18:39 +0200, Lukas Wunner wrote:
> On Fri, Aug 17, 2018 at 11:12:50AM +1000, Benjamin Herrenschmidt wrote:
> > Allright, looking at those atomic flags, we have two today:
> > 
> >  - PCI_DEV_DISCONNECTED
> > 
> > Now that's a complete dup of pci_channel_state_t error_state, yuck.
> 
> Guess what, I did suggest to use pci_channel_state back then:

And you were right :-)

>    "We've got three pci_channel_state values defined in include/linux/pci.h,
>     "normal", "frozen" and "perm_failure".  Instead of adding a new
>     "is_removed" bit to struct pci_dev, would it perhaps make more sense to
>     just add a new type of pci_channel_state for removed devices?"
>     https://spinics.net/lists/linux-pci/msg55411.html

So I initially added a value for disconnected, then noticed a bunch of
drivers have switch/cases around the error_state value, and decided to
just make disconnected alias to permanent failure for now, we can do
driver auditing/cleanup later.

As for Keith:

> This was Keith's answer:
> 
>    "I'd be happy if we can reuse that, but concerned about overloading
>     error_state's intended purpose for AER. The conditions under which an
>     'is_removed' may be set can also create AER events, and the aer driver
>     overrides the error_state."
>     https://spinics.net/lists/linux-pci/msg55417.html

Well, rather than adding another field that means something somewhat
similar, I would just address his concern (it's not just AER, it's also
the powerpc EEH code, which once we turn it into something actually
readable (WIP...) should probably largely migrate to drivers/pci...

But I'm also looking at issues with AER at the moment with another
crowd and I think we can sort this all out.

Funnily enough, it mgiht actually be one of those cases where we *do*
want an atomic. By making error_state an atomic, we can enforce valid
transitions, and thus simply make the transition from "disconnected" to
anything else impossible while dealing with it changing at interrupt
time (which can happen with EEH).

As-is, what you have is a bit that is private to drivers/pci (why ?
devices might be interested in knowing the device has been
disconnected...) and somewhat duplicates the purpose of an existing
field so we'll end up with bits that test one, bits that test the
other, or both, and a lot of confusion.

Fundamentally both mean, from a driver perspective, two things.

 - One very important: break out of a loop that waits for a HW state to
change because it won't

 - One an optimisation: don't bother with all those register updates
bcs they're never going to reach your HW.

So let's make it a single field. I'm happy to rename "error_state" to
something more generic such a "channel_state" to reflect that it's not
all errors (is disconnect an error ? debatable...) and we can work in
making it atomic, adding an enum member etc... if we wish to do so, but
let's not introduce yet another field.

> > Also the atomic bit is completely pointless. It only protects the
> > actual field from RMW access, it doesn't synchronize with any of the
> > users.
> 
> Synchronizing with users?  There's nothing to synchronize with here,
> once it has been determined the device is gone, the bit should be set
> ASAP.
>
>   Places where this bit is checked need to be able to cope with the
> device physically removed but the bit not yet set.  They should just
> skip device accesses *if* the bit is set.

This is true of the current 2 or 3 places where you check it, to *some*
extent, because at the moment it's just a "hint". These things do have
a tendency to grow beyond their original intent though.

> The bit was made atomic because Bjorn wanted to avoid RMW races:
> 
>    "This makes me slightly worried because this is a bitfield and there's
>     no locking.  A concurrent write to some nearby field can corrupt
>     things.  It doesn't look *likely*, but it's a lot of work to be
>     convinced that this is completely safe, especially since the writer is
>     running on behalf of the bridge, and the target is a child of the
>     bridge."
>     https://patchwork.kernel.org/patch/9402793/

Then don't make it a bitfield rather than adding some atomics, they are
really pointless and encourage unsafe practices (even if this precise
one might actually be ok).
> 
> > It's also tested in __pci_write_msi_msg, why ? What for ? If MMIO is
> > blocked it's handled by the channel state. Again, you notice the
> > complete absence of synchronization between the producer and the
> > consumer of that bit.
> 
> Well, a quick git blame would have led you to commit 0170591bb067,
> which contains the following rationale:
> 
>    "Check the device connected state prior to executing device shutdown
>     operations or writing MSI messages so that tear down on disconnected
>     devices completes quicker."
>                       ^^^^^^^

Ok so just an optimisation, nothing terribly important.

> >  - PCI_DEV_ADDED
> > 
> > Now the only reason that was moved was to avoid the RMW races on the
> > bit itself. There is, here too, 0 synchronization with the callers.
> > 
> > Now I forgot the specific details of the race Hari found, but this is
> > definitely not the right way to fix things. Plus it forced powerpc to
> > do a relative path include which sucks.
> > 
> > The latter would be much more cleanly handled using the mutex I
> > proposed.
> 
> I disagree, a mutex is not cleaner if it adds 3 LoC instead of 1
> while the only point is to avoid RMW races and not achieve any kind
> of synchronization.

No this is not the only point, is_added means more than that, and in
fact my argument (see the other emails) is that the root of the problem
was elsewhere. Here, "fixing" the RMW race with an atomic papers over a
deeper problem that this field was being set in the wrong place to
begin with.
> 
> > The former should go a way, that's what error_state is already meant to
> > be. As for the locking, this needs to be looked at more closely since
> > this is inherently a racy op, though testing it in the MSI writing code
> > looks more like a band-aid than a feature to me. The original commit
> > lokos like it's meant to just be some kind of optimisation. One has to
> > be careful however of the possible ordering issues when the bit is
> > cleared.
> 
> PCI_DEV_DISCONNECTED is never cleared.  What sense would that make?

As long as we never "reconnect" without a re-probe, that's ok. That
said, see above why I sitll think it's the wrong things to do.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-18  3:37                 ` Benjamin Herrenschmidt
@ 2018-08-18  9:22                   ` Lukas Wunner
  2018-08-18 13:11                     ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 51+ messages in thread
From: Lukas Wunner @ 2018-08-18  9:22 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Bjorn Helgaas, Hari Vyas, linux-pci, ray.jui,
	Konstantin Khlebnikov, Jens Axboe, mr.nuke.me, keith.busch

On Sat, Aug 18, 2018 at 01:37:35PM +1000, Benjamin Herrenschmidt wrote:
> As-is, what you have is a bit that is private to drivers/pci (why ?
> devices might be interested in knowing the device has been
> disconnected...)

It is private to drivers/pci because Greg wanted it so:

   "> The flag is not intended for a device specific driver to use.
    > The intention is for the pci bus driver to use it, so no additional
    > work for other drivers.
    But the driver can see this in the structure, so it will get used..."
    https://spinics.net/lists/linux-pci/msg57744.html

   "I applaud your attempt to make this "private" to the pci core, just
    don't know if it really will work, or if it is worth it entirely..."
    https://spinics.net/lists/linux-pci/msg58017.html

Greg is of the opinion that drivers should check for themselves whether
a device has been removed and he was happy that they are barred from
using PCI_DEV_DISCONNECTED.  He believes that drivers should verify
for every read of mmio and config space that that the read value is not
0xffffffff (if that is an invalid value) and consider the device removed
if so:

   "If you are worried about your device going away (and you have to),
    then just check all reads and be fine with it.  If you have values
    that can be all 0xff, then just accept that as a valid value and
    move to the next read where it can't be valid."
    https://spinics.net/lists/linux-pci/msg70793.html

However Alex Gagniuc recently countered:

   "The discussion is based on the wrong assumptions that reads are 
    symmetrical wrt to a device being present or not. However, completion 
    timeouts and unsupported requests blow that assumption right out of the 
    water. Best case scenario, you just waste a little more time waiting for 
    hardware IO. More common is to end up crashing the machine.

    Greg's ideas work in a perfect world where PCI and PCIe are equivalent 
    at every level. And in that case, PCI_DEV_DISCONNECTED would have been 
    pure, 100% genuine Redmond bloatware. We wouldn't have gotten complaints 
    from Facebook and other industry players that it takes too damn long to 
    remove a device. We probably also wouldn't be seeing machines crash on 
    PCIe removal."
    https://spinics.net/lists/linux-pci/msg74864.html

The reason I'm interested in PCI_DEV_DISCONNECTED is because hot-removing
an Apple Thunderbolt Ethernet adapter locks up the machine a due to the tg3
driver trying to access the removed device.

Now, tg3 does call pci_channel_offline() and refrains from accessing the
device if that returns true.  If I make PCI_DEV_DISCONNECTED public and
check its value in pci_channel_offline(), I can hot-remove the Thunderbolt
Ethernet adapter without problems.  I posted a patch for that 2 years ago:
https://spinics.net/lists/linux-pci/msg55601.html

Thus, I'd be more than happy if the PCI_DEV_DISCONNECTED state could be
folded into enum pci_channel_state as it would immediately fix Thunderbolt
hot-removal.

> Fundamentally both mean, from a driver perspective, two things.
> 
>  - One very important: break out of a loop that waits for a HW state to
> change because it won't
> 
>  - One an optimisation: don't bother with all those register updates
> bcs they're never going to reach your HW.

Right.  PCI_DEV_DISCONNECTED was introduced by Intel on behalf of
Facebook.  See slides 13 to 16 of this slide deck for the details:
http://files.opencompute.org/oc/public.php?service=files&t=4ff50715e3c1e273e02b694757b80d25&download

There's a graph on slide 16 showing the speedup achieved by
PCI_DEV_DISCONNECTED.

There's also a recording of that talk, the relevant segment is just
10 minutes:
https://youtu.be/GJ6B0xzgvlM?t=926

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition)
  2018-08-18  9:22                   ` Lukas Wunner
@ 2018-08-18 13:11                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-18 13:11 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: Bjorn Helgaas, Hari Vyas, linux-pci, ray.jui,
	Konstantin Khlebnikov, Jens Axboe, mr.nuke.me, keith.busch

On Sat, 2018-08-18 at 11:22 +0200, Lukas Wunner wrote:

> Greg is of the opinion that drivers should check for themselves whether
> a device has been removed and he was happy that they are barred from
> using PCI_DEV_DISCONNECTED.  He believes that drivers should verify
> for every read of mmio and config space that that the read value is not
> 0xffffffff (if that is an invalid value) and consider the device removed
> if so:

Well, this is not quite right.. but close :-)

There can be valid cases of ffffffff's ... that said, this is exactly
what error_state is about, it allows differenciating a valid ffffffff's
from an error state.

On POWER with EEH, every read{b,w,l,q} will check for an all 1's result
and call the EEH core to check the freeze state in the host bridge &
update the channel state.

That does mean that a legitimate all 1's read will be much slower but
thankfully they are pretty rare.

>    "If you are worried about your device going away (and you have to),
>     then just check all reads and be fine with it.  If you have values
>     that can be all 0xff, then just accept that as a valid value and
>     move to the next read where it can't be valid."
>     https://spinics.net/lists/linux-pci/msg70793.html
> 
> However Alex Gagniuc recently countered:
> 
>    "The discussion is based on the wrong assumptions that reads are 
>     symmetrical wrt to a device being present or not. However, completion 
>     timeouts and unsupported requests blow that assumption right out of the 
>     water. Best case scenario, you just waste a little more time waiting for 
>     hardware IO. More common is to end up crashing the machine.
> 
>     Greg's ideas work in a perfect world where PCI and PCIe are equivalent 
>     at every level. And in that case, PCI_DEV_DISCONNECTED would have been 
>     pure, 100% genuine Redmond bloatware. We wouldn't have gotten complaints 
>     from Facebook and other industry players that it takes too damn long to 
>     remove a device. We probably also wouldn't be seeing machines crash on 
>     PCIe removal."
>     https://spinics.net/lists/linux-pci/msg74864.html

Yes, reality is slightly more complicated ;-) Alex is absolutely
correct. Again, this is what error_state (aka channel state) is
supposed to convey, and is meant to allow disambiguation here.

This is why I think that's what we should be using.

> The reason I'm interested in PCI_DEV_DISCONNECTED is because hot-removing
> an Apple Thunderbolt Ethernet adapter locks up the machine a due to the tg3
> driver trying to access the removed device.

TG3 is precisely one of the original culprits we "Fixed" by introducing
the channel state back in the day iirc :-)

There is no difference from a driver perspective between a device being
disconnected, yanked out (think express cards... thunderbolt isn't
bringing anything new here, even good old cardbus...), or in an EEH
frozen state which is what our error handling hardware does on POWER
(blocks writes and returns all 1's on reads on the first error from a
device to prevent propagation of bad data).

The only difference drivers might care about is when it comes to
recovering. Some of the error cases provide recovery options, a pure
disconnect doesn't, but that has no impact on all those various pieces
of wait loops etc.. that need to break out.

> Now, tg3 does call pci_channel_offline() and refrains from accessing the
> device if that returns true.  If I make PCI_DEV_DISCONNECTED public and
> check its value in pci_channel_offline(), I can hot-remove the Thunderbolt
> Ethernet adapter without problems.  I posted a patch for that 2 years ago:
> https://spinics.net/lists/linux-pci/msg55601.html

Yes that's absolutely the right thing to do if you really can't just
use the existing error_state as your "disconnected" state, but I would
prefer we don't break that up into two pieces of state and reconcile
it.

> Thus, I'd be more than happy if the PCI_DEV_DISCONNECTED state could be
> folded into enum pci_channel_state as it would immediately fix Thunderbolt
> hot-removal.

Yes I think that's the way to go.

If we want to be extra safe, what we could do is make the channel state
an atomic so that it's updated by doing cmpxchg with the rules in the
"setter" function enforcing that it cannot ever change back from a
disconnected state.

In this case the atomicity is necessary because at least EEH will
update it potentially from any read{b,w,l,q} and thus at interrupt time
(AER isn't as harsh though).

> > Fundamentally both mean, from a driver perspective, two things.
> > 
> >  - One very important: break out of a loop that waits for a HW state to
> > change because it won't
> > 
> >  - One an optimisation: don't bother with all those register updates
> > bcs they're never going to reach your HW.
> 
> Right.  PCI_DEV_DISCONNECTED was introduced by Intel on behalf of
> Facebook.  See slides 13 to 16 of this slide deck for the details:
> http://files.opencompute.org/oc/public.php?service=files&t=4ff50715e3c1e273e02b694757b80d25&download
> 
> There's a graph on slide 16 showing the speedup achieved by
> PCI_DEV_DISCONNECTED.
> 
> There's also a recording of that talk, the relevant segment is just
> 10 minutes:
> https://youtu.be/GJ6B0xzgvlM?t=926

Ok thanks. I don't know if I'll have time to review all of that
material, but I suspect we can agree that making it all a single piece
of information is preferable.

I need to spend a bit more time auditing the users next week to find a
way to make the conversion smooth without having to patch bazillions
drivers, but I really think that's the way to go.

Cheers,
Ben.

> 
> Thanks,
> 
> Lukas

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races
  2018-08-15 23:38             ` Benjamin Herrenschmidt
@ 2018-08-20  1:31               ` Guenter Roeck
  0 siblings, 0 replies; 51+ messages in thread
From: Guenter Roeck @ 2018-08-20  1:31 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Bjorn Helgaas, Hari Vyas, bhelgaas, linux-pci, ray.jui, linux-kernel

On Thu, Aug 16, 2018 at 09:38:41AM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2018-08-15 at 15:40 -0700, Guenter Roeck wrote:
> > On Thu, Aug 16, 2018 at 07:50:13AM +1000, Benjamin Herrenschmidt wrote:
> > > (Resent with lkml on copy)
> > > 
> > > [Note: This isn't meant to be merged, it need splitting at the very
> > > least, see below]
> > > 
> > > This is something I cooked up quickly today to test if that would fix
> > > my problems with large number of switch and NVME devices on POWER.
> > > 
> > 
> > Is that a problem that can be reproduced with a qemu setup ?
> 
> With difficulty... mt-tcg might help, but you need a rather large
> systems to reproduce it.
> 
> My repro-case is a 2 socket POWER9 system (about 40 cores off the top
> of my mind, so 160 threads) with 72 NVME devices underneath a tree of
> switches (I don't have the system at hand today to check how many).
> 
> It's possible to observe it I suppose on a smaller system (in theory a
> single bridge with 2 devices is enough) but in practice the timing is
> extremely hard to hit.
> 
> You need a combination of:
> 
>   - The bridges come up disabled (which is the case when Linux does the
> resource assignment, such as on POWER but not on x86 unless it's
> hotplug)
> 
>   - The nvme devices try to enable them simultaneously
> 
> Also the resulting error is a UR, I don't know how well qemu models
> that.
> 
Not well enough, apparently. I tried for a while, registering as many
nvme drives as the system would take, but I was not able to reproduce
the problem with qemu. It was worth a try, though.

Guenter

^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2018-08-20  1:31 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-03  9:05 [PATCH v3] PCI: Data corruption happening due to race condition Hari Vyas
2018-07-03  9:05 ` Hari Vyas
2018-07-03  9:13   ` Lukas Wunner
2018-07-18 23:29   ` Bjorn Helgaas
2018-07-19  4:18     ` Benjamin Herrenschmidt
2018-07-19 14:04       ` Hari Vyas
2018-07-19 18:55         ` Lukas Wunner
2018-07-20  4:27           ` Benjamin Herrenschmidt
2018-07-27 22:25       ` Bjorn Helgaas
2018-07-28  0:45         ` Benjamin Herrenschmidt
2018-07-31 11:21         ` Michael Ellerman
2018-07-19 17:41   ` Bjorn Helgaas
2018-07-20  9:16     ` Hari Vyas
2018-07-20 12:20       ` Bjorn Helgaas
2018-07-31 16:37 ` Bjorn Helgaas
2018-08-15  3:35   ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Benjamin Herrenschmidt
2018-08-15  4:16     ` Benjamin Herrenschmidt
2018-08-15  4:44       ` Benjamin Herrenschmidt
2018-08-15  5:21         ` [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races Benjamin Herrenschmidt
2018-08-15 19:09         ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Bjorn Helgaas
2018-08-15 21:50         ` [RFC PATCH] pci: Proof of concept at fixing pci_enable_device/bridge races Benjamin Herrenschmidt
2018-08-15 22:40           ` Guenter Roeck
2018-08-15 23:38             ` Benjamin Herrenschmidt
2018-08-20  1:31               ` Guenter Roeck
2018-08-17  3:07           ` Bjorn Helgaas
2018-08-17  3:42             ` Benjamin Herrenschmidt
2018-08-15 18:50     ` PCIe enable device races (Was: [PATCH v3] PCI: Data corruption happening due to race condition) Bjorn Helgaas
2018-08-15 21:52       ` Benjamin Herrenschmidt
2018-08-15 23:23         ` Benjamin Herrenschmidt
2018-08-16  7:58         ` Konstantin Khlebnikov
2018-08-16  8:02           ` Benjamin Herrenschmidt
2018-08-16  9:22             ` Hari Vyas
2018-08-16 10:10               ` Benjamin Herrenschmidt
2018-08-16 10:11                 ` Benjamin Herrenschmidt
2018-08-16 10:26                 ` Lukas Wunner
2018-08-16 10:47                   ` Hari Vyas
2018-08-16 23:20                     ` Benjamin Herrenschmidt
2018-08-16 23:17                   ` Benjamin Herrenschmidt
2018-08-17  0:43                     ` Benjamin Herrenschmidt
2018-08-16 19:43             ` Jens Axboe
2018-08-16 21:37               ` Benjamin Herrenschmidt
2018-08-16 21:56                 ` Jens Axboe
2018-08-16 23:09                   ` Benjamin Herrenschmidt
2018-08-17  0:14                     ` Jens Axboe
2018-08-16 12:28         ` Lukas Wunner
2018-08-16 23:25           ` Benjamin Herrenschmidt
2018-08-17  1:12             ` Benjamin Herrenschmidt
2018-08-17 16:39               ` Lukas Wunner
2018-08-18  3:37                 ` Benjamin Herrenschmidt
2018-08-18  9:22                   ` Lukas Wunner
2018-08-18 13:11                     ` Benjamin Herrenschmidt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).