linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH RFC v4 02/21] PCI: Fix race condition in pci_enable/disable_device()
       [not found] ` <20190311133122.11417-3-s.miroshnichenko@yadro.com>
@ 2019-03-26 19:00   ` Bjorn Helgaas
  2019-03-27 17:11     ` Sergey Miroshnichenko
  0 siblings, 1 reply; 4+ messages in thread
From: Bjorn Helgaas @ 2019-03-26 19:00 UTC (permalink / raw)
  To: Sergey Miroshnichenko
  Cc: linux-pci, linuxppc-dev, linux, Srinath Mannam, Marta Rybczynska,
	linux-kernel

[+cc Srinath, Marta, LKML]

On Mon, Mar 11, 2019 at 04:31:03PM +0300, Sergey Miroshnichenko wrote:
>  CPU0                                      CPU1
> 
>  pci_enable_device_mem()                   pci_enable_device_mem()
>    pci_enable_bridge()                       pci_enable_bridge()
>      pci_is_enabled()
>        return false;
>      atomic_inc_return(enable_cnt)
>      Start actual enabling the bridge
>      ...                                       pci_is_enabled()
>      ...                                         return true;
>      ...                                   Start memory requests <-- FAIL
>      ...
>      Set the PCI_COMMAND_MEMORY bit <-- Must wait for this
> 
> This patch protects the pci_enable/disable_device() and pci_enable_bridge()
> with mutexes.

This is a subtle issue that we've tried to fix before, but we've never
had a satisfactory solution, so I hope you've figured out the right
fix.

I'll include some links to previous discussion.  This patch is very
similar to [2], which we didn't actually apply.  We did apply the
patch from [3] as 40f11adc7cd9 ("PCI: Avoid race while enabling
upstream bridges"), but it caused the regressions reported in [4,5],
so we reverted it with 0f50a49e3008 ("Revert "PCI: Avoid race while
enabling upstream bridges"").

I think the underlying design problem is that we have a driver for
device B calling pci_enable_device(), and it is changing the state of
device A (an upstream bridge).  The model generally is that a driver
should only touch the device it is bound to.

It's tricky to get the locking right when several children of device A
all need to operate on A.

That's all to say I'll have to think carefully about this particular
patch, so I'll go on to the others and come back to this one.

Bjorn

[1] https://lore.kernel.org/linux-pci/1494256190-28993-1-git-send-email-srinath.mannam@broadcom.com/T/#u
    [RFC PATCH] pci: Concurrency issue in NVMe Init through PCIe switch

[2] https://lore.kernel.org/linux-pci/1496135297-19680-1-git-send-email-srinath.mannam@broadcom.com/T/#u
    [RFC PATCH v2] pci: Concurrency issue in NVMe Init through PCIe switch

[3] https://lore.kernel.org/linux-pci/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com/T/#u
    [RFC PATCH v3] pci: Concurrency issue during pci enable bridge

[4] https://lore.kernel.org/linux-pci/150547971091.977464.16294045866179907260.stgit@buzz/T/#u
    [PATCH bisected regression in 4.14] PCI: fix race while enabling upstream bridges concurrently

[5] https://lore.kernel.org/linux-wireless/04c9b578-693c-1dc6-9f0f-904580231b21@kernel.dk/T/#u
    iwlwifi firmware load broken in current -git

[6] https://lore.kernel.org/linux-pci/744877924.5841545.1521630049567.JavaMail.zimbra@kalray.eu/T/#u
    [RFC PATCH] nvme: avoid race-conditions when enabling devices

> Signed-off-by: Sergey Miroshnichenko <s.miroshnichenko@yadro.com>
> ---
>  drivers/pci/pci.c   | 26 ++++++++++++++++++++++----
>  drivers/pci/probe.c |  1 +
>  include/linux/pci.h |  1 +
>  3 files changed, 24 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index f006068be209..895201d4c9e6 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -1615,6 +1615,8 @@ static void pci_enable_bridge(struct pci_dev *dev)
>  	struct pci_dev *bridge;
>  	int retval;
>  
> +	mutex_lock(&dev->enable_mutex);
> +
>  	bridge = pci_upstream_bridge(dev);
>  	if (bridge)
>  		pci_enable_bridge(bridge);
> @@ -1622,6 +1624,7 @@ static void pci_enable_bridge(struct pci_dev *dev)
>  	if (pci_is_enabled(dev)) {
>  		if (!dev->is_busmaster)
>  			pci_set_master(dev);
> +		mutex_unlock(&dev->enable_mutex);
>  		return;
>  	}
>  
> @@ -1630,11 +1633,14 @@ static void pci_enable_bridge(struct pci_dev *dev)
>  		pci_err(dev, "Error enabling bridge (%d), continuing\n",
>  			retval);
>  	pci_set_master(dev);
> +	mutex_unlock(&dev->enable_mutex);
>  }
>  
>  static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
>  {
>  	struct pci_dev *bridge;
> +	/* Enable-locking of bridges is performed within the pci_enable_bridge() */
> +	bool need_lock = !dev->subordinate;
>  	int err;
>  	int i, bars = 0;
>  
> @@ -1650,8 +1656,13 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
>  		dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
>  	}
>  
> -	if (atomic_inc_return(&dev->enable_cnt) > 1)
> +	if (need_lock)
> +		mutex_lock(&dev->enable_mutex);
> +	if (pci_is_enabled(dev)) {
> +		if (need_lock)
> +			mutex_unlock(&dev->enable_mutex);
>  		return 0;		/* already enabled */
> +	}
>  
>  	bridge = pci_upstream_bridge(dev);
>  	if (bridge)
> @@ -1666,8 +1677,10 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
>  			bars |= (1 << i);
>  
>  	err = do_pci_enable_device(dev, bars);
> -	if (err < 0)
> -		atomic_dec(&dev->enable_cnt);
> +	if (err >= 0)
> +		atomic_inc(&dev->enable_cnt);
> +	if (need_lock)
> +		mutex_unlock(&dev->enable_mutex);
>  	return err;
>  }
>  
> @@ -1910,15 +1923,20 @@ void pci_disable_device(struct pci_dev *dev)
>  	if (dr)
>  		dr->enabled = 0;
>  
> +	mutex_lock(&dev->enable_mutex);
>  	dev_WARN_ONCE(&dev->dev, atomic_read(&dev->enable_cnt) <= 0,
>  		      "disabling already-disabled device");
>  
> -	if (atomic_dec_return(&dev->enable_cnt) != 0)
> +	if (atomic_dec_return(&dev->enable_cnt) != 0) {
> +		mutex_unlock(&dev->enable_mutex);
>  		return;
> +	}
>  
>  	do_pci_disable_device(dev);
>  
>  	dev->is_busmaster = 0;
> +
> +	mutex_unlock(&dev->enable_mutex);
>  }
>  EXPORT_SYMBOL(pci_disable_device);
>  
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 2ec0df04e0dc..977a127ce791 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -2267,6 +2267,7 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
>  	INIT_LIST_HEAD(&dev->bus_list);
>  	dev->dev.type = &pci_dev_type;
>  	dev->bus = pci_bus_get(bus);
> +	mutex_init(&dev->enable_mutex);
>  
>  	return dev;
>  }
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 77448215ef5b..cb2760a31fe2 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -419,6 +419,7 @@ struct pci_dev {
>  	unsigned int	no_vf_scan:1;		/* Don't scan for VFs after IOV enablement */
>  	pci_dev_flags_t dev_flags;
>  	atomic_t	enable_cnt;	/* pci_enable_device has been called */
> +	struct mutex	enable_mutex;
>  
>  	u32		saved_config_space[16]; /* Config space saved at suspend time */
>  	struct hlist_head saved_cap_space;
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC v4 08/21] nvme-pci: Handle movable BARs
       [not found] ` <20190311133122.11417-9-s.miroshnichenko@yadro.com>
@ 2019-03-26 20:20   ` Bjorn Helgaas
  2019-03-27 17:30     ` Sergey Miroshnichenko
  0 siblings, 1 reply; 4+ messages in thread
From: Bjorn Helgaas @ 2019-03-26 20:20 UTC (permalink / raw)
  To: Sergey Miroshnichenko
  Cc: linux-pci, linuxppc-dev, linux, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-nvme, linux-kernel

[+cc Keith, Jens, Christoph, Sagi, linux-nvme, LKML]

On Mon, Mar 11, 2019 at 04:31:09PM +0300, Sergey Miroshnichenko wrote:
> Hotplugged devices can affect the existing ones by moving their BARs.
> PCI subsystem will inform the NVME driver about this by invoking
> reset_prepare()+reset_done(), then iounmap()+ioremap() must be called.

Do you mean the PCI core will invoke ->rescan_prepare() and
->rescan_done() (as opposed to *reset*)?

> Signed-off-by: Sergey Miroshnichenko <s.miroshnichenko@yadro.com>
> ---
>  drivers/nvme/host/pci.c | 29 +++++++++++++++++++++++++++--
>  1 file changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 92bad1c810ac..ccea3033a67a 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -106,6 +106,7 @@ struct nvme_dev {
>  	unsigned int num_vecs;
>  	int q_depth;
>  	u32 db_stride;
> +	resource_size_t current_phys_bar;
>  	void __iomem *bar;
>  	unsigned long bar_mapped_size;
>  	struct work_struct remove_work;
> @@ -1672,13 +1673,16 @@ static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
>  {
>  	struct pci_dev *pdev = to_pci_dev(dev->dev);
>  
> -	if (size <= dev->bar_mapped_size)
> +	if (dev->bar &&
> +	    dev->current_phys_bar == pci_resource_start(pdev, 0) &&
> +	    size <= dev->bar_mapped_size)
>  		return 0;
>  	if (size > pci_resource_len(pdev, 0))
>  		return -ENOMEM;
>  	if (dev->bar)
>  		iounmap(dev->bar);
> -	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
> +	dev->current_phys_bar = pci_resource_start(pdev, 0);
> +	dev->bar = ioremap(dev->current_phys_bar, size);

dev->current_phys_bar is different from pci_resource_start() in the
case where the PCI core has moved the nvme BAR, but nvme has not yet
remapped it.

I'm not sure it's worth keeping track of current_phys_bar, as opposed
to always unmapping and remapping.  Is this a performance path?  I
think there are advantages to always exercising the same code path,
regardless of whether the BAR happened to be moved, e.g., if there's a
bug in the "BAR moved" path, it may be a heisenbug because whether we
exercise that path depends on the current configuration.

If you do need to cache current_phys_bar, maybe this, so it's a little
easier to see that you're not changing the ioremap() itself:

  dev->bar = ioremap(pci_resource_start(pdev, 0), size);
  dev->current_phys_bar = pci_resource_start(pdev, 0);

>  	if (!dev->bar) {
>  		dev->bar_mapped_size = 0;
>  		return -ENOMEM;
> @@ -2504,6 +2508,8 @@ static void nvme_reset_work(struct work_struct *work)
>  	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
>  		goto out;
>  
> +	nvme_remap_bar(dev, db_bar_size(dev, 0));

How is this change connected to rescan?  This looks reset-related.

>  	/*
>  	 * If we're called to reset a live controller first shut it down before
>  	 * moving on.
> @@ -2910,6 +2916,23 @@ static void nvme_error_resume(struct pci_dev *pdev)
>  	flush_work(&dev->ctrl.reset_work);
>  }
>  
> +void nvme_rescan_prepare(struct pci_dev *pdev)
> +{
> +	struct nvme_dev *dev = pci_get_drvdata(pdev);
> +
> +	nvme_dev_disable(dev, false);
> +	nvme_dev_unmap(dev);
> +	dev->bar = NULL;
> +}
> +
> +void nvme_rescan_done(struct pci_dev *pdev)
> +{
> +	struct nvme_dev *dev = pci_get_drvdata(pdev);
> +
> +	nvme_dev_map(dev);
> +	nvme_reset_ctrl_sync(&dev->ctrl);
> +}
> +
>  static const struct pci_error_handlers nvme_err_handler = {
>  	.error_detected	= nvme_error_detected,
>  	.slot_reset	= nvme_slot_reset,
> @@ -2974,6 +2997,8 @@ static struct pci_driver nvme_driver = {
>  	},
>  	.sriov_configure = pci_sriov_configure_simple,
>  	.err_handler	= &nvme_err_handler,
> +	.rescan_prepare	= nvme_rescan_prepare,
> +	.rescan_done	= nvme_rescan_done,
>  };
>  
>  static int __init nvme_init(void)
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC v4 02/21] PCI: Fix race condition in pci_enable/disable_device()
  2019-03-26 19:00   ` [PATCH RFC v4 02/21] PCI: Fix race condition in pci_enable/disable_device() Bjorn Helgaas
@ 2019-03-27 17:11     ` Sergey Miroshnichenko
  0 siblings, 0 replies; 4+ messages in thread
From: Sergey Miroshnichenko @ 2019-03-27 17:11 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: linux-pci, linuxppc-dev, linux, Srinath Mannam, Marta Rybczynska,
	linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 7632 bytes --]

On 3/26/19 10:00 PM, Bjorn Helgaas wrote:
> [+cc Srinath, Marta, LKML]
> 
> On Mon, Mar 11, 2019 at 04:31:03PM +0300, Sergey Miroshnichenko wrote:
>>  CPU0                                      CPU1
>>
>>  pci_enable_device_mem()                   pci_enable_device_mem()
>>    pci_enable_bridge()                       pci_enable_bridge()
>>      pci_is_enabled()
>>        return false;
>>      atomic_inc_return(enable_cnt)
>>      Start actual enabling the bridge
>>      ...                                       pci_is_enabled()
>>      ...                                         return true;
>>      ...                                   Start memory requests <-- FAIL
>>      ...
>>      Set the PCI_COMMAND_MEMORY bit <-- Must wait for this
>>
>> This patch protects the pci_enable/disable_device() and pci_enable_bridge()
>> with mutexes.
> 
> This is a subtle issue that we've tried to fix before, but we've never
> had a satisfactory solution, so I hope you've figured out the right
> fix.
> 
> I'll include some links to previous discussion.  This patch is very
> similar to [2], which we didn't actually apply.  We did apply the
> patch from [3] as 40f11adc7cd9 ("PCI: Avoid race while enabling
> upstream bridges"), but it caused the regressions reported in [4,5],
> so we reverted it with 0f50a49e3008 ("Revert "PCI: Avoid race while
> enabling upstream bridges"").
> 

Thanks for the links, I wasn't aware of these discussions and patches!

On PowerNV this issue is partially hidden by db2173198b95 ("powerpc/powernv/pci: Work
around races in PCI bridge enabling"), and on x86 BIOS pre-initializes all the bridges, so
it doesn't reproduce until hotplugging in a hotplugged bridge.

This patch is indeed similar to 40f11adc7cd9 ("PCI: Avoid race while enabling upstream
bridges"), but instead of a single static mutex it adds per-device mutexes and prevents
the dev->enable_cnt from incrementing too early. So it's not needed anymore to carefully
select a moment safe enough to enable the device.

Serge

> I think the underlying design problem is that we have a driver for
> device B calling pci_enable_device(), and it is changing the state of
> device A (an upstream bridge).  The model generally is that a driver
> should only touch the device it is bound to.
> 
> It's tricky to get the locking right when several children of device A
> all need to operate on A.
> 
> That's all to say I'll have to think carefully about this particular
> patch, so I'll go on to the others and come back to this one.
> 
> Bjorn
> 
> [1] https://lore.kernel.org/linux-pci/1494256190-28993-1-git-send-email-srinath.mannam@broadcom.com/T/#u
>     [RFC PATCH] pci: Concurrency issue in NVMe Init through PCIe switch
> 
> [2] https://lore.kernel.org/linux-pci/1496135297-19680-1-git-send-email-srinath.mannam@broadcom.com/T/#u
>     [RFC PATCH v2] pci: Concurrency issue in NVMe Init through PCIe switch
> 
> [3] https://lore.kernel.org/linux-pci/1501858648-22228-1-git-send-email-srinath.mannam@broadcom.com/T/#u
>     [RFC PATCH v3] pci: Concurrency issue during pci enable bridge
> 
> [4] https://lore.kernel.org/linux-pci/150547971091.977464.16294045866179907260.stgit@buzz/T/#u
>     [PATCH bisected regression in 4.14] PCI: fix race while enabling upstream bridges concurrently
> 
> [5] https://lore.kernel.org/linux-wireless/04c9b578-693c-1dc6-9f0f-904580231b21@kernel.dk/T/#u
>     iwlwifi firmware load broken in current -git
> 
> [6] https://lore.kernel.org/linux-pci/744877924.5841545.1521630049567.JavaMail.zimbra@kalray.eu/T/#u
>     [RFC PATCH] nvme: avoid race-conditions when enabling devices
> 
>> Signed-off-by: Sergey Miroshnichenko <s.miroshnichenko@yadro.com>
>> ---
>>  drivers/pci/pci.c   | 26 ++++++++++++++++++++++----
>>  drivers/pci/probe.c |  1 +
>>  include/linux/pci.h |  1 +
>>  3 files changed, 24 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index f006068be209..895201d4c9e6 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -1615,6 +1615,8 @@ static void pci_enable_bridge(struct pci_dev *dev)
>>  	struct pci_dev *bridge;
>>  	int retval;
>>  
>> +	mutex_lock(&dev->enable_mutex);
>> +
>>  	bridge = pci_upstream_bridge(dev);
>>  	if (bridge)
>>  		pci_enable_bridge(bridge);
>> @@ -1622,6 +1624,7 @@ static void pci_enable_bridge(struct pci_dev *dev)
>>  	if (pci_is_enabled(dev)) {
>>  		if (!dev->is_busmaster)
>>  			pci_set_master(dev);
>> +		mutex_unlock(&dev->enable_mutex);
>>  		return;
>>  	}
>>  
>> @@ -1630,11 +1633,14 @@ static void pci_enable_bridge(struct pci_dev *dev)
>>  		pci_err(dev, "Error enabling bridge (%d), continuing\n",
>>  			retval);
>>  	pci_set_master(dev);
>> +	mutex_unlock(&dev->enable_mutex);
>>  }
>>  
>>  static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
>>  {
>>  	struct pci_dev *bridge;
>> +	/* Enable-locking of bridges is performed within the pci_enable_bridge() */
>> +	bool need_lock = !dev->subordinate;
>>  	int err;
>>  	int i, bars = 0;
>>  
>> @@ -1650,8 +1656,13 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
>>  		dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
>>  	}
>>  
>> -	if (atomic_inc_return(&dev->enable_cnt) > 1)
>> +	if (need_lock)
>> +		mutex_lock(&dev->enable_mutex);
>> +	if (pci_is_enabled(dev)) {
>> +		if (need_lock)
>> +			mutex_unlock(&dev->enable_mutex);
>>  		return 0;		/* already enabled */
>> +	}
>>  
>>  	bridge = pci_upstream_bridge(dev);
>>  	if (bridge)
>> @@ -1666,8 +1677,10 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
>>  			bars |= (1 << i);
>>  
>>  	err = do_pci_enable_device(dev, bars);
>> -	if (err < 0)
>> -		atomic_dec(&dev->enable_cnt);
>> +	if (err >= 0)
>> +		atomic_inc(&dev->enable_cnt);
>> +	if (need_lock)
>> +		mutex_unlock(&dev->enable_mutex);
>>  	return err;
>>  }
>>  
>> @@ -1910,15 +1923,20 @@ void pci_disable_device(struct pci_dev *dev)
>>  	if (dr)
>>  		dr->enabled = 0;
>>  
>> +	mutex_lock(&dev->enable_mutex);
>>  	dev_WARN_ONCE(&dev->dev, atomic_read(&dev->enable_cnt) <= 0,
>>  		      "disabling already-disabled device");
>>  
>> -	if (atomic_dec_return(&dev->enable_cnt) != 0)
>> +	if (atomic_dec_return(&dev->enable_cnt) != 0) {
>> +		mutex_unlock(&dev->enable_mutex);
>>  		return;
>> +	}
>>  
>>  	do_pci_disable_device(dev);
>>  
>>  	dev->is_busmaster = 0;
>> +
>> +	mutex_unlock(&dev->enable_mutex);
>>  }
>>  EXPORT_SYMBOL(pci_disable_device);
>>  
>> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
>> index 2ec0df04e0dc..977a127ce791 100644
>> --- a/drivers/pci/probe.c
>> +++ b/drivers/pci/probe.c
>> @@ -2267,6 +2267,7 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
>>  	INIT_LIST_HEAD(&dev->bus_list);
>>  	dev->dev.type = &pci_dev_type;
>>  	dev->bus = pci_bus_get(bus);
>> +	mutex_init(&dev->enable_mutex);
>>  
>>  	return dev;
>>  }
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 77448215ef5b..cb2760a31fe2 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -419,6 +419,7 @@ struct pci_dev {
>>  	unsigned int	no_vf_scan:1;		/* Don't scan for VFs after IOV enablement */
>>  	pci_dev_flags_t dev_flags;
>>  	atomic_t	enable_cnt;	/* pci_enable_device has been called */
>> +	struct mutex	enable_mutex;
>>  
>>  	u32		saved_config_space[16]; /* Config space saved at suspend time */
>>  	struct hlist_head saved_cap_space;
>> -- 
>> 2.20.1
>>


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC v4 08/21] nvme-pci: Handle movable BARs
  2019-03-26 20:20   ` [PATCH RFC v4 08/21] nvme-pci: Handle movable BARs Bjorn Helgaas
@ 2019-03-27 17:30     ` Sergey Miroshnichenko
  0 siblings, 0 replies; 4+ messages in thread
From: Sergey Miroshnichenko @ 2019-03-27 17:30 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: linux-pci, linuxppc-dev, linux, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-nvme, linux-kernel

On 3/26/19 11:20 PM, Bjorn Helgaas wrote:
> [+cc Keith, Jens, Christoph, Sagi, linux-nvme, LKML]
> 
> On Mon, Mar 11, 2019 at 04:31:09PM +0300, Sergey Miroshnichenko wrote:
>> Hotplugged devices can affect the existing ones by moving their BARs.
>> PCI subsystem will inform the NVME driver about this by invoking
>> reset_prepare()+reset_done(), then iounmap()+ioremap() must be called.
> 
> Do you mean the PCI core will invoke ->rescan_prepare() and
> ->rescan_done() (as opposed to *reset*)?
> 

Yes, of course, sorry for the confusion!

These are new callbacks, so drivers can explicitly show their support of movable BARs, and
the PCI core can detect if they don't and note that the corresponding BARs can't be moved
for now.

>> Signed-off-by: Sergey Miroshnichenko <s.miroshnichenko@yadro.com>
>> ---
>>  drivers/nvme/host/pci.c | 29 +++++++++++++++++++++++++++--
>>  1 file changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 92bad1c810ac..ccea3033a67a 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -106,6 +106,7 @@ struct nvme_dev {
>>  	unsigned int num_vecs;
>>  	int q_depth;
>>  	u32 db_stride;
>> +	resource_size_t current_phys_bar;
>>  	void __iomem *bar;
>>  	unsigned long bar_mapped_size;
>>  	struct work_struct remove_work;
>> @@ -1672,13 +1673,16 @@ static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
>>  {
>>  	struct pci_dev *pdev = to_pci_dev(dev->dev);
>>  
>> -	if (size <= dev->bar_mapped_size)
>> +	if (dev->bar &&
>> +	    dev->current_phys_bar == pci_resource_start(pdev, 0) &&
>> +	    size <= dev->bar_mapped_size)
>>  		return 0;
>>  	if (size > pci_resource_len(pdev, 0))
>>  		return -ENOMEM;
>>  	if (dev->bar)
>>  		iounmap(dev->bar);
>> -	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
>> +	dev->current_phys_bar = pci_resource_start(pdev, 0);
>> +	dev->bar = ioremap(dev->current_phys_bar, size);
> 
> dev->current_phys_bar is different from pci_resource_start() in the
> case where the PCI core has moved the nvme BAR, but nvme has not yet
> remapped it.
> 
> I'm not sure it's worth keeping track of current_phys_bar, as opposed
> to always unmapping and remapping.  Is this a performance path?  I
> think there are advantages to always exercising the same code path,
> regardless of whether the BAR happened to be moved, e.g., if there's a
> bug in the "BAR moved" path, it may be a heisenbug because whether we
> exercise that path depends on the current configuration.
> 
> If you do need to cache current_phys_bar, maybe this, so it's a little
> easier to see that you're not changing the ioremap() itself:
> 
>   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
>   dev->current_phys_bar = pci_resource_start(pdev, 0);
> 

Oh, I see now. Rescan is rather a rare event, and unconditional remapping is simpler, so a
bit more resistant to bugs.

>>  	if (!dev->bar) {
>>  		dev->bar_mapped_size = 0;
>>  		return -ENOMEM;
>> @@ -2504,6 +2508,8 @@ static void nvme_reset_work(struct work_struct *work)
>>  	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
>>  		goto out;
>>  
>> +	nvme_remap_bar(dev, db_bar_size(dev, 0));
> 
> How is this change connected to rescan?  This looks reset-related.
> 

Thanks for catching that! This has also slipped form early stage of this pathset, when
reset_done() (which is rescan_done() now) just initiated an NVME reset.

Best regards,
Serge

>>  	/*
>>  	 * If we're called to reset a live controller first shut it down before
>>  	 * moving on.
>> @@ -2910,6 +2916,23 @@ static void nvme_error_resume(struct pci_dev *pdev)
>>  	flush_work(&dev->ctrl.reset_work);
>>  }
>>  
>> +void nvme_rescan_prepare(struct pci_dev *pdev)
>> +{
>> +	struct nvme_dev *dev = pci_get_drvdata(pdev);
>> +
>> +	nvme_dev_disable(dev, false);
>> +	nvme_dev_unmap(dev);
>> +	dev->bar = NULL;
>> +}
>> +
>> +void nvme_rescan_done(struct pci_dev *pdev)
>> +{
>> +	struct nvme_dev *dev = pci_get_drvdata(pdev);
>> +
>> +	nvme_dev_map(dev);
>> +	nvme_reset_ctrl_sync(&dev->ctrl);
>> +}
>> +
>>  static const struct pci_error_handlers nvme_err_handler = {
>>  	.error_detected	= nvme_error_detected,
>>  	.slot_reset	= nvme_slot_reset,
>> @@ -2974,6 +2997,8 @@ static struct pci_driver nvme_driver = {
>>  	},
>>  	.sriov_configure = pci_sriov_configure_simple,
>>  	.err_handler	= &nvme_err_handler,
>> +	.rescan_prepare	= nvme_rescan_prepare,
>> +	.rescan_done	= nvme_rescan_done,
>>  };
>>  
>>  static int __init nvme_init(void)
>> -- 
>> 2.20.1
>>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2019-03-27 17:30 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20190311133122.11417-1-s.miroshnichenko@yadro.com>
     [not found] ` <20190311133122.11417-3-s.miroshnichenko@yadro.com>
2019-03-26 19:00   ` [PATCH RFC v4 02/21] PCI: Fix race condition in pci_enable/disable_device() Bjorn Helgaas
2019-03-27 17:11     ` Sergey Miroshnichenko
     [not found] ` <20190311133122.11417-9-s.miroshnichenko@yadro.com>
2019-03-26 20:20   ` [PATCH RFC v4 08/21] nvme-pci: Handle movable BARs Bjorn Helgaas
2019-03-27 17:30     ` Sergey Miroshnichenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).