linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH 6/6] mm/devm_memremap_pages: Fix final page put race
  2019-03-29 15:27 ` [PATCH 6/6] mm/devm_memremap_pages: Fix final page put race Dan Williams
@ 2019-03-29  9:46   ` Ira Weiny
  0 siblings, 0 replies; 11+ messages in thread
From: Ira Weiny @ 2019-03-29  9:46 UTC (permalink / raw)
  To: Dan Williams
  Cc: akpm, Logan Gunthorpe, Bjorn Helgaas, Jérôme Glisse,
	Christoph Hellwig, linux-mm, linux-pci, linux-nvdimm,
	linux-kernel

On Fri, Mar 29, 2019 at 08:27:55AM -0700, Dan Williams wrote:
> Logan noticed that devm_memremap_pages_release() kills the percpu_ref
> drops all the page references that were acquired at init and then
> immediately proceeds to unplug, arch_remove_memory(), the backing pages
> for the pagemap. If for some reason device shutdown actually collides
> with a busy / elevated-ref-count page then arch_remove_memory() should
> be deferred until after that reference is dropped.
> 
> As it stands the "wait for last page ref drop" happens *after*
> devm_memremap_pages_release() returns, which is obviously too late and
> can lead to crashes.
> 
> Fix this situation by assigning the responsibility to wait for the
> percpu_ref to go idle to devm_memremap_pages() with a new ->cleanup()
> callback. Implement the new cleanup callback for all
> devm_memremap_pages() users: pmem, devdax, hmm, and p2pdma.
> 
> Reported-by: Logan Gunthorpe <logang@deltatee.com>
> Fixes: 41e94a851304 ("add devm_memremap_pages")
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Bjorn Helgaas <bhelgaas@google.com>
> Cc: "Jérôme Glisse" <jglisse@redhat.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

For the series:

Reviewed-by: Ira Weiny <ira.weiny@intel.com>

> ---
>  drivers/dax/device.c              |   13 +++----------
>  drivers/nvdimm/pmem.c             |   17 +++++++++++++----
>  drivers/pci/p2pdma.c              |   17 +++--------------
>  include/linux/memremap.h          |    2 ++
>  kernel/memremap.c                 |   17 ++++++++++++-----
>  mm/hmm.c                          |   14 +++-----------
>  tools/testing/nvdimm/test/iomap.c |    2 ++
>  7 files changed, 38 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index e428468ab661..e3aa78dd1bb0 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -27,9 +27,8 @@ static void dev_dax_percpu_release(struct percpu_ref *ref)
>  	complete(&dev_dax->cmp);
>  }
>  
> -static void dev_dax_percpu_exit(void *data)
> +static void dev_dax_percpu_exit(struct percpu_ref *ref)
>  {
> -	struct percpu_ref *ref = data;
>  	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
>  
>  	dev_dbg(&dev_dax->dev, "%s\n", __func__);
> @@ -468,18 +467,12 @@ int dev_dax_probe(struct device *dev)
>  	if (rc)
>  		return rc;
>  
> -	rc = devm_add_action_or_reset(dev, dev_dax_percpu_exit, &dev_dax->ref);
> -	if (rc)
> -		return rc;
> -
>  	dev_dax->pgmap.ref = &dev_dax->ref;
>  	dev_dax->pgmap.kill = dev_dax_percpu_kill;
> +	dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
>  	addr = devm_memremap_pages(dev, &dev_dax->pgmap);
> -	if (IS_ERR(addr)) {
> -		devm_remove_action(dev, dev_dax_percpu_exit, &dev_dax->ref);
> -		percpu_ref_exit(&dev_dax->ref);
> +	if (IS_ERR(addr))
>  		return PTR_ERR(addr);
> -	}
>  
>  	inode = dax_inode(dax_dev);
>  	cdev = inode->i_cdev;
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index bc2f700feef8..507b9eda42aa 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -304,11 +304,19 @@ static const struct attribute_group *pmem_attribute_groups[] = {
>  	NULL,
>  };
>  
> -static void pmem_release_queue(void *q)
> +static void __pmem_release_queue(struct percpu_ref *ref)
>  {
> +	struct request_queue *q;
> +
> +	q = container_of(ref, typeof(*q), q_usage_counter);
>  	blk_cleanup_queue(q);
>  }
>  
> +static void pmem_release_queue(void *ref)
> +{
> +	__pmem_release_queue(ref);
> +}
> +
>  static void pmem_freeze_queue(struct percpu_ref *ref)
>  {
>  	struct request_queue *q;
> @@ -400,12 +408,10 @@ static int pmem_attach_disk(struct device *dev,
>  	if (!q)
>  		return -ENOMEM;
>  
> -	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
> -		return -ENOMEM;
> -
>  	pmem->pfn_flags = PFN_DEV;
>  	pmem->pgmap.ref = &q->q_usage_counter;
>  	pmem->pgmap.kill = pmem_freeze_queue;
> +	pmem->pgmap.cleanup = __pmem_release_queue;
>  	if (is_nd_pfn(dev)) {
>  		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
>  			return -ENOMEM;
> @@ -426,6 +432,9 @@ static int pmem_attach_disk(struct device *dev,
>  		pmem->pfn_flags |= PFN_MAP;
>  		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
>  	} else {
> +		if (devm_add_action_or_reset(dev, pmem_release_queue,
> +					&q->q_usage_counter))
> +			return -ENOMEM;
>  		addr = devm_memremap(dev, pmem->phys_addr,
>  				pmem->size, ARCH_MEMREMAP_PMEM);
>  		memcpy(&bb_res, &nsio->res, sizeof(bb_res));
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 1b96c1688715..7ff5b8067670 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -95,7 +95,7 @@ static void pci_p2pdma_percpu_kill(struct percpu_ref *ref)
>  	percpu_ref_kill(ref);
>  }
>  
> -static void pci_p2pdma_percpu_cleanup(void *ref)
> +static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref)
>  {
>  	struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
>  
> @@ -197,16 +197,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
>  	if (error)
>  		goto pgmap_free;
>  
> -	/*
> -	 * FIXME: the percpu_ref_exit needs to be coordinated internal
> -	 * to devm_memremap_pages_release(). Duplicate the same ordering
> -	 * as other devm_memremap_pages() users for now.
> -	 */
> -	error = devm_add_action(&pdev->dev, pci_p2pdma_percpu_cleanup,
> -			&p2p_pgmap->ref);
> -	if (error)
> -		goto ref_cleanup;
> -
>  	pgmap = &p2p_pgmap->pgmap;
>  
>  	pgmap->res.start = pci_resource_start(pdev, bar) + offset;
> @@ -217,11 +207,12 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
>  	pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
>  		pci_resource_start(pdev, bar);
>  	pgmap->kill = pci_p2pdma_percpu_kill;
> +	pgmap->cleanup = pci_p2pdma_percpu_cleanup;
>  
>  	addr = devm_memremap_pages(&pdev->dev, pgmap);
>  	if (IS_ERR(addr)) {
>  		error = PTR_ERR(addr);
> -		goto ref_exit;
> +		goto pgmap_free;
>  	}
>  
>  	error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr,
> @@ -238,8 +229,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
>  
>  pages_free:
>  	devm_memunmap_pages(&pdev->dev, pgmap);
> -ref_cleanup:
> -	percpu_ref_exit(&p2p_pgmap->ref);
>  pgmap_free:
>  	devm_kfree(&pdev->dev, p2p_pgmap);
>  	return error;
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 7601ee314c4a..1732dea030b2 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -81,6 +81,7 @@ typedef void (*dev_page_free_t)(struct page *page, void *data);
>   * @res: physical address range covered by @ref
>   * @ref: reference count that pins the devm_memremap_pages() mapping
>   * @kill: callback to transition @ref to the dead state
> + * @cleanup: callback to wait for @ref to be idle and reap it
>   * @dev: host device of the mapping for debug
>   * @data: private data pointer for page_free()
>   * @type: memory type: see MEMORY_* in memory_hotplug.h
> @@ -92,6 +93,7 @@ struct dev_pagemap {
>  	struct resource res;
>  	struct percpu_ref *ref;
>  	void (*kill)(struct percpu_ref *ref);
> +	void (*cleanup)(struct percpu_ref *ref);
>  	struct device *dev;
>  	void *data;
>  	enum memory_type type;
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index 65afbacab44e..05d1af5a2f15 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -96,6 +96,7 @@ static void devm_memremap_pages_release(void *data)
>  	pgmap->kill(pgmap->ref);
>  	for_each_device_pfn(pfn, pgmap)
>  		put_page(pfn_to_page(pfn));
> +	pgmap->cleanup(pgmap->ref);
>  
>  	/* pages are dead and unused, undo the arch mapping */
>  	align_start = res->start & ~(SECTION_SIZE - 1);
> @@ -134,8 +135,8 @@ static void devm_memremap_pages_release(void *data)
>   * 2/ The altmap field may optionally be initialized, in which case altmap_valid
>   *    must be set to true
>   *
> - * 3/ pgmap->ref must be 'live' on entry and will be killed at
> - *    devm_memremap_pages_release() time, or if this routine fails.
> + * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
> + *    at devm_memremap_pages_release() time, or if this routine fails.
>   *
>   * 4/ res is expected to be a host memory range that could feasibly be
>   *    treated as a "System RAM" range, i.e. not a device mmio range, but
> @@ -151,8 +152,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	pgprot_t pgprot = PAGE_KERNEL;
>  	int error, nid, is_ram;
>  
> -	if (!pgmap->ref || !pgmap->kill)
> +	if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
> +		WARN(1, "Missing reference count teardown definition\n");
>  		return ERR_PTR(-EINVAL);
> +	}
>  
>  	align_start = res->start & ~(SECTION_SIZE - 1);
>  	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
> @@ -163,14 +166,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	if (conflict_pgmap) {
>  		dev_WARN(dev, "Conflicting mapping in same section\n");
>  		put_dev_pagemap(conflict_pgmap);
> -		return ERR_PTR(-ENOMEM);
> +		error = -ENOMEM;
> +		goto err_array;
>  	}
>  
>  	conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
>  	if (conflict_pgmap) {
>  		dev_WARN(dev, "Conflicting mapping in same section\n");
>  		put_dev_pagemap(conflict_pgmap);
> -		return ERR_PTR(-ENOMEM);
> +		error = -ENOMEM;
> +		goto err_array;
>  	}
>  
>  	is_ram = region_intersects(align_start, align_size,
> @@ -262,6 +267,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	pgmap_array_delete(res);
>   err_array:
>  	pgmap->kill(pgmap->ref);
> +	pgmap->cleanup(pgmap->ref);
> +
>  	return ERR_PTR(error);
>  }
>  EXPORT_SYMBOL_GPL(devm_memremap_pages);
> diff --git a/mm/hmm.c b/mm/hmm.c
> index fe1cd87e49ac..225ade644058 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -975,9 +975,8 @@ static void hmm_devmem_ref_release(struct percpu_ref *ref)
>  	complete(&devmem->completion);
>  }
>  
> -static void hmm_devmem_ref_exit(void *data)
> +static void hmm_devmem_ref_exit(struct percpu_ref *ref)
>  {
> -	struct percpu_ref *ref = data;
>  	struct hmm_devmem *devmem;
>  
>  	devmem = container_of(ref, struct hmm_devmem, ref);
> @@ -1054,10 +1053,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
>  	if (ret)
>  		return ERR_PTR(ret);
>  
> -	ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref);
> -	if (ret)
> -		return ERR_PTR(ret);
> -
>  	size = ALIGN(size, PA_SECTION_SIZE);
>  	addr = min((unsigned long)iomem_resource.end,
>  		   (1UL << MAX_PHYSMEM_BITS) - 1);
> @@ -1096,6 +1091,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
>  	devmem->pagemap.ref = &devmem->ref;
>  	devmem->pagemap.data = devmem;
>  	devmem->pagemap.kill = hmm_devmem_ref_kill;
> +	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
>  
>  	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
>  	if (IS_ERR(result))
> @@ -1133,11 +1129,6 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
>  	if (ret)
>  		return ERR_PTR(ret);
>  
> -	ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit,
> -			&devmem->ref);
> -	if (ret)
> -		return ERR_PTR(ret);
> -
>  	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
>  	devmem->pfn_last = devmem->pfn_first +
>  			   (resource_size(devmem->resource) >> PAGE_SHIFT);
> @@ -1150,6 +1141,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
>  	devmem->pagemap.ref = &devmem->ref;
>  	devmem->pagemap.data = devmem;
>  	devmem->pagemap.kill = hmm_devmem_ref_kill;
> +	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
>  
>  	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
>  	if (IS_ERR(result))
> diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
> index c6635fee27d8..219dd0a1cb08 100644
> --- a/tools/testing/nvdimm/test/iomap.c
> +++ b/tools/testing/nvdimm/test/iomap.c
> @@ -108,7 +108,9 @@ static void nfit_test_kill(void *_pgmap)
>  {
>  	struct dev_pagemap *pgmap = _pgmap;
>  
> +	WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup);
>  	pgmap->kill(pgmap->ref);
> +	pgmap->cleanup(pgmap->ref);
>  }
>  
>  void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 0/6] mm/devm_memremap_pages: Fix page release race
@ 2019-03-29 15:27 Dan Williams
  2019-03-29 15:27 ` [PATCH 1/6] drivers/base/devres: Introduce devm_release_action() Dan Williams
                   ` (5 more replies)
  0 siblings, 6 replies; 11+ messages in thread
From: Dan Williams @ 2019-03-29 15:27 UTC (permalink / raw)
  To: akpm
  Cc: Ira Weiny, Bjorn Helgaas, Logan Gunthorpe, Christoph Hellwig,
	Jérôme Glisse, Greg Kroah-Hartman, Rafael J. Wysocki,
	linux-mm, linux-pci, linux-nvdimm, linux-kernel

Logan audited the devm_memremap_pages() shutdown path and noticed that
it was possible to proceed to arch_remove_memory() before all
potential page references have been reaped.

Introduce a new ->cleanup() callback to do the work of waiting for any
straggling page references and then perform the percpu_ref_exit() in
devm_memremap_pages_release() context.

For p2pdma this involves some deeper reworks to reference count
resources on a per-instance basis rather than a per pci-device basis. A
modified genalloc api is introduced to convey a driver-private pointer
through gen_pool_{alloc,free}() interfaces. Also, a
devm_memunmap_pages() api is introduced since p2pdma does not
auto-release resources on a setup failure.

The dax and pmem changes pass the nvdimm unit tests, but the hmm and
p2pdma changes are compile-tested only.

Thoughts on the api changes?

I'm targeting to land this through Andrew's tree. 0day has chewed on
this for a day and reported no issues so far.

---

Dan Williams (6):
      drivers/base/devres: Introduce devm_release_action()
      mm/devm_memremap_pages: Introduce devm_memunmap_pages
      pci/p2pdma: Fix the gen_pool_add_virt() failure path
      lib/genalloc: Introduce chunk owners
      pci/p2pdma: Track pgmap references per resource, not globally
      mm/devm_memremap_pages: Fix final page put race


 drivers/base/devres.c             |   24 ++++++++
 drivers/dax/device.c              |   13 +----
 drivers/nvdimm/pmem.c             |   17 +++++-
 drivers/pci/p2pdma.c              |  105 +++++++++++++++++++++++--------------
 include/linux/device.h            |    1 
 include/linux/genalloc.h          |   55 +++++++++++++++++--
 include/linux/memremap.h          |    8 +++
 kernel/memremap.c                 |   23 ++++++--
 lib/genalloc.c                    |   51 +++++++++---------
 mm/hmm.c                          |   14 +----
 tools/testing/nvdimm/test/iomap.c |    2 +
 11 files changed, 209 insertions(+), 104 deletions(-)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/6] drivers/base/devres: Introduce devm_release_action()
  2019-03-29 15:27 [PATCH 0/6] mm/devm_memremap_pages: Fix page release race Dan Williams
@ 2019-03-29 15:27 ` Dan Williams
  2019-03-29 15:27 ` [PATCH 2/6] mm/devm_memremap_pages: Introduce devm_memunmap_pages Dan Williams
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Dan Williams @ 2019-03-29 15:27 UTC (permalink / raw)
  To: akpm
  Cc: Logan Gunthorpe, Ira Weiny, Bjorn Helgaas, Christoph Hellwig,
	Greg Kroah-Hartman, Rafael J. Wysocki, linux-mm, linux-pci,
	linux-nvdimm, linux-kernel

The devm_add_action() facility allows a resource allocation routine to
add custom devm semantics. One such user is devm_memremap_pages().

There is now a need to manually trigger devm_memremap_pages_release().
Introduce devm_release_action() so the release action can be triggered
via a new devm_memunmap_pages() api in a follow-on change.

Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/base/devres.c  |   24 +++++++++++++++++++++++-
 include/linux/device.h |    1 +
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index e038e2b3b7ea..0bbb328bd17f 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -755,10 +755,32 @@ void devm_remove_action(struct device *dev, void (*action)(void *), void *data)
 
 	WARN_ON(devres_destroy(dev, devm_action_release, devm_action_match,
 			       &devres));
-
 }
 EXPORT_SYMBOL_GPL(devm_remove_action);
 
+/**
+ * devm_release_action() - release previously added custom action
+ * @dev: Device that owns the action
+ * @action: Function implementing the action
+ * @data: Pointer to data passed to @action implementation
+ *
+ * Releases and removes instance of @action previously added by
+ * devm_add_action().  Both action and data should match one of the
+ * existing entries.
+ */
+void devm_release_action(struct device *dev, void (*action)(void *), void *data)
+{
+	struct action_devres devres = {
+		.data = data,
+		.action = action,
+	};
+
+	WARN_ON(devres_release(dev, devm_action_release, devm_action_match,
+			       &devres));
+
+}
+EXPORT_SYMBOL_GPL(devm_release_action);
+
 /*
  * Managed kmalloc/kfree
  */
diff --git a/include/linux/device.h b/include/linux/device.h
index b425a7ee04ce..02a3e45de9af 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -715,6 +715,7 @@ void __iomem *devm_of_iomap(struct device *dev,
 /* allows to add/remove a custom action to devres stack */
 int devm_add_action(struct device *dev, void (*action)(void *), void *data);
 void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
+void devm_release_action(struct device *dev, void (*action)(void *), void *data);
 
 static inline int devm_add_action_or_reset(struct device *dev,
 					   void (*action)(void *), void *data)


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/6] mm/devm_memremap_pages: Introduce devm_memunmap_pages
  2019-03-29 15:27 [PATCH 0/6] mm/devm_memremap_pages: Fix page release race Dan Williams
  2019-03-29 15:27 ` [PATCH 1/6] drivers/base/devres: Introduce devm_release_action() Dan Williams
@ 2019-03-29 15:27 ` Dan Williams
  2019-03-29 15:27 ` [PATCH 3/6] pci/p2pdma: Fix the gen_pool_add_virt() failure path Dan Williams
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Dan Williams @ 2019-03-29 15:27 UTC (permalink / raw)
  To: akpm
  Cc: Logan Gunthorpe, Ira Weiny, Bjorn Helgaas, Christoph Hellwig,
	linux-mm, linux-pci, linux-nvdimm, linux-kernel

Use the new devm_relase_action() facility to allow
devm_memremap_pages_release() to be manually triggered.

Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/memremap.h |    6 ++++++
 kernel/memremap.c        |    6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f0628660d541..7601ee314c4a 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -100,6 +100,7 @@ struct dev_pagemap {
 
 #ifdef CONFIG_ZONE_DEVICE
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
 struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 		struct dev_pagemap *pgmap);
 
@@ -118,6 +119,11 @@ static inline void *devm_memremap_pages(struct device *dev,
 	return ERR_PTR(-ENXIO);
 }
 
+static inline void devm_memunmap_pages(struct device *dev,
+		struct dev_pagemap *pgmap)
+{
+}
+
 static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 		struct dev_pagemap *pgmap)
 {
diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..65afbacab44e 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -266,6 +266,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
 
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+	devm_release_action(dev, devm_memremap_pages_release, pgmap);
+}
+EXPORT_SYMBOL_GPL(devm_memunmap_pages);
+
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 {
 	/* number of pfns from base where pfn_to_page() is valid */


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/6] pci/p2pdma: Fix the gen_pool_add_virt() failure path
  2019-03-29 15:27 [PATCH 0/6] mm/devm_memremap_pages: Fix page release race Dan Williams
  2019-03-29 15:27 ` [PATCH 1/6] drivers/base/devres: Introduce devm_release_action() Dan Williams
  2019-03-29 15:27 ` [PATCH 2/6] mm/devm_memremap_pages: Introduce devm_memunmap_pages Dan Williams
@ 2019-03-29 15:27 ` Dan Williams
  2019-03-29 17:24   ` Bjorn Helgaas
  2019-03-29 15:27 ` [PATCH 4/6] lib/genalloc: Introduce chunk owners Dan Williams
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 11+ messages in thread
From: Dan Williams @ 2019-03-29 15:27 UTC (permalink / raw)
  To: akpm
  Cc: Logan Gunthorpe, Ira Weiny, Bjorn Helgaas, Christoph Hellwig,
	linux-mm, linux-pci, linux-nvdimm, linux-kernel

The pci_p2pdma_add_resource() implementation immediately frees the pgmap
if gen_pool_add_virt() fails. However, that means that when @dev
triggers a devres release devm_memremap_pages_release() will crash
trying to access the freed @pgmap.

Use the new devm_memunmap_pages() to manually free the mapping in the
error path.

Fixes: 52916982af48 ("PCI/P2PDMA: Support peer-to-peer memory")
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/p2pdma.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index c52298d76e64..595a534bd749 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -208,13 +208,15 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 			pci_bus_address(pdev, bar) + offset,
 			resource_size(&pgmap->res), dev_to_node(&pdev->dev));
 	if (error)
-		goto pgmap_free;
+		goto pages_free;
 
 	pci_info(pdev, "added peer-to-peer DMA memory %pR\n",
 		 &pgmap->res);
 
 	return 0;
 
+pages_free:
+	devm_memunmap_pages(&pdev->dev, pgmap);
 pgmap_free:
 	devm_kfree(&pdev->dev, pgmap);
 	return error;


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 4/6] lib/genalloc: Introduce chunk owners
  2019-03-29 15:27 [PATCH 0/6] mm/devm_memremap_pages: Fix page release race Dan Williams
                   ` (2 preceding siblings ...)
  2019-03-29 15:27 ` [PATCH 3/6] pci/p2pdma: Fix the gen_pool_add_virt() failure path Dan Williams
@ 2019-03-29 15:27 ` Dan Williams
  2019-03-29 15:27 ` [PATCH 5/6] pci/p2pdma: Track pgmap references per resource, not globally Dan Williams
  2019-03-29 15:27 ` [PATCH 6/6] mm/devm_memremap_pages: Fix final page put race Dan Williams
  5 siblings, 0 replies; 11+ messages in thread
From: Dan Williams @ 2019-03-29 15:27 UTC (permalink / raw)
  To: akpm
  Cc: Logan Gunthorpe, Ira Weiny, Bjorn Helgaas,
	Jérôme Glisse, Christoph Hellwig, linux-mm, linux-pci,
	linux-nvdimm, linux-kernel

The p2pdma facility enables a provider to publish a pool of dma
addresses for a consumer to allocate. A genpool is used internally by
p2pdma to collect dma resources, 'chunks', to be handed out to
consumers. Whenever a consumer allocates a resource it needs to pin the
'struct dev_pagemap' instance that backs the chunk selected by
pci_alloc_p2pmem().

Currently that reference is taken globally on the entire provider
device. That sets up a lifetime mismatch whereby the p2pdma core needs
to maintain hacks to make sure the percpu_ref is not released twice.

This lifetime mismatch also stands in the way of a fix to
devm_memremap_pages() whereby devm_memremap_pages_release() must wait
for the percpu_ref ->release() callback to complete before it can
proceed to teardown pages.

So, towards fixing this situation, introduce the ability to store a
'chunk owner' at gen_pool_add() time, and a facility to retrieve the
owner at gen_pool_{alloc,free}() time. For p2pdma this will be used to
store and recall individual dev_pagemap reference counter instances
per-chunk.

Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/genalloc.h |   55 +++++++++++++++++++++++++++++++++++++++++-----
 lib/genalloc.c           |   51 +++++++++++++++++++++----------------------
 2 files changed, 74 insertions(+), 32 deletions(-)

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index dd0a452373e7..a337313e064f 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -75,6 +75,7 @@ struct gen_pool_chunk {
 	struct list_head next_chunk;	/* next chunk in pool */
 	atomic_long_t avail;
 	phys_addr_t phys_addr;		/* physical starting address of memory chunk */
+	void *owner;			/* private data to retrieve at alloc time */
 	unsigned long start_addr;	/* start address of memory chunk */
 	unsigned long end_addr;		/* end address of memory chunk (inclusive) */
 	unsigned long bits[0];		/* bitmap for allocating memory chunk */
@@ -96,8 +97,15 @@ struct genpool_data_fixed {
 
 extern struct gen_pool *gen_pool_create(int, int);
 extern phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long);
-extern int gen_pool_add_virt(struct gen_pool *, unsigned long, phys_addr_t,
-			     size_t, int);
+extern int gen_pool_add_owner(struct gen_pool *, unsigned long, phys_addr_t,
+			     size_t, int, void *);
+
+static inline int gen_pool_add_virt(struct gen_pool *pool, unsigned long addr,
+		phys_addr_t phys, size_t size, int nid)
+{
+	return gen_pool_add_owner(pool, addr, phys, size, nid, NULL);
+}
+
 /**
  * gen_pool_add - add a new chunk of special memory to the pool
  * @pool: pool to add new memory chunk to
@@ -116,12 +124,47 @@ static inline int gen_pool_add(struct gen_pool *pool, unsigned long addr,
 	return gen_pool_add_virt(pool, addr, -1, size, nid);
 }
 extern void gen_pool_destroy(struct gen_pool *);
-extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);
-extern unsigned long gen_pool_alloc_algo(struct gen_pool *, size_t,
-		genpool_algo_t algo, void *data);
+unsigned long gen_pool_alloc_algo_owner(struct gen_pool *pool, size_t size,
+		genpool_algo_t algo, void *data, void **owner);
+
+static inline unsigned long gen_pool_alloc_owner(struct gen_pool *pool,
+		size_t size, void **owner)
+{
+	return gen_pool_alloc_algo_owner(pool, size, pool->algo, pool->data,
+			owner);
+}
+
+static inline unsigned long gen_pool_alloc_algo(struct gen_pool *pool,
+		size_t size, genpool_algo_t algo, void *data)
+{
+	return gen_pool_alloc_algo_owner(pool, size, algo, data, NULL);
+}
+
+/**
+ * gen_pool_alloc - allocate special memory from the pool
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
+ *
+ * Allocate the requested number of bytes from the specified pool.
+ * Uses the pool allocation function (with first-fit algorithm by default).
+ * Can not be used in NMI handler on architectures without
+ * NMI-safe cmpxchg implementation.
+ */
+static inline unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
+{
+	return gen_pool_alloc_algo(pool, size, pool->algo, pool->data);
+}
+
 extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma);
-extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
+extern void gen_pool_free_owner(struct gen_pool *pool, unsigned long addr,
+		size_t size, void **owner);
+static inline void gen_pool_free(struct gen_pool *pool, unsigned long addr,
+                size_t size)
+{
+	gen_pool_free_owner(pool, addr, size, NULL);
+}
+
 extern void gen_pool_for_each_chunk(struct gen_pool *,
 	void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *);
 extern size_t gen_pool_avail(struct gen_pool *);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 7e85d1e37a6e..770c769d7cb7 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -168,20 +168,21 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
 EXPORT_SYMBOL(gen_pool_create);
 
 /**
- * gen_pool_add_virt - add a new chunk of special memory to the pool
+ * gen_pool_add_owner- add a new chunk of special memory to the pool
  * @pool: pool to add new memory chunk to
  * @virt: virtual starting address of memory chunk to add to pool
  * @phys: physical starting address of memory chunk to add to pool
  * @size: size in bytes of the memory chunk to add to pool
  * @nid: node id of the node the chunk structure and bitmap should be
  *       allocated on, or -1
+ * @owner: private data the publisher would like to recall at alloc time
  *
  * Add a new chunk of special memory to the specified pool.
  *
  * Returns 0 on success or a -ve errno on failure.
  */
-int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phys,
-		 size_t size, int nid)
+int gen_pool_add_owner(struct gen_pool *pool, unsigned long virt, phys_addr_t phys,
+		 size_t size, int nid, void *owner)
 {
 	struct gen_pool_chunk *chunk;
 	int nbits = size >> pool->min_alloc_order;
@@ -195,6 +196,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
 	chunk->phys_addr = phys;
 	chunk->start_addr = virt;
 	chunk->end_addr = virt + size - 1;
+	chunk->owner = owner;
 	atomic_long_set(&chunk->avail, size);
 
 	spin_lock(&pool->lock);
@@ -203,7 +205,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
 
 	return 0;
 }
-EXPORT_SYMBOL(gen_pool_add_virt);
+EXPORT_SYMBOL(gen_pool_add_owner);
 
 /**
  * gen_pool_virt_to_phys - return the physical address of memory
@@ -260,35 +262,20 @@ void gen_pool_destroy(struct gen_pool *pool)
 EXPORT_SYMBOL(gen_pool_destroy);
 
 /**
- * gen_pool_alloc - allocate special memory from the pool
- * @pool: pool to allocate from
- * @size: number of bytes to allocate from the pool
- *
- * Allocate the requested number of bytes from the specified pool.
- * Uses the pool allocation function (with first-fit algorithm by default).
- * Can not be used in NMI handler on architectures without
- * NMI-safe cmpxchg implementation.
- */
-unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
-{
-	return gen_pool_alloc_algo(pool, size, pool->algo, pool->data);
-}
-EXPORT_SYMBOL(gen_pool_alloc);
-
-/**
- * gen_pool_alloc_algo - allocate special memory from the pool
+ * gen_pool_alloc_algo_owner - allocate special memory from the pool
  * @pool: pool to allocate from
  * @size: number of bytes to allocate from the pool
  * @algo: algorithm passed from caller
  * @data: data passed to algorithm
+ * @owner: optionally retrieve the chunk owner
  *
  * Allocate the requested number of bytes from the specified pool.
  * Uses the pool allocation function (with first-fit algorithm by default).
  * Can not be used in NMI handler on architectures without
  * NMI-safe cmpxchg implementation.
  */
-unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
-		genpool_algo_t algo, void *data)
+unsigned long gen_pool_alloc_algo_owner(struct gen_pool *pool, size_t size,
+		genpool_algo_t algo, void *data, void **owner)
 {
 	struct gen_pool_chunk *chunk;
 	unsigned long addr = 0;
@@ -299,6 +286,9 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
 	BUG_ON(in_nmi());
 #endif
 
+	if (owner)
+		*owner = NULL;
+
 	if (size == 0)
 		return 0;
 
@@ -326,12 +316,14 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
 		addr = chunk->start_addr + ((unsigned long)start_bit << order);
 		size = nbits << order;
 		atomic_long_sub(size, &chunk->avail);
+		if (owner)
+			*owner = chunk->owner;
 		break;
 	}
 	rcu_read_unlock();
 	return addr;
 }
-EXPORT_SYMBOL(gen_pool_alloc_algo);
+EXPORT_SYMBOL(gen_pool_alloc_algo_owner);
 
 /**
  * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage
@@ -367,12 +359,14 @@ EXPORT_SYMBOL(gen_pool_dma_alloc);
  * @pool: pool to free to
  * @addr: starting address of memory to free back to pool
  * @size: size in bytes of memory to free
+ * @owner: private data stashed at gen_pool_add() time
  *
  * Free previously allocated special memory back to the specified
  * pool.  Can not be used in NMI handler on architectures without
  * NMI-safe cmpxchg implementation.
  */
-void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
+void gen_pool_free_owner(struct gen_pool *pool, unsigned long addr, size_t size,
+		void **owner)
 {
 	struct gen_pool_chunk *chunk;
 	int order = pool->min_alloc_order;
@@ -382,6 +376,9 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 	BUG_ON(in_nmi());
 #endif
 
+	if (owner)
+		*owner = NULL;
+
 	nbits = (size + (1UL << order) - 1) >> order;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
@@ -392,6 +389,8 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 			BUG_ON(remain);
 			size = nbits << order;
 			atomic_long_add(size, &chunk->avail);
+			if (owner)
+				*owner = chunk->owner;
 			rcu_read_unlock();
 			return;
 		}
@@ -399,7 +398,7 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 	rcu_read_unlock();
 	BUG();
 }
-EXPORT_SYMBOL(gen_pool_free);
+EXPORT_SYMBOL(gen_pool_free_owner);
 
 /**
  * gen_pool_for_each_chunk - call func for every chunk of generic memory pool


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 5/6] pci/p2pdma: Track pgmap references per resource, not globally
  2019-03-29 15:27 [PATCH 0/6] mm/devm_memremap_pages: Fix page release race Dan Williams
                   ` (3 preceding siblings ...)
  2019-03-29 15:27 ` [PATCH 4/6] lib/genalloc: Introduce chunk owners Dan Williams
@ 2019-03-29 15:27 ` Dan Williams
  2019-03-29 17:50   ` Logan Gunthorpe
  2019-03-29 15:27 ` [PATCH 6/6] mm/devm_memremap_pages: Fix final page put race Dan Williams
  5 siblings, 1 reply; 11+ messages in thread
From: Dan Williams @ 2019-03-29 15:27 UTC (permalink / raw)
  To: akpm
  Cc: Logan Gunthorpe, Bjorn Helgaas, Christoph Hellwig, linux-mm,
	linux-pci, linux-nvdimm, linux-kernel

In preparation for fixing a race between devm_memremap_pages_release()
and the final put of a page from the device-page-map, allocate a
percpu-ref per p2pdma resource mapping.

Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/p2pdma.c |  114 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 73 insertions(+), 41 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 595a534bd749..1b96c1688715 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -20,12 +20,16 @@
 #include <linux/seq_buf.h>
 
 struct pci_p2pdma {
-	struct percpu_ref devmap_ref;
-	struct completion devmap_ref_done;
 	struct gen_pool *pool;
 	bool p2pmem_published;
 };
 
+struct p2pdma_pagemap {
+	struct dev_pagemap pgmap;
+	struct percpu_ref ref;
+	struct completion ref_done;
+};
+
 static ssize_t size_show(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
@@ -74,28 +78,31 @@ static const struct attribute_group p2pmem_group = {
 	.name = "p2pmem",
 };
 
+static struct p2pdma_pagemap *to_p2p_pgmap(struct percpu_ref *ref)
+{
+	return container_of(ref, struct p2pdma_pagemap, ref);
+}
+
 static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
 {
-	struct pci_p2pdma *p2p =
-		container_of(ref, struct pci_p2pdma, devmap_ref);
+	struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
 
-	complete_all(&p2p->devmap_ref_done);
+	complete(&p2p_pgmap->ref_done);
 }
 
 static void pci_p2pdma_percpu_kill(struct percpu_ref *ref)
 {
-	/*
-	 * pci_p2pdma_add_resource() may be called multiple times
-	 * by a driver and may register the percpu_kill devm action multiple
-	 * times. We only want the first action to actually kill the
-	 * percpu_ref.
-	 */
-	if (percpu_ref_is_dying(ref))
-		return;
-
 	percpu_ref_kill(ref);
 }
 
+static void pci_p2pdma_percpu_cleanup(void *ref)
+{
+	struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
+
+	wait_for_completion(&p2p_pgmap->ref_done);
+	percpu_ref_exit(&p2p_pgmap->ref);
+}
+
 static void pci_p2pdma_release(void *data)
 {
 	struct pci_dev *pdev = data;
@@ -103,12 +110,12 @@ static void pci_p2pdma_release(void *data)
 	if (!pdev->p2pdma)
 		return;
 
-	wait_for_completion(&pdev->p2pdma->devmap_ref_done);
-	percpu_ref_exit(&pdev->p2pdma->devmap_ref);
+	/* Flush and disable pci_alloc_p2p_mem() */
+	pdev->p2pdma = NULL;
+	synchronize_rcu();
 
 	gen_pool_destroy(pdev->p2pdma->pool);
 	sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
-	pdev->p2pdma = NULL;
 }
 
 static int pci_p2pdma_setup(struct pci_dev *pdev)
@@ -124,12 +131,6 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
 	if (!p2p->pool)
 		goto out;
 
-	init_completion(&p2p->devmap_ref_done);
-	error = percpu_ref_init(&p2p->devmap_ref,
-			pci_p2pdma_percpu_release, 0, GFP_KERNEL);
-	if (error)
-		goto out_pool_destroy;
-
 	error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
 	if (error)
 		goto out_pool_destroy;
@@ -163,6 +164,7 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
 int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 			    u64 offset)
 {
+	struct p2pdma_pagemap *p2p_pgmap;
 	struct dev_pagemap *pgmap;
 	void *addr;
 	int error;
@@ -185,14 +187,32 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 			return error;
 	}
 
-	pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL);
-	if (!pgmap)
+	p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL);
+	if (!p2p_pgmap)
 		return -ENOMEM;
 
+	init_completion(&p2p_pgmap->ref_done);
+	error = percpu_ref_init(&p2p_pgmap->ref,
+			pci_p2pdma_percpu_release, 0, GFP_KERNEL);
+	if (error)
+		goto pgmap_free;
+
+	/*
+	 * FIXME: the percpu_ref_exit needs to be coordinated internal
+	 * to devm_memremap_pages_release(). Duplicate the same ordering
+	 * as other devm_memremap_pages() users for now.
+	 */
+	error = devm_add_action(&pdev->dev, pci_p2pdma_percpu_cleanup,
+			&p2p_pgmap->ref);
+	if (error)
+		goto ref_cleanup;
+
+	pgmap = &p2p_pgmap->pgmap;
+
 	pgmap->res.start = pci_resource_start(pdev, bar) + offset;
 	pgmap->res.end = pgmap->res.start + size - 1;
 	pgmap->res.flags = pci_resource_flags(pdev, bar);
-	pgmap->ref = &pdev->p2pdma->devmap_ref;
+	pgmap->ref = &p2p_pgmap->ref;
 	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
 	pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
 		pci_resource_start(pdev, bar);
@@ -201,12 +221,13 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	addr = devm_memremap_pages(&pdev->dev, pgmap);
 	if (IS_ERR(addr)) {
 		error = PTR_ERR(addr);
-		goto pgmap_free;
+		goto ref_exit;
 	}
 
-	error = gen_pool_add_virt(pdev->p2pdma->pool, (unsigned long)addr,
+	error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr,
 			pci_bus_address(pdev, bar) + offset,
-			resource_size(&pgmap->res), dev_to_node(&pdev->dev));
+			resource_size(&pgmap->res), dev_to_node(&pdev->dev),
+			&p2p_pgmap->ref);
 	if (error)
 		goto pages_free;
 
@@ -217,8 +238,10 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 
 pages_free:
 	devm_memunmap_pages(&pdev->dev, pgmap);
+ref_cleanup:
+	percpu_ref_exit(&p2p_pgmap->ref);
 pgmap_free:
-	devm_kfree(&pdev->dev, pgmap);
+	devm_kfree(&pdev->dev, p2p_pgmap);
 	return error;
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
@@ -555,19 +578,25 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_find_many);
  */
 void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size)
 {
-	void *ret;
+	void *ret = NULL;
+	struct percpu_ref *ref;
 
+	rcu_read_lock();
 	if (unlikely(!pdev->p2pdma))
-		return NULL;
-
-	if (unlikely(!percpu_ref_tryget_live(&pdev->p2pdma->devmap_ref)))
-		return NULL;
-
-	ret = (void *)gen_pool_alloc(pdev->p2pdma->pool, size);
+		goto out;
 
-	if (unlikely(!ret))
-		percpu_ref_put(&pdev->p2pdma->devmap_ref);
+	ret = (void *)gen_pool_alloc_owner(pdev->p2pdma->pool, size,
+			(void **) &ref);
+	if (!ret)
+		goto out;
 
+	if (unlikely(!percpu_ref_tryget_live(ref))) {
+		gen_pool_free(pdev->p2pdma->pool, (unsigned long) ret, size);
+		ret = NULL;
+		goto out;
+	}
+out:
+	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pci_alloc_p2pmem);
@@ -580,8 +609,11 @@ EXPORT_SYMBOL_GPL(pci_alloc_p2pmem);
  */
 void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size)
 {
-	gen_pool_free(pdev->p2pdma->pool, (uintptr_t)addr, size);
-	percpu_ref_put(&pdev->p2pdma->devmap_ref);
+	struct percpu_ref *ref;
+
+	gen_pool_free_owner(pdev->p2pdma->pool, (uintptr_t)addr, size,
+			(void **) &ref);
+	percpu_ref_put(ref);
 }
 EXPORT_SYMBOL_GPL(pci_free_p2pmem);
 


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 6/6] mm/devm_memremap_pages: Fix final page put race
  2019-03-29 15:27 [PATCH 0/6] mm/devm_memremap_pages: Fix page release race Dan Williams
                   ` (4 preceding siblings ...)
  2019-03-29 15:27 ` [PATCH 5/6] pci/p2pdma: Track pgmap references per resource, not globally Dan Williams
@ 2019-03-29 15:27 ` Dan Williams
  2019-03-29  9:46   ` Ira Weiny
  5 siblings, 1 reply; 11+ messages in thread
From: Dan Williams @ 2019-03-29 15:27 UTC (permalink / raw)
  To: akpm
  Cc: Logan Gunthorpe, Ira Weiny, Bjorn Helgaas,
	Jérôme Glisse, Christoph Hellwig, linux-mm, linux-pci,
	linux-nvdimm, linux-kernel

Logan noticed that devm_memremap_pages_release() kills the percpu_ref
drops all the page references that were acquired at init and then
immediately proceeds to unplug, arch_remove_memory(), the backing pages
for the pagemap. If for some reason device shutdown actually collides
with a busy / elevated-ref-count page then arch_remove_memory() should
be deferred until after that reference is dropped.

As it stands the "wait for last page ref drop" happens *after*
devm_memremap_pages_release() returns, which is obviously too late and
can lead to crashes.

Fix this situation by assigning the responsibility to wait for the
percpu_ref to go idle to devm_memremap_pages() with a new ->cleanup()
callback. Implement the new cleanup callback for all
devm_memremap_pages() users: pmem, devdax, hmm, and p2pdma.

Reported-by: Logan Gunthorpe <logang@deltatee.com>
Fixes: 41e94a851304 ("add devm_memremap_pages")
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/device.c              |   13 +++----------
 drivers/nvdimm/pmem.c             |   17 +++++++++++++----
 drivers/pci/p2pdma.c              |   17 +++--------------
 include/linux/memremap.h          |    2 ++
 kernel/memremap.c                 |   17 ++++++++++++-----
 mm/hmm.c                          |   14 +++-----------
 tools/testing/nvdimm/test/iomap.c |    2 ++
 7 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index e428468ab661..e3aa78dd1bb0 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -27,9 +27,8 @@ static void dev_dax_percpu_release(struct percpu_ref *ref)
 	complete(&dev_dax->cmp);
 }
 
-static void dev_dax_percpu_exit(void *data)
+static void dev_dax_percpu_exit(struct percpu_ref *ref)
 {
-	struct percpu_ref *ref = data;
 	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
@@ -468,18 +467,12 @@ int dev_dax_probe(struct device *dev)
 	if (rc)
 		return rc;
 
-	rc = devm_add_action_or_reset(dev, dev_dax_percpu_exit, &dev_dax->ref);
-	if (rc)
-		return rc;
-
 	dev_dax->pgmap.ref = &dev_dax->ref;
 	dev_dax->pgmap.kill = dev_dax_percpu_kill;
+	dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
 	addr = devm_memremap_pages(dev, &dev_dax->pgmap);
-	if (IS_ERR(addr)) {
-		devm_remove_action(dev, dev_dax_percpu_exit, &dev_dax->ref);
-		percpu_ref_exit(&dev_dax->ref);
+	if (IS_ERR(addr))
 		return PTR_ERR(addr);
-	}
 
 	inode = dax_inode(dax_dev);
 	cdev = inode->i_cdev;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index bc2f700feef8..507b9eda42aa 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -304,11 +304,19 @@ static const struct attribute_group *pmem_attribute_groups[] = {
 	NULL,
 };
 
-static void pmem_release_queue(void *q)
+static void __pmem_release_queue(struct percpu_ref *ref)
 {
+	struct request_queue *q;
+
+	q = container_of(ref, typeof(*q), q_usage_counter);
 	blk_cleanup_queue(q);
 }
 
+static void pmem_release_queue(void *ref)
+{
+	__pmem_release_queue(ref);
+}
+
 static void pmem_freeze_queue(struct percpu_ref *ref)
 {
 	struct request_queue *q;
@@ -400,12 +408,10 @@ static int pmem_attach_disk(struct device *dev,
 	if (!q)
 		return -ENOMEM;
 
-	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
-		return -ENOMEM;
-
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
 	pmem->pgmap.kill = pmem_freeze_queue;
+	pmem->pgmap.cleanup = __pmem_release_queue;
 	if (is_nd_pfn(dev)) {
 		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
 			return -ENOMEM;
@@ -426,6 +432,9 @@ static int pmem_attach_disk(struct device *dev,
 		pmem->pfn_flags |= PFN_MAP;
 		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
 	} else {
+		if (devm_add_action_or_reset(dev, pmem_release_queue,
+					&q->q_usage_counter))
+			return -ENOMEM;
 		addr = devm_memremap(dev, pmem->phys_addr,
 				pmem->size, ARCH_MEMREMAP_PMEM);
 		memcpy(&bb_res, &nsio->res, sizeof(bb_res));
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 1b96c1688715..7ff5b8067670 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -95,7 +95,7 @@ static void pci_p2pdma_percpu_kill(struct percpu_ref *ref)
 	percpu_ref_kill(ref);
 }
 
-static void pci_p2pdma_percpu_cleanup(void *ref)
+static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref)
 {
 	struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
 
@@ -197,16 +197,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	if (error)
 		goto pgmap_free;
 
-	/*
-	 * FIXME: the percpu_ref_exit needs to be coordinated internal
-	 * to devm_memremap_pages_release(). Duplicate the same ordering
-	 * as other devm_memremap_pages() users for now.
-	 */
-	error = devm_add_action(&pdev->dev, pci_p2pdma_percpu_cleanup,
-			&p2p_pgmap->ref);
-	if (error)
-		goto ref_cleanup;
-
 	pgmap = &p2p_pgmap->pgmap;
 
 	pgmap->res.start = pci_resource_start(pdev, bar) + offset;
@@ -217,11 +207,12 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
 		pci_resource_start(pdev, bar);
 	pgmap->kill = pci_p2pdma_percpu_kill;
+	pgmap->cleanup = pci_p2pdma_percpu_cleanup;
 
 	addr = devm_memremap_pages(&pdev->dev, pgmap);
 	if (IS_ERR(addr)) {
 		error = PTR_ERR(addr);
-		goto ref_exit;
+		goto pgmap_free;
 	}
 
 	error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr,
@@ -238,8 +229,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 
 pages_free:
 	devm_memunmap_pages(&pdev->dev, pgmap);
-ref_cleanup:
-	percpu_ref_exit(&p2p_pgmap->ref);
 pgmap_free:
 	devm_kfree(&pdev->dev, p2p_pgmap);
 	return error;
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 7601ee314c4a..1732dea030b2 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -81,6 +81,7 @@ typedef void (*dev_page_free_t)(struct page *page, void *data);
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
  * @kill: callback to transition @ref to the dead state
+ * @cleanup: callback to wait for @ref to be idle and reap it
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
@@ -92,6 +93,7 @@ struct dev_pagemap {
 	struct resource res;
 	struct percpu_ref *ref;
 	void (*kill)(struct percpu_ref *ref);
+	void (*cleanup)(struct percpu_ref *ref);
 	struct device *dev;
 	void *data;
 	enum memory_type type;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 65afbacab44e..05d1af5a2f15 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -96,6 +96,7 @@ static void devm_memremap_pages_release(void *data)
 	pgmap->kill(pgmap->ref);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
+	pgmap->cleanup(pgmap->ref);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -134,8 +135,8 @@ static void devm_memremap_pages_release(void *data)
  * 2/ The altmap field may optionally be initialized, in which case altmap_valid
  *    must be set to true
  *
- * 3/ pgmap->ref must be 'live' on entry and will be killed at
- *    devm_memremap_pages_release() time, or if this routine fails.
+ * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
+ *    at devm_memremap_pages_release() time, or if this routine fails.
  *
  * 4/ res is expected to be a host memory range that could feasibly be
  *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -151,8 +152,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
 
-	if (!pgmap->ref || !pgmap->kill)
+	if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
+		WARN(1, "Missing reference count teardown definition\n");
 		return ERR_PTR(-EINVAL);
+	}
 
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -163,14 +166,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	if (conflict_pgmap) {
 		dev_WARN(dev, "Conflicting mapping in same section\n");
 		put_dev_pagemap(conflict_pgmap);
-		return ERR_PTR(-ENOMEM);
+		error = -ENOMEM;
+		goto err_array;
 	}
 
 	conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
 	if (conflict_pgmap) {
 		dev_WARN(dev, "Conflicting mapping in same section\n");
 		put_dev_pagemap(conflict_pgmap);
-		return ERR_PTR(-ENOMEM);
+		error = -ENOMEM;
+		goto err_array;
 	}
 
 	is_ram = region_intersects(align_start, align_size,
@@ -262,6 +267,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	pgmap_array_delete(res);
  err_array:
 	pgmap->kill(pgmap->ref);
+	pgmap->cleanup(pgmap->ref);
+
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
diff --git a/mm/hmm.c b/mm/hmm.c
index fe1cd87e49ac..225ade644058 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -975,9 +975,8 @@ static void hmm_devmem_ref_release(struct percpu_ref *ref)
 	complete(&devmem->completion);
 }
 
-static void hmm_devmem_ref_exit(void *data)
+static void hmm_devmem_ref_exit(struct percpu_ref *ref)
 {
-	struct percpu_ref *ref = data;
 	struct hmm_devmem *devmem;
 
 	devmem = container_of(ref, struct hmm_devmem, ref);
@@ -1054,10 +1053,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	if (ret)
 		return ERR_PTR(ret);
 
-	ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref);
-	if (ret)
-		return ERR_PTR(ret);
-
 	size = ALIGN(size, PA_SECTION_SIZE);
 	addr = min((unsigned long)iomem_resource.end,
 		   (1UL << MAX_PHYSMEM_BITS) - 1);
@@ -1096,6 +1091,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pagemap.ref = &devmem->ref;
 	devmem->pagemap.data = devmem;
 	devmem->pagemap.kill = hmm_devmem_ref_kill;
+	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
 	if (IS_ERR(result))
@@ -1133,11 +1129,6 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
 	if (ret)
 		return ERR_PTR(ret);
 
-	ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit,
-			&devmem->ref);
-	if (ret)
-		return ERR_PTR(ret);
-
 	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
 	devmem->pfn_last = devmem->pfn_first +
 			   (resource_size(devmem->resource) >> PAGE_SHIFT);
@@ -1150,6 +1141,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
 	devmem->pagemap.ref = &devmem->ref;
 	devmem->pagemap.data = devmem;
 	devmem->pagemap.kill = hmm_devmem_ref_kill;
+	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
 	if (IS_ERR(result))
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index c6635fee27d8..219dd0a1cb08 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -108,7 +108,9 @@ static void nfit_test_kill(void *_pgmap)
 {
 	struct dev_pagemap *pgmap = _pgmap;
 
+	WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup);
 	pgmap->kill(pgmap->ref);
+	pgmap->cleanup(pgmap->ref);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/6] pci/p2pdma: Fix the gen_pool_add_virt() failure path
  2019-03-29 15:27 ` [PATCH 3/6] pci/p2pdma: Fix the gen_pool_add_virt() failure path Dan Williams
@ 2019-03-29 17:24   ` Bjorn Helgaas
  0 siblings, 0 replies; 11+ messages in thread
From: Bjorn Helgaas @ 2019-03-29 17:24 UTC (permalink / raw)
  To: Dan Williams
  Cc: akpm, Logan Gunthorpe, Ira Weiny, Christoph Hellwig, linux-mm,
	linux-pci, linux-nvdimm, linux-kernel

On Fri, Mar 29, 2019 at 08:27:39AM -0700, Dan Williams wrote:
> The pci_p2pdma_add_resource() implementation immediately frees the pgmap
> if gen_pool_add_virt() fails. However, that means that when @dev
> triggers a devres release devm_memremap_pages_release() will crash
> trying to access the freed @pgmap.
> 
> Use the new devm_memunmap_pages() to manually free the mapping in the
> error path.
> 
> Fixes: 52916982af48 ("PCI/P2PDMA: Support peer-to-peer memory")
> Cc: Logan Gunthorpe <logang@deltatee.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Bjorn Helgaas <bhelgaas@google.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Especially if you run "git log --oneline drivers/pci/p2pdma.c" and make
yours match :),

Acked-by: Bjorn Helgaas <bhelgaas@google.com>

> ---
>  drivers/pci/p2pdma.c |    4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index c52298d76e64..595a534bd749 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -208,13 +208,15 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
>  			pci_bus_address(pdev, bar) + offset,
>  			resource_size(&pgmap->res), dev_to_node(&pdev->dev));
>  	if (error)
> -		goto pgmap_free;
> +		goto pages_free;
>  
>  	pci_info(pdev, "added peer-to-peer DMA memory %pR\n",
>  		 &pgmap->res);
>  
>  	return 0;
>  
> +pages_free:
> +	devm_memunmap_pages(&pdev->dev, pgmap);
>  pgmap_free:
>  	devm_kfree(&pdev->dev, pgmap);
>  	return error;
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 5/6] pci/p2pdma: Track pgmap references per resource, not globally
  2019-03-29 15:27 ` [PATCH 5/6] pci/p2pdma: Track pgmap references per resource, not globally Dan Williams
@ 2019-03-29 17:50   ` Logan Gunthorpe
  2019-03-29 19:32     ` Dan Williams
  0 siblings, 1 reply; 11+ messages in thread
From: Logan Gunthorpe @ 2019-03-29 17:50 UTC (permalink / raw)
  To: Dan Williams, akpm
  Cc: Bjorn Helgaas, Christoph Hellwig, linux-mm, linux-pci,
	linux-nvdimm, linux-kernel

Thanks Dan, this is great. I think the changes in this series are
cleaner and more understandable than the patch set I had sent earlier.

However, I found a couple minor issues with this patch:

On 2019-03-29 9:27 a.m., Dan Williams wrote:
>  static void pci_p2pdma_release(void *data)
>  {
>  	struct pci_dev *pdev = data;
> @@ -103,12 +110,12 @@ static void pci_p2pdma_release(void *data)
>  	if (!pdev->p2pdma)
>  		return;
>  
> -	wait_for_completion(&pdev->p2pdma->devmap_ref_done);
> -	percpu_ref_exit(&pdev->p2pdma->devmap_ref);
> +	/* Flush and disable pci_alloc_p2p_mem() */
> +	pdev->p2pdma = NULL;
> +	synchronize_rcu();
>  
>  	gen_pool_destroy(pdev->p2pdma->pool);

I missed this on my initial review, but it became obvious when I tried
to test the series: this is a NULL dereference seeing pdev->p2pdma was
set to NULL a few lines up.

When I fix this by storing p2pdma in a local variable, the patch set
works and never seems to crash when I hot remove p2pdma memory.

>  void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size)
>  {
> -	void *ret;
> +	void *ret = NULL;
> +	struct percpu_ref *ref;
>  
> +	rcu_read_lock();
>  	if (unlikely(!pdev->p2pdma))
> -		return NULL;

Using RCU here makes sense to me, however I expect we should be using
the proper rcu_assign_pointer(), rcu_dereference() and __rcu tag with
pdev->p2pdma. If only to better document what's being protected with the
new RCU calls.

Logan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 5/6] pci/p2pdma: Track pgmap references per resource, not globally
  2019-03-29 17:50   ` Logan Gunthorpe
@ 2019-03-29 19:32     ` Dan Williams
  0 siblings, 0 replies; 11+ messages in thread
From: Dan Williams @ 2019-03-29 19:32 UTC (permalink / raw)
  To: Logan Gunthorpe
  Cc: Andrew Morton, Bjorn Helgaas, Christoph Hellwig, Linux MM,
	linux-pci, linux-nvdimm, Linux Kernel Mailing List

On Fri, Mar 29, 2019 at 10:50 AM Logan Gunthorpe <logang@deltatee.com> wrote:
>
> Thanks Dan, this is great. I think the changes in this series are
> cleaner and more understandable than the patch set I had sent earlier.
>
> However, I found a couple minor issues with this patch:
>
> On 2019-03-29 9:27 a.m., Dan Williams wrote:
> >  static void pci_p2pdma_release(void *data)
> >  {
> >       struct pci_dev *pdev = data;
> > @@ -103,12 +110,12 @@ static void pci_p2pdma_release(void *data)
> >       if (!pdev->p2pdma)
> >               return;
> >
> > -     wait_for_completion(&pdev->p2pdma->devmap_ref_done);
> > -     percpu_ref_exit(&pdev->p2pdma->devmap_ref);
> > +     /* Flush and disable pci_alloc_p2p_mem() */
> > +     pdev->p2pdma = NULL;
> > +     synchronize_rcu();
> >
> >       gen_pool_destroy(pdev->p2pdma->pool);
>
> I missed this on my initial review, but it became obvious when I tried
> to test the series: this is a NULL dereference seeing pdev->p2pdma was
> set to NULL a few lines up.

Ah, yup.

> When I fix this by storing p2pdma in a local variable, the patch set
> works and never seems to crash when I hot remove p2pdma memory.

Great!

>
> >  void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size)
> >  {
> > -     void *ret;
> > +     void *ret = NULL;
> > +     struct percpu_ref *ref;
> >
> > +     rcu_read_lock();
> >       if (unlikely(!pdev->p2pdma))
> > -             return NULL;
>
> Using RCU here makes sense to me, however I expect we should be using
> the proper rcu_assign_pointer(), rcu_dereference() and __rcu tag with
> pdev->p2pdma. If only to better document what's being protected with the
> new RCU calls.

I think just add a comment because those helpers are for cases where
the rcu protected pointer is allowed to race the teardown. In this
case we're using rcu just as a barrier to force the NULL check to
resolve.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-03-29 19:32 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-29 15:27 [PATCH 0/6] mm/devm_memremap_pages: Fix page release race Dan Williams
2019-03-29 15:27 ` [PATCH 1/6] drivers/base/devres: Introduce devm_release_action() Dan Williams
2019-03-29 15:27 ` [PATCH 2/6] mm/devm_memremap_pages: Introduce devm_memunmap_pages Dan Williams
2019-03-29 15:27 ` [PATCH 3/6] pci/p2pdma: Fix the gen_pool_add_virt() failure path Dan Williams
2019-03-29 17:24   ` Bjorn Helgaas
2019-03-29 15:27 ` [PATCH 4/6] lib/genalloc: Introduce chunk owners Dan Williams
2019-03-29 15:27 ` [PATCH 5/6] pci/p2pdma: Track pgmap references per resource, not globally Dan Williams
2019-03-29 17:50   ` Logan Gunthorpe
2019-03-29 19:32     ` Dan Williams
2019-03-29 15:27 ` [PATCH 6/6] mm/devm_memremap_pages: Fix final page put race Dan Williams
2019-03-29  9:46   ` Ira Weiny

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).