linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org, "Jérôme Glisse" <jglisse@redhat.com>,
	"Logan Gunthorpe" <logang@deltatee.com>,
	linux-kernel@vger.kernel.org,
	"Kirill Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCH] mm, zone_device: replace {get, put}_zone_device_page() with a single reference
Date: Wed, 26 Apr 2017 17:55:31 -0700	[thread overview]
Message-ID: <149325431313.40660.7404075559824162131.stgit@dwillia2-desk3.amr.corp.intel.com> (raw)
In-Reply-To: <20170423233125.nehmgtzldgi25niy@node.shutemov.name>

Kirill points out that the calls to {get,put}_dev_pagemap() can be
removed from the mm fast path if we take a single get_dev_pagemap()
reference to signify that the page is alive and use the final put of the
page to drop that reference.

This does require some care to make sure that any waits for the
percpu_ref to drop to zero occur *after* devm_memremap_page_release(),
since it now maintains its own elevated reference.

Cc Ingo Molnar <mingo@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---

This patch might fix the regression that we found with the conversion to
generic get_user_pages_fast() in the x86/mm branch pending for 4.12
(commit 2947ba054a4d "x86/mm/gup: Switch GUP to the generic
get_user_page_fast() implementation"). I'll test tomorrow, but in case
someone can give it a try before I wake up, here's an early version.

 drivers/dax/pmem.c    |    2 +-
 drivers/nvdimm/pmem.c |   13 +++++++++++--
 include/linux/mm.h    |   14 --------------
 kernel/memremap.c     |   22 +++++++++-------------
 mm/swap.c             |   10 ++++++++++
 5 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index d4ca19bd74eb..9f2a0b4fd801 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -43,6 +43,7 @@ static void dax_pmem_percpu_exit(void *data)
 	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
 
 	dev_dbg(dax_pmem->dev, "%s\n", __func__);
+	wait_for_completion(&dax_pmem->cmp);
 	percpu_ref_exit(ref);
 }
 
@@ -53,7 +54,6 @@ static void dax_pmem_percpu_kill(void *data)
 
 	dev_dbg(dax_pmem->dev, "%s\n", __func__);
 	percpu_ref_kill(ref);
-	wait_for_completion(&dax_pmem->cmp);
 }
 
 static int dax_pmem_probe(struct device *dev)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 3b3dab73d741..6be0c1253fcd 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -25,6 +25,7 @@
 #include <linux/badblocks.h>
 #include <linux/memremap.h>
 #include <linux/vmalloc.h>
+#include <linux/blk-mq.h>
 #include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/pmem.h>
@@ -243,6 +244,11 @@ static void pmem_release_queue(void *q)
 	blk_cleanup_queue(q);
 }
 
+static void pmem_freeze_queue(void *q)
+{
+	blk_mq_freeze_queue_start(q);
+}
+
 static void pmem_release_disk(void *__pmem)
 {
 	struct pmem_device *pmem = __pmem;
@@ -301,6 +307,9 @@ static int pmem_attach_disk(struct device *dev,
 	if (!q)
 		return -ENOMEM;
 
+	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
+		return -ENOMEM;
+
 	pmem->pfn_flags = PFN_DEV;
 	if (is_nd_pfn(dev)) {
 		addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
@@ -320,10 +329,10 @@ static int pmem_attach_disk(struct device *dev,
 				pmem->size, ARCH_MEMREMAP_PMEM);
 
 	/*
-	 * At release time the queue must be dead before
+	 * At release time the queue must be frozen before
 	 * devm_memremap_pages is unwound
 	 */
-	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
+	if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
 		return -ENOMEM;
 
 	if (IS_ERR(addr))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00a8fa7e366a..ce17b35257ac 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -758,19 +758,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
 }
 
 #ifdef CONFIG_ZONE_DEVICE
-void get_zone_device_page(struct page *page);
-void put_zone_device_page(struct page *page);
 static inline bool is_zone_device_page(const struct page *page)
 {
 	return page_zonenum(page) == ZONE_DEVICE;
 }
 #else
-static inline void get_zone_device_page(struct page *page)
-{
-}
-static inline void put_zone_device_page(struct page *page)
-{
-}
 static inline bool is_zone_device_page(const struct page *page)
 {
 	return false;
@@ -786,9 +778,6 @@ static inline void get_page(struct page *page)
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
 	page_ref_inc(page);
-
-	if (unlikely(is_zone_device_page(page)))
-		get_zone_device_page(page);
 }
 
 static inline void put_page(struct page *page)
@@ -797,9 +786,6 @@ static inline void put_page(struct page *page)
 
 	if (put_page_testzero(page))
 		__put_page(page);
-
-	if (unlikely(is_zone_device_page(page)))
-		put_zone_device_page(page);
 }
 
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 07e85e5229da..5316efdde083 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -182,18 +182,6 @@ struct page_map {
 	struct vmem_altmap altmap;
 };
 
-void get_zone_device_page(struct page *page)
-{
-	percpu_ref_get(page->pgmap->ref);
-}
-EXPORT_SYMBOL(get_zone_device_page);
-
-void put_zone_device_page(struct page *page)
-{
-	put_dev_pagemap(page->pgmap);
-}
-EXPORT_SYMBOL(put_zone_device_page);
-
 static void pgmap_radix_release(struct resource *res)
 {
 	resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 	struct resource *res = &page_map->res;
 	resource_size_t align_start, align_size;
 	struct dev_pagemap *pgmap = &page_map->pgmap;
+	unsigned long pfn;
+
+	for_each_device_pfn(pfn, page_map)
+		put_page(pfn_to_page(pfn));
 
 	if (percpu_ref_tryget_live(pgmap->ref)) {
 		dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
  *
  * Notes:
  * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
- *    (or devm release event).
+ *    (or devm release event). The expected order of events is that @ref has
+ *    been through percpu_ref_kill() before devm_memremap_pages_release(). The
+ *    wait for the completion of kill and percpu_ref_exit() must occur after
+ *    devm_memremap_pages_release().
  *
  * 2/ @res is expected to be a host memory range that could feasibly be
  *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		 */
 		list_del(&page->lru);
 		page->pgmap = pgmap;
+		percpu_ref_get(ref);
 	}
 	devres_add(dev, page_map);
 	return __va(res->start);
diff --git a/mm/swap.c b/mm/swap.c
index 5dabf444d724..01267dda6668 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
 
 void __put_page(struct page *page)
 {
+	if (is_zone_device_page(page)) {
+		put_dev_pagemap(page->pgmap);
+
+		/*
+		 * The page belong to device, do not return it to
+		 * page allocator.
+		 */
+		return;
+	}
+
 	if (unlikely(PageCompound(page)))
 		__put_compound_page(page);
 	else

  parent reply	other threads:[~2017-04-27  1:01 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-20 21:46 [tip:x86/mm] x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation Dan Williams
2017-04-21 14:16 ` Kirill A. Shutemov
2017-04-21 19:30   ` Dan Williams
2017-04-23  9:52     ` [PATCH] Revert "x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation" Ingo Molnar
2017-04-23 23:31 ` get_zone_device_page() in get_page() and page_cache_get_speculative() Kirill A. Shutemov
2017-04-24 17:23   ` Dan Williams
2017-04-24 17:30     ` Kirill A. Shutemov
2017-04-24 17:47       ` Dan Williams
2017-04-24 18:01         ` Kirill A. Shutemov
2017-04-24 18:25           ` Kirill A. Shutemov
2017-04-24 18:41             ` Dan Williams
2017-04-25 13:19               ` Kirill A. Shutemov
2017-04-25 16:44                 ` Dan Williams
2017-04-27  0:55   ` Dan Williams [this message]
2017-04-27  8:33     ` [PATCH] mm, zone_device: replace {get, put}_zone_device_page() with a single reference Kirill A. Shutemov
2017-04-28  6:39       ` Ingo Molnar
2017-04-28  8:14         ` [PATCH] mm, zone_device: Replace " Kirill A. Shutemov
2017-04-28 17:23         ` [PATCH v2] mm, zone_device: replace " Dan Williams
2017-04-28 17:34           ` Jerome Glisse
2017-04-28 17:41             ` Dan Williams
2017-04-28 18:00               ` Jerome Glisse
2017-04-28 19:02                 ` Dan Williams
2017-04-28 19:16                   ` Jerome Glisse
2017-04-28 19:22                     ` Dan Williams
2017-04-28 19:33                       ` Jerome Glisse
2017-04-29 10:17                         ` Kirill A. Shutemov
2017-04-30 23:14                           ` Jerome Glisse
2017-05-01  1:42                             ` Dan Williams
2017-05-01  1:54                               ` Jerome Glisse
2017-05-01  2:40                                 ` Dan Williams
2017-05-01  3:48                             ` Logan Gunthorpe
2017-05-01 10:23                             ` Kirill A. Shutemov
2017-05-01 13:55                               ` Jerome Glisse
2017-05-01 20:19                                 ` Dan Williams
2017-05-01 20:32                                   ` Jerome Glisse
2017-05-02 11:37                                 ` Kirill A. Shutemov
2017-05-02 13:22                                   ` Jerome Glisse
2017-04-29 14:18           ` Ingo Molnar
2017-05-01  2:45             ` Dan Williams
2017-05-01  7:12               ` Ingo Molnar
2017-05-01  9:33                 ` Kirill A. Shutemov
2017-05-01  8:28           ` [tip:x86/mm] mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash tip-bot for Dan Williams
2017-04-27 16:11     ` [PATCH] mm, zone_device: replace {get, put}_zone_device_page() with a single reference Logan Gunthorpe
2017-04-27 16:14       ` Dan Williams
2017-04-27 16:33         ` Logan Gunthorpe
2017-04-27 16:38           ` Dan Williams
2017-04-27 16:45             ` Logan Gunthorpe
2017-04-27 16:46               ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=149325431313.40660.7404075559824162131.stgit@dwillia2-desk3.amr.corp.intel.com \
    --to=dan.j.williams@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=jglisse@redhat.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=logang@deltatee.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).