All of lore.kernel.org
 help / color / mirror / Atom feed
From: Joao Martins <joao.m.martins@oracle.com>
To: linux-mm@kvack.org
Cc: Dan Williams <dan.j.williams@intel.com>,
	Vishal Verma <vishal.l.verma@intel.com>,
	Dave Jiang <dave.jiang@intel.com>,
	Naoya Horiguchi <naoya.horiguchi@nec.com>,
	Matthew Wilcox <willy@infradead.org>,
	Jason Gunthorpe <jgg@ziepe.ca>,
	John Hubbard <jhubbard@nvidia.com>,
	Jane Chu <jane.chu@oracle.com>,
	Muchun Song <songmuchun@bytedance.com>,
	Mike Kravetz <mike.kravetz@oracle.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Jonathan Corbet <corbet@lwn.net>, Christoph Hellwig <hch@lst.de>,
	nvdimm@lists.linux.dev, linux-doc@vger.kernel.org,
	Joao Martins <joao.m.martins@oracle.com>
Subject: [PATCH v6 04/10] mm/memremap: add ZONE_DEVICE support for compound pages
Date: Wed, 24 Nov 2021 19:09:59 +0000	[thread overview]
Message-ID: <20211124191005.20783-5-joao.m.martins@oracle.com> (raw)
In-Reply-To: <20211124191005.20783-1-joao.m.martins@oracle.com>

Add a new @vmemmap_shift property for struct dev_pagemap which specifies that a
devmap is composed of a set of compound pages of order @vmemmap_shift, instead of
base pages. When a compound page devmap is requested, all but the first
page are initialised as tail pages instead of order-0 pages.

For certain ZONE_DEVICE users like device-dax which have a fixed page size,
this creates an opportunity to optimize GUP and GUP-fast walkers, treating
it the same way as THP or hugetlb pages.

Additionally, commit 7118fc2906e2 ("hugetlb: address ref count racing in
prep_compound_gigantic_page") removed set_page_count() because the
setting of page ref count to zero was redundant. devmap pages don't come
from page allocator though and only head page refcount is used for
compound pages, hence initialize tail page count to zero.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/memremap.h | 11 +++++++++++
 mm/memremap.c            | 12 ++++++------
 mm/page_alloc.c          | 38 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 119f130ef8f1..aaf85bda093b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -99,6 +99,11 @@ struct dev_pagemap_ops {
  * @done: completion for @internal_ref
  * @type: memory type: see MEMORY_* in memory_hotplug.h
  * @flags: PGMAP_* flags to specify defailed behavior
+ * @vmemmap_shift: structural definition of how the vmemmap page metadata
+ *      is populated, specifically the metadata page order.
+ *	A zero value (default) uses base pages as the vmemmap metadata
+ *	representation. A bigger value will set up compound struct pages
+ *	of the requested order value.
  * @ops: method table
  * @owner: an opaque pointer identifying the entity that manages this
  *	instance.  Used by various helpers to make sure that no
@@ -114,6 +119,7 @@ struct dev_pagemap {
 	struct completion done;
 	enum memory_type type;
 	unsigned int flags;
+	unsigned long vmemmap_shift;
 	const struct dev_pagemap_ops *ops;
 	void *owner;
 	int nr_range;
@@ -130,6 +136,11 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
 	return NULL;
 }
 
+static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
+{
+	return 1 << pgmap->vmemmap_shift;
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 bool pfn_zone_device_reserved(unsigned long pfn);
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
diff --git a/mm/memremap.c b/mm/memremap.c
index 84de22c14567..3afa246eb1ab 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -102,11 +102,11 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
 	return (range->start + range_len(range)) >> PAGE_SHIFT;
 }
 
-static unsigned long pfn_next(unsigned long pfn)
+static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
 {
-	if (pfn % 1024 == 0)
+	if (pfn % (1024 << pgmap->vmemmap_shift))
 		cond_resched();
-	return pfn + 1;
+	return pfn + pgmap_vmemmap_nr(pgmap);
 }
 
 /*
@@ -130,7 +130,7 @@ bool pfn_zone_device_reserved(unsigned long pfn)
 }
 
 #define for_each_device_pfn(pfn, map, i) \
-	for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
+	for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(map, pfn))
 
 static void dev_pagemap_kill(struct dev_pagemap *pgmap)
 {
@@ -315,8 +315,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 				PHYS_PFN(range->start),
 				PHYS_PFN(range_len(range)), pgmap);
-	percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
-			- pfn_first(pgmap, range_id));
+	percpu_ref_get_many(pgmap->ref, (pfn_end(pgmap, range_id)
+			- pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift);
 	return 0;
 
 err_add_memory:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f7f33c83222f..ea537839816e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6605,6 +6605,35 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
 	}
 }
 
+static void __ref memmap_init_compound(struct page *head,
+				       unsigned long head_pfn,
+				       unsigned long zone_idx, int nid,
+				       struct dev_pagemap *pgmap,
+				       unsigned long nr_pages)
+{
+	unsigned long pfn, end_pfn = head_pfn + nr_pages;
+	unsigned int order = pgmap->vmemmap_shift;
+
+	__SetPageHead(head);
+	for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
+		struct page *page = pfn_to_page(pfn);
+
+		__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+		prep_compound_tail(head, pfn - head_pfn);
+		set_page_count(page, 0);
+
+		/*
+		 * The first tail page stores compound_mapcount_ptr() and
+		 * compound_order() and the second tail page stores
+		 * compound_pincount_ptr(). Call prep_compound_head() after
+		 * the first and second tail pages have been initialized to
+		 * not have the data overwritten.
+		 */
+		if (pfn == head_pfn + 2)
+			prep_compound_head(head, order);
+	}
+}
+
 void __ref memmap_init_zone_device(struct zone *zone,
 				   unsigned long start_pfn,
 				   unsigned long nr_pages,
@@ -6613,6 +6642,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 	unsigned long pfn, end_pfn = start_pfn + nr_pages;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+	unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
 	unsigned long zone_idx = zone_idx(zone);
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
@@ -6630,10 +6660,16 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		nr_pages = end_pfn - start_pfn;
 	}
 
-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+	for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
 		struct page *page = pfn_to_page(pfn);
 
 		__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+
+		if (pfns_per_compound == 1)
+			continue;
+
+		memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
+				     pfns_per_compound);
 	}
 
 	pr_info("%s initialised %lu pages in %ums\n", __func__,
-- 
2.17.2


  parent reply	other threads:[~2021-11-24 19:10 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-24 19:09 [PATCH v6 00/10] mm, device-dax: Introduce compound pages in devmap Joao Martins
2021-11-24 19:09 ` [PATCH v6 01/10] memory-failure: fetch compound_head after pgmap_pfn_valid() Joao Martins
2021-11-24 19:09 ` [PATCH v6 02/10] mm/page_alloc: split prep_compound_page into head and tail subparts Joao Martins
2021-11-24 19:09 ` [PATCH v6 03/10] mm/page_alloc: refactor memmap_init_zone_device() page init Joao Martins
2021-11-24 19:09 ` Joao Martins [this message]
2021-11-25  6:11   ` [PATCH v6 04/10] mm/memremap: add ZONE_DEVICE support for compound pages Christoph Hellwig
2021-11-25 11:35     ` Joao Martins
2021-11-24 19:10 ` [PATCH v6 05/10] device-dax: use ALIGN() for determining pgoff Joao Martins
2021-11-24 19:10 ` [PATCH v6 06/10] device-dax: use struct_size() Joao Martins
2021-11-24 19:10 ` [PATCH v6 07/10] device-dax: ensure dev_dax->pgmap is valid for dynamic devices Joao Martins
2021-11-24 19:10 ` [PATCH v6 08/10] device-dax: factor out page mapping initialization Joao Martins
2021-11-24 19:10 ` [PATCH v6 09/10] device-dax: set mapping prior to vmf_insert_pfn{,_pmd,pud}() Joao Martins
2021-11-25 11:42   ` Joao Martins
2021-11-26 18:39     ` Joao Martins
2021-11-29  7:32       ` Christoph Hellwig
2021-11-29 15:49         ` Joao Martins
2021-11-29 16:48           ` Christoph Hellwig
2021-11-29 17:20           ` Joao Martins
2021-11-25 18:02   ` [PATCH v6 09/10] device-dax: set mapping prior to vmf_insert_pfn{, _pmd, pud}() kernel test robot
2021-11-24 19:10 ` [PATCH v6 10/10] device-dax: compound devmap support Joao Martins
2021-11-24 22:30 ` [PATCH v6 00/10] mm, device-dax: Introduce compound pages in devmap Dan Williams
2021-11-24 22:41   ` Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211124191005.20783-5-joao.m.martins@oracle.com \
    --to=joao.m.martins@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=corbet@lwn.net \
    --cc=dan.j.williams@intel.com \
    --cc=dave.jiang@intel.com \
    --cc=hch@lst.de \
    --cc=jane.chu@oracle.com \
    --cc=jgg@ziepe.ca \
    --cc=jhubbard@nvidia.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mike.kravetz@oracle.com \
    --cc=naoya.horiguchi@nec.com \
    --cc=nvdimm@lists.linux.dev \
    --cc=songmuchun@bytedance.com \
    --cc=vishal.l.verma@intel.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.