Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 0/3] make pfn walker support ZONE_DEVICE
@ 2019-11-08  0:08 Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c Toshiki Fukasawa
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Toshiki Fukasawa @ 2019-11-08  0:08 UTC (permalink / raw)
  To: linux-mm, dan.j.williams
  Cc: linux-kernel, akpm, mhocko, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

This patch set tries to make pfn walker support ZONE_DEVICE.
This idea is from the TODO in below patch:

  commit aad5f69bc161af489dbb5934868bd347282f0764
  Author: David Hildenbrand <david@redhat.com>
  Date:   Fri Oct 18 20:19:20 2019 -0700

	fs/proc/page.c: don't access uninitialized memmaps in fs/proc/page.c

pfn walker's ZONE_DEVICE support requires capability to identify
that a memmap has been initialized. The uninitialized cases are 
as follows:

	a) pages reserved for ZONE_DEVICE driver
	b) pages currently initializing

This patch set solves both of them.

Toshiki Fukasawa (3):
  procfs: refactor kpage_*_read() in fs/proc/page.c
  mm: Introduce subsection_dev_map
  mm: make pfn walker support ZONE_DEVICE

 fs/proc/page.c           | 155 ++++++++++++++++++++---------------------------
 include/linux/memremap.h |   6 ++
 include/linux/mmzone.h   |  19 ++++++
 mm/memremap.c            |  31 ++++++++++
 mm/sparse.c              |  32 ++++++++++
 5 files changed, 154 insertions(+), 89 deletions(-)

-- 
1.8.3.1



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c
  2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
@ 2019-11-08  0:08 ` Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 2/3] mm: Introduce subsection_dev_map Toshiki Fukasawa
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Toshiki Fukasawa @ 2019-11-08  0:08 UTC (permalink / raw)
  To: linux-mm, dan.j.williams
  Cc: linux-kernel, akpm, mhocko, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

kpagecount_read(), kpageflags_read(), and kpagecgroup_read()
have duplicate code. This patch moves it into a common function.

Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
---
 fs/proc/page.c | 133 +++++++++++++++++++++------------------------------------
 1 file changed, 48 insertions(+), 85 deletions(-)

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7c952ee..a49b638 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -21,20 +21,19 @@
 #define KPMMASK (KPMSIZE - 1)
 #define KPMBITS (KPMSIZE * BITS_PER_BYTE)
 
-/* /proc/kpagecount - an array exposing page counts
- *
- * Each entry is a u64 representing the corresponding
- * physical page count.
+typedef u64 (*read_page_data_fn_t)(struct page *page);
+
+/*
+ * This is general function to read various data on pages.
  */
-static ssize_t kpagecount_read(struct file *file, char __user *buf,
-			     size_t count, loff_t *ppos)
+static ssize_t kpage_common_read(struct file *file, char __user *buf,
+		size_t count, loff_t *ppos, read_page_data_fn_t read_fn)
 {
 	u64 __user *out = (u64 __user *)buf;
 	struct page *ppage;
 	unsigned long src = *ppos;
 	unsigned long pfn;
 	ssize_t ret = 0;
-	u64 pcount;
 
 	pfn = src / KPMSIZE;
 	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
@@ -48,12 +47,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 		 */
 		ppage = pfn_to_online_page(pfn);
 
-		if (!ppage || PageSlab(ppage) || page_has_type(ppage))
-			pcount = 0;
-		else
-			pcount = page_mapcount(ppage);
-
-		if (put_user(pcount, out)) {
+		if (put_user(read_fn(ppage), out)) {
 			ret = -EFAULT;
 			break;
 		}
@@ -71,6 +65,30 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 	return ret;
 }
 
+/* /proc/kpagecount - an array exposing page counts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page count.
+ */
+
+static u64 page_count_data(struct page *page)
+{
+	u64 pcount;
+
+	if (!page || PageSlab(page) || page_has_type(page))
+		pcount = 0;
+	else
+		pcount = page_mapcount(page);
+
+	return pcount;
+}
+
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	return kpage_common_read(file, buf, count, ppos, page_count_data);
+}
+
 static const struct file_operations proc_kpagecount_operations = {
 	.llseek = mem_lseek,
 	.read = kpagecount_read,
@@ -203,43 +221,15 @@ u64 stable_page_flags(struct page *page)
 	return u;
 };
 
+static u64 page_flags_data(struct page *page)
+{
+	return stable_page_flags(page);
+}
+
 static ssize_t kpageflags_read(struct file *file, char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	u64 __user *out = (u64 __user *)buf;
-	struct page *ppage;
-	unsigned long src = *ppos;
-	unsigned long pfn;
-	ssize_t ret = 0;
-
-	pfn = src / KPMSIZE;
-	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
-	if (src & KPMMASK || count & KPMMASK)
-		return -EINVAL;
-
-	while (count > 0) {
-		/*
-		 * TODO: ZONE_DEVICE support requires to identify
-		 * memmaps that were actually initialized.
-		 */
-		ppage = pfn_to_online_page(pfn);
-
-		if (put_user(stable_page_flags(ppage), out)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		pfn++;
-		out++;
-		count -= KPMSIZE;
-
-		cond_resched();
-	}
-
-	*ppos += (char __user *)out - buf;
-	if (!ret)
-		ret = (char __user *)out - buf;
-	return ret;
+	return kpage_common_read(file, buf, count, ppos, page_flags_data);
 }
 
 static const struct file_operations proc_kpageflags_operations = {
@@ -248,49 +238,22 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 };
 
 #ifdef CONFIG_MEMCG
-static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
-				size_t count, loff_t *ppos)
+static u64 page_cgroup_data(struct page *page)
 {
-	u64 __user *out = (u64 __user *)buf;
-	struct page *ppage;
-	unsigned long src = *ppos;
-	unsigned long pfn;
-	ssize_t ret = 0;
 	u64 ino;
 
-	pfn = src / KPMSIZE;
-	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
-	if (src & KPMMASK || count & KPMMASK)
-		return -EINVAL;
-
-	while (count > 0) {
-		/*
-		 * TODO: ZONE_DEVICE support requires to identify
-		 * memmaps that were actually initialized.
-		 */
-		ppage = pfn_to_online_page(pfn);
-
-		if (ppage)
-			ino = page_cgroup_ino(ppage);
-		else
-			ino = 0;
-
-		if (put_user(ino, out)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		pfn++;
-		out++;
-		count -= KPMSIZE;
+	if (page)
+		ino = page_cgroup_ino(page);
+	else
+		ino = 0;
 
-		cond_resched();
-	}
+	return ino;
+}
 
-	*ppos += (char __user *)out - buf;
-	if (!ret)
-		ret = (char __user *)out - buf;
-	return ret;
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	return kpage_common_read(file, buf, count, ppos, page_cgroup_data);
 }
 
 static const struct file_operations proc_kpagecgroup_operations = {
-- 
1.8.3.1



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c Toshiki Fukasawa
@ 2019-11-08  0:08 ` Toshiki Fukasawa
  2019-11-08 19:13   ` Dan Williams
  2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
  2019-11-08  9:18 ` [PATCH 0/3] " Michal Hocko
  3 siblings, 1 reply; 10+ messages in thread
From: Toshiki Fukasawa @ 2019-11-08  0:08 UTC (permalink / raw)
  To: linux-mm, dan.j.williams
  Cc: linux-kernel, akpm, mhocko, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

Currently, there is no way to identify pfn on ZONE_DEVICE.
Identifying pfn on system memory can be done by using a
section-level flag. On the other hand, identifying pfn on
ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
can be created in units of subsections.

This patch introduces a new bitmap subsection_dev_map so that
we can identify pfn on ZONE_DEVICE.

Also, subsection_dev_map is used to prove that struct pages
included in the subsection have been initialized since it is
set after memmap_init_zone_device(). We can avoid accessing
pages currently being initialized by checking subsection_dev_map.

Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
---
 include/linux/mmzone.h | 19 +++++++++++++++++++
 mm/memremap.c          |  2 ++
 mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bda2028..11376c4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 
 struct mem_section_usage {
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
+#ifdef CONFIG_ZONE_DEVICE
+	DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
+#endif
 	/* See declaration of similar field in struct zone */
 	unsigned long pageblock_flags[0];
 };
 
 void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
+#ifdef CONFIG_ZONE_DEVICE
+void subsections_mark_device(unsigned long start_pfn, unsigned long size);
+#endif
 
 struct page;
 struct page_ext;
@@ -1367,6 +1373,19 @@ static inline int pfn_present(unsigned long pfn)
 	return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 
+static inline int pfn_zone_device(unsigned long pfn)
+{
+#ifdef CONFIG_ZONE_DEVICE
+	if (pfn_valid(pfn)) {
+		struct mem_section *ms = __pfn_to_section(pfn);
+		int idx = subsection_map_index(pfn);
+
+		return test_bit(idx, ms->usage->subsection_dev_map);
+	}
+#endif
+	return 0;
+}
+
 /*
  * These are _only_ used during initialisation, therefore they
  * can use __initdata ...  They could have names to indicate
diff --git a/mm/memremap.c b/mm/memremap.c
index 03ccbdf..8a97fd4 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -303,6 +303,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 				PHYS_PFN(res->start),
 				PHYS_PFN(resource_size(res)), pgmap);
+	subsections_mark_device(PHYS_PFN(res->start),
+				PHYS_PFN(resource_size(res)));
 	percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
 	return __va(res->start);
 
diff --git a/mm/sparse.c b/mm/sparse.c
index f6891c1..a3fc9e0a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -603,6 +603,31 @@ void __init sparse_init(void)
 	vmemmap_populate_print_last();
 }
 
+#ifdef CONFIG_ZONE_DEVICE
+void subsections_mark_device(unsigned long start_pfn, unsigned long size)
+{
+	struct mem_section *ms;
+	unsigned long *dev_map;
+	unsigned long sec, start_sec, end_sec, pfns;
+
+	start_sec = pfn_to_section_nr(start_pfn);
+	end_sec = pfn_to_section_nr(start_pfn + size - 1);
+	for (sec = start_sec; sec <= end_sec;
+	     sec++, start_pfn += pfns, size -= pfns) {
+		pfns = min(size, PAGES_PER_SECTION
+			- (start_pfn & ~PAGE_SECTION_MASK));
+		if (WARN_ON(!valid_section_nr(sec)))
+			continue;
+		ms = __pfn_to_section(start_pfn);
+		if (!ms->usage)
+			continue;
+
+		dev_map = &ms->usage->subsection_dev_map[0];
+		subsection_mask_set(dev_map, start_pfn, pfns);
+	}
+}
+#endif
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 
 /* Mark all memory sections within the pfn range as online */
@@ -782,7 +807,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
 	}
+#ifdef CONFIG_ZONE_DEVICE
+	/* deactivation of a partial section on ZONE_DEVICE */
+	if (ms->usage) {
+		unsigned long *dev_map = &ms->usage->subsection_dev_map[0];
 
+		bitmap_andnot(dev_map, dev_map, map, SUBSECTIONS_PER_SECTION);
+	}
+#endif
 	if (section_is_early && memmap)
 		free_map_bootmem(memmap);
 	else
-- 
1.8.3.1



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE
  2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 2/3] mm: Introduce subsection_dev_map Toshiki Fukasawa
@ 2019-11-08  0:08 ` Toshiki Fukasawa
  2019-11-09 17:08   ` kbuild test robot
  2019-11-09 19:14   ` kbuild test robot
  2019-11-08  9:18 ` [PATCH 0/3] " Michal Hocko
  3 siblings, 2 replies; 10+ messages in thread
From: Toshiki Fukasawa @ 2019-11-08  0:08 UTC (permalink / raw)
  To: linux-mm, dan.j.williams
  Cc: linux-kernel, akpm, mhocko, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

This patch allows pfn walker to read pages on ZONE_DEVICE.
There are the following notes:

	a) The reserved pages indicated by vmem_altmap->reserve
	   are uninitialized, so it must be skipped to read.
	b) To get vmem_altmap, we need to use get_dev_pagemap(),
	   but doing it for all pfns is too slow.

This patch solves both of them. Since vmem_altmap could reserve
only first few pages, we can reduce the number of checks by
counting sequential valid pages.

Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
---
 fs/proc/page.c           | 22 ++++++++++++++++++----
 include/linux/memremap.h |  6 ++++++
 mm/memremap.c            | 29 +++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/fs/proc/page.c b/fs/proc/page.c
index a49b638..b6241ea 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -33,6 +33,7 @@ static ssize_t kpage_common_read(struct file *file, char __user *buf,
 	struct page *ppage;
 	unsigned long src = *ppos;
 	unsigned long pfn;
+	unsigned long valid_pages = 0;
 	ssize_t ret = 0;
 
 	pfn = src / KPMSIZE;
@@ -41,11 +42,24 @@ static ssize_t kpage_common_read(struct file *file, char __user *buf,
 		return -EINVAL;
 
 	while (count > 0) {
-		/*
-		 * TODO: ZONE_DEVICE support requires to identify
-		 * memmaps that were actually initialized.
-		 */
 		ppage = pfn_to_online_page(pfn);
+		if (!ppage && pfn_zone_device(pfn)) {
+			/*
+			 * Skip to read first few uninitialized pages on
+			 * ZONE_DEVICE. And count valid pages starting
+			 * with the pfn so that minimize the number of
+			 * calls to nr_valid_pages_zone_device().
+			 */
+			if (!valid_pages)
+				valid_pages = nr_valid_pages_zone_device(pfn);
+			if (valid_pages) {
+				ppage = pfn_to_page(pfn);
+				valid_pages--;
+			}
+		} else if (valid_pages) {
+			/* ZONE_DEVICE has been hot removed */
+			valid_pages = 0;
+		}
 
 		if (put_user(read_fn(ppage), out)) {
 			ret = -EFAULT;
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 6fefb09..d111ae3 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -123,6 +123,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
 }
 
 #ifdef CONFIG_ZONE_DEVICE
+unsigned long nr_valid_pages_zone_device(unsigned long pfn);
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -133,6 +134,11 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
 #else
+static inline unsigned long nr_valid_pages_zone_device(unsigned long pfn)
+{
+	return 0;
+}
+
 static inline void *devm_memremap_pages(struct device *dev,
 		struct dev_pagemap *pgmap)
 {
diff --git a/mm/memremap.c b/mm/memremap.c
index 8a97fd4..307c73e 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -73,6 +73,35 @@ static unsigned long pfn_next(unsigned long pfn)
 	return pfn + 1;
 }
 
+/*
+ * This returns the number of sequential valid pages starting from @pfn
+ * on ZONE_DEVICE. The invalid pages reserved by driver is first few
+ * pages on ZONE_DEVICE.
+ */
+unsigned long nr_valid_pages_zone_device(unsigned long pfn)
+{
+	struct dev_pagemap *pgmap;
+	struct vmem_altmap *altmap;
+	unsigned long pages;
+
+	pgmap = get_dev_pagemap(pfn, NULL);
+	if (!pgmap)
+		return 0;
+	altmap = pgmap_altmap(pgmap);
+	if (altmap && pfn < (altmap->base_pfn + altmap->reserve))
+		pages = 0;
+	else
+		/*
+		 * PHYS_PFN(pgmap->res.end) is end pfn on pgmap
+		 * (not start pfn on next mapping).
+		 */
+		pages = PHYS_PFN(pgmap->res.end) - pfn + 1;
+
+	put_dev_pagemap(pgmap);
+
+	return pages;
+}
+
 #define for_each_device_pfn(pfn, map) \
 	for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
 
-- 
1.8.3.1



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/3] make pfn walker support ZONE_DEVICE
  2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
                   ` (2 preceding siblings ...)
  2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
@ 2019-11-08  9:18 ` " Michal Hocko
  2019-11-11  8:00   ` Toshiki Fukasawa
  3 siblings, 1 reply; 10+ messages in thread
From: Michal Hocko @ 2019-11-08  9:18 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: linux-mm, dan.j.williams, linux-kernel, akpm, adobriyan, hch,
	longman, sfr, mst, cai, Naoya Horiguchi, Junichi Nomura

On Fri 08-11-19 00:08:03, Toshiki Fukasawa wrote:
> This patch set tries to make pfn walker support ZONE_DEVICE.
> This idea is from the TODO in below patch:
> 
>   commit aad5f69bc161af489dbb5934868bd347282f0764
>   Author: David Hildenbrand <david@redhat.com>
>   Date:   Fri Oct 18 20:19:20 2019 -0700
> 
> 	fs/proc/page.c: don't access uninitialized memmaps in fs/proc/page.c
> 
> pfn walker's ZONE_DEVICE support requires capability to identify
> that a memmap has been initialized. The uninitialized cases are 
> as follows:
> 
> 	a) pages reserved for ZONE_DEVICE driver
> 	b) pages currently initializing
> 
> This patch set solves both of them.

Why do we want this? What is the usecase?

> 
> Toshiki Fukasawa (3):
>   procfs: refactor kpage_*_read() in fs/proc/page.c
>   mm: Introduce subsection_dev_map
>   mm: make pfn walker support ZONE_DEVICE
> 
>  fs/proc/page.c           | 155 ++++++++++++++++++++---------------------------
>  include/linux/memremap.h |   6 ++
>  include/linux/mmzone.h   |  19 ++++++
>  mm/memremap.c            |  31 ++++++++++
>  mm/sparse.c              |  32 ++++++++++
>  5 files changed, 154 insertions(+), 89 deletions(-)
> 
> -- 
> 1.8.3.1
> 

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-08  0:08 ` [PATCH 2/3] mm: Introduce subsection_dev_map Toshiki Fukasawa
@ 2019-11-08 19:13   ` Dan Williams
  0 siblings, 0 replies; 10+ messages in thread
From: Dan Williams @ 2019-11-08 19:13 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: linux-mm, linux-kernel, akpm, mhocko, adobriyan, hch, longman,
	sfr, mst, cai, Naoya Horiguchi, Junichi Nomura

On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
<t-fukasawa@vx.jp.nec.com> wrote:
>
> Currently, there is no way to identify pfn on ZONE_DEVICE.
> Identifying pfn on system memory can be done by using a
> section-level flag. On the other hand, identifying pfn on
> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
> can be created in units of subsections.
>
> This patch introduces a new bitmap subsection_dev_map so that
> we can identify pfn on ZONE_DEVICE.
>
> Also, subsection_dev_map is used to prove that struct pages
> included in the subsection have been initialized since it is
> set after memmap_init_zone_device(). We can avoid accessing
> pages currently being initialized by checking subsection_dev_map.
>
> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
> ---
>  include/linux/mmzone.h | 19 +++++++++++++++++++
>  mm/memremap.c          |  2 ++
>  mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>  3 files changed, 53 insertions(+)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index bda2028..11376c4 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>
>  struct mem_section_usage {
>         DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
> +#ifdef CONFIG_ZONE_DEVICE
> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
> +#endif

Hi Toshiki,

There is currently an effort to remove the PageReserved() flag as some
code is using that to detect ZONE_DEVICE. In reviewing those patches
we realized that what many code paths want is to detect online memory.
So instead of a subsection_dev_map add a subsection_online_map. That
way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
otherwise question the use case for pfn_walkers to return pages for
ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
== false is the right behavior.


>         /* See declaration of similar field in struct zone */
>         unsigned long pageblock_flags[0];
>  };
>
>  void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
> +#ifdef CONFIG_ZONE_DEVICE
> +void subsections_mark_device(unsigned long start_pfn, unsigned long size);
> +#endif
>
>  struct page;
>  struct page_ext;
> @@ -1367,6 +1373,19 @@ static inline int pfn_present(unsigned long pfn)
>         return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
>  }
>
> +static inline int pfn_zone_device(unsigned long pfn)
> +{
> +#ifdef CONFIG_ZONE_DEVICE
> +       if (pfn_valid(pfn)) {
> +               struct mem_section *ms = __pfn_to_section(pfn);
> +               int idx = subsection_map_index(pfn);
> +
> +               return test_bit(idx, ms->usage->subsection_dev_map);
> +       }
> +#endif
> +       return 0;
> +}
> +
>  /*
>   * These are _only_ used during initialisation, therefore they
>   * can use __initdata ...  They could have names to indicate
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 03ccbdf..8a97fd4 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -303,6 +303,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
>         memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
>                                 PHYS_PFN(res->start),
>                                 PHYS_PFN(resource_size(res)), pgmap);
> +       subsections_mark_device(PHYS_PFN(res->start),
> +                               PHYS_PFN(resource_size(res)));
>         percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
>         return __va(res->start);
>
> diff --git a/mm/sparse.c b/mm/sparse.c
> index f6891c1..a3fc9e0a 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -603,6 +603,31 @@ void __init sparse_init(void)
>         vmemmap_populate_print_last();
>  }
>
> +#ifdef CONFIG_ZONE_DEVICE
> +void subsections_mark_device(unsigned long start_pfn, unsigned long size)
> +{
> +       struct mem_section *ms;
> +       unsigned long *dev_map;
> +       unsigned long sec, start_sec, end_sec, pfns;
> +
> +       start_sec = pfn_to_section_nr(start_pfn);
> +       end_sec = pfn_to_section_nr(start_pfn + size - 1);
> +       for (sec = start_sec; sec <= end_sec;
> +            sec++, start_pfn += pfns, size -= pfns) {
> +               pfns = min(size, PAGES_PER_SECTION
> +                       - (start_pfn & ~PAGE_SECTION_MASK));
> +               if (WARN_ON(!valid_section_nr(sec)))
> +                       continue;
> +               ms = __pfn_to_section(start_pfn);
> +               if (!ms->usage)
> +                       continue;
> +
> +               dev_map = &ms->usage->subsection_dev_map[0];
> +               subsection_mask_set(dev_map, start_pfn, pfns);
> +       }
> +}
> +#endif
> +
>  #ifdef CONFIG_MEMORY_HOTPLUG
>
>  /* Mark all memory sections within the pfn range as online */
> @@ -782,7 +807,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
>                 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
>                 ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
>         }
> +#ifdef CONFIG_ZONE_DEVICE
> +       /* deactivation of a partial section on ZONE_DEVICE */
> +       if (ms->usage) {
> +               unsigned long *dev_map = &ms->usage->subsection_dev_map[0];
>
> +               bitmap_andnot(dev_map, dev_map, map, SUBSECTIONS_PER_SECTION);
> +       }
> +#endif
>         if (section_is_early && memmap)
>                 free_map_bootmem(memmap);
>         else
> --
> 1.8.3.1
>


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE
  2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
@ 2019-11-09 17:08   ` kbuild test robot
  2019-11-09 19:14   ` kbuild test robot
  1 sibling, 0 replies; 10+ messages in thread
From: kbuild test robot @ 2019-11-09 17:08 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: kbuild-all, linux-mm, dan.j.williams, linux-kernel, akpm, mhocko,
	adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura

[-- Attachment #1: Type: text/plain, Size: 3187 bytes --]

Hi Toshiki,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.4-rc6]
[cannot apply to next-20191108]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Toshiki-Fukasawa/make-pfn-walker-support-ZONE_DEVICE/20191110-000508
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 0058b0a506e40d9a2c62015fe92eb64a44d78cd9
config: i386-defconfig (attached as .config)
compiler: gcc-7 (Debian 7.4.0-14) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   fs/proc/page.c: In function 'kpage_common_read':
>> fs/proc/page.c:46:17: error: implicit declaration of function 'pfn_zone_device'; did you mean 'bus_find_device'? [-Werror=implicit-function-declaration]
      if (!ppage && pfn_zone_device(pfn)) {
                    ^~~~~~~~~~~~~~~
                    bus_find_device
   cc1: some warnings being treated as errors

vim +46 fs/proc/page.c

    25	
    26	/*
    27	 * This is general function to read various data on pages.
    28	 */
    29	static ssize_t kpage_common_read(struct file *file, char __user *buf,
    30			size_t count, loff_t *ppos, read_page_data_fn_t read_fn)
    31	{
    32		u64 __user *out = (u64 __user *)buf;
    33		struct page *ppage;
    34		unsigned long src = *ppos;
    35		unsigned long pfn;
    36		unsigned long valid_pages = 0;
    37		ssize_t ret = 0;
    38	
    39		pfn = src / KPMSIZE;
    40		count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
    41		if (src & KPMMASK || count & KPMMASK)
    42			return -EINVAL;
    43	
    44		while (count > 0) {
    45			ppage = pfn_to_online_page(pfn);
  > 46			if (!ppage && pfn_zone_device(pfn)) {
    47				/*
    48				 * Skip to read first few uninitialized pages on
    49				 * ZONE_DEVICE. And count valid pages starting
    50				 * with the pfn so that minimize the number of
    51				 * calls to nr_valid_pages_zone_device().
    52				 */
    53				if (!valid_pages)
    54					valid_pages = nr_valid_pages_zone_device(pfn);
    55				if (valid_pages) {
    56					ppage = pfn_to_page(pfn);
    57					valid_pages--;
    58				}
    59			} else if (valid_pages) {
    60				/* ZONE_DEVICE has been hot removed */
    61				valid_pages = 0;
    62			}
    63	
    64			if (put_user(read_fn(ppage), out)) {
    65				ret = -EFAULT;
    66				break;
    67			}
    68	
    69			pfn++;
    70			out++;
    71			count -= KPMSIZE;
    72	
    73			cond_resched();
    74		}
    75	
    76		*ppos += (char __user *)out - buf;
    77		if (!ret)
    78			ret = (char __user *)out - buf;
    79		return ret;
    80	}
    81	

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28148 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE
  2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
  2019-11-09 17:08   ` kbuild test robot
@ 2019-11-09 19:14   ` kbuild test robot
  1 sibling, 0 replies; 10+ messages in thread
From: kbuild test robot @ 2019-11-09 19:14 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: kbuild-all, linux-mm, dan.j.williams, linux-kernel, akpm, mhocko,
	adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura

[-- Attachment #1: Type: text/plain, Size: 5542 bytes --]

Hi Toshiki,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[cannot apply to v5.4-rc6 next-20191108]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Toshiki-Fukasawa/make-pfn-walker-support-ZONE_DEVICE/20191110-000508
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 0058b0a506e40d9a2c62015fe92eb64a44d78cd9
config: i386-randconfig-b002-201945 (attached as .config)
compiler: gcc-7 (Debian 7.4.0-14) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   fs/proc/page.c: In function 'kpage_common_read':
>> fs/proc/page.c:46:17: error: implicit declaration of function 'pfn_zone_device'; did you mean 'pgprot_device'? [-Werror=implicit-function-declaration]
      if (!ppage && pfn_zone_device(pfn)) {
                    ^~~~~~~~~~~~~~~
                    pgprot_device
   Cyclomatic Complexity 5 include/linux/compiler.h:__read_once_size
   Cyclomatic Complexity 1 include/linux/kasan-checks.h:kasan_check_read
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:constant_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:variable_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:arch_atomic_read
   Cyclomatic Complexity 1 include/asm-generic/atomic-instrumented.h:atomic_read
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageTail
   Cyclomatic Complexity 3 include/linux/page-flags.h:PageCompound
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageLRU
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageSlab
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageSwapCache
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageAnon
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageKsm
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageHead
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageTransCompound
   Cyclomatic Complexity 1 include/linux/page-flags.h:page_has_type
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageBuddy
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageOffline
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageTable
   Cyclomatic Complexity 1 include/linux/memremap.h:nr_valid_pages_zone_device
   Cyclomatic Complexity 1 include/linux/sched.h:_cond_resched
   Cyclomatic Complexity 1 include/asm-generic/pgtable.h:is_zero_pfn
   Cyclomatic Complexity 1 include/linux/huge_mm.h:is_huge_zero_page
   Cyclomatic Complexity 1 fs/proc/page.c:kpf_copy_bit
   Cyclomatic Complexity 2 include/asm-generic/bitops-instrumented.h:test_bit
   Cyclomatic Complexity 2 include/linux/page_idle.h:page_is_idle
   Cyclomatic Complexity 2 include/linux/page-flags.h:compound_head
   Cyclomatic Complexity 1 include/linux/page_ref.h:page_count
   Cyclomatic Complexity 1 fs/proc/page.c:proc_page_init
   Cyclomatic Complexity 12 fs/proc/page.c:kpage_common_read
   Cyclomatic Complexity 1 fs/proc/page.c:kpageflags_read
   Cyclomatic Complexity 1 fs/proc/page.c:kpagecount_read
   Cyclomatic Complexity 2 include/linux/mm.h:page_mapcount
   Cyclomatic Complexity 4 fs/proc/page.c:page_count_data
   Cyclomatic Complexity 23 fs/proc/page.c:stable_page_flags
   Cyclomatic Complexity 1 fs/proc/page.c:page_flags_data
   cc1: some warnings being treated as errors

vim +46 fs/proc/page.c

    25	
    26	/*
    27	 * This is general function to read various data on pages.
    28	 */
    29	static ssize_t kpage_common_read(struct file *file, char __user *buf,
    30			size_t count, loff_t *ppos, read_page_data_fn_t read_fn)
    31	{
    32		u64 __user *out = (u64 __user *)buf;
    33		struct page *ppage;
    34		unsigned long src = *ppos;
    35		unsigned long pfn;
    36		unsigned long valid_pages = 0;
    37		ssize_t ret = 0;
    38	
    39		pfn = src / KPMSIZE;
    40		count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
    41		if (src & KPMMASK || count & KPMMASK)
    42			return -EINVAL;
    43	
    44		while (count > 0) {
    45			ppage = pfn_to_online_page(pfn);
  > 46			if (!ppage && pfn_zone_device(pfn)) {
    47				/*
    48				 * Skip to read first few uninitialized pages on
    49				 * ZONE_DEVICE. And count valid pages starting
    50				 * with the pfn so that minimize the number of
    51				 * calls to nr_valid_pages_zone_device().
    52				 */
    53				if (!valid_pages)
    54					valid_pages = nr_valid_pages_zone_device(pfn);
    55				if (valid_pages) {
    56					ppage = pfn_to_page(pfn);
    57					valid_pages--;
    58				}
    59			} else if (valid_pages) {
    60				/* ZONE_DEVICE has been hot removed */
    61				valid_pages = 0;
    62			}
    63	
    64			if (put_user(read_fn(ppage), out)) {
    65				ret = -EFAULT;
    66				break;
    67			}
    68	
    69			pfn++;
    70			out++;
    71			count -= KPMSIZE;
    72	
    73			cond_resched();
    74		}
    75	
    76		*ppos += (char __user *)out - buf;
    77		if (!ret)
    78			ret = (char __user *)out - buf;
    79		return ret;
    80	}
    81	

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28178 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/3] make pfn walker support ZONE_DEVICE
  2019-11-08  9:18 ` [PATCH 0/3] " Michal Hocko
@ 2019-11-11  8:00   ` Toshiki Fukasawa
  2019-11-11 16:23     ` Dan Williams
  0 siblings, 1 reply; 10+ messages in thread
From: Toshiki Fukasawa @ 2019-11-11  8:00 UTC (permalink / raw)
  To: Michal Hocko, dan.j.williams
  Cc: linux-mm, linux-kernel, akpm, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

On 2019/11/08 18:18, Michal Hocko wrote:
> On Fri 08-11-19 00:08:03, Toshiki Fukasawa wrote:
>> This patch set tries to make pfn walker support ZONE_DEVICE.
>> This idea is from the TODO in below patch:
>>
>>    commit aad5f69bc161af489dbb5934868bd347282f0764
>>    Author: David Hildenbrand <david@redhat.com>
>>    Date:   Fri Oct 18 20:19:20 2019 -0700
>>
>> 	fs/proc/page.c: don't access uninitialized memmaps in fs/proc/page.c
>>
>> pfn walker's ZONE_DEVICE support requires capability to identify
>> that a memmap has been initialized. The uninitialized cases are
>> as follows:
>>
>> 	a) pages reserved for ZONE_DEVICE driver
>> 	b) pages currently initializing
>>
>> This patch set solves both of them.
> 
> Why do we want this? What is the usecase?

We are writing a test program for hwpoison, which is a use case.
Without this patch, we can't see the HWPOISON flag on the
ZONE_DEVICE page.

Thanks,
Toshiki Fukasawa


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/3] make pfn walker support ZONE_DEVICE
  2019-11-11  8:00   ` Toshiki Fukasawa
@ 2019-11-11 16:23     ` Dan Williams
  0 siblings, 0 replies; 10+ messages in thread
From: Dan Williams @ 2019-11-11 16:23 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: Michal Hocko, linux-mm, linux-kernel, akpm, adobriyan, hch,
	longman, sfr, mst, cai, Naoya Horiguchi, Junichi Nomura

On Mon, Nov 11, 2019 at 12:01 AM Toshiki Fukasawa
<t-fukasawa@vx.jp.nec.com> wrote:
>
> On 2019/11/08 18:18, Michal Hocko wrote:
> > On Fri 08-11-19 00:08:03, Toshiki Fukasawa wrote:
> >> This patch set tries to make pfn walker support ZONE_DEVICE.
> >> This idea is from the TODO in below patch:
> >>
> >>    commit aad5f69bc161af489dbb5934868bd347282f0764
> >>    Author: David Hildenbrand <david@redhat.com>
> >>    Date:   Fri Oct 18 20:19:20 2019 -0700
> >>
> >>      fs/proc/page.c: don't access uninitialized memmaps in fs/proc/page.c
> >>
> >> pfn walker's ZONE_DEVICE support requires capability to identify
> >> that a memmap has been initialized. The uninitialized cases are
> >> as follows:
> >>
> >>      a) pages reserved for ZONE_DEVICE driver
> >>      b) pages currently initializing
> >>
> >> This patch set solves both of them.
> >
> > Why do we want this? What is the usecase?
>
> We are writing a test program for hwpoison, which is a use case.
> Without this patch, we can't see the HWPOISON flag on the
> ZONE_DEVICE page.

I'm not sure that's a goal that's a worthwhile goal. That hwpoison
flag has specific meaning for the System RAM case where the page is
going to be marked offline. For the pmem case the nvdimm core tracks
'badblocks'. I did attempt to use PageHWPoison to track which pmem
pages had been marked UC to prevent speculative consumption, but that
implementation has been found to collide with lookup_memtype() so I'm
looking ot replace it with something that consults with the pmem
driver.


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, back to index

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
2019-11-08  0:08 ` [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c Toshiki Fukasawa
2019-11-08  0:08 ` [PATCH 2/3] mm: Introduce subsection_dev_map Toshiki Fukasawa
2019-11-08 19:13   ` Dan Williams
2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
2019-11-09 17:08   ` kbuild test robot
2019-11-09 19:14   ` kbuild test robot
2019-11-08  9:18 ` [PATCH 0/3] " Michal Hocko
2019-11-11  8:00   ` Toshiki Fukasawa
2019-11-11 16:23     ` Dan Williams

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org
	public-inbox-index linux-mm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git