linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] make pfn walker support ZONE_DEVICE
@ 2019-11-08  0:08 Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c Toshiki Fukasawa
                   ` (3 more replies)
  0 siblings, 4 replies; 22+ messages in thread
From: Toshiki Fukasawa @ 2019-11-08  0:08 UTC (permalink / raw)
  To: linux-mm, dan.j.williams
  Cc: linux-kernel, akpm, mhocko, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

This patch set tries to make pfn walker support ZONE_DEVICE.
This idea is from the TODO in below patch:

  commit aad5f69bc161af489dbb5934868bd347282f0764
  Author: David Hildenbrand <david@redhat.com>
  Date:   Fri Oct 18 20:19:20 2019 -0700

	fs/proc/page.c: don't access uninitialized memmaps in fs/proc/page.c

pfn walker's ZONE_DEVICE support requires capability to identify
that a memmap has been initialized. The uninitialized cases are 
as follows:

	a) pages reserved for ZONE_DEVICE driver
	b) pages currently initializing

This patch set solves both of them.

Toshiki Fukasawa (3):
  procfs: refactor kpage_*_read() in fs/proc/page.c
  mm: Introduce subsection_dev_map
  mm: make pfn walker support ZONE_DEVICE

 fs/proc/page.c           | 155 ++++++++++++++++++++---------------------------
 include/linux/memremap.h |   6 ++
 include/linux/mmzone.h   |  19 ++++++
 mm/memremap.c            |  31 ++++++++++
 mm/sparse.c              |  32 ++++++++++
 5 files changed, 154 insertions(+), 89 deletions(-)

-- 
1.8.3.1



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c
  2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
@ 2019-11-08  0:08 ` Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 2/3] mm: Introduce subsection_dev_map Toshiki Fukasawa
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 22+ messages in thread
From: Toshiki Fukasawa @ 2019-11-08  0:08 UTC (permalink / raw)
  To: linux-mm, dan.j.williams
  Cc: linux-kernel, akpm, mhocko, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

kpagecount_read(), kpageflags_read(), and kpagecgroup_read()
have duplicate code. This patch moves it into a common function.

Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
---
 fs/proc/page.c | 133 +++++++++++++++++++++------------------------------------
 1 file changed, 48 insertions(+), 85 deletions(-)

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7c952ee..a49b638 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -21,20 +21,19 @@
 #define KPMMASK (KPMSIZE - 1)
 #define KPMBITS (KPMSIZE * BITS_PER_BYTE)
 
-/* /proc/kpagecount - an array exposing page counts
- *
- * Each entry is a u64 representing the corresponding
- * physical page count.
+typedef u64 (*read_page_data_fn_t)(struct page *page);
+
+/*
+ * This is general function to read various data on pages.
  */
-static ssize_t kpagecount_read(struct file *file, char __user *buf,
-			     size_t count, loff_t *ppos)
+static ssize_t kpage_common_read(struct file *file, char __user *buf,
+		size_t count, loff_t *ppos, read_page_data_fn_t read_fn)
 {
 	u64 __user *out = (u64 __user *)buf;
 	struct page *ppage;
 	unsigned long src = *ppos;
 	unsigned long pfn;
 	ssize_t ret = 0;
-	u64 pcount;
 
 	pfn = src / KPMSIZE;
 	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
@@ -48,12 +47,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 		 */
 		ppage = pfn_to_online_page(pfn);
 
-		if (!ppage || PageSlab(ppage) || page_has_type(ppage))
-			pcount = 0;
-		else
-			pcount = page_mapcount(ppage);
-
-		if (put_user(pcount, out)) {
+		if (put_user(read_fn(ppage), out)) {
 			ret = -EFAULT;
 			break;
 		}
@@ -71,6 +65,30 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 	return ret;
 }
 
+/* /proc/kpagecount - an array exposing page counts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page count.
+ */
+
+static u64 page_count_data(struct page *page)
+{
+	u64 pcount;
+
+	if (!page || PageSlab(page) || page_has_type(page))
+		pcount = 0;
+	else
+		pcount = page_mapcount(page);
+
+	return pcount;
+}
+
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	return kpage_common_read(file, buf, count, ppos, page_count_data);
+}
+
 static const struct file_operations proc_kpagecount_operations = {
 	.llseek = mem_lseek,
 	.read = kpagecount_read,
@@ -203,43 +221,15 @@ u64 stable_page_flags(struct page *page)
 	return u;
 };
 
+static u64 page_flags_data(struct page *page)
+{
+	return stable_page_flags(page);
+}
+
 static ssize_t kpageflags_read(struct file *file, char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	u64 __user *out = (u64 __user *)buf;
-	struct page *ppage;
-	unsigned long src = *ppos;
-	unsigned long pfn;
-	ssize_t ret = 0;
-
-	pfn = src / KPMSIZE;
-	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
-	if (src & KPMMASK || count & KPMMASK)
-		return -EINVAL;
-
-	while (count > 0) {
-		/*
-		 * TODO: ZONE_DEVICE support requires to identify
-		 * memmaps that were actually initialized.
-		 */
-		ppage = pfn_to_online_page(pfn);
-
-		if (put_user(stable_page_flags(ppage), out)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		pfn++;
-		out++;
-		count -= KPMSIZE;
-
-		cond_resched();
-	}
-
-	*ppos += (char __user *)out - buf;
-	if (!ret)
-		ret = (char __user *)out - buf;
-	return ret;
+	return kpage_common_read(file, buf, count, ppos, page_flags_data);
 }
 
 static const struct file_operations proc_kpageflags_operations = {
@@ -248,49 +238,22 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 };
 
 #ifdef CONFIG_MEMCG
-static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
-				size_t count, loff_t *ppos)
+static u64 page_cgroup_data(struct page *page)
 {
-	u64 __user *out = (u64 __user *)buf;
-	struct page *ppage;
-	unsigned long src = *ppos;
-	unsigned long pfn;
-	ssize_t ret = 0;
 	u64 ino;
 
-	pfn = src / KPMSIZE;
-	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
-	if (src & KPMMASK || count & KPMMASK)
-		return -EINVAL;
-
-	while (count > 0) {
-		/*
-		 * TODO: ZONE_DEVICE support requires to identify
-		 * memmaps that were actually initialized.
-		 */
-		ppage = pfn_to_online_page(pfn);
-
-		if (ppage)
-			ino = page_cgroup_ino(ppage);
-		else
-			ino = 0;
-
-		if (put_user(ino, out)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		pfn++;
-		out++;
-		count -= KPMSIZE;
+	if (page)
+		ino = page_cgroup_ino(page);
+	else
+		ino = 0;
 
-		cond_resched();
-	}
+	return ino;
+}
 
-	*ppos += (char __user *)out - buf;
-	if (!ret)
-		ret = (char __user *)out - buf;
-	return ret;
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	return kpage_common_read(file, buf, count, ppos, page_cgroup_data);
 }
 
 static const struct file_operations proc_kpagecgroup_operations = {
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c Toshiki Fukasawa
@ 2019-11-08  0:08 ` Toshiki Fukasawa
  2019-11-08 19:13   ` Dan Williams
  2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
  2019-11-08  9:18 ` [PATCH 0/3] " Michal Hocko
  3 siblings, 1 reply; 22+ messages in thread
From: Toshiki Fukasawa @ 2019-11-08  0:08 UTC (permalink / raw)
  To: linux-mm, dan.j.williams
  Cc: linux-kernel, akpm, mhocko, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

Currently, there is no way to identify pfn on ZONE_DEVICE.
Identifying pfn on system memory can be done by using a
section-level flag. On the other hand, identifying pfn on
ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
can be created in units of subsections.

This patch introduces a new bitmap subsection_dev_map so that
we can identify pfn on ZONE_DEVICE.

Also, subsection_dev_map is used to prove that struct pages
included in the subsection have been initialized since it is
set after memmap_init_zone_device(). We can avoid accessing
pages currently being initialized by checking subsection_dev_map.

Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
---
 include/linux/mmzone.h | 19 +++++++++++++++++++
 mm/memremap.c          |  2 ++
 mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bda2028..11376c4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 
 struct mem_section_usage {
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
+#ifdef CONFIG_ZONE_DEVICE
+	DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
+#endif
 	/* See declaration of similar field in struct zone */
 	unsigned long pageblock_flags[0];
 };
 
 void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
+#ifdef CONFIG_ZONE_DEVICE
+void subsections_mark_device(unsigned long start_pfn, unsigned long size);
+#endif
 
 struct page;
 struct page_ext;
@@ -1367,6 +1373,19 @@ static inline int pfn_present(unsigned long pfn)
 	return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 
+static inline int pfn_zone_device(unsigned long pfn)
+{
+#ifdef CONFIG_ZONE_DEVICE
+	if (pfn_valid(pfn)) {
+		struct mem_section *ms = __pfn_to_section(pfn);
+		int idx = subsection_map_index(pfn);
+
+		return test_bit(idx, ms->usage->subsection_dev_map);
+	}
+#endif
+	return 0;
+}
+
 /*
  * These are _only_ used during initialisation, therefore they
  * can use __initdata ...  They could have names to indicate
diff --git a/mm/memremap.c b/mm/memremap.c
index 03ccbdf..8a97fd4 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -303,6 +303,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 				PHYS_PFN(res->start),
 				PHYS_PFN(resource_size(res)), pgmap);
+	subsections_mark_device(PHYS_PFN(res->start),
+				PHYS_PFN(resource_size(res)));
 	percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
 	return __va(res->start);
 
diff --git a/mm/sparse.c b/mm/sparse.c
index f6891c1..a3fc9e0a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -603,6 +603,31 @@ void __init sparse_init(void)
 	vmemmap_populate_print_last();
 }
 
+#ifdef CONFIG_ZONE_DEVICE
+void subsections_mark_device(unsigned long start_pfn, unsigned long size)
+{
+	struct mem_section *ms;
+	unsigned long *dev_map;
+	unsigned long sec, start_sec, end_sec, pfns;
+
+	start_sec = pfn_to_section_nr(start_pfn);
+	end_sec = pfn_to_section_nr(start_pfn + size - 1);
+	for (sec = start_sec; sec <= end_sec;
+	     sec++, start_pfn += pfns, size -= pfns) {
+		pfns = min(size, PAGES_PER_SECTION
+			- (start_pfn & ~PAGE_SECTION_MASK));
+		if (WARN_ON(!valid_section_nr(sec)))
+			continue;
+		ms = __pfn_to_section(start_pfn);
+		if (!ms->usage)
+			continue;
+
+		dev_map = &ms->usage->subsection_dev_map[0];
+		subsection_mask_set(dev_map, start_pfn, pfns);
+	}
+}
+#endif
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 
 /* Mark all memory sections within the pfn range as online */
@@ -782,7 +807,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
 	}
+#ifdef CONFIG_ZONE_DEVICE
+	/* deactivation of a partial section on ZONE_DEVICE */
+	if (ms->usage) {
+		unsigned long *dev_map = &ms->usage->subsection_dev_map[0];
 
+		bitmap_andnot(dev_map, dev_map, map, SUBSECTIONS_PER_SECTION);
+	}
+#endif
 	if (section_is_early && memmap)
 		free_map_bootmem(memmap);
 	else
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE
  2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c Toshiki Fukasawa
  2019-11-08  0:08 ` [PATCH 2/3] mm: Introduce subsection_dev_map Toshiki Fukasawa
@ 2019-11-08  0:08 ` Toshiki Fukasawa
  2019-11-09 17:08   ` kbuild test robot
  2019-11-09 19:14   ` kbuild test robot
  2019-11-08  9:18 ` [PATCH 0/3] " Michal Hocko
  3 siblings, 2 replies; 22+ messages in thread
From: Toshiki Fukasawa @ 2019-11-08  0:08 UTC (permalink / raw)
  To: linux-mm, dan.j.williams
  Cc: linux-kernel, akpm, mhocko, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

This patch allows pfn walker to read pages on ZONE_DEVICE.
There are the following notes:

	a) The reserved pages indicated by vmem_altmap->reserve
	   are uninitialized, so it must be skipped to read.
	b) To get vmem_altmap, we need to use get_dev_pagemap(),
	   but doing it for all pfns is too slow.

This patch solves both of them. Since vmem_altmap could reserve
only first few pages, we can reduce the number of checks by
counting sequential valid pages.

Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
---
 fs/proc/page.c           | 22 ++++++++++++++++++----
 include/linux/memremap.h |  6 ++++++
 mm/memremap.c            | 29 +++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/fs/proc/page.c b/fs/proc/page.c
index a49b638..b6241ea 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -33,6 +33,7 @@ static ssize_t kpage_common_read(struct file *file, char __user *buf,
 	struct page *ppage;
 	unsigned long src = *ppos;
 	unsigned long pfn;
+	unsigned long valid_pages = 0;
 	ssize_t ret = 0;
 
 	pfn = src / KPMSIZE;
@@ -41,11 +42,24 @@ static ssize_t kpage_common_read(struct file *file, char __user *buf,
 		return -EINVAL;
 
 	while (count > 0) {
-		/*
-		 * TODO: ZONE_DEVICE support requires to identify
-		 * memmaps that were actually initialized.
-		 */
 		ppage = pfn_to_online_page(pfn);
+		if (!ppage && pfn_zone_device(pfn)) {
+			/*
+			 * Skip to read first few uninitialized pages on
+			 * ZONE_DEVICE. And count valid pages starting
+			 * with the pfn so that minimize the number of
+			 * calls to nr_valid_pages_zone_device().
+			 */
+			if (!valid_pages)
+				valid_pages = nr_valid_pages_zone_device(pfn);
+			if (valid_pages) {
+				ppage = pfn_to_page(pfn);
+				valid_pages--;
+			}
+		} else if (valid_pages) {
+			/* ZONE_DEVICE has been hot removed */
+			valid_pages = 0;
+		}
 
 		if (put_user(read_fn(ppage), out)) {
 			ret = -EFAULT;
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 6fefb09..d111ae3 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -123,6 +123,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
 }
 
 #ifdef CONFIG_ZONE_DEVICE
+unsigned long nr_valid_pages_zone_device(unsigned long pfn);
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -133,6 +134,11 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
 #else
+static inline unsigned long nr_valid_pages_zone_device(unsigned long pfn)
+{
+	return 0;
+}
+
 static inline void *devm_memremap_pages(struct device *dev,
 		struct dev_pagemap *pgmap)
 {
diff --git a/mm/memremap.c b/mm/memremap.c
index 8a97fd4..307c73e 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -73,6 +73,35 @@ static unsigned long pfn_next(unsigned long pfn)
 	return pfn + 1;
 }
 
+/*
+ * This returns the number of sequential valid pages starting from @pfn
+ * on ZONE_DEVICE. The invalid pages reserved by driver is first few
+ * pages on ZONE_DEVICE.
+ */
+unsigned long nr_valid_pages_zone_device(unsigned long pfn)
+{
+	struct dev_pagemap *pgmap;
+	struct vmem_altmap *altmap;
+	unsigned long pages;
+
+	pgmap = get_dev_pagemap(pfn, NULL);
+	if (!pgmap)
+		return 0;
+	altmap = pgmap_altmap(pgmap);
+	if (altmap && pfn < (altmap->base_pfn + altmap->reserve))
+		pages = 0;
+	else
+		/*
+		 * PHYS_PFN(pgmap->res.end) is end pfn on pgmap
+		 * (not start pfn on next mapping).
+		 */
+		pages = PHYS_PFN(pgmap->res.end) - pfn + 1;
+
+	put_dev_pagemap(pgmap);
+
+	return pages;
+}
+
 #define for_each_device_pfn(pfn, map) \
 	for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
 
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/3] make pfn walker support ZONE_DEVICE
  2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
                   ` (2 preceding siblings ...)
  2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
@ 2019-11-08  9:18 ` Michal Hocko
  2019-11-11  8:00   ` Toshiki Fukasawa
  3 siblings, 1 reply; 22+ messages in thread
From: Michal Hocko @ 2019-11-08  9:18 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: linux-mm, dan.j.williams, linux-kernel, akpm, adobriyan, hch,
	longman, sfr, mst, cai, Naoya Horiguchi, Junichi Nomura

On Fri 08-11-19 00:08:03, Toshiki Fukasawa wrote:
> This patch set tries to make pfn walker support ZONE_DEVICE.
> This idea is from the TODO in below patch:
> 
>   commit aad5f69bc161af489dbb5934868bd347282f0764
>   Author: David Hildenbrand <david@redhat.com>
>   Date:   Fri Oct 18 20:19:20 2019 -0700
> 
> 	fs/proc/page.c: don't access uninitialized memmaps in fs/proc/page.c
> 
> pfn walker's ZONE_DEVICE support requires capability to identify
> that a memmap has been initialized. The uninitialized cases are 
> as follows:
> 
> 	a) pages reserved for ZONE_DEVICE driver
> 	b) pages currently initializing
> 
> This patch set solves both of them.

Why do we want this? What is the usecase?

> 
> Toshiki Fukasawa (3):
>   procfs: refactor kpage_*_read() in fs/proc/page.c
>   mm: Introduce subsection_dev_map
>   mm: make pfn walker support ZONE_DEVICE
> 
>  fs/proc/page.c           | 155 ++++++++++++++++++++---------------------------
>  include/linux/memremap.h |   6 ++
>  include/linux/mmzone.h   |  19 ++++++
>  mm/memremap.c            |  31 ++++++++++
>  mm/sparse.c              |  32 ++++++++++
>  5 files changed, 154 insertions(+), 89 deletions(-)
> 
> -- 
> 1.8.3.1
> 

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-08  0:08 ` [PATCH 2/3] mm: Introduce subsection_dev_map Toshiki Fukasawa
@ 2019-11-08 19:13   ` Dan Williams
  2019-11-13 18:51     ` David Hildenbrand
  0 siblings, 1 reply; 22+ messages in thread
From: Dan Williams @ 2019-11-08 19:13 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: linux-mm, linux-kernel, akpm, mhocko, adobriyan, hch, longman,
	sfr, mst, cai, Naoya Horiguchi, Junichi Nomura

On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
<t-fukasawa@vx.jp.nec.com> wrote:
>
> Currently, there is no way to identify pfn on ZONE_DEVICE.
> Identifying pfn on system memory can be done by using a
> section-level flag. On the other hand, identifying pfn on
> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
> can be created in units of subsections.
>
> This patch introduces a new bitmap subsection_dev_map so that
> we can identify pfn on ZONE_DEVICE.
>
> Also, subsection_dev_map is used to prove that struct pages
> included in the subsection have been initialized since it is
> set after memmap_init_zone_device(). We can avoid accessing
> pages currently being initialized by checking subsection_dev_map.
>
> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
> ---
>  include/linux/mmzone.h | 19 +++++++++++++++++++
>  mm/memremap.c          |  2 ++
>  mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>  3 files changed, 53 insertions(+)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index bda2028..11376c4 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>
>  struct mem_section_usage {
>         DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
> +#ifdef CONFIG_ZONE_DEVICE
> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
> +#endif

Hi Toshiki,

There is currently an effort to remove the PageReserved() flag as some
code is using that to detect ZONE_DEVICE. In reviewing those patches
we realized that what many code paths want is to detect online memory.
So instead of a subsection_dev_map add a subsection_online_map. That
way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
otherwise question the use case for pfn_walkers to return pages for
ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
== false is the right behavior.


>         /* See declaration of similar field in struct zone */
>         unsigned long pageblock_flags[0];
>  };
>
>  void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
> +#ifdef CONFIG_ZONE_DEVICE
> +void subsections_mark_device(unsigned long start_pfn, unsigned long size);
> +#endif
>
>  struct page;
>  struct page_ext;
> @@ -1367,6 +1373,19 @@ static inline int pfn_present(unsigned long pfn)
>         return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
>  }
>
> +static inline int pfn_zone_device(unsigned long pfn)
> +{
> +#ifdef CONFIG_ZONE_DEVICE
> +       if (pfn_valid(pfn)) {
> +               struct mem_section *ms = __pfn_to_section(pfn);
> +               int idx = subsection_map_index(pfn);
> +
> +               return test_bit(idx, ms->usage->subsection_dev_map);
> +       }
> +#endif
> +       return 0;
> +}
> +
>  /*
>   * These are _only_ used during initialisation, therefore they
>   * can use __initdata ...  They could have names to indicate
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 03ccbdf..8a97fd4 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -303,6 +303,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
>         memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
>                                 PHYS_PFN(res->start),
>                                 PHYS_PFN(resource_size(res)), pgmap);
> +       subsections_mark_device(PHYS_PFN(res->start),
> +                               PHYS_PFN(resource_size(res)));
>         percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
>         return __va(res->start);
>
> diff --git a/mm/sparse.c b/mm/sparse.c
> index f6891c1..a3fc9e0a 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -603,6 +603,31 @@ void __init sparse_init(void)
>         vmemmap_populate_print_last();
>  }
>
> +#ifdef CONFIG_ZONE_DEVICE
> +void subsections_mark_device(unsigned long start_pfn, unsigned long size)
> +{
> +       struct mem_section *ms;
> +       unsigned long *dev_map;
> +       unsigned long sec, start_sec, end_sec, pfns;
> +
> +       start_sec = pfn_to_section_nr(start_pfn);
> +       end_sec = pfn_to_section_nr(start_pfn + size - 1);
> +       for (sec = start_sec; sec <= end_sec;
> +            sec++, start_pfn += pfns, size -= pfns) {
> +               pfns = min(size, PAGES_PER_SECTION
> +                       - (start_pfn & ~PAGE_SECTION_MASK));
> +               if (WARN_ON(!valid_section_nr(sec)))
> +                       continue;
> +               ms = __pfn_to_section(start_pfn);
> +               if (!ms->usage)
> +                       continue;
> +
> +               dev_map = &ms->usage->subsection_dev_map[0];
> +               subsection_mask_set(dev_map, start_pfn, pfns);
> +       }
> +}
> +#endif
> +
>  #ifdef CONFIG_MEMORY_HOTPLUG
>
>  /* Mark all memory sections within the pfn range as online */
> @@ -782,7 +807,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
>                 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
>                 ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
>         }
> +#ifdef CONFIG_ZONE_DEVICE
> +       /* deactivation of a partial section on ZONE_DEVICE */
> +       if (ms->usage) {
> +               unsigned long *dev_map = &ms->usage->subsection_dev_map[0];
>
> +               bitmap_andnot(dev_map, dev_map, map, SUBSECTIONS_PER_SECTION);
> +       }
> +#endif
>         if (section_is_early && memmap)
>                 free_map_bootmem(memmap);
>         else
> --
> 1.8.3.1
>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE
  2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
@ 2019-11-09 17:08   ` kbuild test robot
  2019-11-09 19:14   ` kbuild test robot
  1 sibling, 0 replies; 22+ messages in thread
From: kbuild test robot @ 2019-11-09 17:08 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: kbuild-all, linux-mm, dan.j.williams, linux-kernel, akpm, mhocko,
	adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura

[-- Attachment #1: Type: text/plain, Size: 3187 bytes --]

Hi Toshiki,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.4-rc6]
[cannot apply to next-20191108]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Toshiki-Fukasawa/make-pfn-walker-support-ZONE_DEVICE/20191110-000508
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 0058b0a506e40d9a2c62015fe92eb64a44d78cd9
config: i386-defconfig (attached as .config)
compiler: gcc-7 (Debian 7.4.0-14) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   fs/proc/page.c: In function 'kpage_common_read':
>> fs/proc/page.c:46:17: error: implicit declaration of function 'pfn_zone_device'; did you mean 'bus_find_device'? [-Werror=implicit-function-declaration]
      if (!ppage && pfn_zone_device(pfn)) {
                    ^~~~~~~~~~~~~~~
                    bus_find_device
   cc1: some warnings being treated as errors

vim +46 fs/proc/page.c

    25	
    26	/*
    27	 * This is general function to read various data on pages.
    28	 */
    29	static ssize_t kpage_common_read(struct file *file, char __user *buf,
    30			size_t count, loff_t *ppos, read_page_data_fn_t read_fn)
    31	{
    32		u64 __user *out = (u64 __user *)buf;
    33		struct page *ppage;
    34		unsigned long src = *ppos;
    35		unsigned long pfn;
    36		unsigned long valid_pages = 0;
    37		ssize_t ret = 0;
    38	
    39		pfn = src / KPMSIZE;
    40		count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
    41		if (src & KPMMASK || count & KPMMASK)
    42			return -EINVAL;
    43	
    44		while (count > 0) {
    45			ppage = pfn_to_online_page(pfn);
  > 46			if (!ppage && pfn_zone_device(pfn)) {
    47				/*
    48				 * Skip to read first few uninitialized pages on
    49				 * ZONE_DEVICE. And count valid pages starting
    50				 * with the pfn so that minimize the number of
    51				 * calls to nr_valid_pages_zone_device().
    52				 */
    53				if (!valid_pages)
    54					valid_pages = nr_valid_pages_zone_device(pfn);
    55				if (valid_pages) {
    56					ppage = pfn_to_page(pfn);
    57					valid_pages--;
    58				}
    59			} else if (valid_pages) {
    60				/* ZONE_DEVICE has been hot removed */
    61				valid_pages = 0;
    62			}
    63	
    64			if (put_user(read_fn(ppage), out)) {
    65				ret = -EFAULT;
    66				break;
    67			}
    68	
    69			pfn++;
    70			out++;
    71			count -= KPMSIZE;
    72	
    73			cond_resched();
    74		}
    75	
    76		*ppos += (char __user *)out - buf;
    77		if (!ret)
    78			ret = (char __user *)out - buf;
    79		return ret;
    80	}
    81	

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28148 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE
  2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
  2019-11-09 17:08   ` kbuild test robot
@ 2019-11-09 19:14   ` kbuild test robot
  1 sibling, 0 replies; 22+ messages in thread
From: kbuild test robot @ 2019-11-09 19:14 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: kbuild-all, linux-mm, dan.j.williams, linux-kernel, akpm, mhocko,
	adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura

[-- Attachment #1: Type: text/plain, Size: 5542 bytes --]

Hi Toshiki,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[cannot apply to v5.4-rc6 next-20191108]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Toshiki-Fukasawa/make-pfn-walker-support-ZONE_DEVICE/20191110-000508
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 0058b0a506e40d9a2c62015fe92eb64a44d78cd9
config: i386-randconfig-b002-201945 (attached as .config)
compiler: gcc-7 (Debian 7.4.0-14) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   fs/proc/page.c: In function 'kpage_common_read':
>> fs/proc/page.c:46:17: error: implicit declaration of function 'pfn_zone_device'; did you mean 'pgprot_device'? [-Werror=implicit-function-declaration]
      if (!ppage && pfn_zone_device(pfn)) {
                    ^~~~~~~~~~~~~~~
                    pgprot_device
   Cyclomatic Complexity 5 include/linux/compiler.h:__read_once_size
   Cyclomatic Complexity 1 include/linux/kasan-checks.h:kasan_check_read
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:constant_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:variable_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:arch_atomic_read
   Cyclomatic Complexity 1 include/asm-generic/atomic-instrumented.h:atomic_read
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageTail
   Cyclomatic Complexity 3 include/linux/page-flags.h:PageCompound
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageLRU
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageSlab
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageSwapCache
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageAnon
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageKsm
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageHead
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageTransCompound
   Cyclomatic Complexity 1 include/linux/page-flags.h:page_has_type
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageBuddy
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageOffline
   Cyclomatic Complexity 1 include/linux/page-flags.h:PageTable
   Cyclomatic Complexity 1 include/linux/memremap.h:nr_valid_pages_zone_device
   Cyclomatic Complexity 1 include/linux/sched.h:_cond_resched
   Cyclomatic Complexity 1 include/asm-generic/pgtable.h:is_zero_pfn
   Cyclomatic Complexity 1 include/linux/huge_mm.h:is_huge_zero_page
   Cyclomatic Complexity 1 fs/proc/page.c:kpf_copy_bit
   Cyclomatic Complexity 2 include/asm-generic/bitops-instrumented.h:test_bit
   Cyclomatic Complexity 2 include/linux/page_idle.h:page_is_idle
   Cyclomatic Complexity 2 include/linux/page-flags.h:compound_head
   Cyclomatic Complexity 1 include/linux/page_ref.h:page_count
   Cyclomatic Complexity 1 fs/proc/page.c:proc_page_init
   Cyclomatic Complexity 12 fs/proc/page.c:kpage_common_read
   Cyclomatic Complexity 1 fs/proc/page.c:kpageflags_read
   Cyclomatic Complexity 1 fs/proc/page.c:kpagecount_read
   Cyclomatic Complexity 2 include/linux/mm.h:page_mapcount
   Cyclomatic Complexity 4 fs/proc/page.c:page_count_data
   Cyclomatic Complexity 23 fs/proc/page.c:stable_page_flags
   Cyclomatic Complexity 1 fs/proc/page.c:page_flags_data
   cc1: some warnings being treated as errors

vim +46 fs/proc/page.c

    25	
    26	/*
    27	 * This is general function to read various data on pages.
    28	 */
    29	static ssize_t kpage_common_read(struct file *file, char __user *buf,
    30			size_t count, loff_t *ppos, read_page_data_fn_t read_fn)
    31	{
    32		u64 __user *out = (u64 __user *)buf;
    33		struct page *ppage;
    34		unsigned long src = *ppos;
    35		unsigned long pfn;
    36		unsigned long valid_pages = 0;
    37		ssize_t ret = 0;
    38	
    39		pfn = src / KPMSIZE;
    40		count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
    41		if (src & KPMMASK || count & KPMMASK)
    42			return -EINVAL;
    43	
    44		while (count > 0) {
    45			ppage = pfn_to_online_page(pfn);
  > 46			if (!ppage && pfn_zone_device(pfn)) {
    47				/*
    48				 * Skip to read first few uninitialized pages on
    49				 * ZONE_DEVICE. And count valid pages starting
    50				 * with the pfn so that minimize the number of
    51				 * calls to nr_valid_pages_zone_device().
    52				 */
    53				if (!valid_pages)
    54					valid_pages = nr_valid_pages_zone_device(pfn);
    55				if (valid_pages) {
    56					ppage = pfn_to_page(pfn);
    57					valid_pages--;
    58				}
    59			} else if (valid_pages) {
    60				/* ZONE_DEVICE has been hot removed */
    61				valid_pages = 0;
    62			}
    63	
    64			if (put_user(read_fn(ppage), out)) {
    65				ret = -EFAULT;
    66				break;
    67			}
    68	
    69			pfn++;
    70			out++;
    71			count -= KPMSIZE;
    72	
    73			cond_resched();
    74		}
    75	
    76		*ppos += (char __user *)out - buf;
    77		if (!ret)
    78			ret = (char __user *)out - buf;
    79		return ret;
    80	}
    81	

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28178 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/3] make pfn walker support ZONE_DEVICE
  2019-11-08  9:18 ` [PATCH 0/3] " Michal Hocko
@ 2019-11-11  8:00   ` Toshiki Fukasawa
  2019-11-11 16:23     ` Dan Williams
  0 siblings, 1 reply; 22+ messages in thread
From: Toshiki Fukasawa @ 2019-11-11  8:00 UTC (permalink / raw)
  To: Michal Hocko, dan.j.williams
  Cc: linux-mm, linux-kernel, akpm, adobriyan, hch, longman, sfr, mst,
	cai, Naoya Horiguchi, Junichi Nomura

On 2019/11/08 18:18, Michal Hocko wrote:
> On Fri 08-11-19 00:08:03, Toshiki Fukasawa wrote:
>> This patch set tries to make pfn walker support ZONE_DEVICE.
>> This idea is from the TODO in below patch:
>>
>>    commit aad5f69bc161af489dbb5934868bd347282f0764
>>    Author: David Hildenbrand <david@redhat.com>
>>    Date:   Fri Oct 18 20:19:20 2019 -0700
>>
>> 	fs/proc/page.c: don't access uninitialized memmaps in fs/proc/page.c
>>
>> pfn walker's ZONE_DEVICE support requires capability to identify
>> that a memmap has been initialized. The uninitialized cases are
>> as follows:
>>
>> 	a) pages reserved for ZONE_DEVICE driver
>> 	b) pages currently initializing
>>
>> This patch set solves both of them.
> 
> Why do we want this? What is the usecase?

We are writing a test program for hwpoison, which is a use case.
Without this patch, we can't see the HWPOISON flag on the
ZONE_DEVICE page.

Thanks,
Toshiki Fukasawa


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/3] make pfn walker support ZONE_DEVICE
  2019-11-11  8:00   ` Toshiki Fukasawa
@ 2019-11-11 16:23     ` Dan Williams
  0 siblings, 0 replies; 22+ messages in thread
From: Dan Williams @ 2019-11-11 16:23 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: Michal Hocko, linux-mm, linux-kernel, akpm, adobriyan, hch,
	longman, sfr, mst, cai, Naoya Horiguchi, Junichi Nomura

On Mon, Nov 11, 2019 at 12:01 AM Toshiki Fukasawa
<t-fukasawa@vx.jp.nec.com> wrote:
>
> On 2019/11/08 18:18, Michal Hocko wrote:
> > On Fri 08-11-19 00:08:03, Toshiki Fukasawa wrote:
> >> This patch set tries to make pfn walker support ZONE_DEVICE.
> >> This idea is from the TODO in below patch:
> >>
> >>    commit aad5f69bc161af489dbb5934868bd347282f0764
> >>    Author: David Hildenbrand <david@redhat.com>
> >>    Date:   Fri Oct 18 20:19:20 2019 -0700
> >>
> >>      fs/proc/page.c: don't access uninitialized memmaps in fs/proc/page.c
> >>
> >> pfn walker's ZONE_DEVICE support requires capability to identify
> >> that a memmap has been initialized. The uninitialized cases are
> >> as follows:
> >>
> >>      a) pages reserved for ZONE_DEVICE driver
> >>      b) pages currently initializing
> >>
> >> This patch set solves both of them.
> >
> > Why do we want this? What is the usecase?
>
> We are writing a test program for hwpoison, which is a use case.
> Without this patch, we can't see the HWPOISON flag on the
> ZONE_DEVICE page.

I'm not sure that's a goal that's a worthwhile goal. That hwpoison
flag has specific meaning for the System RAM case where the page is
going to be marked offline. For the pmem case the nvdimm core tracks
'badblocks'. I did attempt to use PageHWPoison to track which pmem
pages had been marked UC to prevent speculative consumption, but that
implementation has been found to collide with lookup_memtype() so I'm
looking ot replace it with something that consults with the pmem
driver.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-08 19:13   ` Dan Williams
@ 2019-11-13 18:51     ` David Hildenbrand
  2019-11-13 19:06       ` Dan Williams
  0 siblings, 1 reply; 22+ messages in thread
From: David Hildenbrand @ 2019-11-13 18:51 UTC (permalink / raw)
  To: Dan Williams, Toshiki Fukasawa
  Cc: linux-mm, linux-kernel, akpm, mhocko, adobriyan, hch, longman,
	sfr, mst, cai, Naoya Horiguchi, Junichi Nomura

On 08.11.19 20:13, Dan Williams wrote:
> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
> <t-fukasawa@vx.jp.nec.com> wrote:
>>
>> Currently, there is no way to identify pfn on ZONE_DEVICE.
>> Identifying pfn on system memory can be done by using a
>> section-level flag. On the other hand, identifying pfn on
>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
>> can be created in units of subsections.
>>
>> This patch introduces a new bitmap subsection_dev_map so that
>> we can identify pfn on ZONE_DEVICE.
>>
>> Also, subsection_dev_map is used to prove that struct pages
>> included in the subsection have been initialized since it is
>> set after memmap_init_zone_device(). We can avoid accessing
>> pages currently being initialized by checking subsection_dev_map.
>>
>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
>> ---
>>   include/linux/mmzone.h | 19 +++++++++++++++++++
>>   mm/memremap.c          |  2 ++
>>   mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>>   3 files changed, 53 insertions(+)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index bda2028..11376c4 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>>
>>   struct mem_section_usage {
>>          DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
>> +#ifdef CONFIG_ZONE_DEVICE
>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
>> +#endif
> 
> Hi Toshiki,
> 
> There is currently an effort to remove the PageReserved() flag as some
> code is using that to detect ZONE_DEVICE. In reviewing those patches
> we realized that what many code paths want is to detect online memory.
> So instead of a subsection_dev_map add a subsection_online_map. That
> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
> otherwise question the use case for pfn_walkers to return pages for
> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
> == false is the right behavior.

To be more precise, I recommended an subsection_active_map, to indicate 
which memmaps were fully initialized and can safely be touched (e.g., to 
read the zone/nid). This map would also be set when the devmem memmaps 
were initialized (race between adding memory/growing the section and 
initializing the memmap).

See

https://lkml.org/lkml/2019/10/10/87

and

https://www.spinics.net/lists/linux-driver-devel/msg130012.html

I dislike a map that is specific to ZONE_DEVICE or (currently) 
!ZONE_DEVICE. I rather want an indication "this memmap is safe to 
touch". As discussed along the mentioned threads, we can combine this 
later with RCU to handle some races that are currently possible.

-- 

Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-13 18:51     ` David Hildenbrand
@ 2019-11-13 19:06       ` Dan Williams
  2019-11-13 19:53         ` David Hildenbrand
  0 siblings, 1 reply; 22+ messages in thread
From: Dan Williams @ 2019-11-13 19:06 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Toshiki Fukasawa, linux-mm, linux-kernel, akpm, mhocko,
	adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura

On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 08.11.19 20:13, Dan Williams wrote:
> > On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
> > <t-fukasawa@vx.jp.nec.com> wrote:
> >>
> >> Currently, there is no way to identify pfn on ZONE_DEVICE.
> >> Identifying pfn on system memory can be done by using a
> >> section-level flag. On the other hand, identifying pfn on
> >> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
> >> can be created in units of subsections.
> >>
> >> This patch introduces a new bitmap subsection_dev_map so that
> >> we can identify pfn on ZONE_DEVICE.
> >>
> >> Also, subsection_dev_map is used to prove that struct pages
> >> included in the subsection have been initialized since it is
> >> set after memmap_init_zone_device(). We can avoid accessing
> >> pages currently being initialized by checking subsection_dev_map.
> >>
> >> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
> >> ---
> >>   include/linux/mmzone.h | 19 +++++++++++++++++++
> >>   mm/memremap.c          |  2 ++
> >>   mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
> >>   3 files changed, 53 insertions(+)
> >>
> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >> index bda2028..11376c4 100644
> >> --- a/include/linux/mmzone.h
> >> +++ b/include/linux/mmzone.h
> >> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
> >>
> >>   struct mem_section_usage {
> >>          DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
> >> +#ifdef CONFIG_ZONE_DEVICE
> >> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
> >> +#endif
> >
> > Hi Toshiki,
> >
> > There is currently an effort to remove the PageReserved() flag as some
> > code is using that to detect ZONE_DEVICE. In reviewing those patches
> > we realized that what many code paths want is to detect online memory.
> > So instead of a subsection_dev_map add a subsection_online_map. That
> > way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
> > otherwise question the use case for pfn_walkers to return pages for
> > ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
> > == false is the right behavior.
>
> To be more precise, I recommended an subsection_active_map, to indicate
> which memmaps were fully initialized and can safely be touched (e.g., to
> read the zone/nid). This map would also be set when the devmem memmaps
> were initialized (race between adding memory/growing the section and
> initializing the memmap).
>
> See
>
> https://lkml.org/lkml/2019/10/10/87
>
> and
>
> https://www.spinics.net/lists/linux-driver-devel/msg130012.html

I'm still struggling to understand the motivation of distinguishing
"active" as something distinct from "online". As long as the "online"
granularity is improved from sections down to subsections then most
code paths are good to go. The others can use get_devpagemap() to
check for ZONE_DEVICE in a race free manner as they currently do.

> I dislike a map that is specific to ZONE_DEVICE or (currently)
> !ZONE_DEVICE. I rather want an indication "this memmap is safe to
> touch". As discussed along the mentioned threads, we can combine this
> later with RCU to handle some races that are currently possible.

The rcu protection is independent of the pfn_active vs pfn_online
distinction afaics.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-13 19:06       ` Dan Williams
@ 2019-11-13 19:53         ` David Hildenbrand
  2019-11-13 20:10           ` Dan Williams
  0 siblings, 1 reply; 22+ messages in thread
From: David Hildenbrand @ 2019-11-13 19:53 UTC (permalink / raw)
  To: Dan Williams
  Cc: David Hildenbrand, Toshiki Fukasawa, linux-mm, linux-kernel,
	akpm, mhocko, adobriyan, hch, longman, sfr, mst, cai,
	Naoya Horiguchi, Junichi Nomura



> Am 13.11.2019 um 20:06 schrieb Dan Williams <dan.j.williams@intel.com>:
> 
> On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
>> 
>>> On 08.11.19 20:13, Dan Williams wrote:
>>> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
>>> <t-fukasawa@vx.jp.nec.com> wrote:
>>>> 
>>>> Currently, there is no way to identify pfn on ZONE_DEVICE.
>>>> Identifying pfn on system memory can be done by using a
>>>> section-level flag. On the other hand, identifying pfn on
>>>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
>>>> can be created in units of subsections.
>>>> 
>>>> This patch introduces a new bitmap subsection_dev_map so that
>>>> we can identify pfn on ZONE_DEVICE.
>>>> 
>>>> Also, subsection_dev_map is used to prove that struct pages
>>>> included in the subsection have been initialized since it is
>>>> set after memmap_init_zone_device(). We can avoid accessing
>>>> pages currently being initialized by checking subsection_dev_map.
>>>> 
>>>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
>>>> ---
>>>>  include/linux/mmzone.h | 19 +++++++++++++++++++
>>>>  mm/memremap.c          |  2 ++
>>>>  mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>>>>  3 files changed, 53 insertions(+)
>>>> 
>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>> index bda2028..11376c4 100644
>>>> --- a/include/linux/mmzone.h
>>>> +++ b/include/linux/mmzone.h
>>>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>>>> 
>>>>  struct mem_section_usage {
>>>>         DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
>>>> +#ifdef CONFIG_ZONE_DEVICE
>>>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
>>>> +#endif
>>> 
>>> Hi Toshiki,
>>> 
>>> There is currently an effort to remove the PageReserved() flag as some
>>> code is using that to detect ZONE_DEVICE. In reviewing those patches
>>> we realized that what many code paths want is to detect online memory.
>>> So instead of a subsection_dev_map add a subsection_online_map. That
>>> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
>>> otherwise question the use case for pfn_walkers to return pages for
>>> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
>>> == false is the right behavior.
>> 
>> To be more precise, I recommended an subsection_active_map, to indicate
>> which memmaps were fully initialized and can safely be touched (e.g., to
>> read the zone/nid). This map would also be set when the devmem memmaps
>> were initialized (race between adding memory/growing the section and
>> initializing the memmap).
>> 
>> See
>> 
>> https://lkml.org/lkml/2019/10/10/87
>> 
>> and
>> 
>> https://www.spinics.net/lists/linux-driver-devel/msg130012.html
> 
> I'm still struggling to understand the motivation of distinguishing
> "active" as something distinct from "online". As long as the "online"
> granularity is improved from sections down to subsections then most
> code paths are good to go. The others can use get_devpagemap() to
> check for ZONE_DEVICE in a race free manner as they currently do.

I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking. Anyhow, a subsection online map would be a good start, we can reuse that later for ZONE_DEVICE as well.

> 
>> I dislike a map that is specific to ZONE_DEVICE or (currently)
>> !ZONE_DEVICE. I rather want an indication "this memmap is safe to
>> touch". As discussed along the mentioned threads, we can combine this
>> later with RCU to handle some races that are currently possible.
> 
> The rcu protection is independent of the pfn_active vs pfn_online
> distinction afaics.

It’s one part of the bigger picture IMHO.

> 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-13 19:53         ` David Hildenbrand
@ 2019-11-13 20:10           ` Dan Williams
  2019-11-13 20:23             ` David Hildenbrand
  0 siblings, 1 reply; 22+ messages in thread
From: Dan Williams @ 2019-11-13 20:10 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Toshiki Fukasawa, linux-mm, linux-kernel, akpm, mhocko,
	adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura

On Wed, Nov 13, 2019 at 11:53 AM David Hildenbrand <david@redhat.com> wrote:
>
>
>
> > Am 13.11.2019 um 20:06 schrieb Dan Williams <dan.j.williams@intel.com>:
> >
> > On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
> >>
> >>> On 08.11.19 20:13, Dan Williams wrote:
> >>> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
> >>> <t-fukasawa@vx.jp.nec.com> wrote:
> >>>>
> >>>> Currently, there is no way to identify pfn on ZONE_DEVICE.
> >>>> Identifying pfn on system memory can be done by using a
> >>>> section-level flag. On the other hand, identifying pfn on
> >>>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
> >>>> can be created in units of subsections.
> >>>>
> >>>> This patch introduces a new bitmap subsection_dev_map so that
> >>>> we can identify pfn on ZONE_DEVICE.
> >>>>
> >>>> Also, subsection_dev_map is used to prove that struct pages
> >>>> included in the subsection have been initialized since it is
> >>>> set after memmap_init_zone_device(). We can avoid accessing
> >>>> pages currently being initialized by checking subsection_dev_map.
> >>>>
> >>>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
> >>>> ---
> >>>>  include/linux/mmzone.h | 19 +++++++++++++++++++
> >>>>  mm/memremap.c          |  2 ++
> >>>>  mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
> >>>>  3 files changed, 53 insertions(+)
> >>>>
> >>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >>>> index bda2028..11376c4 100644
> >>>> --- a/include/linux/mmzone.h
> >>>> +++ b/include/linux/mmzone.h
> >>>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
> >>>>
> >>>>  struct mem_section_usage {
> >>>>         DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
> >>>> +#ifdef CONFIG_ZONE_DEVICE
> >>>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
> >>>> +#endif
> >>>
> >>> Hi Toshiki,
> >>>
> >>> There is currently an effort to remove the PageReserved() flag as some
> >>> code is using that to detect ZONE_DEVICE. In reviewing those patches
> >>> we realized that what many code paths want is to detect online memory.
> >>> So instead of a subsection_dev_map add a subsection_online_map. That
> >>> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
> >>> otherwise question the use case for pfn_walkers to return pages for
> >>> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
> >>> == false is the right behavior.
> >>
> >> To be more precise, I recommended an subsection_active_map, to indicate
> >> which memmaps were fully initialized and can safely be touched (e.g., to
> >> read the zone/nid). This map would also be set when the devmem memmaps
> >> were initialized (race between adding memory/growing the section and
> >> initializing the memmap).
> >>
> >> See
> >>
> >> https://lkml.org/lkml/2019/10/10/87
> >>
> >> and
> >>
> >> https://www.spinics.net/lists/linux-driver-devel/msg130012.html
> >
> > I'm still struggling to understand the motivation of distinguishing
> > "active" as something distinct from "online". As long as the "online"
> > granularity is improved from sections down to subsections then most
> > code paths are good to go. The others can use get_devpagemap() to
> > check for ZONE_DEVICE in a race free manner as they currently do.
>
> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.

Agree, when the zone does not matter, which is most cases, then
pfn_online() and pfn_valid() are sufficient.

> Anyhow, a subsection online map would be a good start, we can reuse that later for ZONE_DEVICE as well.

Cool, good to go with me sending a patch to introduce pfn_online() and
a corresponding subsection_map for the same?


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-13 20:10           ` Dan Williams
@ 2019-11-13 20:23             ` David Hildenbrand
  2019-11-13 20:40               ` David Hildenbrand
  0 siblings, 1 reply; 22+ messages in thread
From: David Hildenbrand @ 2019-11-13 20:23 UTC (permalink / raw)
  To: Dan Williams
  Cc: David Hildenbrand, Toshiki Fukasawa, linux-mm, linux-kernel,
	akpm, mhocko, adobriyan, hch, longman, sfr, mst, cai,
	Naoya Horiguchi, Junichi Nomura



> Am 13.11.2019 um 21:10 schrieb Dan Williams <dan.j.williams@intel.com>:
> 
> On Wed, Nov 13, 2019 at 11:53 AM David Hildenbrand <david@redhat.com> wrote:
>> 
>> 
>> 
>>>> Am 13.11.2019 um 20:06 schrieb Dan Williams <dan.j.williams@intel.com>:
>>> 
>>> On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
>>>> 
>>>>> On 08.11.19 20:13, Dan Williams wrote:
>>>>> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
>>>>> <t-fukasawa@vx.jp.nec.com> wrote:
>>>>>> 
>>>>>> Currently, there is no way to identify pfn on ZONE_DEVICE.
>>>>>> Identifying pfn on system memory can be done by using a
>>>>>> section-level flag. On the other hand, identifying pfn on
>>>>>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
>>>>>> can be created in units of subsections.
>>>>>> 
>>>>>> This patch introduces a new bitmap subsection_dev_map so that
>>>>>> we can identify pfn on ZONE_DEVICE.
>>>>>> 
>>>>>> Also, subsection_dev_map is used to prove that struct pages
>>>>>> included in the subsection have been initialized since it is
>>>>>> set after memmap_init_zone_device(). We can avoid accessing
>>>>>> pages currently being initialized by checking subsection_dev_map.
>>>>>> 
>>>>>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
>>>>>> ---
>>>>>> include/linux/mmzone.h | 19 +++++++++++++++++++
>>>>>> mm/memremap.c          |  2 ++
>>>>>> mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>>>>>> 3 files changed, 53 insertions(+)
>>>>>> 
>>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>>> index bda2028..11376c4 100644
>>>>>> --- a/include/linux/mmzone.h
>>>>>> +++ b/include/linux/mmzone.h
>>>>>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>>>>>> 
>>>>>> struct mem_section_usage {
>>>>>>        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
>>>>>> +#ifdef CONFIG_ZONE_DEVICE
>>>>>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
>>>>>> +#endif
>>>>> 
>>>>> Hi Toshiki,
>>>>> 
>>>>> There is currently an effort to remove the PageReserved() flag as some
>>>>> code is using that to detect ZONE_DEVICE. In reviewing those patches
>>>>> we realized that what many code paths want is to detect online memory.
>>>>> So instead of a subsection_dev_map add a subsection_online_map. That
>>>>> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
>>>>> otherwise question the use case for pfn_walkers to return pages for
>>>>> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
>>>>> == false is the right behavior.
>>>> 
>>>> To be more precise, I recommended an subsection_active_map, to indicate
>>>> which memmaps were fully initialized and can safely be touched (e.g., to
>>>> read the zone/nid). This map would also be set when the devmem memmaps
>>>> were initialized (race between adding memory/growing the section and
>>>> initializing the memmap).
>>>> 
>>>> See
>>>> 
>>>> https://lkml.org/lkml/2019/10/10/87
>>>> 
>>>> and
>>>> 
>>>> https://www.spinics.net/lists/linux-driver-devel/msg130012.html
>>> 
>>> I'm still struggling to understand the motivation of distinguishing
>>> "active" as something distinct from "online". As long as the "online"
>>> granularity is improved from sections down to subsections then most
>>> code paths are good to go. The others can use get_devpagemap() to
>>> check for ZONE_DEVICE in a race free manner as they currently do.
>> 
>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
> 
> Agree, when the zone does not matter, which is most cases, then
> pfn_online() and pfn_valid() are sufficient.
> 
>> Anyhow, a subsection online map would be a good start, we can reuse that later for ZONE_DEVICE as well.
> 
> Cool, good to go with me sending a patch to introduce pfn_online() and
> a corresponding subsection_map for the same?

Yeah, let‘s see how this turns out and if we‘re on the same page. Thanks!

> 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-13 20:23             ` David Hildenbrand
@ 2019-11-13 20:40               ` David Hildenbrand
  2019-11-13 21:11                 ` Dan Williams
  0 siblings, 1 reply; 22+ messages in thread
From: David Hildenbrand @ 2019-11-13 20:40 UTC (permalink / raw)
  To: Dan Williams
  Cc: Toshiki Fukasawa, linux-mm, linux-kernel, akpm, mhocko,
	adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura



> Am 13.11.2019 um 21:23 schrieb David Hildenbrand <david@redhat.com>:
> 
> 
> 
>>> Am 13.11.2019 um 21:10 schrieb Dan Williams <dan.j.williams@intel.com>:
>>> 
>>> On Wed, Nov 13, 2019 at 11:53 AM David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> 
>>> 
>>>>> Am 13.11.2019 um 20:06 schrieb Dan Williams <dan.j.williams@intel.com>:
>>>> 
>>>> On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
>>>>> 
>>>>>> On 08.11.19 20:13, Dan Williams wrote:
>>>>>> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
>>>>>> <t-fukasawa@vx.jp.nec.com> wrote:
>>>>>>> 
>>>>>>> Currently, there is no way to identify pfn on ZONE_DEVICE.
>>>>>>> Identifying pfn on system memory can be done by using a
>>>>>>> section-level flag. On the other hand, identifying pfn on
>>>>>>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
>>>>>>> can be created in units of subsections.
>>>>>>> 
>>>>>>> This patch introduces a new bitmap subsection_dev_map so that
>>>>>>> we can identify pfn on ZONE_DEVICE.
>>>>>>> 
>>>>>>> Also, subsection_dev_map is used to prove that struct pages
>>>>>>> included in the subsection have been initialized since it is
>>>>>>> set after memmap_init_zone_device(). We can avoid accessing
>>>>>>> pages currently being initialized by checking subsection_dev_map.
>>>>>>> 
>>>>>>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
>>>>>>> ---
>>>>>>> include/linux/mmzone.h | 19 +++++++++++++++++++
>>>>>>> mm/memremap.c          |  2 ++
>>>>>>> mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>>>>>>> 3 files changed, 53 insertions(+)
>>>>>>> 
>>>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>>>> index bda2028..11376c4 100644
>>>>>>> --- a/include/linux/mmzone.h
>>>>>>> +++ b/include/linux/mmzone.h
>>>>>>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>>>>>>> 
>>>>>>> struct mem_section_usage {
>>>>>>>       DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
>>>>>>> +#ifdef CONFIG_ZONE_DEVICE
>>>>>>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
>>>>>>> +#endif
>>>>>> 
>>>>>> Hi Toshiki,
>>>>>> 
>>>>>> There is currently an effort to remove the PageReserved() flag as some
>>>>>> code is using that to detect ZONE_DEVICE. In reviewing those patches
>>>>>> we realized that what many code paths want is to detect online memory.
>>>>>> So instead of a subsection_dev_map add a subsection_online_map. That
>>>>>> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
>>>>>> otherwise question the use case for pfn_walkers to return pages for
>>>>>> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
>>>>>> == false is the right behavior.
>>>>> 
>>>>> To be more precise, I recommended an subsection_active_map, to indicate
>>>>> which memmaps were fully initialized and can safely be touched (e.g., to
>>>>> read the zone/nid). This map would also be set when the devmem memmaps
>>>>> were initialized (race between adding memory/growing the section and
>>>>> initializing the memmap).
>>>>> 
>>>>> See
>>>>> 
>>>>> https://lkml.org/lkml/2019/10/10/87
>>>>> 
>>>>> and
>>>>> 
>>>>> https://www.spinics.net/lists/linux-driver-devel/msg130012.html
>>>> 
>>>> I'm still struggling to understand the motivation of distinguishing
>>>> "active" as something distinct from "online". As long as the "online"
>>>> granularity is improved from sections down to subsections then most
>>>> code paths are good to go. The others can use get_devpagemap() to
>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>> 
>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>> 
>> Agree, when the zone does not matter, which is most cases, then
>> pfn_online() and pfn_valid() are sufficient.

Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.

>> 
>>> Anyhow, a subsection online map would be a good start, we can reuse that later for ZONE_DEVICE as well.
>> 
>> Cool, good to go with me sending a patch to introduce pfn_online() and
>> a corresponding subsection_map for the same?
> 
> Yeah, let‘s see how this turns out and if we‘re on the same page. Thanks!
> 
>> 
> 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-13 20:40               ` David Hildenbrand
@ 2019-11-13 21:11                 ` Dan Williams
  2019-11-13 21:22                   ` David Hildenbrand
  0 siblings, 1 reply; 22+ messages in thread
From: Dan Williams @ 2019-11-13 21:11 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Toshiki Fukasawa, linux-mm, linux-kernel, akpm, mhocko,
	adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura

On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
[..]
> >>>> I'm still struggling to understand the motivation of distinguishing
> >>>> "active" as something distinct from "online". As long as the "online"
> >>>> granularity is improved from sections down to subsections then most
> >>>> code paths are good to go. The others can use get_devpagemap() to
> >>>> check for ZONE_DEVICE in a race free manner as they currently do.
> >>>
> >>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
> >>
> >> Agree, when the zone does not matter, which is most cases, then
> >> pfn_online() and pfn_valid() are sufficient.
>
> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.

That's what I was debating with Toshiki [1], whether there is a real
example of needing to distinguish ZONE_DEVICE from offline memory in a
pfn walker. The proposed use case in this patch set of being able to
set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
me. My suspicion is that this is a common theme and others are looking
to do these types page manipulations that only make sense for online
memory. If that is the case then treating ZONE_DEVICE as offline seems
the right direction.

[1]: https://lore.kernel.org/linux-mm/CAPcyv4joVDwiL21PPyJ7E_mMFR2SL3QTi09VMtfxb_W+-1p8vQ@mail.gmail.com/


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-13 21:11                 ` Dan Williams
@ 2019-11-13 21:22                   ` David Hildenbrand
  2019-11-13 21:26                     ` Dan Williams
  0 siblings, 1 reply; 22+ messages in thread
From: David Hildenbrand @ 2019-11-13 21:22 UTC (permalink / raw)
  To: Dan Williams
  Cc: David Hildenbrand, Toshiki Fukasawa, linux-mm, linux-kernel,
	akpm, mhocko, adobriyan, hch, longman, sfr, mst, cai,
	Naoya Horiguchi, Junichi Nomura, Oscar Salvador



> Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
> 
> On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
> [..]
>>>>>> I'm still struggling to understand the motivation of distinguishing
>>>>>> "active" as something distinct from "online". As long as the "online"
>>>>>> granularity is improved from sections down to subsections then most
>>>>>> code paths are good to go. The others can use get_devpagemap() to
>>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>>>> 
>>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>>>> 
>>>> Agree, when the zone does not matter, which is most cases, then
>>>> pfn_online() and pfn_valid() are sufficient.
>> 
>> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
> 
> That's what I was debating with Toshiki [1], whether there is a real
> example of needing to distinguish ZONE_DEVICE from offline memory in a
> pfn walker. The proposed use case in this patch set of being able to
> set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
> me. My suspicion is that this is a common theme and others are looking
> to do these types page manipulations that only make sense for online
> memory. If that is the case then treating ZONE_DEVICE as offline seems
> the right direction.

Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.

But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).


> 
> [1]: https://lore.kernel.org/linux-mm/CAPcyv4joVDwiL21PPyJ7E_mMFR2SL3QTi09VMtfxb_W+-1p8vQ@mail.gmail.com/
> 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-13 21:22                   ` David Hildenbrand
@ 2019-11-13 21:26                     ` Dan Williams
  2019-11-14 23:36                       ` Toshiki Fukasawa
  0 siblings, 1 reply; 22+ messages in thread
From: Dan Williams @ 2019-11-13 21:26 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Toshiki Fukasawa, linux-mm, linux-kernel, akpm, mhocko,
	adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura, Oscar Salvador

On Wed, Nov 13, 2019 at 1:22 PM David Hildenbrand <david@redhat.com> wrote:
>
>
>
> > Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
> >
> > On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
> > [..]
> >>>>>> I'm still struggling to understand the motivation of distinguishing
> >>>>>> "active" as something distinct from "online". As long as the "online"
> >>>>>> granularity is improved from sections down to subsections then most
> >>>>>> code paths are good to go. The others can use get_devpagemap() to
> >>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
> >>>>>
> >>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
> >>>>
> >>>> Agree, when the zone does not matter, which is most cases, then
> >>>> pfn_online() and pfn_valid() are sufficient.
> >>
> >> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
> >
> > That's what I was debating with Toshiki [1], whether there is a real
> > example of needing to distinguish ZONE_DEVICE from offline memory in a
> > pfn walker. The proposed use case in this patch set of being able to
> > set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
> > me. My suspicion is that this is a common theme and others are looking
> > to do these types page manipulations that only make sense for online
> > memory. If that is the case then treating ZONE_DEVICE as offline seems
> > the right direction.
>
> Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.
>

I think that's ok... It's already zone aware code whereas pfn walkers
are zone unaware and should stay that way if at all possible.

> But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).

Ok, I'll keep an eye out.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-13 21:26                     ` Dan Williams
@ 2019-11-14 23:36                       ` Toshiki Fukasawa
  2019-11-15  0:46                         ` David Hildenbrand
  0 siblings, 1 reply; 22+ messages in thread
From: Toshiki Fukasawa @ 2019-11-14 23:36 UTC (permalink / raw)
  To: Dan Williams, David Hildenbrand
  Cc: linux-mm, linux-kernel, akpm, mhocko, adobriyan, hch, longman,
	sfr, mst, cai, Naoya Horiguchi, Junichi Nomura, Oscar Salvador

On 2019/11/14 6:26, Dan Williams wrote:
> On Wed, Nov 13, 2019 at 1:22 PM David Hildenbrand <david@redhat.com> wrote:
>>
>>
>>
>>> Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
>>>
>>> On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
>>> [..]
>>>>>>>> I'm still struggling to understand the motivation of distinguishing
>>>>>>>> "active" as something distinct from "online". As long as the "online"
>>>>>>>> granularity is improved from sections down to subsections then most
>>>>>>>> code paths are good to go. The others can use get_devpagemap() to
>>>>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>>>>>>
>>>>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>>>>>>
>>>>>> Agree, when the zone does not matter, which is most cases, then
>>>>>> pfn_online() and pfn_valid() are sufficient.
>>>>
>>>> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
>>>
>>> That's what I was debating with Toshiki [1], whether there is a real
>>> example of needing to distinguish ZONE_DEVICE from offline memory in a
>>> pfn walker. The proposed use case in this patch set of being able to
>>> set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
>>> me. My suspicion is that this is a common theme and others are looking
>>> to do these types page manipulations that only make sense for online
>>> memory. If that is the case then treating ZONE_DEVICE as offline seems
>>> the right direction.
>>
>> Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.
>>
> 
> I think that's ok... It's already zone aware code whereas pfn walkers
> are zone unaware and should stay that way if at all possible.
> 
>> But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).
> 
> Ok, I'll keep an eye out.

I understand your point. Thanks!

By the way, I found another problem about ZONE_DEVICE, which
is race between memmap initialization and zone shrinking.

Iteration of create and destroy namespace causes the panic as below:

[   41.207694] kernel BUG at mm/page_alloc.c:535!
[   41.208109] invalid opcode: 0000 [#1] SMP PTI
[   41.208508] CPU: 7 PID: 2766 Comm: ndctl Not tainted 5.4.0-rc4 #6
[   41.209064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
[   41.210175] RIP: 0010:set_pfnblock_flags_mask+0x95/0xf0
[   41.210643] Code: 04 41 83 e2 3c 48 8d 04 a8 48 c1 e0 07 48 03 04 dd e0 59 55 bb 48 8b 58 68 48 39 da 73 0e 48 c7 c6 70 ac 11 bb e8 1b b2 fd ff <0f> 0b 48 03 58 78 48 39 da 73 e9 49 01 ca b9 3f 00 00 00 4f 8d 0c
[   41.212354] RSP: 0018:ffffac0d41557c80 EFLAGS: 00010246
[   41.212821] RAX: 000000000000004a RBX: 0000000000244a00 RCX: 0000000000000000
[   41.213459] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffbb1197dc
[   41.214100] RBP: 000000000000000c R08: 0000000000000439 R09: 0000000000000059
[   41.214736] R10: 0000000000000000 R11: ffffac0d41557b08 R12: ffff8be475ea72b0
[   41.215376] R13: 000000000000fa00 R14: 0000000000250000 R15: 00000000fffc0bb5
[   41.216008] FS:  00007f30862ab600(0000) GS:ffff8be57bc40000(0000) knlGS:0000000000000000
[   41.216771] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   41.217299] CR2: 000055e824d0d508 CR3: 0000000231dac000 CR4: 00000000000006e0
[   41.217934] Call Trace:
[   41.218225]  memmap_init_zone_device+0x165/0x17c
[   41.218642]  memremap_pages+0x4c1/0x540
[   41.218989]  devm_memremap_pages+0x1d/0x60
[   41.219367]  pmem_attach_disk+0x16b/0x600 [nd_pmem]
[   41.219804]  ? devm_nsio_enable+0xb8/0xe0
[   41.220172]  nvdimm_bus_probe+0x69/0x1c0
[   41.220526]  really_probe+0x1c2/0x3e0
[   41.220856]  driver_probe_device+0xb4/0x100
[   41.221238]  device_driver_attach+0x4f/0x60
[   41.221611]  bind_store+0xc9/0x110
[   41.221919]  kernfs_fop_write+0x116/0x190
[   41.222326]  vfs_write+0xa5/0x1a0
[   41.222626]  ksys_write+0x59/0xd0
[   41.222927]  do_syscall_64+0x5b/0x180
[   41.223264]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   41.223714] RIP: 0033:0x7f30865d0ed8
[   41.224037] Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 45 78 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55
[   41.225920] RSP: 002b:00007fffe5d30a78 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   41.226608] RAX: ffffffffffffffda RBX: 000055e824d07f40 RCX: 00007f30865d0ed8
[   41.227242] RDX: 0000000000000007 RSI: 000055e824d07f40 RDI: 0000000000000004
[   41.227870] RBP: 0000000000000007 R08: 0000000000000007 R09: 0000000000000006
[   41.228753] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004
[   41.229419] R13: 00007f30862ab528 R14: 0000000000000001 R15: 000055e824d07f40

While creating a namespace and initializing memmap, if you destroy the namespace
and shrink the zone, it will initialize the memmap outside the zone and
trigger VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page) in
set_pfnblock_flags_mask().

Thanks,
Toshiki Fukasawa

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-14 23:36                       ` Toshiki Fukasawa
@ 2019-11-15  0:46                         ` David Hildenbrand
  2019-11-15  2:57                           ` Toshiki Fukasawa
  0 siblings, 1 reply; 22+ messages in thread
From: David Hildenbrand @ 2019-11-15  0:46 UTC (permalink / raw)
  To: Toshiki Fukasawa
  Cc: Dan Williams, David Hildenbrand, linux-mm, linux-kernel, akpm,
	mhocko, adobriyan, hch, longman, sfr, mst, cai, Naoya Horiguchi,
	Junichi Nomura, Oscar Salvador



> Am 15.11.2019 um 00:42 schrieb Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>:
> 
> On 2019/11/14 6:26, Dan Williams wrote:
>>> On Wed, Nov 13, 2019 at 1:22 PM David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> 
>>> 
>>>> Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
>>>> 
>>>> On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
>>>> [..]
>>>>>>>>> I'm still struggling to understand the motivation of distinguishing
>>>>>>>>> "active" as something distinct from "online". As long as the "online"
>>>>>>>>> granularity is improved from sections down to subsections then most
>>>>>>>>> code paths are good to go. The others can use get_devpagemap() to
>>>>>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>>>>>>> 
>>>>>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>>>>>>> 
>>>>>>> Agree, when the zone does not matter, which is most cases, then
>>>>>>> pfn_online() and pfn_valid() are sufficient.
>>>>> 
>>>>> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
>>>> 
>>>> That's what I was debating with Toshiki [1], whether there is a real
>>>> example of needing to distinguish ZONE_DEVICE from offline memory in a
>>>> pfn walker. The proposed use case in this patch set of being able to
>>>> set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
>>>> me. My suspicion is that this is a common theme and others are looking
>>>> to do these types page manipulations that only make sense for online
>>>> memory. If that is the case then treating ZONE_DEVICE as offline seems
>>>> the right direction.
>>> 
>>> Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.
>>> 
>> 
>> I think that's ok... It's already zone aware code whereas pfn walkers
>> are zone unaware and should stay that way if at all possible.
>> 
>>> But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).
>> 
>> Ok, I'll keep an eye out.
> 
> I understand your point. Thanks!
> 
> By the way, I found another problem about ZONE_DEVICE, which
> is race between memmap initialization and zone shrinking.
> 
> Iteration of create and destroy namespace causes the panic as below:
> 
> [   41.207694] kernel BUG at mm/page_alloc.c:535!
> [   41.208109] invalid opcode: 0000 [#1] SMP PTI
> [   41.208508] CPU: 7 PID: 2766 Comm: ndctl Not tainted 5.4.0-rc4 #6
> [   41.209064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
> [   41.210175] RIP: 0010:set_pfnblock_flags_mask+0x95/0xf0
> [   41.210643] Code: 04 41 83 e2 3c 48 8d 04 a8 48 c1 e0 07 48 03 04 dd e0 59 55 bb 48 8b 58 68 48 39 da 73 0e 48 c7 c6 70 ac 11 bb e8 1b b2 fd ff <0f> 0b 48 03 58 78 48 39 da 73 e9 49 01 ca b9 3f 00 00 00 4f 8d 0c
> [   41.212354] RSP: 0018:ffffac0d41557c80 EFLAGS: 00010246
> [   41.212821] RAX: 000000000000004a RBX: 0000000000244a00 RCX: 0000000000000000
> [   41.213459] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffbb1197dc
> [   41.214100] RBP: 000000000000000c R08: 0000000000000439 R09: 0000000000000059
> [   41.214736] R10: 0000000000000000 R11: ffffac0d41557b08 R12: ffff8be475ea72b0
> [   41.215376] R13: 000000000000fa00 R14: 0000000000250000 R15: 00000000fffc0bb5
> [   41.216008] FS:  00007f30862ab600(0000) GS:ffff8be57bc40000(0000) knlGS:0000000000000000
> [   41.216771] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   41.217299] CR2: 000055e824d0d508 CR3: 0000000231dac000 CR4: 00000000000006e0
> [   41.217934] Call Trace:
> [   41.218225]  memmap_init_zone_device+0x165/0x17c
> [   41.218642]  memremap_pages+0x4c1/0x540
> [   41.218989]  devm_memremap_pages+0x1d/0x60
> [   41.219367]  pmem_attach_disk+0x16b/0x600 [nd_pmem]
> [   41.219804]  ? devm_nsio_enable+0xb8/0xe0
> [   41.220172]  nvdimm_bus_probe+0x69/0x1c0
> [   41.220526]  really_probe+0x1c2/0x3e0
> [   41.220856]  driver_probe_device+0xb4/0x100
> [   41.221238]  device_driver_attach+0x4f/0x60
> [   41.221611]  bind_store+0xc9/0x110
> [   41.221919]  kernfs_fop_write+0x116/0x190
> [   41.222326]  vfs_write+0xa5/0x1a0
> [   41.222626]  ksys_write+0x59/0xd0
> [   41.222927]  do_syscall_64+0x5b/0x180
> [   41.223264]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   41.223714] RIP: 0033:0x7f30865d0ed8
> [   41.224037] Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 45 78 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55
> [   41.225920] RSP: 002b:00007fffe5d30a78 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> [   41.226608] RAX: ffffffffffffffda RBX: 000055e824d07f40 RCX: 00007f30865d0ed8
> [   41.227242] RDX: 0000000000000007 RSI: 000055e824d07f40 RDI: 0000000000000004
> [   41.227870] RBP: 0000000000000007 R08: 0000000000000007 R09: 0000000000000006
> [   41.228753] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004
> [   41.229419] R13: 00007f30862ab528 R14: 0000000000000001 R15: 000055e824d07f40
> 
> While creating a namespace and initializing memmap, if you destroy the namespace
> and shrink the zone, it will initialize the memmap outside the zone and
> trigger VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page) in
> set_pfnblock_flags_mask().

Does that happen with -next as well? There, we currently don‘t shrink the ZONE_DEVICE zone anymore.

> 
> Thanks,
> Toshiki Fukasawa



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/3] mm: Introduce subsection_dev_map
  2019-11-15  0:46                         ` David Hildenbrand
@ 2019-11-15  2:57                           ` Toshiki Fukasawa
  0 siblings, 0 replies; 22+ messages in thread
From: Toshiki Fukasawa @ 2019-11-15  2:57 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Dan Williams, linux-mm, linux-kernel, akpm, mhocko, adobriyan,
	hch, longman, sfr, mst, cai, Naoya Horiguchi, Junichi Nomura,
	Oscar Salvador

On 2019/11/15 9:46, David Hildenbrand wrote:
> 
> 
>> Am 15.11.2019 um 00:42 schrieb Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>:
>>
>> On 2019/11/14 6:26, Dan Williams wrote:
>>>> On Wed, Nov 13, 2019 at 1:22 PM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>>
>>>>
>>>>> Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
>>>>>
>>>>> On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
>>>>> [..]
>>>>>>>>>> I'm still struggling to understand the motivation of distinguishing
>>>>>>>>>> "active" as something distinct from "online". As long as the "online"
>>>>>>>>>> granularity is improved from sections down to subsections then most
>>>>>>>>>> code paths are good to go. The others can use get_devpagemap() to
>>>>>>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>>>>>>>>
>>>>>>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>>>>>>>>
>>>>>>>> Agree, when the zone does not matter, which is most cases, then
>>>>>>>> pfn_online() and pfn_valid() are sufficient.
>>>>>>
>>>>>> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
>>>>>
>>>>> That's what I was debating with Toshiki [1], whether there is a real
>>>>> example of needing to distinguish ZONE_DEVICE from offline memory in a
>>>>> pfn walker. The proposed use case in this patch set of being able to
>>>>> set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
>>>>> me. My suspicion is that this is a common theme and others are looking
>>>>> to do these types page manipulations that only make sense for online
>>>>> memory. If that is the case then treating ZONE_DEVICE as offline seems
>>>>> the right direction.
>>>>
>>>> Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.
>>>>
>>>
>>> I think that's ok... It's already zone aware code whereas pfn walkers
>>> are zone unaware and should stay that way if at all possible.
>>>
>>>> But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).
>>>
>>> Ok, I'll keep an eye out.
>>
>> I understand your point. Thanks!
>>
>> By the way, I found another problem about ZONE_DEVICE, which
>> is race between memmap initialization and zone shrinking.
>>
>> Iteration of create and destroy namespace causes the panic as below:
>>
>> [   41.207694] kernel BUG at mm/page_alloc.c:535!
>> [   41.208109] invalid opcode: 0000 [#1] SMP PTI
>> [   41.208508] CPU: 7 PID: 2766 Comm: ndctl Not tainted 5.4.0-rc4 #6
>> [   41.209064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
>> [   41.210175] RIP: 0010:set_pfnblock_flags_mask+0x95/0xf0
>> [   41.210643] Code: 04 41 83 e2 3c 48 8d 04 a8 48 c1 e0 07 48 03 04 dd e0 59 55 bb 48 8b 58 68 48 39 da 73 0e 48 c7 c6 70 ac 11 bb e8 1b b2 fd ff <0f> 0b 48 03 58 78 48 39 da 73 e9 49 01 ca b9 3f 00 00 00 4f 8d 0c
>> [   41.212354] RSP: 0018:ffffac0d41557c80 EFLAGS: 00010246
>> [   41.212821] RAX: 000000000000004a RBX: 0000000000244a00 RCX: 0000000000000000
>> [   41.213459] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffbb1197dc
>> [   41.214100] RBP: 000000000000000c R08: 0000000000000439 R09: 0000000000000059
>> [   41.214736] R10: 0000000000000000 R11: ffffac0d41557b08 R12: ffff8be475ea72b0
>> [   41.215376] R13: 000000000000fa00 R14: 0000000000250000 R15: 00000000fffc0bb5
>> [   41.216008] FS:  00007f30862ab600(0000) GS:ffff8be57bc40000(0000) knlGS:0000000000000000
>> [   41.216771] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [   41.217299] CR2: 000055e824d0d508 CR3: 0000000231dac000 CR4: 00000000000006e0
>> [   41.217934] Call Trace:
>> [   41.218225]  memmap_init_zone_device+0x165/0x17c
>> [   41.218642]  memremap_pages+0x4c1/0x540
>> [   41.218989]  devm_memremap_pages+0x1d/0x60
>> [   41.219367]  pmem_attach_disk+0x16b/0x600 [nd_pmem]
>> [   41.219804]  ? devm_nsio_enable+0xb8/0xe0
>> [   41.220172]  nvdimm_bus_probe+0x69/0x1c0
>> [   41.220526]  really_probe+0x1c2/0x3e0
>> [   41.220856]  driver_probe_device+0xb4/0x100
>> [   41.221238]  device_driver_attach+0x4f/0x60
>> [   41.221611]  bind_store+0xc9/0x110
>> [   41.221919]  kernfs_fop_write+0x116/0x190
>> [   41.222326]  vfs_write+0xa5/0x1a0
>> [   41.222626]  ksys_write+0x59/0xd0
>> [   41.222927]  do_syscall_64+0x5b/0x180
>> [   41.223264]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>> [   41.223714] RIP: 0033:0x7f30865d0ed8
>> [   41.224037] Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 45 78 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55
>> [   41.225920] RSP: 002b:00007fffe5d30a78 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
>> [   41.226608] RAX: ffffffffffffffda RBX: 000055e824d07f40 RCX: 00007f30865d0ed8
>> [   41.227242] RDX: 0000000000000007 RSI: 000055e824d07f40 RDI: 0000000000000004
>> [   41.227870] RBP: 0000000000000007 R08: 0000000000000007 R09: 0000000000000006
>> [   41.228753] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004
>> [   41.229419] R13: 00007f30862ab528 R14: 0000000000000001 R15: 000055e824d07f40
>>
>> While creating a namespace and initializing memmap, if you destroy the namespace
>> and shrink the zone, it will initialize the memmap outside the zone and
>> trigger VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page) in
>> set_pfnblock_flags_mask().
> 
> Does that happen with -next as well? There, we currently don‘t shrink the ZONE_DEVICE zone anymore.

I confirmed the patch. The panic doesn't occur with linux-next kernel.
https://lore.kernel.org/linux-mm/20191006085646.5768-6-david@redhat.com/

Thank you for your information.

Thanks,
Toshiki Fukasawa

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2019-11-15  2:59 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-08  0:08 [PATCH 0/3] make pfn walker support ZONE_DEVICE Toshiki Fukasawa
2019-11-08  0:08 ` [PATCH 1/3] procfs: refactor kpage_*_read() in fs/proc/page.c Toshiki Fukasawa
2019-11-08  0:08 ` [PATCH 2/3] mm: Introduce subsection_dev_map Toshiki Fukasawa
2019-11-08 19:13   ` Dan Williams
2019-11-13 18:51     ` David Hildenbrand
2019-11-13 19:06       ` Dan Williams
2019-11-13 19:53         ` David Hildenbrand
2019-11-13 20:10           ` Dan Williams
2019-11-13 20:23             ` David Hildenbrand
2019-11-13 20:40               ` David Hildenbrand
2019-11-13 21:11                 ` Dan Williams
2019-11-13 21:22                   ` David Hildenbrand
2019-11-13 21:26                     ` Dan Williams
2019-11-14 23:36                       ` Toshiki Fukasawa
2019-11-15  0:46                         ` David Hildenbrand
2019-11-15  2:57                           ` Toshiki Fukasawa
2019-11-08  0:08 ` [PATCH 3/3] mm: make pfn walker support ZONE_DEVICE Toshiki Fukasawa
2019-11-09 17:08   ` kbuild test robot
2019-11-09 19:14   ` kbuild test robot
2019-11-08  9:18 ` [PATCH 0/3] " Michal Hocko
2019-11-11  8:00   ` Toshiki Fukasawa
2019-11-11 16:23     ` Dan Williams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).