linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC 1/8] mm/memory_hotplug: Revert "mm/memory_hotplug: optimize memory hotplug"
       [not found] <20180413131632.1413-1-david@redhat.com>
@ 2018-04-13 13:16 ` David Hildenbrand
  2018-04-13 13:16 ` [PATCH RFC 2/8] mm: introduce PG_offline David Hildenbrand
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 13:16 UTC (permalink / raw)
  To: linux-mm
  Cc: David Hildenbrand, Greg Kroah-Hartman, Andrew Morton,
	Pavel Tatashin, Ingo Molnar, Thomas Gleixner, Michal Hocko,
	Vlastimil Babka, Dan Williams, Reza Arbab, Mel Gorman,
	Tetsuo Handa, Johannes Weiner, Kirill A. Shutemov, Dave Hansen,
	David Rientjes, Arnd Bergmann, open list

Conflicts with the possibility to online sub-section chunks. Revert it
for now.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 drivers/base/node.c    |  2 --
 include/linux/memory.h |  1 -
 mm/memory_hotplug.c    | 27 +++++++++++++++++++--------
 mm/page_alloc.c        | 28 ++++++++++++++++++----------
 mm/sparse.c            |  8 +-------
 5 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 7a3a580821e0..92b00a7e6a02 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -407,8 +407,6 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid,
 
 	if (!mem_blk)
 		return -EFAULT;
-
-	mem_blk->nid = nid;
 	if (!node_online(nid))
 		return 0;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 31ca3e28b0eb..9f8cd856ca1e 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -33,7 +33,6 @@ struct memory_block {
 	void *hw;			/* optional pointer to fw/hw data */
 	int (*phys_callback)(struct memory_block *);
 	struct device dev;
-	int nid;			/* NID for this memory block */
 };
 
 int arch_get_memory_phys_device(unsigned long start_pfn);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f74826cdceea..d4474781c799 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -250,6 +250,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 		struct vmem_altmap *altmap, bool want_memblock)
 {
 	int ret;
+	int i;
 
 	if (pfn_valid(phys_start_pfn))
 		return -EEXIST;
@@ -258,6 +259,23 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 	if (ret < 0)
 		return ret;
 
+	/*
+	 * Make all the pages reserved so that nobody will stumble over half
+	 * initialized state.
+	 * FIXME: We also have to associate it with a node because page_to_nid
+	 * relies on having page with the proper node.
+	 */
+	for (i = 0; i < PAGES_PER_SECTION; i++) {
+		unsigned long pfn = phys_start_pfn + i;
+		struct page *page;
+		if (!pfn_valid(pfn))
+			continue;
+
+		page = pfn_to_page(pfn);
+		set_page_node(page, nid);
+		SetPageReserved(page);
+	}
+
 	if (!want_memblock)
 		return 0;
 
@@ -891,15 +909,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	int nid;
 	int ret;
 	struct memory_notify arg;
-	struct memory_block *mem;
-
-	/*
-	 * We can't use pfn_to_nid() because nid might be stored in struct page
-	 * which is not yet initialized. Instead, we find nid from memory block.
-	 */
-	mem = find_memory_block(__pfn_to_section(pfn));
-	nid = mem->nid;
 
+	nid = pfn_to_nid(pfn);
 	/* associate pfn range with the zone */
 	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 905db9d7962f..647c8c6dd4d1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1172,9 +1172,10 @@ static void free_one_page(struct zone *zone,
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
-				unsigned long zone, int nid)
+				unsigned long zone, int nid, bool zero)
 {
-	mm_zero_struct_page(page);
+	if (zero)
+		mm_zero_struct_page(page);
 	set_page_links(page, zone, nid, pfn);
 	init_page_count(page);
 	page_mapcount_reset(page);
@@ -1188,6 +1189,12 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
 #endif
 }
 
+static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
+					int nid, bool zero)
+{
+	return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
+}
+
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static void __meminit init_reserved_page(unsigned long pfn)
 {
@@ -1206,7 +1213,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
 		if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
 			break;
 	}
-	__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+	__init_single_pfn(pfn, zid, nid, true);
 }
 #else
 static inline void init_reserved_page(unsigned long pfn)
@@ -1523,7 +1530,7 @@ static unsigned long  __init deferred_init_pages(int nid, int zid,
 		} else {
 			page++;
 		}
-		__init_single_page(page, pfn, zid, nid);
+		__init_single_page(page, pfn, zid, nid, true);
 		nr_pages++;
 	}
 	return (nr_pages);
@@ -5460,7 +5467,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long pfn;
 	unsigned long nr_initialised = 0;
-	struct page *page;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	struct memblock_region *r = NULL, *tmp;
 #endif
@@ -5513,11 +5519,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 #endif
 
 not_early:
-		page = pfn_to_page(pfn);
-		__init_single_page(page, pfn, zone, nid);
-		if (context == MEMMAP_HOTPLUG)
-			SetPageReserved(page);
-
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
@@ -5534,8 +5535,15 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		 * because this is done early in sparse_add_one_section
 		 */
 		if (!(pfn & (pageblock_nr_pages - 1))) {
+			struct page *page = pfn_to_page(pfn);
+
+			__init_single_page(page, pfn, zone, nid,
+					context != MEMMAP_HOTPLUG);
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			cond_resched();
+		} else {
+			__init_single_pfn(pfn, zone, nid,
+					context != MEMMAP_HOTPLUG);
 		}
 	}
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 62eef264a7bd..58cab483e81b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -779,13 +779,7 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
 		goto out;
 	}
 
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * Poison uninitialized struct pages in order to catch invalid flags
-	 * combinations.
-	 */
-	memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION);
-#endif
+	memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
 
 	section_mark_present(ms);
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 2/8] mm: introduce PG_offline
       [not found] <20180413131632.1413-1-david@redhat.com>
  2018-04-13 13:16 ` [PATCH RFC 1/8] mm/memory_hotplug: Revert "mm/memory_hotplug: optimize memory hotplug" David Hildenbrand
@ 2018-04-13 13:16 ` David Hildenbrand
  2018-04-13 13:40   ` Michal Hocko
                     ` (2 more replies)
  2018-04-13 13:16 ` [PATCH RFC 3/8] mm: use PG_offline in online/offlining code David Hildenbrand
                   ` (5 subsequent siblings)
  7 siblings, 3 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 13:16 UTC (permalink / raw)
  To: linux-mm
  Cc: David Hildenbrand, Steven Rostedt, Ingo Molnar, Andrew Morton,
	Michal Hocko, Huang Ying, Greg Kroah-Hartman, Pavel Tatashin,
	Miles Chen, Vlastimil Babka, Mel Gorman, Rik van Riel,
	James Hogan, Levin, Alexander (Sasha Levin),
	open list

online_pages()/offline_pages() theoretically allows us to work on
sub-section sizes. This is especially relevant in the context of
virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
4MB chunks.

While the whole section is marked as online/offline, we have to know
the state of each page. E.g. to not read memory that is not online
during kexec() or to properly mark a section as offline as soon as all
contained pages are offline.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/page-flags.h     | 10 ++++++++++
 include/trace/events/mmflags.h |  9 ++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e34a27727b9a..8ebc4bad7824 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -49,6 +49,9 @@
  * PG_hwpoison indicates that a page got corrupted in hardware and contains
  * data with incorrect ECC bits that triggered a machine check. Accessing is
  * not safe since it may cause another machine check. Don't touch!
+ *
+ * PG_offline indicates that a page is offline and the backing storage
+ * might already have been removed (virtualization). Don't touch!
  */
 
 /*
@@ -100,6 +103,9 @@ enum pageflags {
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
 	PG_young,
 	PG_idle,
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+	PG_offline,		/* Page is offline. Don't touch */
 #endif
 	__NR_PAGEFLAGS,
 
@@ -381,6 +387,10 @@ TESTCLEARFLAG(Young, young, PF_ANY)
 PAGEFLAG(Idle, idle, PF_ANY)
 #endif
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+PAGEFLAG(Offline, offline, PF_ANY)
+#endif
+
 /*
  * On an anonymous page mapped into a user virtual memory area,
  * page->mapping points to its anon_vma, not to a struct address_space;
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a81cffb76d89..14c31209e34a 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -79,6 +79,12 @@
 #define IF_HAVE_PG_IDLE(flag,string)
 #endif
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+#define IF_HAVE_PG_OFFLINE(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_OFFLINE(flag,string)
+#endif
+
 #define __def_pageflag_names						\
 	{1UL << PG_locked,		"locked"	},		\
 	{1UL << PG_waiters,		"waiters"	},		\
@@ -104,7 +110,8 @@ IF_HAVE_PG_MLOCK(PG_mlocked,		"mlocked"	)		\
 IF_HAVE_PG_UNCACHED(PG_uncached,	"uncached"	)		\
 IF_HAVE_PG_HWPOISON(PG_hwpoison,	"hwpoison"	)		\
 IF_HAVE_PG_IDLE(PG_young,		"young"		)		\
-IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
+IF_HAVE_PG_IDLE(PG_idle,		"idle"		)		\
+IF_HAVE_PG_OFFLINE(PG_offline,		"offline"	)
 
 #define show_page_flags(flags)						\
 	(flags) ? __print_flags(flags, "|",				\
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 3/8] mm: use PG_offline in online/offlining code
       [not found] <20180413131632.1413-1-david@redhat.com>
  2018-04-13 13:16 ` [PATCH RFC 1/8] mm/memory_hotplug: Revert "mm/memory_hotplug: optimize memory hotplug" David Hildenbrand
  2018-04-13 13:16 ` [PATCH RFC 2/8] mm: introduce PG_offline David Hildenbrand
@ 2018-04-13 13:16 ` David Hildenbrand
  2018-04-13 13:31 ` [PATCH RFC 4/8] kdump: expose PG_offline David Hildenbrand
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 13:16 UTC (permalink / raw)
  To: linux-mm
  Cc: Michal Hocko, Stephen Hemminger, David Hildenbrand, Tetsuo Handa,
	Haiyang Zhang, Pavel Tatashin, Reza Arbab, open list,
	Thomas Gleixner, Johannes Weiner,
	open list:Hyper-V CORE AND DRIVERS, Andrew Morton, Mel Gorman,
	Dan Williams, Vlastimil Babka

Let's mark all offline pages with PG_offline. We'll continue to mark
them reserved.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 drivers/hv/hv_balloon.c |  2 +-
 mm/memory_hotplug.c     | 10 ++++++----
 mm/page_alloc.c         |  5 ++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index b3e9f13f8bc3..04d98d9b6191 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -893,7 +893,7 @@ static unsigned long handle_pg_range(unsigned long pg_start,
 			 * backed previously) online too.
 			 */
 			if (start_pfn > has->start_pfn &&
-			    !PageReserved(pfn_to_page(start_pfn - 1)))
+			    !PageOffline(pfn_to_page(start_pfn - 1)))
 				hv_bring_pgs_online(has, start_pfn, pgs_ol);
 
 		}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d4474781c799..3a8d56476233 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -260,8 +260,8 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 		return ret;
 
 	/*
-	 * Make all the pages reserved so that nobody will stumble over half
-	 * initialized state.
+	 * Make all the pages offline and reserved so that nobody will stumble
+	 * over half initialized state.
 	 * FIXME: We also have to associate it with a node because page_to_nid
 	 * relies on having page with the proper node.
 	 */
@@ -274,6 +274,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 		page = pfn_to_page(pfn);
 		set_page_node(page, nid);
 		SetPageReserved(page);
+		SetPageOffline(page);
 	}
 
 	if (!want_memblock)
@@ -669,6 +670,7 @@ EXPORT_SYMBOL_GPL(__online_page_increment_counters);
 
 void __online_page_free(struct page *page)
 {
+	ClearPageOffline(page);
 	__free_reserved_page(page);
 }
 EXPORT_SYMBOL_GPL(__online_page_free);
@@ -687,7 +689,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 	unsigned long onlined_pages = *(unsigned long *)arg;
 	struct page *page;
 
-	if (PageReserved(pfn_to_page(start_pfn)))
+	if (PageOffline(pfn_to_page(start_pfn)))
 		for (i = 0; i < nr_pages; i++) {
 			page = pfn_to_page(start_pfn + i);
 			(*online_page_callback)(page);
@@ -1437,7 +1439,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 }
 
 /*
- * remove from free_area[] and mark all as Reserved.
+ * remove from free_area[] and mark all as Reserved and Offline.
  */
 static int
 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 647c8c6dd4d1..2e5dcfdb0908 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8030,6 +8030,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
 			pfn++;
 			SetPageReserved(page);
+			SetPageOffline(page);
 			continue;
 		}
 
@@ -8043,8 +8044,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
-		for (i = 0; i < (1 << order); i++)
+		for (i = 0; i < (1 << order); i++) {
 			SetPageReserved((page+i));
+			SetPageOffline(page + i);
+		}
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
-- 
2.14.3

_______________________________________________
devel mailing list
devel@linuxdriverproject.org
http://driverdev.linuxdriverproject.org/mailman/listinfo/driverdev-devel

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 4/8] kdump: expose PG_offline
       [not found] <20180413131632.1413-1-david@redhat.com>
                   ` (2 preceding siblings ...)
  2018-04-13 13:16 ` [PATCH RFC 3/8] mm: use PG_offline in online/offlining code David Hildenbrand
@ 2018-04-13 13:31 ` David Hildenbrand
  2018-04-13 13:33 ` [PATCH RFC 5/8] mm: only mark section offline when all pages are offline David Hildenbrand
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 13:31 UTC (permalink / raw)
  To: linux-mm
  Cc: Dave Young, Andrew Morton, Baoquan He, Hari Bathini,
	Kirill A . Shutemov, Marc-Andre Lureau, linux-kernel,
	David Hildenbrand

This allows user space to skip pages that are offline when dumping. This is
especially relevant when dealing with pages that have been unplugged in
the context of virtualization, and their backing storage has already
been freed.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 kernel/crash_core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a93590cdd9e1..d6f21b19aeb3 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -463,6 +463,9 @@ static int __init crash_save_vmcoreinfo_init(void)
 #ifdef CONFIG_HUGETLB_PAGE
 	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
 #endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+	VMCOREINFO_NUMBER(PG_offline);
+#endif
 
 	arch_crash_save_vmcoreinfo();
 	update_vmcoreinfo_note();
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 5/8] mm: only mark section offline when all pages are offline
       [not found] <20180413131632.1413-1-david@redhat.com>
                   ` (3 preceding siblings ...)
  2018-04-13 13:31 ` [PATCH RFC 4/8] kdump: expose PG_offline David Hildenbrand
@ 2018-04-13 13:33 ` David Hildenbrand
  2018-04-13 13:33 ` [PATCH RFC 6/8] mm: offline_pages() is also limited by MAX_ORDER David Hildenbrand
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 13:33 UTC (permalink / raw)
  To: linux-mm
  Cc: David Hildenbrand, Andrew Morton, Michal Hocko, Vlastimil Babka,
	Mel Gorman, Pavel Tatashin, Johannes Weiner, Tetsuo Handa,
	Ingo Molnar, Kirill A. Shutemov, Greg Kroah-Hartman, Dave Hansen,
	David Rientjes, Arnd Bergmann, open list

If any page is still online, the section should stay online.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/page_alloc.c |  2 +-
 mm/sparse.c     | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2e5dcfdb0908..ae9023da2ca2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8013,7 +8013,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 			break;
 	if (pfn == end_pfn)
 		return;
-	offline_mem_sections(pfn, end_pfn);
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
@@ -8051,6 +8050,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
+	offline_mem_sections(start_pfn, end_pfn);
 }
 #endif
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 58cab483e81b..44978cb18fed 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -623,7 +623,27 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-/* Mark all memory sections within the pfn range as online */
+static bool all_pages_in_section_offline(unsigned long section_nr)
+{
+	unsigned long pfn = section_nr_to_pfn(section_nr);
+	struct page *page;
+	int i;
+
+	for (i = 0; i < PAGES_PER_SECTION; i++, pfn++) {
+		if (!pfn_valid(pfn))
+			continue;
+
+		page = pfn_to_page(pfn);
+		if (!PageOffline(page))
+			return false;
+	}
+	return true;
+}
+
+/*
+ * Mark all memory sections within the pfn range as offline (if all pages
+ * of a memory section are already offline)
+ */
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
@@ -639,6 +659,9 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 		if (WARN_ON(!valid_section_nr(section_nr)))
 			continue;
 
+		if (!all_pages_in_section_offline(section_nr))
+			continue;
+
 		ms = __nr_to_section(section_nr);
 		ms->section_mem_map &= ~SECTION_IS_ONLINE;
 	}
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 6/8] mm: offline_pages() is also limited by MAX_ORDER
       [not found] <20180413131632.1413-1-david@redhat.com>
                   ` (4 preceding siblings ...)
  2018-04-13 13:33 ` [PATCH RFC 5/8] mm: only mark section offline when all pages are offline David Hildenbrand
@ 2018-04-13 13:33 ` David Hildenbrand
  2018-04-13 13:33 ` [PATCH RFC 7/8] mm: allow to control onlining/offlining of memory by a driver David Hildenbrand
  2018-04-13 13:33 ` [PATCH RFC 8/8] mm: export more functions used to online/offline memory David Hildenbrand
  7 siblings, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 13:33 UTC (permalink / raw)
  To: linux-mm
  Cc: David Hildenbrand, Andrew Morton, Michal Hocko, Vlastimil Babka,
	Dan Williams, Reza Arbab, Pavel Tatashin, Thomas Gleixner,
	open list

Page blocks might contain references to the next page block. So
a page block cannot be offlined independently. E.g. on x86: page block
size is 2MB, MAX_ORDER -1 (10) allows 4MB allocations.
-> Right now, __offline_isolated_pages() will mark pages in the following
page block as reserved.

Let document offline_pages() while at it.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/memory_hotplug.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3a8d56476233..1d6054edc241 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1598,11 +1598,14 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	struct zone *zone;
 	struct memory_notify arg;
 
-	/* at least, alignment against pageblock is necessary */
 	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
 		return -EINVAL;
+	if (!IS_ALIGNED(start_pfn, (1 << (MAX_ORDER - 1))))
+		return -EINVAL;
 	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
 		return -EINVAL;
+	if (!IS_ALIGNED(end_pfn, (1 << (MAX_ORDER - 1))))
+		return -EINVAL;
 	/* This makes hotplug much easier...and readable.
 	   we assume this for now. .*/
 	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
@@ -1699,7 +1702,22 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	return ret;
 }
 
-/* Must be protected by mem_hotplug_begin() or a device_lock */
+/**
+ * offline_pages - offline pages in a given range (that are currently online)
+ * @start_pfn: start pfn of the memory range
+ * @nr_pages: the number of pages
+ *
+ * This function tries to offline the given pages. The alignment/size that
+ * can be used is max(pageblock_nr_pages, 1 << (MAX_ORDER - 1)).
+ *
+ * Returns 0 if sucessful, -EBUSY if the pages cannot be offlined and
+ * -EINVAL if start_pfn/nr_pages is not properly aligned or not in a zone.
+ * -EINTR is returned if interrupted by a signal.
+ *
+ * Bad things will happen if pages are already offline.
+ *
+ * Must be protected by mem_hotplug_begin() or a device_lock
+ */
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return __offline_pages(start_pfn, start_pfn + nr_pages);
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 7/8] mm: allow to control onlining/offlining of memory by a driver
       [not found] <20180413131632.1413-1-david@redhat.com>
                   ` (5 preceding siblings ...)
  2018-04-13 13:33 ` [PATCH RFC 6/8] mm: offline_pages() is also limited by MAX_ORDER David Hildenbrand
@ 2018-04-13 13:33 ` David Hildenbrand
  2018-04-13 15:59   ` Michal Hocko
  2018-04-13 13:33 ` [PATCH RFC 8/8] mm: export more functions used to online/offline memory David Hildenbrand
  7 siblings, 1 reply; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 13:33 UTC (permalink / raw)
  To: linux-mm
  Cc: David Hildenbrand, Greg Kroah-Hartman, Boris Ostrovsky,
	Juergen Gross, Andrew Morton, Pavel Tatashin, Ingo Molnar,
	Michal Hocko, Vlastimil Babka, Dan Williams, Joonsoo Kim,
	Reza Arbab, Thomas Gleixner, open list,
	moderated list:XEN HYPERVISOR INTERFACE

Some devices (esp. paravirtualized) might want to control
- when to online/offline a memory block
- how to online memory (MOVABLE/NORMAL)
- in which granularity to online/offline memory

So let's add a new flag "driver_managed" and disallow to change the
state by user space. Device onlining/offlining will still work, however
the memory will not be actually onlined/offlined. That has to be handled
by the device driver that owns the memory.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 drivers/base/memory.c          | 22 ++++++++++++++--------
 drivers/xen/balloon.c          |  2 +-
 include/linux/memory.h         |  1 +
 include/linux/memory_hotplug.h |  4 +++-
 mm/memory_hotplug.c            | 34 ++++++++++++++++++++++++++++++++--
 5 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bffe8616bd55..3b8616551561 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -231,27 +231,28 @@ static bool pages_correctly_probed(unsigned long start_pfn)
  * Must already be protected by mem_hotplug_begin().
  */
 static int
-memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
+memory_block_action(struct memory_block *mem, unsigned long action)
 {
-	unsigned long start_pfn;
+	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-	int ret;
+	int ret = 0;
 
-	start_pfn = section_nr_to_pfn(phys_index);
+	if (mem->driver_managed)
+		return 0;
 
 	switch (action) {
 	case MEM_ONLINE:
 		if (!pages_correctly_probed(start_pfn))
 			return -EBUSY;
 
-		ret = online_pages(start_pfn, nr_pages, online_type);
+		ret = online_pages(start_pfn, nr_pages, mem->online_type);
 		break;
 	case MEM_OFFLINE:
 		ret = offline_pages(start_pfn, nr_pages);
 		break;
 	default:
 		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
-		     "%ld\n", __func__, phys_index, action, action);
+		     "%ld\n", __func__, mem->start_section_nr, action, action);
 		ret = -EINVAL;
 	}
 
@@ -269,8 +270,7 @@ static int memory_block_change_state(struct memory_block *mem,
 	if (to_state == MEM_OFFLINE)
 		mem->state = MEM_GOING_OFFLINE;
 
-	ret = memory_block_action(mem->start_section_nr, to_state,
-				mem->online_type);
+	ret = memory_block_action(mem, to_state);
 
 	mem->state = ret ? from_state_req : to_state;
 
@@ -350,6 +350,11 @@ store_mem_state(struct device *dev,
 	 */
 	mem_hotplug_begin();
 
+	if (mem->driver_managed) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	switch (online_type) {
 	case MMOP_ONLINE_KERNEL:
 	case MMOP_ONLINE_MOVABLE:
@@ -364,6 +369,7 @@ store_mem_state(struct device *dev,
 		ret = -EINVAL; /* should never happen */
 	}
 
+out:
 	mem_hotplug_done();
 err:
 	unlock_device_hotplug();
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 065f0b607373..89981d573c06 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -401,7 +401,7 @@ static enum bp_state reserve_additional_memory(void)
 	 * callers drop the mutex before trying again.
 	 */
 	mutex_unlock(&balloon_mutex);
-	rc = add_memory_resource(nid, resource, memhp_auto_online);
+	rc = add_memory_resource(nid, resource, memhp_auto_online, false);
 	mutex_lock(&balloon_mutex);
 
 	if (rc) {
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 9f8cd856ca1e..018c5e5ecde1 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -29,6 +29,7 @@ struct memory_block {
 	unsigned long state;		/* serialized by the dev->lock */
 	int section_count;		/* serialized by mem_sysfs_mutex */
 	int online_type;		/* for passing data to online routine */
+	bool driver_managed;		/* driver handles online/offline */
 	int phys_device;		/* to which fru does this belong? */
 	void *hw;			/* optional pointer to fw/hw data */
 	int (*phys_callback)(struct memory_block *);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e0e49b5b1ee1..46c6ceb1110d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -320,7 +320,9 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
 extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 		void *arg, int (*func)(struct memory_block *, void *));
 extern int add_memory(int nid, u64 start, u64 size);
-extern int add_memory_resource(int nid, struct resource *resource, bool online);
+extern int add_memory_driver_managed(int nid, u64 start, u64 size);
+extern int add_memory_resource(int nid, struct resource *resource, bool online,
+			       bool driver_managed);
 extern int arch_add_memory(int nid, u64 start, u64 size,
 		struct vmem_altmap *altmap, bool want_memblock);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1d6054edc241..ac14ea772792 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1108,8 +1108,15 @@ static int online_memory_block(struct memory_block *mem, void *arg)
 	return device_online(&mem->dev);
 }
 
+static int mark_memory_block_driver_managed(struct memory_block *mem, void *arg)
+{
+	mem->driver_managed = true;
+	return 0;
+}
+
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
+int __ref add_memory_resource(int nid, struct resource *res, bool online,
+			      bool driver_managed)
 {
 	u64 start, size;
 	pg_data_t *pgdat = NULL;
@@ -1117,6 +1124,9 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
 	bool new_node;
 	int ret;
 
+	if (online && driver_managed)
+		return -EINVAL;
+
 	start = res->start;
 	size = resource_size(res);
 
@@ -1188,6 +1198,9 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
 	if (online)
 		walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
 				  NULL, online_memory_block);
+	else if (driver_managed)
+		walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
+				  NULL, mark_memory_block_driver_managed);
 
 	goto out;
 
@@ -1212,13 +1225,30 @@ int __ref add_memory(int nid, u64 start, u64 size)
 	if (IS_ERR(res))
 		return PTR_ERR(res);
 
-	ret = add_memory_resource(nid, res, memhp_auto_online);
+	ret = add_memory_resource(nid, res, memhp_auto_online, false);
 	if (ret < 0)
 		release_memory_resource(res);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);
 
+int __ref add_memory_driver_managed(int nid, u64 start, u64 size)
+{
+	struct resource *res;
+	int ret;
+
+	res = register_memory_resource(start, size);
+	if (IS_ERR(res))
+		return PTR_ERR(res);
+
+	ret = add_memory_resource(nid, res, false, true);
+	if (ret < 0)
+		release_memory_resource(res);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory_driver_managed);
+
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 8/8] mm: export more functions used to online/offline memory
       [not found] <20180413131632.1413-1-david@redhat.com>
                   ` (6 preceding siblings ...)
  2018-04-13 13:33 ` [PATCH RFC 7/8] mm: allow to control onlining/offlining of memory by a driver David Hildenbrand
@ 2018-04-13 13:33 ` David Hildenbrand
  7 siblings, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 13:33 UTC (permalink / raw)
  To: linux-mm
  Cc: David Hildenbrand, Andrew Morton, Michal Hocko, Vlastimil Babka,
	Dan Williams, Pavel Tatashin, Thomas Gleixner, open list

Kernel modules that want to control how/when memory is onlined/offlined
need these functions.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/memory_hotplug.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ac14ea772792..3c374d308cf4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -979,6 +979,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	memory_notify(MEM_CANCEL_ONLINE, &arg);
 	return ret;
 }
+EXPORT_SYMBOL(online_pages);
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 static void reset_node_present_pages(pg_data_t *pgdat)
@@ -1296,6 +1297,7 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 	/* All pageblocks in the memory block are likely to be hot-removable */
 	return true;
 }
+EXPORT_SYMBOL(is_mem_section_removable);
 
 /*
  * Confirm all pages in a range [start, end) belong to the same zone.
@@ -1752,6 +1754,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return __offline_pages(start_pfn, start_pfn + nr_pages);
 }
+EXPORT_SYMBOL(offline_pages);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /**
@@ -1802,6 +1805,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 
 	return 0;
 }
+EXPORT_SYMBOL(walk_memory_range);
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-13 13:16 ` [PATCH RFC 2/8] mm: introduce PG_offline David Hildenbrand
@ 2018-04-13 13:40   ` Michal Hocko
  2018-04-13 13:46     ` David Hildenbrand
  2018-04-17 11:50     ` David Hildenbrand
  2018-04-13 17:11   ` Matthew Wilcox
  2018-04-20  7:30   ` David Hildenbrand
  2 siblings, 2 replies; 23+ messages in thread
From: Michal Hocko @ 2018-04-13 13:40 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-mm, Steven Rostedt, Ingo Molnar, Andrew Morton, Huang Ying,
	Greg Kroah-Hartman, Pavel Tatashin, Miles Chen, Vlastimil Babka,
	Mel Gorman, Rik van Riel, James Hogan, Levin,
	Alexander (Sasha Levin),
	open list

On Fri 13-04-18 15:16:26, David Hildenbrand wrote:
> online_pages()/offline_pages() theoretically allows us to work on
> sub-section sizes. This is especially relevant in the context of
> virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
> 4MB chunks.

Well, theoretically possible but this would require a lot of auditing
because the hotplug and per section assumption is quite a spread one.

> While the whole section is marked as online/offline, we have to know
> the state of each page. E.g. to not read memory that is not online
> during kexec() or to properly mark a section as offline as soon as all
> contained pages are offline.

But you cannot use a page flag for that, I am afraid. Page flags are
extremely scarce resource. I haven't looked at the rest of the series
but _if_ we have a bit spare which I am not really sure about then you
should prove there are no other ways around this.
 
> Signed-off-by: David Hildenbrand <david@redhat.com>
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-13 13:40   ` Michal Hocko
@ 2018-04-13 13:46     ` David Hildenbrand
  2018-04-17 11:50     ` David Hildenbrand
  1 sibling, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 13:46 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Steven Rostedt, Ingo Molnar, Andrew Morton, Huang Ying,
	Greg Kroah-Hartman, Pavel Tatashin, Miles Chen, Vlastimil Babka,
	Mel Gorman, Rik van Riel, James Hogan, Levin,
	Alexander (Sasha Levin),
	open list

On 13.04.2018 15:40, Michal Hocko wrote:
> On Fri 13-04-18 15:16:26, David Hildenbrand wrote:
>> online_pages()/offline_pages() theoretically allows us to work on
>> sub-section sizes. This is especially relevant in the context of
>> virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
>> 4MB chunks.
> 
> Well, theoretically possible but this would require a lot of auditing
> because the hotplug and per section assumption is quite a spread one.

Indeed. But besides changing section sizes / size of memory blocks this
seems to be the only way to do it. (btw, I think Windows allows to add
1MB chunks - e.g. 1MB DIMMs)

But as these pages "belong to nobody" nobody (besides kdump) should dare
to access the content, although the section is online.

> 
>> While the whole section is marked as online/offline, we have to know
>> the state of each page. E.g. to not read memory that is not online
>> during kexec() or to properly mark a section as offline as soon as all
>> contained pages are offline.
> 
> But you cannot use a page flag for that, I am afraid. Page flags are
> extremely scarce resource. I haven't looked at the rest of the series
> but _if_ we have a bit spare which I am not really sure about then you
> should prove there are no other ways around this.

Open for suggestions. We could remember per segment/memory block which
parts are online/offline and use that to decide if a section can go offline.

However: kdump will also have to (easily) know which pages are offline,
so it can skip reading them. (see the other patch)

>  
>> Signed-off-by: David Hildenbrand <david@redhat.com>


-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 7/8] mm: allow to control onlining/offlining of memory by a driver
  2018-04-13 13:33 ` [PATCH RFC 7/8] mm: allow to control onlining/offlining of memory by a driver David Hildenbrand
@ 2018-04-13 15:59   ` Michal Hocko
  2018-04-13 16:32     ` David Hildenbrand
  0 siblings, 1 reply; 23+ messages in thread
From: Michal Hocko @ 2018-04-13 15:59 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-mm, Greg Kroah-Hartman, Boris Ostrovsky, Juergen Gross,
	Andrew Morton, Pavel Tatashin, Ingo Molnar, Vlastimil Babka,
	Dan Williams, Joonsoo Kim, Reza Arbab, Thomas Gleixner,
	open list, moderated list:XEN HYPERVISOR INTERFACE

On Fri 13-04-18 15:33:28, David Hildenbrand wrote:
> Some devices (esp. paravirtualized) might want to control
> - when to online/offline a memory block
> - how to online memory (MOVABLE/NORMAL)
> - in which granularity to online/offline memory
> 
> So let's add a new flag "driver_managed" and disallow to change the
> state by user space. Device onlining/offlining will still work, however
> the memory will not be actually onlined/offlined. That has to be handled
> by the device driver that owns the memory.

Is there any reason to create the memblock sysfs interface to this
memory at all? ZONE_DEVICE mem hotplug users currently do not do that
and manage the memory themselves. It seems you want to achieve the same
thing, no?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 7/8] mm: allow to control onlining/offlining of memory by a driver
  2018-04-13 15:59   ` Michal Hocko
@ 2018-04-13 16:32     ` David Hildenbrand
  0 siblings, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-13 16:32 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Greg Kroah-Hartman, Boris Ostrovsky, Juergen Gross,
	Andrew Morton, Pavel Tatashin, Ingo Molnar, Vlastimil Babka,
	Dan Williams, Joonsoo Kim, Reza Arbab, Thomas Gleixner,
	open list, moderated list:XEN HYPERVISOR INTERFACE

On 13.04.2018 17:59, Michal Hocko wrote:
> On Fri 13-04-18 15:33:28, David Hildenbrand wrote:
>> Some devices (esp. paravirtualized) might want to control
>> - when to online/offline a memory block
>> - how to online memory (MOVABLE/NORMAL)
>> - in which granularity to online/offline memory
>>
>> So let's add a new flag "driver_managed" and disallow to change the
>> state by user space. Device onlining/offlining will still work, however
>> the memory will not be actually onlined/offlined. That has to be handled
>> by the device driver that owns the memory.
> 
> Is there any reason to create the memblock sysfs interface to this
> memory at all? ZONE_DEVICE mem hotplug users currently do not do that
> and manage the memory themselves. It seems you want to achieve the same
> thing, no?
> 

Yes, I think so, namely kdump. We have to retrigger kexec() whenever a
memory block is added/removed. udev events are sent for that reason when
a memory block is created/deleted. And I think this is not done for
ZONE_DEVICE devices, or am I wrong?

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-13 13:16 ` [PATCH RFC 2/8] mm: introduce PG_offline David Hildenbrand
  2018-04-13 13:40   ` Michal Hocko
@ 2018-04-13 17:11   ` Matthew Wilcox
  2018-04-16  8:31     ` David Hildenbrand
  2018-04-21 16:52     ` Vlastimil Babka
  2018-04-20  7:30   ` David Hildenbrand
  2 siblings, 2 replies; 23+ messages in thread
From: Matthew Wilcox @ 2018-04-13 17:11 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-mm, Steven Rostedt, Ingo Molnar, Andrew Morton,
	Michal Hocko, Huang Ying, Greg Kroah-Hartman, Pavel Tatashin,
	Miles Chen, Vlastimil Babka, Mel Gorman, Rik van Riel,
	James Hogan, Levin, Alexander (Sasha Levin),
	open list

On Fri, Apr 13, 2018 at 03:16:26PM +0200, David Hildenbrand wrote:
> online_pages()/offline_pages() theoretically allows us to work on
> sub-section sizes. This is especially relevant in the context of
> virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
> 4MB chunks.
> 
> While the whole section is marked as online/offline, we have to know
> the state of each page. E.g. to not read memory that is not online
> during kexec() or to properly mark a section as offline as soon as all
> contained pages are offline.

Can you not use PG_reserved for this purpose?

> + * PG_offline indicates that a page is offline and the backing storage
> + * might already have been removed (virtualization). Don't touch!

 * PG_reserved is set for special pages, which can never be swapped out. Some
 * of them might not even exist...

They seem pretty congruent to me.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-13 17:11   ` Matthew Wilcox
@ 2018-04-16  8:31     ` David Hildenbrand
  2018-04-21 16:52     ` Vlastimil Babka
  1 sibling, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-16  8:31 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-mm, Steven Rostedt, Ingo Molnar, Andrew Morton,
	Michal Hocko, Huang Ying, Greg Kroah-Hartman, Pavel Tatashin,
	Miles Chen, Vlastimil Babka, Mel Gorman, Rik van Riel,
	James Hogan, Levin, Alexander (Sasha Levin),
	open list

On 13.04.2018 19:11, Matthew Wilcox wrote:
> On Fri, Apr 13, 2018 at 03:16:26PM +0200, David Hildenbrand wrote:
>> online_pages()/offline_pages() theoretically allows us to work on
>> sub-section sizes. This is especially relevant in the context of
>> virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
>> 4MB chunks.
>>
>> While the whole section is marked as online/offline, we have to know
>> the state of each page. E.g. to not read memory that is not online
>> during kexec() or to properly mark a section as offline as soon as all
>> contained pages are offline.
> 
> Can you not use PG_reserved for this purpose?
> 
>> + * PG_offline indicates that a page is offline and the backing storage
>> + * might already have been removed (virtualization). Don't touch!
> 
>  * PG_reserved is set for special pages, which can never be swapped out. Some
>  * of them might not even exist...
> 
> They seem pretty congruent to me.
> 

Can we really go ahead and make dump tools exclude any PG_reserved page
from a memory dump? While it might be true for ballooned pages, I doubt
that this assumption holds in general. ("cannot be swapped out" doesn't
imply "content should never be read/dumped")


I need PG_offline right now for two reasons:

1. Make kdump skip these pages (like PG_hwpoison), because they might
not even be readable anymore as the hypervisor might have restricted
memory access completely.

2. Detect when all pages of a memory section are offline, so we can mark
the section as offline and eventually remove it.


A clear point speaking against using PG_reserved for 2. is the following
simple example.

Let's assume we use virtio-balloon and inflated some chunk of memory in
a section (let's say 4MB). Now we offline (using the new driver) all
other chunks in a section, except the memory allocated by
virtio-balloon. We would suddenly mark the section as offline and
eventually remove it. This is of course very bad.


I think using PG_reserved for 1. is wrong. PG_reserved is usually used
for pages _after_ coming from an allocator. Using PG_reserved for 2.
will not work.


An ugly way for 2. would be, remembering for each section which pages
are actually online, but I would like to avoid that, especially as it
only solves part of a problem.

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-13 13:40   ` Michal Hocko
  2018-04-13 13:46     ` David Hildenbrand
@ 2018-04-17 11:50     ` David Hildenbrand
  1 sibling, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-17 11:50 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Steven Rostedt, Ingo Molnar, Andrew Morton, Huang Ying,
	Greg Kroah-Hartman, Pavel Tatashin, Miles Chen, Vlastimil Babka,
	Mel Gorman, Rik van Riel, James Hogan, Levin,
	Alexander (Sasha Levin),
	open list

On 13.04.2018 15:40, Michal Hocko wrote:
> On Fri 13-04-18 15:16:26, David Hildenbrand wrote:
>> online_pages()/offline_pages() theoretically allows us to work on
>> sub-section sizes. This is especially relevant in the context of
>> virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
>> 4MB chunks.
> 
> Well, theoretically possible but this would require a lot of auditing
> because the hotplug and per section assumption is quite a spread one.
> 
>> While the whole section is marked as online/offline, we have to know
>> the state of each page. E.g. to not read memory that is not online
>> during kexec() or to properly mark a section as offline as soon as all
>> contained pages are offline.
> 
> But you cannot use a page flag for that, I am afraid. Page flags are
> extremely scarce resource. I haven't looked at the rest of the series
> but _if_ we have a bit spare which I am not really sure about then you
> should prove there are no other ways around this.

BTW, looking at the possible layouts of page->flags, I don't think it
will be a problem adding this flag. Especially if we compile this flag
only if really needed.

We could glue this flag to CONFIG_MEMORY_HOTPLUG_SUBSECTION to something
like that, that will be set when our new driver is compiled. So this
would not affect anybody just wanting to use ordinary DIMM based hotplug
(CONFIG_MEMORY_HOTPLUG).

But I am open for other suggestions. I don't think PG_reserved is the
right thing to use. And storing for each section which parts are
online/offline is also something I would like to avoid.

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-13 13:16 ` [PATCH RFC 2/8] mm: introduce PG_offline David Hildenbrand
  2018-04-13 13:40   ` Michal Hocko
  2018-04-13 17:11   ` Matthew Wilcox
@ 2018-04-20  7:30   ` David Hildenbrand
  2 siblings, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-20  7:30 UTC (permalink / raw)
  To: linux-mm
  Cc: Steven Rostedt, Ingo Molnar, Andrew Morton, Michal Hocko,
	Huang Ying, Greg Kroah-Hartman, Pavel Tatashin, Miles Chen,
	Vlastimil Babka, Mel Gorman, Rik van Riel, James Hogan, Levin,
	Alexander (Sasha Levin),
	open list

On 13.04.2018 15:16, David Hildenbrand wrote:
> online_pages()/offline_pages() theoretically allows us to work on
> sub-section sizes. This is especially relevant in the context of
> virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
> 4MB chunks.
> 
> While the whole section is marked as online/offline, we have to know
> the state of each page. E.g. to not read memory that is not online
> during kexec() or to properly mark a section as offline as soon as all
> contained pages are offline.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/linux/page-flags.h     | 10 ++++++++++
>  include/trace/events/mmflags.h |  9 ++++++++-
>  2 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index e34a27727b9a..8ebc4bad7824 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -49,6 +49,9 @@
>   * PG_hwpoison indicates that a page got corrupted in hardware and contains
>   * data with incorrect ECC bits that triggered a machine check. Accessing is
>   * not safe since it may cause another machine check. Don't touch!
> + *
> + * PG_offline indicates that a page is offline and the backing storage
> + * might already have been removed (virtualization). Don't touch!
>   */
>  
>  /*
> @@ -100,6 +103,9 @@ enum pageflags {
>  #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
>  	PG_young,
>  	PG_idle,
> +#endif
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +	PG_offline,		/* Page is offline. Don't touch */
>  #endif
>  	__NR_PAGEFLAGS,
>  
> @@ -381,6 +387,10 @@ TESTCLEARFLAG(Young, young, PF_ANY)
>  PAGEFLAG(Idle, idle, PF_ANY)
>  #endif
>  
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +PAGEFLAG(Offline, offline, PF_ANY)
> +#endif
> +
>  /*
>   * On an anonymous page mapped into a user virtual memory area,
>   * page->mapping points to its anon_vma, not to a struct address_space;
> diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
> index a81cffb76d89..14c31209e34a 100644
> --- a/include/trace/events/mmflags.h
> +++ b/include/trace/events/mmflags.h
> @@ -79,6 +79,12 @@
>  #define IF_HAVE_PG_IDLE(flag,string)
>  #endif
>  
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +#define IF_HAVE_PG_OFFLINE(flag,string) ,{1UL << flag, string}
> +#else
> +#define IF_HAVE_PG_OFFLINE(flag,string)
> +#endif
> +
>  #define __def_pageflag_names						\
>  	{1UL << PG_locked,		"locked"	},		\
>  	{1UL << PG_waiters,		"waiters"	},		\
> @@ -104,7 +110,8 @@ IF_HAVE_PG_MLOCK(PG_mlocked,		"mlocked"	)		\
>  IF_HAVE_PG_UNCACHED(PG_uncached,	"uncached"	)		\
>  IF_HAVE_PG_HWPOISON(PG_hwpoison,	"hwpoison"	)		\
>  IF_HAVE_PG_IDLE(PG_young,		"young"		)		\
> -IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
> +IF_HAVE_PG_IDLE(PG_idle,		"idle"		)		\
> +IF_HAVE_PG_OFFLINE(PG_offline,		"offline"	)
>  
>  #define show_page_flags(flags)						\
>  	(flags) ? __print_flags(flags, "|",				\
> 

I am thinking right now of gluing this to CONFIG_MEMORY_PG_OFFLINE and
allowing it to be used also for ordinary balloon drivers. It is then
basically a way to signal using kdump "don't dump this page, the content
is either invalid or not even accessible".

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-13 17:11   ` Matthew Wilcox
  2018-04-16  8:31     ` David Hildenbrand
@ 2018-04-21 16:52     ` Vlastimil Babka
  2018-04-22  3:01       ` Matthew Wilcox
  1 sibling, 1 reply; 23+ messages in thread
From: Vlastimil Babka @ 2018-04-21 16:52 UTC (permalink / raw)
  To: Matthew Wilcox, David Hildenbrand
  Cc: linux-mm, Steven Rostedt, Ingo Molnar, Andrew Morton,
	Michal Hocko, Huang Ying, Greg Kroah-Hartman, Pavel Tatashin,
	Miles Chen, Mel Gorman, Rik van Riel, James Hogan, Levin,
	Alexander (Sasha Levin),
	open list

On 04/13/2018 07:11 PM, Matthew Wilcox wrote:
> On Fri, Apr 13, 2018 at 03:16:26PM +0200, David Hildenbrand wrote:
>> online_pages()/offline_pages() theoretically allows us to work on
>> sub-section sizes. This is especially relevant in the context of
>> virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
>> 4MB chunks.
>>
>> While the whole section is marked as online/offline, we have to know
>> the state of each page. E.g. to not read memory that is not online
>> during kexec() or to properly mark a section as offline as soon as all
>> contained pages are offline.
> 
> Can you not use PG_reserved for this purpose?

Sounds like your newly introduced "page types" could be useful here? I
don't suppose those offline pages would be using mapcount which is
aliased there?

>> + * PG_offline indicates that a page is offline and the backing storage
>> + * might already have been removed (virtualization). Don't touch!
> 
>  * PG_reserved is set for special pages, which can never be swapped out. Some
>  * of them might not even exist...
> 
> They seem pretty congruent to me.
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-21 16:52     ` Vlastimil Babka
@ 2018-04-22  3:01       ` Matthew Wilcox
  2018-04-22  8:17         ` David Hildenbrand
  0 siblings, 1 reply; 23+ messages in thread
From: Matthew Wilcox @ 2018-04-22  3:01 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: David Hildenbrand, linux-mm, Steven Rostedt, Ingo Molnar,
	Andrew Morton, Michal Hocko, Huang Ying, Greg Kroah-Hartman,
	Pavel Tatashin, Miles Chen, Mel Gorman, Rik van Riel,
	James Hogan, Levin, Alexander (Sasha Levin),
	open list

On Sat, Apr 21, 2018 at 06:52:18PM +0200, Vlastimil Babka wrote:
> On 04/13/2018 07:11 PM, Matthew Wilcox wrote:
> > On Fri, Apr 13, 2018 at 03:16:26PM +0200, David Hildenbrand wrote:
> >> online_pages()/offline_pages() theoretically allows us to work on
> >> sub-section sizes. This is especially relevant in the context of
> >> virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
> >> 4MB chunks.
> >>
> >> While the whole section is marked as online/offline, we have to know
> >> the state of each page. E.g. to not read memory that is not online
> >> during kexec() or to properly mark a section as offline as soon as all
> >> contained pages are offline.
> > 
> > Can you not use PG_reserved for this purpose?
> 
> Sounds like your newly introduced "page types" could be useful here? I
> don't suppose those offline pages would be using mapcount which is
> aliased there?

Oh, that's a good point!  Yes, this is a perfect use for page_type.
We have something like twenty bits available there.

Now you've got me thinking that we can move PG_hwpoison and PG_reserved
to be page_type flags too.  That'll take us from 23 to 21 bits (on 32-bit,
with PG_UNCACHED)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-22  3:01       ` Matthew Wilcox
@ 2018-04-22  8:17         ` David Hildenbrand
  2018-04-22 14:02           ` Matthew Wilcox
  0 siblings, 1 reply; 23+ messages in thread
From: David Hildenbrand @ 2018-04-22  8:17 UTC (permalink / raw)
  To: Matthew Wilcox, Vlastimil Babka
  Cc: linux-mm, Steven Rostedt, Ingo Molnar, Andrew Morton,
	Michal Hocko, Huang Ying, Greg Kroah-Hartman, Pavel Tatashin,
	Miles Chen, Mel Gorman, Rik van Riel, James Hogan, Levin,
	Alexander (Sasha Levin),
	open list

On 22.04.2018 05:01, Matthew Wilcox wrote:
> On Sat, Apr 21, 2018 at 06:52:18PM +0200, Vlastimil Babka wrote:
>> On 04/13/2018 07:11 PM, Matthew Wilcox wrote:
>>> On Fri, Apr 13, 2018 at 03:16:26PM +0200, David Hildenbrand wrote:
>>>> online_pages()/offline_pages() theoretically allows us to work on
>>>> sub-section sizes. This is especially relevant in the context of
>>>> virtualization. It e.g. allows us to add/remove memory to Linux in a VM in
>>>> 4MB chunks.
>>>>
>>>> While the whole section is marked as online/offline, we have to know
>>>> the state of each page. E.g. to not read memory that is not online
>>>> during kexec() or to properly mark a section as offline as soon as all
>>>> contained pages are offline.
>>>
>>> Can you not use PG_reserved for this purpose?
>>
>> Sounds like your newly introduced "page types" could be useful here? I
>> don't suppose those offline pages would be using mapcount which is
>> aliased there?
> 
> Oh, that's a good point!  Yes, this is a perfect use for page_type.
> We have something like twenty bits available there.
> 
> Now you've got me thinking that we can move PG_hwpoison and PG_reserved
> to be page_type flags too.  That'll take us from 23 to 21 bits (on 32-bit,
> with PG_UNCACHED)
> 

Some things to clarify here. I modified the current RFC to also allow
PG_offline on allocated (ballooned) pages (e.g. virtio-balloon).

kdump based dump tools can then easily identify which pages are not to
be dumped (either because the content is invalid or not accessible).

I previously stated that ballooned pages would be marked as PG_reserved,
which is not true (at least not for virtio-balloon). However this allows
me to detect if all pages in a section are offline by looking at
(PG_reserved && PG_offline). So I can actually tell if a page is marked
as offline and allocated or really offline.


1. The location (not the number!) of PG_hwpoison is basically ABI and
cannot be changed. Moving it around will most probably break dump tools.
(see kernel/crash_core.c)

2. Exposing PG_offline via kdump will make it ABI as well. And we don't
want any complicated validity checks ("is the bit valid or not?"),
because that would imply having to make these bits ABI as well. So
having PG_offline just like PG_hwpoison part of page_flags is the right
thing to do. (see patch nr 4)

3. For determining if all pages of a section are offline (see patch nr
5), I will have to be able to check 1. PG_offline and 2. PG_reserved on
any page. Will this be possible by moving e.g. PG_reserved to page
types? (especially if some field is suddenly aliased?)


I was wondering if we could reuse PG_hwpoison to mark "this page is not
to be read by dump tools" and store the real reason (page offline/page
has an hw error/page is part of a balloon ...) somewhere else (page
types?). But I am not sure if changing the semantic of PG_hwpoison
(visible to dump tools) is okay, and if we can then always "read out"
the type (especially: when is the page type field valid and can be used?)

Thanks!

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-22  8:17         ` David Hildenbrand
@ 2018-04-22 14:02           ` Matthew Wilcox
  2018-04-22 15:13             ` David Hildenbrand
  0 siblings, 1 reply; 23+ messages in thread
From: Matthew Wilcox @ 2018-04-22 14:02 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Vlastimil Babka, linux-mm, Steven Rostedt, Ingo Molnar,
	Andrew Morton, Michal Hocko, Huang Ying, Greg Kroah-Hartman,
	Pavel Tatashin, Miles Chen, Mel Gorman, Rik van Riel,
	James Hogan, Levin, Alexander (Sasha Levin),
	open list

On Sun, Apr 22, 2018 at 10:17:31AM +0200, David Hildenbrand wrote:
> On 22.04.2018 05:01, Matthew Wilcox wrote:
> > On Sat, Apr 21, 2018 at 06:52:18PM +0200, Vlastimil Babka wrote:
> >> Sounds like your newly introduced "page types" could be useful here? I
> >> don't suppose those offline pages would be using mapcount which is
> >> aliased there?
> > 
> > Oh, that's a good point!  Yes, this is a perfect use for page_type.
> > We have something like twenty bits available there.
> > 
> > Now you've got me thinking that we can move PG_hwpoison and PG_reserved
> > to be page_type flags too.  That'll take us from 23 to 21 bits (on 32-bit,
> > with PG_UNCACHED)
> 
> Some things to clarify here. I modified the current RFC to also allow
> PG_offline on allocated (ballooned) pages (e.g. virtio-balloon).
> 
> kdump based dump tools can then easily identify which pages are not to
> be dumped (either because the content is invalid or not accessible).
> 
> I previously stated that ballooned pages would be marked as PG_reserved,
> which is not true (at least not for virtio-balloon). However this allows
> me to detect if all pages in a section are offline by looking at
> (PG_reserved && PG_offline). So I can actually tell if a page is marked
> as offline and allocated or really offline.
> 
> 
> 1. The location (not the number!) of PG_hwpoison is basically ABI and
> cannot be changed. Moving it around will most probably break dump tools.
> (see kernel/crash_core.c)

It's not ABI.  It already changed after 4.9 when PG_waiters was introduced
by commit 62906027091f.

> 2. Exposing PG_offline via kdump will make it ABI as well. And we don't
> want any complicated validity checks ("is the bit valid or not?"),
> because that would imply having to make these bits ABI as well. So
> having PG_offline just like PG_hwpoison part of page_flags is the right
> thing to do. (see patch nr 4)
> 
> 3. For determining if all pages of a section are offline (see patch nr
> 5), I will have to be able to check 1. PG_offline and 2. PG_reserved on
> any page. Will this be possible by moving e.g. PG_reserved to page
> types? (especially if some field is suddenly aliased?)

It's possible to tell whether the field is in use as mapcount or
page_types; mapcount should always be non-negative, and page_types
reserves a few bits to detect under/overflow of mapcount.  The slab/slob
users of the field will also be positive uses.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-22 14:02           ` Matthew Wilcox
@ 2018-04-22 15:13             ` David Hildenbrand
  2018-04-29 21:08               ` Michal Hocko
  0 siblings, 1 reply; 23+ messages in thread
From: David Hildenbrand @ 2018-04-22 15:13 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Vlastimil Babka, linux-mm, Steven Rostedt, Ingo Molnar,
	Andrew Morton, Michal Hocko, Huang Ying, Greg Kroah-Hartman,
	Pavel Tatashin, Miles Chen, Mel Gorman, Rik van Riel,
	James Hogan, Levin, Alexander (Sasha Levin),
	open list

On 22.04.2018 16:02, Matthew Wilcox wrote:
> On Sun, Apr 22, 2018 at 10:17:31AM +0200, David Hildenbrand wrote:
>> On 22.04.2018 05:01, Matthew Wilcox wrote:
>>> On Sat, Apr 21, 2018 at 06:52:18PM +0200, Vlastimil Babka wrote:
>>>> Sounds like your newly introduced "page types" could be useful here? I
>>>> don't suppose those offline pages would be using mapcount which is
>>>> aliased there?
>>>
>>> Oh, that's a good point!  Yes, this is a perfect use for page_type.
>>> We have something like twenty bits available there.
>>>
>>> Now you've got me thinking that we can move PG_hwpoison and PG_reserved
>>> to be page_type flags too.  That'll take us from 23 to 21 bits (on 32-bit,
>>> with PG_UNCACHED)
>>
>> Some things to clarify here. I modified the current RFC to also allow
>> PG_offline on allocated (ballooned) pages (e.g. virtio-balloon).
>>
>> kdump based dump tools can then easily identify which pages are not to
>> be dumped (either because the content is invalid or not accessible).
>>
>> I previously stated that ballooned pages would be marked as PG_reserved,
>> which is not true (at least not for virtio-balloon). However this allows
>> me to detect if all pages in a section are offline by looking at
>> (PG_reserved && PG_offline). So I can actually tell if a page is marked
>> as offline and allocated or really offline.
>>
>>
>> 1. The location (not the number!) of PG_hwpoison is basically ABI and
>> cannot be changed. Moving it around will most probably break dump tools.
>> (see kernel/crash_core.c)
> 
> It's not ABI.  It already changed after 4.9 when PG_waiters was introduced
> by commit 62906027091f.

It is, please have a look at the file I pointed you to.

We export the *value* of PG_hwpoison in the ELF file, therefore the
*value* can change, but the *location* (page_flags, mapcount, whatever)
must not change. Or am I missing something here? I don't think we can
move PG_hwpoison that easily.

Also, I can read "For pages that are never mapped to userspace,
page->mapcount may be used for storing extra information about page
type" - is that true for PG_hwpoison/PG_reserved? I am skeptical.

And we need something similar for PG_offline, because it will become
ABI. (I can see that PAGE_BUDDY_MAPCOUNT_VALUE is also exported in an
ELF file, so maybe a new page type might work for marking a page offline
- but I have to look at the details first tomorrow)

> 
>> 2. Exposing PG_offline via kdump will make it ABI as well. And we don't
>> want any complicated validity checks ("is the bit valid or not?"),
>> because that would imply having to make these bits ABI as well. So
>> having PG_offline just like PG_hwpoison part of page_flags is the right
>> thing to do. (see patch nr 4)
>>
>> 3. For determining if all pages of a section are offline (see patch nr
>> 5), I will have to be able to check 1. PG_offline and 2. PG_reserved on
>> any page. Will this be possible by moving e.g. PG_reserved to page
>> types? (especially if some field is suddenly aliased?)
> 
> It's possible to tell whether the field is in use as mapcount or
> page_types; mapcount should always be non-negative, and page_types
> reserves a few bits to detect under/overflow of mapcount.  The slab/slob
> users of the field will also be positive uses.
> 

Thanks for the info!

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-22 15:13             ` David Hildenbrand
@ 2018-04-29 21:08               ` Michal Hocko
  2018-04-30  6:31                 ` David Hildenbrand
  0 siblings, 1 reply; 23+ messages in thread
From: Michal Hocko @ 2018-04-29 21:08 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Matthew Wilcox, Vlastimil Babka, linux-mm, Steven Rostedt,
	Ingo Molnar, Andrew Morton, Huang Ying, Greg Kroah-Hartman,
	Pavel Tatashin, Miles Chen, Mel Gorman, Rik van Riel,
	James Hogan, Levin, Alexander (Sasha Levin),
	open list

On Sun 22-04-18 17:13:52, David Hildenbrand wrote:
> On 22.04.2018 16:02, Matthew Wilcox wrote:
> > On Sun, Apr 22, 2018 at 10:17:31AM +0200, David Hildenbrand wrote:
> >> On 22.04.2018 05:01, Matthew Wilcox wrote:
> >>> On Sat, Apr 21, 2018 at 06:52:18PM +0200, Vlastimil Babka wrote:
> >>>> Sounds like your newly introduced "page types" could be useful here? I
> >>>> don't suppose those offline pages would be using mapcount which is
> >>>> aliased there?
> >>>
> >>> Oh, that's a good point!  Yes, this is a perfect use for page_type.
> >>> We have something like twenty bits available there.
> >>>
> >>> Now you've got me thinking that we can move PG_hwpoison and PG_reserved
> >>> to be page_type flags too.  That'll take us from 23 to 21 bits (on 32-bit,
> >>> with PG_UNCACHED)
> >>
> >> Some things to clarify here. I modified the current RFC to also allow
> >> PG_offline on allocated (ballooned) pages (e.g. virtio-balloon).
> >>
> >> kdump based dump tools can then easily identify which pages are not to
> >> be dumped (either because the content is invalid or not accessible).
> >>
> >> I previously stated that ballooned pages would be marked as PG_reserved,
> >> which is not true (at least not for virtio-balloon). However this allows
> >> me to detect if all pages in a section are offline by looking at
> >> (PG_reserved && PG_offline). So I can actually tell if a page is marked
> >> as offline and allocated or really offline.
> >>
> >>
> >> 1. The location (not the number!) of PG_hwpoison is basically ABI and
> >> cannot be changed. Moving it around will most probably break dump tools.
> >> (see kernel/crash_core.c)
> > 
> > It's not ABI.  It already changed after 4.9 when PG_waiters was introduced
> > by commit 62906027091f.
> 
> It is, please have a look at the file I pointed you to.
> 
> We export the *value* of PG_hwpoison in the ELF file, therefore the
> *value* can change, but the *location* (page_flags, mapcount, whatever)
> must not change. Or am I missing something here? I don't think we can
> move PG_hwpoison that easily.
> 
> Also, I can read "For pages that are never mapped to userspace,
> page->mapcount may be used for storing extra information about page
> type" - is that true for PG_hwpoison/PG_reserved? I am skeptical.
> 
> And we need something similar for PG_offline, because it will become
> ABI. (I can see that PAGE_BUDDY_MAPCOUNT_VALUE is also exported in an
> ELF file, so maybe a new page type might work for marking a page offline
> - but I have to look at the details first tomorrow)

Wait wait wait. Who is relying on this? Kdump? Page flags have always
been an internal implementation detail and _nobody_ outside of the
kernel should ever rely on the specific value. Well, kdump has been
cheating but that is because kdump is inherently tight to a specific
kernel implementation but that doesn't make it a stable ABI IMHO.
Restricting the kernel internals because of a debugging tool would be
quite insane.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 2/8] mm: introduce PG_offline
  2018-04-29 21:08               ` Michal Hocko
@ 2018-04-30  6:31                 ` David Hildenbrand
  0 siblings, 0 replies; 23+ messages in thread
From: David Hildenbrand @ 2018-04-30  6:31 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Matthew Wilcox, Vlastimil Babka, linux-mm, Steven Rostedt,
	Ingo Molnar, Andrew Morton, Huang Ying, Greg Kroah-Hartman,
	Pavel Tatashin, Miles Chen, Mel Gorman, Rik van Riel,
	James Hogan, Levin, Alexander (Sasha Levin),
	open list

On 29.04.2018 23:08, Michal Hocko wrote:
> On Sun 22-04-18 17:13:52, David Hildenbrand wrote:
>> On 22.04.2018 16:02, Matthew Wilcox wrote:
>>> On Sun, Apr 22, 2018 at 10:17:31AM +0200, David Hildenbrand wrote:
>>>> On 22.04.2018 05:01, Matthew Wilcox wrote:
>>>>> On Sat, Apr 21, 2018 at 06:52:18PM +0200, Vlastimil Babka wrote:
>>>>>> Sounds like your newly introduced "page types" could be useful here? I
>>>>>> don't suppose those offline pages would be using mapcount which is
>>>>>> aliased there?
>>>>>
>>>>> Oh, that's a good point!  Yes, this is a perfect use for page_type.
>>>>> We have something like twenty bits available there.
>>>>>
>>>>> Now you've got me thinking that we can move PG_hwpoison and PG_reserved
>>>>> to be page_type flags too.  That'll take us from 23 to 21 bits (on 32-bit,
>>>>> with PG_UNCACHED)
>>>>
>>>> Some things to clarify here. I modified the current RFC to also allow
>>>> PG_offline on allocated (ballooned) pages (e.g. virtio-balloon).
>>>>
>>>> kdump based dump tools can then easily identify which pages are not to
>>>> be dumped (either because the content is invalid or not accessible).
>>>>
>>>> I previously stated that ballooned pages would be marked as PG_reserved,
>>>> which is not true (at least not for virtio-balloon). However this allows
>>>> me to detect if all pages in a section are offline by looking at
>>>> (PG_reserved && PG_offline). So I can actually tell if a page is marked
>>>> as offline and allocated or really offline.
>>>>
>>>>
>>>> 1. The location (not the number!) of PG_hwpoison is basically ABI and
>>>> cannot be changed. Moving it around will most probably break dump tools.
>>>> (see kernel/crash_core.c)
>>>
>>> It's not ABI.  It already changed after 4.9 when PG_waiters was introduced
>>> by commit 62906027091f.
>>
>> It is, please have a look at the file I pointed you to.
>>
>> We export the *value* of PG_hwpoison in the ELF file, therefore the
>> *value* can change, but the *location* (page_flags, mapcount, whatever)
>> must not change. Or am I missing something here? I don't think we can
>> move PG_hwpoison that easily.
>>
>> Also, I can read "For pages that are never mapped to userspace,
>> page->mapcount may be used for storing extra information about page
>> type" - is that true for PG_hwpoison/PG_reserved? I am skeptical.
>>
>> And we need something similar for PG_offline, because it will become
>> ABI. (I can see that PAGE_BUDDY_MAPCOUNT_VALUE is also exported in an
>> ELF file, so maybe a new page type might work for marking a page offline
>> - but I have to look at the details first tomorrow)
> 
> Wait wait wait. Who is relying on this? Kdump? Page flags have always
> been an internal implementation detail and _nobody_ outside of the
> kernel should ever rely on the specific value. Well, kdump has been
> cheating but that is because kdump is inherently tight to a specific
> kernel implementation but that doesn't make it a stable ABI IMHO.
> Restricting the kernel internals because of a debugging tool would be
> quite insane.
> 

kdump tools (makedumptool) don't rely on any specific value or assume
anything.

Using the example of musing PG_hwpoison to mapcount:

If it sees PG_hwpoison:
- it knows the right bit number to use
- it knows the kernel uses it

If it doesn't see PG_hwpoison (in the ELF info) anymore:
- it cannot exclude poisoned pages anymore, potentially crashing the
system during a dump


If you have a better fitting name for "requires a interlocked update
with tools to keep it working" than ABI, please let me know :)

Anyhow, I have a new prototype based on PAGE_OFFLINE_MAPCOUNT_VALUE that
I will share briefly.

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2018-04-30  6:31 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20180413131632.1413-1-david@redhat.com>
2018-04-13 13:16 ` [PATCH RFC 1/8] mm/memory_hotplug: Revert "mm/memory_hotplug: optimize memory hotplug" David Hildenbrand
2018-04-13 13:16 ` [PATCH RFC 2/8] mm: introduce PG_offline David Hildenbrand
2018-04-13 13:40   ` Michal Hocko
2018-04-13 13:46     ` David Hildenbrand
2018-04-17 11:50     ` David Hildenbrand
2018-04-13 17:11   ` Matthew Wilcox
2018-04-16  8:31     ` David Hildenbrand
2018-04-21 16:52     ` Vlastimil Babka
2018-04-22  3:01       ` Matthew Wilcox
2018-04-22  8:17         ` David Hildenbrand
2018-04-22 14:02           ` Matthew Wilcox
2018-04-22 15:13             ` David Hildenbrand
2018-04-29 21:08               ` Michal Hocko
2018-04-30  6:31                 ` David Hildenbrand
2018-04-20  7:30   ` David Hildenbrand
2018-04-13 13:16 ` [PATCH RFC 3/8] mm: use PG_offline in online/offlining code David Hildenbrand
2018-04-13 13:31 ` [PATCH RFC 4/8] kdump: expose PG_offline David Hildenbrand
2018-04-13 13:33 ` [PATCH RFC 5/8] mm: only mark section offline when all pages are offline David Hildenbrand
2018-04-13 13:33 ` [PATCH RFC 6/8] mm: offline_pages() is also limited by MAX_ORDER David Hildenbrand
2018-04-13 13:33 ` [PATCH RFC 7/8] mm: allow to control onlining/offlining of memory by a driver David Hildenbrand
2018-04-13 15:59   ` Michal Hocko
2018-04-13 16:32     ` David Hildenbrand
2018-04-13 13:33 ` [PATCH RFC 8/8] mm: export more functions used to online/offline memory David Hildenbrand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).