linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] mm: Fix memory corruption caused by deferred page initialization
@ 2016-03-25 16:05 Gavin Shan
  2016-03-26  9:47 ` [RFC] " Michael Ellerman
  0 siblings, 1 reply; 9+ messages in thread
From: Gavin Shan @ 2016-03-25 16:05 UTC (permalink / raw)
  To: linux-mm; +Cc: linuxppc-dev, zhlcindy, mpe, mgorman, Gavin Shan

During deferred page initialization, the pages are moved from memblock
or bootmem to buddy allocator without checking they were reserved. Those
reserved pages can be reallocated to somebody else by buddy/slab allocator.
It leads to memory corruption and potential kernel crash eventually.

This fixes above issue by:

   * Deferred releasing bootmem bitmap until the completion of deferred
     page initialization.
   * Implements __reserved_bootmem_region() to check if the specified
     page is reserved by memblock or bootmem during the deferred
     page initialization. The pages won't be released to buddy allocator
     if they are reserved.
   * In free_all_bootmem_core(), @cur is set to node's starting PFN and
     that's incorrect. It's fixed as well.

With this applied, the IBM's Power8 box boots up without reserved issues
with all possible combinations of NO_BOOTMEM and DEFERRED_STRUCT_PAGE_INIT.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 include/linux/bootmem.h |  2 ++
 mm/bootmem.c            | 45 +++++++++++++++++++++++++++++++++-----
 mm/nobootmem.c          |  6 +++++
 mm/page_alloc.c         | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 35b22f9..a64f378 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -78,6 +78,8 @@ extern int reserve_bootmem_node(pg_data_t *pgdat,
 				unsigned long size,
 				int flags);
 
+extern bool __reserved_bootmem_region(unsigned long base,
+				      unsigned long size);
 extern void *__alloc_bootmem(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0aa7dda..eaf13b0 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -172,7 +172,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
 	struct page *page;
-	unsigned long *map, start, end, pages, cur, count = 0;
+	unsigned long *map, start, end, cur, count = 0;
 
 	if (!bdata->node_bootmem_map)
 		return 0;
@@ -229,14 +229,21 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 		}
 	}
 
-	cur = bdata->node_min_pfn;
+	/*
+	 * The information tracked by bootmem bitmap can be released when
+	 * deferred page initialization is disabled. Otherwise, we have
+	 * to release it right after deferred page initialization
+	 */
+#ifndef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+	cur = PFN_DOWN(virt_to_phys(bdata->node_bootmem_map));
 	page = virt_to_page(bdata->node_bootmem_map);
-	pages = bdata->node_low_pfn - bdata->node_min_pfn;
-	pages = bootmem_bootmap_pages(pages);
-	count += pages;
-	while (pages--)
+	end = bdata->node_low_pfn - bdata->node_min_pfn;
+	end = bootmem_bootmap_pages(end);
+	count += end;
+	while (end--)
 		__free_pages_bootmem(page++, cur++, 0);
 	bdata->node_bootmem_map = NULL;
+#endif
 
 	bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
 
@@ -497,6 +504,32 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
 	return ALIGN(base + off, align) - base;
 }
 
+bool __init __reserved_bootmem_region(unsigned long base, unsigned long size)
+{
+	struct pglist_data *pgdat;
+	struct bootmem_data *bdata;
+	unsigned long pfn, start, end, idx;
+	int nid;
+
+	start = PFN_DOWN(base);
+	end = PFN_UP(base + size);
+	for (pfn = start; pfn < end; pfn++) {
+		nid = early_pfn_to_nid(pfn);
+		pgdat = NODE_DATA(nid);
+		bdata = pgdat ? pgdat->bdata : NULL;
+		if (!bdata ||
+		    pfn < bdata->node_min_pfn ||
+		    pfn > bdata->node_low_pfn)
+			continue;
+
+		idx = pfn - bdata->node_min_pfn;
+		if (test_bit(idx, bdata->node_bootmem_map))
+			return true;
+	}
+
+	return false;
+}
+
 static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
 					unsigned long size, unsigned long align,
 					unsigned long goal, unsigned long limit)
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bd05a70..70bca8d2 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -33,6 +33,12 @@ unsigned long min_low_pfn;
 unsigned long max_pfn;
 unsigned long long max_possible_pfn;
 
+bool __init __reserved_bootmem_region(unsigned long base,
+				      unsigned long size)
+{
+	return memblock_is_region_reserved(base, size);
+}
+
 static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 					u64 goal, u64 limit)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a762be5..9ca9546 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1227,6 +1227,57 @@ static void __init deferred_free_range(struct page *page,
 		__free_pages_boot_core(page, pfn, 0);
 }
 
+#ifndef CONFIG_NO_BOOTMEM
+static unsigned long __init deferred_free_bootmem_bitmap(int nid)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct bootmem_data *bdata = pgdat->bdata;
+	struct zone *zone;
+	struct page *page;
+	unsigned long pfn, cur, pages, count;
+	int zid;
+
+	if (!bdata || !bdata->node_bootmem_map)
+		return 0UL;
+
+	pfn = PFN_DOWN(virt_to_phys(bdata->node_bootmem_map));
+	page = virt_to_page(bdata->node_bootmem_map);
+	bdata->node_bootmem_map = NULL;
+	pages = bdata->node_low_pfn - bdata->node_min_pfn;
+	pages = bootmem_bootmap_pages(pages);
+
+	/*
+	 * We won't lose much performance to release pages one by one
+	 * as the amount of reserved memory for bootmem bitmap is usually
+	 * very small
+	 */
+	for (count = 0UL, cur = 0UL; cur < pages; cur++) {
+		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+			zone = &pgdat->node_zones[zid];
+			if (!zone->spanned_pages)
+				continue;
+
+			if (pfn >= zone->zone_start_pfn &&
+			    pfn < zone->zone_start_pfn + zone->spanned_pages)
+				break;
+		}
+
+		if (zid < MAX_NR_ZONES) {
+			pr_info("%s: nid#%d, %s, 0x%lx\n",
+				__func__, nid, zone_names[zid], pfn);
+			__init_single_page(page, pfn, zid, nid);
+			__free_pages_boot_core(page, pfn, 0);
+			count++;
+		}
+
+		page++;
+		pfn++;
+	}
+
+	return count;
+}
+#endif /* !CONFIG_NO_BOOTMEM */
+
 /* Completion tracking for deferred_init_memmap() threads */
 static atomic_t pgdat_init_n_undone __initdata;
 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
@@ -1301,7 +1352,9 @@ static int __init deferred_init_memmap(void *data)
 				}
 			}
 
-			if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+			if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state) ||
+			    __reserved_bootmem_region(PFN_PHYS(pfn),
+						      PAGE_SIZE)) {
 				page = NULL;
 				goto free_range;
 			}
@@ -1350,6 +1403,9 @@ free_range:
 	/* Sanity check that the next zone really is unpopulated */
 	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
 
+#ifndef CONFIG_NO_BOOTMEM
+	nr_pages += deferred_free_bootmem_bitmap(nid);
+#endif
 	pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
 					jiffies_to_msecs(jiffies - start));
 
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC] mm: Fix memory corruption caused by deferred page initialization
  2016-03-25 16:05 [PATCH RFC] mm: Fix memory corruption caused by deferred page initialization Gavin Shan
@ 2016-03-26  9:47 ` Michael Ellerman
  2016-03-26 13:37   ` Gavin Shan
  0 siblings, 1 reply; 9+ messages in thread
From: Michael Ellerman @ 2016-03-26  9:47 UTC (permalink / raw)
  To: Gavin Shan, linux-mm; +Cc: linuxppc-dev, mgorman, zhlcindy, Gavin Shan

Hi Gavin,

On Fri, 2016-25-03 at 16:05:29 UTC, Gavin Shan wrote:
> During deferred page initialization, the pages are moved from memblock
> or bootmem to buddy allocator without checking they were reserved. Those
> reserved pages can be reallocated to somebody else by buddy/slab allocator.
> It leads to memory corruption and potential kernel crash eventually.

Can you give me a bit more detail on what the bug is?

I haven't seen any issues on my systems, but I realise now I haven't enabled
DEFERRED_STRUCT_PAGE_INIT - I assumed it was enabled by default.

How did this get tested before submission?

> This fixes above issue by:
> 
>    * Deferred releasing bootmem bitmap until the completion of deferred
>      page initialization.

As I said in my other mail, we don't support bootmem anymore. So please resend
with just the non-bootmem fixes.

cheers

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] mm: Fix memory corruption caused by deferred page initialization
  2016-03-26  9:47 ` [RFC] " Michael Ellerman
@ 2016-03-26 13:37   ` Gavin Shan
  2016-03-27 13:48     ` Gavin Shan
                       ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Gavin Shan @ 2016-03-26 13:37 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Gavin Shan, linux-mm, linuxppc-dev, mgorman, zhlcindy

On Sat, Mar 26, 2016 at 08:47:17PM +1100, Michael Ellerman wrote:
>Hi Gavin,
>
>On Fri, 2016-25-03 at 16:05:29 UTC, Gavin Shan wrote:
>> During deferred page initialization, the pages are moved from memblock
>> or bootmem to buddy allocator without checking they were reserved. Those
>> reserved pages can be reallocated to somebody else by buddy/slab allocator.
>> It leads to memory corruption and potential kernel crash eventually.
>
>Can you give me a bit more detail on what the bug is?
>
>I haven't seen any issues on my systems, but I realise now I haven't enabled
>DEFERRED_STRUCT_PAGE_INIT - I assumed it was enabled by default.
>
>How did this get tested before submission?
>

Michael, I have to reply with same context in another thread in case 
somebody else wants to understand more: Li, who is in the cc list, is
backporting deferred page initialization (CONFIG_DEFERRED_STRUCT_PAGE_INIT)
from upstream kernel to RHEL 7.2 or 7.3 kernel (3.10.0-357.el7). RHEL kernel
has (!CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT), meaning
bootmem is enabled. She eventually runs into kernel crash and I jumped
in to help understanding the root cause.

There're two related kernel config options: ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
and DEFERRED_STRUCT_PAGE_INIT. The former one is enabled on PPC by default.
The later one isn't enabled by default.

There are two test cases I had:

- With (!CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT)
on PowerNV platform, upstream kernel (4.5.rc7) and additional patch to support
bootmem as it was removed on powerpc a while ago.

- With (CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT) on PowerNV platform,
upstream kernel (4.5.rc7), I dumped the reserved memblock regions and added printk
in function deferred_init_memmap() to check if memblock reserved PFN 0x1fff80 (one
page in memblock reserved region#31, refer to the below kernel log) is released
to buddy allocator or not when doing deferred page struct initialization. I did
see that PFN is released to buddy allocator at that time. However, I didn't see
kernel crash and it would be luck and the current deferred page struct initialization
implementation: The pages in region [0, 2GB] except the memblock reserved ones are
presented to buddy allocator at early stage. It's not deferred. So for the pages in
[0, 2GB], we don't have consistency issue between memblock and buddy allocator.
The pages in region [2GB ...] are all presented to buddy allocator despite they're
reserved in memblock or not. It ensures the kernel text section isn't corrupted
and we're lucky not seeing program interrupt because of illegal instruction.

Below is the kernel log I got from the printk:

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 index a762be5..7039bc5 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -1307,6 +1307,9 @@ static int __init deferred_init_memmap(void *data)
                         }
  
                         /* Minimise pfn page lookups and scheduler checks */
 +                       if (pfn == 0x1fff80)
 +                               pr_info("===> %s: Free PFN 0x%lx\n", __func__, pfn);


[    0.000000] Linux version 4.5.0-11790-g4fab991-dirty (gwshan@gwshan) (gcc version 4.9.3 (Buildroot 2016.02-rc2-00093-g5ea3bce) ) #423 SMP Sat Mar 26 23:24:15 AEDT 2016
	:
[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000000000000-0x0000001fffffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   empty
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000000000000-0x0000000fffffffff]
[    0.000000]   node   8: [mem 0x0000001000000000-0x0000001fffffffff]
        :
[    0.492855] Brought up 160 CPUs
[    0.493047] Node 0 CPUs: 0-79
[    0.493098] Node 8 CPUs: 80-159
[    0.525746] ===> deferred_init_memmap: Free PFN 0x1fff80	<<<< In memblock reserved region#31
[    0.525764] node 0 initialised, 1014458 pages in 30ms
[    0.526005] node 8 initialised, 1012540 pages in 30ms
        :
[    8.973599] Dumping memblock [memory]
[    8.973630]    [0000] [0000000000000000 - 0x0000001000000000, 0000001000000000] [0000000000000000] [0]
[    8.973698]    [0001] [0000001000000000 - 0x0000002000000000, 0000001000000000] [0000000000000000] [8]
[    8.973766] Dumping memblock [reserved]
[    8.973797]    [0000] [0000000000000000 - 0x0000000001540000, 0000000001540000] [0000000000000000] [256]
[    8.973866]    [0001] [000000000fc40000 - 0x000000000fcb0000, 0000000000070000] [0000000000000000] [256]
[    8.973935]    [0002] [000000000fe80000 - 0x000000000fea0000, 0000000000020000] [0000000000000000] [256]
[    8.974004]    [0003] [0000000030000000 - 0x0000000032de0000, 0000000002de0000] [0000000000000000] [256]
[    8.974073]    [0004] [0000000039c00000 - 0x000000003a780200, 0000000000b80200] [0000000000000000] [256]
[    8.974144]    [0005] [000000003fb00000 - 0x0000000040000000, 0000000000500000] [0000000000000000] [256]
[    8.974213]    [0006] [000000007ffe0000 - 0x000000007fffddff, 000000000001ddff] [0000000000000000] [256]
[    8.974281]    [0007] [0000000ff9c00000 - 0x0000000fff000000, 0000000005400000] [0000000000000000] [256]
[    8.974351]    [0008] [0000000ffffa8000 - 0x0000000ffffd0000, 0000000000028000] [0000000000000000] [256]
[    8.974419]    [0009] [0000000ffffde300 - 0x0000001000c40200, 0000000000c61f00] [0000000000000000] [256]
[    8.974489]    [0010] [0000001ff0000000 - 0x0000001ff8000000, 0000000008000000] [0000000000000000] [256]
[    8.974556]    [0011] [0000001ff9000000 - 0x0000001ffb000000, 0000000002000000] [0000000000000000] [256]
[    8.974624]    [0012] [0000001ffd260000 - 0x0000001fff000000, 0000000001da0000] [0000000000000000] [256]
[    8.974692]    [0013] [0000001fff15b780 - 0x0000001fff15b7f0, 0000000000000070] [0000000000000000] [256]
[    8.974760]    [0014] [0000001fff15b800 - 0x0000001fff15b910, 0000000000000110] [0000000000000000] [256]
[    8.974828]    [0015] [0000001fff15b980 - 0x0000001fff15c108, 0000000000000788] [0000000000000000] [256]
[    8.974904]    [0016] [0000001fff15c180 - 0x0000001fff15c188, 0000000000000008] [0000000000000000] [256]
[    8.974974]    [0017] [0000001fff174200 - 0x0000001fff17c223, 0000000000008023] [0000000000000000] [256]
[    8.975042]    [0018] [0000001fff17c280 - 0x0000001fff17c2a3, 0000000000000023] [0000000000000000] [256]
[    8.975110]    [0019] [0000001fff17c300 - 0x0000001fff1a5ba0, 00000000000298a0] [0000000000000000] [256]
[    8.975178]    [0020] [0000001fff1a5c00 - 0x0000001fff1b8148, 0000000000012548] [0000000000000000] [256]
[    8.975247]    [0021] [0000001fff1b8180 - 0x0000001fff1c86a0, 0000000000010520] [0000000000000000] [256]
[    8.975315]    [0022] [0000001fff1c8700 - 0x0000001fff1dac48, 0000000000012548] [0000000000000000] [256]
[    8.975385]    [0023] [0000001fff1dac80 - 0x0000001fff1eb0a0, 0000000000010420] [0000000000000000] [256]
[    8.975454]    [0024] [0000001fff1eb100 - 0x0000001fff1fd3c8, 00000000000122c8] [0000000000000000] [256]
[    8.975522]    [0025] [0000001fff1fd400 - 0x0000001fff20d820, 0000000000010420] [0000000000000000] [256]
[    8.975592]    [0026] [0000001fff20d880 - 0x0000001fff21fb48, 00000000000122c8] [0000000000000000] [256]
[    8.975660]    [0027] [0000001fff21fb80 - 0x0000001fff22ffa0, 0000000000010420] [0000000000000000] [256]
[    8.975727]    [0028] [0000001fff230000 - 0x0000001fff2422c8, 00000000000122c8] [0000000000000000] [256]
[    8.975795]    [0029] [0000001fff242300 - 0x0000001fff764b23, 0000000000522823] [0000000000000000] [256]
[    8.975864]    [0030] [0000001fff764b48 - 0x0000001fff7ffffc, 000000000009b4b4] [0000000000000000] [256]
[    8.975932]    [0031] [0000001fff800000 - 0x0000002000000000, 0000000000800000] [0000000000000000] [256]

>> This fixes above issue by:
>> 
>>    * Deferred releasing bootmem bitmap until the completion of deferred
>>      page initialization.
>
>As I said in my other mail, we don't support bootmem anymore. So please resend
>with just the non-bootmem fixes.
>

I think this patch is generic one. I guess bootmem might be supported on other
platforms other than PPC? If that's the case, it would be fine to have the code
fixing the bootmem bitmap if you agree. If you want me to split the patch into
two for bootmem and memblock cases separately, I can do it absolutely. Please
let me know your preference :-)

Thanks,
Gavin

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] mm: Fix memory corruption caused by deferred page initialization
  2016-03-26 13:37   ` Gavin Shan
@ 2016-03-27 13:48     ` Gavin Shan
  2016-03-31  2:27       ` Gavin Shan
  2016-03-28 14:20     ` Aneesh Kumar K.V
  2016-03-29  5:13     ` Li Zhang
  2 siblings, 1 reply; 9+ messages in thread
From: Gavin Shan @ 2016-03-27 13:48 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Michael Ellerman, linux-mm, linuxppc-dev, mgorman, zhlcindy

On Sun, Mar 27, 2016 at 12:37:09AM +1100, Gavin Shan wrote:
>On Sat, Mar 26, 2016 at 08:47:17PM +1100, Michael Ellerman wrote:
>>Hi Gavin,
>>
>>On Fri, 2016-25-03 at 16:05:29 UTC, Gavin Shan wrote:
>>> During deferred page initialization, the pages are moved from memblock
>>> or bootmem to buddy allocator without checking they were reserved. Those
>>> reserved pages can be reallocated to somebody else by buddy/slab allocator.
>>> It leads to memory corruption and potential kernel crash eventually.
>>
>>Can you give me a bit more detail on what the bug is?
>>
>>I haven't seen any issues on my systems, but I realise now I haven't enabled
>>DEFERRED_STRUCT_PAGE_INIT - I assumed it was enabled by default.
>>
>>How did this get tested before submission?
>>
>
>Michael, I have to reply with same context in another thread in case 
>somebody else wants to understand more: Li, who is in the cc list, is
>backporting deferred page initialization (CONFIG_DEFERRED_STRUCT_PAGE_INIT)
>from upstream kernel to RHEL 7.2 or 7.3 kernel (3.10.0-357.el7). RHEL kernel
>has (!CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT), meaning
>bootmem is enabled. She eventually runs into kernel crash and I jumped
>in to help understanding the root cause.
>
>There're two related kernel config options: ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
>and DEFERRED_STRUCT_PAGE_INIT. The former one is enabled on PPC by default.
>The later one isn't enabled by default.
>
>There are two test cases I had:
>
>- With (!CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT)
>on PowerNV platform, upstream kernel (4.5.rc7) and additional patch to support
>bootmem as it was removed on powerpc a while ago.
>
>- With (CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT) on PowerNV platform,
>upstream kernel (4.5.rc7), I dumped the reserved memblock regions and added printk
>in function deferred_init_memmap() to check if memblock reserved PFN 0x1fff80 (one
>page in memblock reserved region#31, refer to the below kernel log) is released
>to buddy allocator or not when doing deferred page struct initialization. I did
>see that PFN is released to buddy allocator at that time. However, I didn't see
>kernel crash and it would be luck and the current deferred page struct initialization
>implementation: The pages in region [0, 2GB] except the memblock reserved ones are
>presented to buddy allocator at early stage. It's not deferred. So for the pages in
>[0, 2GB], we don't have consistency issue between memblock and buddy allocator.
>The pages in region [2GB ...] are all presented to buddy allocator despite they're
>reserved in memblock or not. It ensures the kernel text section isn't corrupted
>and we're lucky not seeing program interrupt because of illegal instruction.
>

After more debugging, it turns out that Michael is correct: we don't have problem
when CONFIG_NO_BOOTMEM=y. In the case, the page frames in [2G ...] is marked as
reserved in early stage (as below function calls reveal). During the deferred
initialization stage, those reserved pages won't be released to buddy allocator:

- Below function calls mark reserved pages according to memblock reserved regions:
  init/main.c::start_kernel()
  init/main.c::mm_init()
  arch/powerpc/mm/mem.c::mem_init()
  nobootmem.c::free_all_bootmem()            <-> bootmem.c::free_all_bootmem() on !CONFIG_NO_BOOTMEM
  nobootmem.c::free_low_memory_core_early()
  nobootmem.c::reserve_bootmem_region()

- In page_alloc.c::deferred_init_memmap(), the reserved pages aren't released
  to buddy allocator with below check:

                        if (page->flags) {
                                VM_BUG_ON(page_zone(page) != zone);
                                goto free_range;
                        }


So the issue is only existing when CONFIG_NO_BOOTMEM=n. The alternative fix would
be similar to what we have on !CONFIG_NO_BOOTMEM: In early stage, all page structs
for bootmem reserved pages are initialized and mark them with PG_reserved. I'm
not sure it's worthy to fix it as we won't support bootmem as Michael mentioned.

Thanks,
Gavin

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] mm: Fix memory corruption caused by deferred page initialization
  2016-03-26 13:37   ` Gavin Shan
  2016-03-27 13:48     ` Gavin Shan
@ 2016-03-28 14:20     ` Aneesh Kumar K.V
  2016-03-29  5:13     ` Li Zhang
  2 siblings, 0 replies; 9+ messages in thread
From: Aneesh Kumar K.V @ 2016-03-28 14:20 UTC (permalink / raw)
  To: Gavin Shan, Michael Ellerman
  Cc: Gavin Shan, linux-mm, linuxppc-dev, mgorman, zhlcindy

Gavin Shan <gwshan@linux.vnet.ibm.com> writes:

> [ text/plain ]
> On Sat, Mar 26, 2016 at 08:47:17PM +1100, Michael Ellerman wrote:
>>Hi Gavin,
>>
>>On Fri, 2016-25-03 at 16:05:29 UTC, Gavin Shan wrote:
>>> During deferred page initialization, the pages are moved from memblock
>>> or bootmem to buddy allocator without checking they were reserved. Those
>>> reserved pages can be reallocated to somebody else by buddy/slab allocator.
>>> It leads to memory corruption and potential kernel crash eventually.
>>
>>Can you give me a bit more detail on what the bug is?
>>
>>I haven't seen any issues on my systems, but I realise now I haven't enabled
>>DEFERRED_STRUCT_PAGE_INIT - I assumed it was enabled by default.
>>
>>How did this get tested before submission?
>
.....

> I think this patch is generic one. I guess bootmem might be supported on other
> platforms other than PPC? If that's the case, it would be fine to have the code
> fixing the bootmem bitmap if you agree. If you want me to split the patch into
> two for bootmem and memblock cases separately, I can do it absolutely. Please
> let me know your preference :-)
>

IMHO it would make it simpler if you split this into two patch. Also
avoid doing variable renames in the patch.

-aneesh

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] mm: Fix memory corruption caused by deferred page initialization
  2016-03-26 13:37   ` Gavin Shan
  2016-03-27 13:48     ` Gavin Shan
  2016-03-28 14:20     ` Aneesh Kumar K.V
@ 2016-03-29  5:13     ` Li Zhang
  2 siblings, 0 replies; 9+ messages in thread
From: Li Zhang @ 2016-03-29  5:13 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Michael Ellerman, linux-mm, linuxppc-dev, Li Zhang, mgorman

A problem is found by back porting page parallel initialisation for RHEL7.2.
And CONFIG_NO_BOOTMEM = n. If kernel old version with page parallel needs
to work well, it still needs to fix with BOOTMEM as this patch looks like.

There are some potential bugs with page parallel in RHEL7.2. I will continue
to look at it.

Thanks.

On Sat, Mar 26, 2016 at 9:37 PM, Gavin Shan <gwshan@linux.vnet.ibm.com> wrote:
> On Sat, Mar 26, 2016 at 08:47:17PM +1100, Michael Ellerman wrote:
>>Hi Gavin,
>>
>>On Fri, 2016-25-03 at 16:05:29 UTC, Gavin Shan wrote:
>>> During deferred page initialization, the pages are moved from memblock
>>> or bootmem to buddy allocator without checking they were reserved. Those
>>> reserved pages can be reallocated to somebody else by buddy/slab allocator.
>>> It leads to memory corruption and potential kernel crash eventually.
>>
>>Can you give me a bit more detail on what the bug is?
>>
>>I haven't seen any issues on my systems, but I realise now I haven't enabled
>>DEFERRED_STRUCT_PAGE_INIT - I assumed it was enabled by default.
>>
>>How did this get tested before submission?
>>
>
> Michael, I have to reply with same context in another thread in case
> somebody else wants to understand more: Li, who is in the cc list, is
> backporting deferred page initialization (CONFIG_DEFERRED_STRUCT_PAGE_INIT)
> from upstream kernel to RHEL 7.2 or 7.3 kernel (3.10.0-357.el7). RHEL kernel
> has (!CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT), meaning
> bootmem is enabled. She eventually runs into kernel crash and I jumped
> in to help understanding the root cause.
>
> There're two related kernel config options: ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
> and DEFERRED_STRUCT_PAGE_INIT. The former one is enabled on PPC by default.
> The later one isn't enabled by default.
>
> There are two test cases I had:
>
> - With (!CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT)
> on PowerNV platform, upstream kernel (4.5.rc7) and additional patch to support
> bootmem as it was removed on powerpc a while ago.
>
> - With (CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT) on PowerNV platform,
> upstream kernel (4.5.rc7), I dumped the reserved memblock regions and added printk
> in function deferred_init_memmap() to check if memblock reserved PFN 0x1fff80 (one
> page in memblock reserved region#31, refer to the below kernel log) is released
> to buddy allocator or not when doing deferred page struct initialization. I did
> see that PFN is released to buddy allocator at that time. However, I didn't see
> kernel crash and it would be luck and the current deferred page struct initialization
> implementation: The pages in region [0, 2GB] except the memblock reserved ones are
> presented to buddy allocator at early stage. It's not deferred. So for the pages in
> [0, 2GB], we don't have consistency issue between memblock and buddy allocator.
> The pages in region [2GB ...] are all presented to buddy allocator despite they're
> reserved in memblock or not. It ensures the kernel text section isn't corrupted
> and we're lucky not seeing program interrupt because of illegal instruction.
>
> Below is the kernel log I got from the printk:
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>  index a762be5..7039bc5 100644
>  --- a/mm/page_alloc.c
>  +++ b/mm/page_alloc.c
>  @@ -1307,6 +1307,9 @@ static int __init deferred_init_memmap(void *data)
>                          }
>
>                          /* Minimise pfn page lookups and scheduler checks */
>  +                       if (pfn == 0x1fff80)
>  +                               pr_info("===> %s: Free PFN 0x%lx\n", __func__, pfn);
>
>
> [    0.000000] Linux version 4.5.0-11790-g4fab991-dirty (gwshan@gwshan) (gcc version 4.9.3 (Buildroot 2016.02-rc2-00093-g5ea3bce) ) #423 SMP Sat Mar 26 23:24:15 AEDT 2016
>         :
> [    0.000000] Zone ranges:
> [    0.000000]   DMA      [mem 0x0000000000000000-0x0000001fffffffff]
> [    0.000000]   DMA32    empty
> [    0.000000]   Normal   empty
> [    0.000000] Movable zone start for each node
> [    0.000000] Early memory node ranges
> [    0.000000]   node   0: [mem 0x0000000000000000-0x0000000fffffffff]
> [    0.000000]   node   8: [mem 0x0000001000000000-0x0000001fffffffff]
>         :
> [    0.492855] Brought up 160 CPUs
> [    0.493047] Node 0 CPUs: 0-79
> [    0.493098] Node 8 CPUs: 80-159
> [    0.525746] ===> deferred_init_memmap: Free PFN 0x1fff80     <<<< In memblock reserved region#31
> [    0.525764] node 0 initialised, 1014458 pages in 30ms
> [    0.526005] node 8 initialised, 1012540 pages in 30ms
>         :
> [    8.973599] Dumping memblock [memory]
> [    8.973630]    [0000] [0000000000000000 - 0x0000001000000000, 0000001000000000] [0000000000000000] [0]
> [    8.973698]    [0001] [0000001000000000 - 0x0000002000000000, 0000001000000000] [0000000000000000] [8]
> [    8.973766] Dumping memblock [reserved]
> [    8.973797]    [0000] [0000000000000000 - 0x0000000001540000, 0000000001540000] [0000000000000000] [256]
> [    8.973866]    [0001] [000000000fc40000 - 0x000000000fcb0000, 0000000000070000] [0000000000000000] [256]
> [    8.973935]    [0002] [000000000fe80000 - 0x000000000fea0000, 0000000000020000] [0000000000000000] [256]
> [    8.974004]    [0003] [0000000030000000 - 0x0000000032de0000, 0000000002de0000] [0000000000000000] [256]
> [    8.974073]    [0004] [0000000039c00000 - 0x000000003a780200, 0000000000b80200] [0000000000000000] [256]
> [    8.974144]    [0005] [000000003fb00000 - 0x0000000040000000, 0000000000500000] [0000000000000000] [256]
> [    8.974213]    [0006] [000000007ffe0000 - 0x000000007fffddff, 000000000001ddff] [0000000000000000] [256]
> [    8.974281]    [0007] [0000000ff9c00000 - 0x0000000fff000000, 0000000005400000] [0000000000000000] [256]
> [    8.974351]    [0008] [0000000ffffa8000 - 0x0000000ffffd0000, 0000000000028000] [0000000000000000] [256]
> [    8.974419]    [0009] [0000000ffffde300 - 0x0000001000c40200, 0000000000c61f00] [0000000000000000] [256]
> [    8.974489]    [0010] [0000001ff0000000 - 0x0000001ff8000000, 0000000008000000] [0000000000000000] [256]
> [    8.974556]    [0011] [0000001ff9000000 - 0x0000001ffb000000, 0000000002000000] [0000000000000000] [256]
> [    8.974624]    [0012] [0000001ffd260000 - 0x0000001fff000000, 0000000001da0000] [0000000000000000] [256]
> [    8.974692]    [0013] [0000001fff15b780 - 0x0000001fff15b7f0, 0000000000000070] [0000000000000000] [256]
> [    8.974760]    [0014] [0000001fff15b800 - 0x0000001fff15b910, 0000000000000110] [0000000000000000] [256]
> [    8.974828]    [0015] [0000001fff15b980 - 0x0000001fff15c108, 0000000000000788] [0000000000000000] [256]
> [    8.974904]    [0016] [0000001fff15c180 - 0x0000001fff15c188, 0000000000000008] [0000000000000000] [256]
> [    8.974974]    [0017] [0000001fff174200 - 0x0000001fff17c223, 0000000000008023] [0000000000000000] [256]
> [    8.975042]    [0018] [0000001fff17c280 - 0x0000001fff17c2a3, 0000000000000023] [0000000000000000] [256]
> [    8.975110]    [0019] [0000001fff17c300 - 0x0000001fff1a5ba0, 00000000000298a0] [0000000000000000] [256]
> [    8.975178]    [0020] [0000001fff1a5c00 - 0x0000001fff1b8148, 0000000000012548] [0000000000000000] [256]
> [    8.975247]    [0021] [0000001fff1b8180 - 0x0000001fff1c86a0, 0000000000010520] [0000000000000000] [256]
> [    8.975315]    [0022] [0000001fff1c8700 - 0x0000001fff1dac48, 0000000000012548] [0000000000000000] [256]
> [    8.975385]    [0023] [0000001fff1dac80 - 0x0000001fff1eb0a0, 0000000000010420] [0000000000000000] [256]
> [    8.975454]    [0024] [0000001fff1eb100 - 0x0000001fff1fd3c8, 00000000000122c8] [0000000000000000] [256]
> [    8.975522]    [0025] [0000001fff1fd400 - 0x0000001fff20d820, 0000000000010420] [0000000000000000] [256]
> [    8.975592]    [0026] [0000001fff20d880 - 0x0000001fff21fb48, 00000000000122c8] [0000000000000000] [256]
> [    8.975660]    [0027] [0000001fff21fb80 - 0x0000001fff22ffa0, 0000000000010420] [0000000000000000] [256]
> [    8.975727]    [0028] [0000001fff230000 - 0x0000001fff2422c8, 00000000000122c8] [0000000000000000] [256]
> [    8.975795]    [0029] [0000001fff242300 - 0x0000001fff764b23, 0000000000522823] [0000000000000000] [256]
> [    8.975864]    [0030] [0000001fff764b48 - 0x0000001fff7ffffc, 000000000009b4b4] [0000000000000000] [256]
> [    8.975932]    [0031] [0000001fff800000 - 0x0000002000000000, 0000000000800000] [0000000000000000] [256]
>
>>> This fixes above issue by:
>>>
>>>    * Deferred releasing bootmem bitmap until the completion of deferred
>>>      page initialization.
>>
>>As I said in my other mail, we don't support bootmem anymore. So please resend
>>with just the non-bootmem fixes.
>>
>
> I think this patch is generic one. I guess bootmem might be supported on other
> platforms other than PPC? If that's the case, it would be fine to have the code
> fixing the bootmem bitmap if you agree. If you want me to split the patch into
> two for bootmem and memblock cases separately, I can do it absolutely. Please
> let me know your preference :-)
>
> Thanks,
> Gavin
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev



-- 

Best Regards
-Li

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] mm: Fix memory corruption caused by deferred page initialization
  2016-03-27 13:48     ` Gavin Shan
@ 2016-03-31  2:27       ` Gavin Shan
  2016-04-04  8:39         ` Mel Gorman
  0 siblings, 1 reply; 9+ messages in thread
From: Gavin Shan @ 2016-03-31  2:27 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Michael Ellerman, linux-mm, linuxppc-dev, mgorman, zhlcindy

On Mon, Mar 28, 2016 at 12:48:27AM +1100, Gavin Shan wrote:
>On Sun, Mar 27, 2016 at 12:37:09AM +1100, Gavin Shan wrote:
>>On Sat, Mar 26, 2016 at 08:47:17PM +1100, Michael Ellerman wrote:
>>>Hi Gavin,
>>>
>>>On Fri, 2016-25-03 at 16:05:29 UTC, Gavin Shan wrote:
>>>> During deferred page initialization, the pages are moved from memblock
>>>> or bootmem to buddy allocator without checking they were reserved. Those
>>>> reserved pages can be reallocated to somebody else by buddy/slab allocator.
>>>> It leads to memory corruption and potential kernel crash eventually.
>>>
>>>Can you give me a bit more detail on what the bug is?
>>>
>>>I haven't seen any issues on my systems, but I realise now I haven't enabled
>>>DEFERRED_STRUCT_PAGE_INIT - I assumed it was enabled by default.
>>>
>>>How did this get tested before submission?
>>>
>>
>>Michael, I have to reply with same context in another thread in case 
>>somebody else wants to understand more: Li, who is in the cc list, is
>>backporting deferred page initialization (CONFIG_DEFERRED_STRUCT_PAGE_INIT)
>>from upstream kernel to RHEL 7.2 or 7.3 kernel (3.10.0-357.el7). RHEL kernel
>>has (!CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT), meaning
>>bootmem is enabled. She eventually runs into kernel crash and I jumped
>>in to help understanding the root cause.
>>
>>There're two related kernel config options: ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
>>and DEFERRED_STRUCT_PAGE_INIT. The former one is enabled on PPC by default.
>>The later one isn't enabled by default.
>>
>>There are two test cases I had:
>>
>>- With (!CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT)
>>on PowerNV platform, upstream kernel (4.5.rc7) and additional patch to support
>>bootmem as it was removed on powerpc a while ago.
>>
>>- With (CONFIG_NO_BOOTMEM && CONFIG_DEFERRED_STRUCT_PAGE_INIT) on PowerNV platform,
>>upstream kernel (4.5.rc7), I dumped the reserved memblock regions and added printk
>>in function deferred_init_memmap() to check if memblock reserved PFN 0x1fff80 (one
>>page in memblock reserved region#31, refer to the below kernel log) is released
>>to buddy allocator or not when doing deferred page struct initialization. I did
>>see that PFN is released to buddy allocator at that time. However, I didn't see
>>kernel crash and it would be luck and the current deferred page struct initialization
>>implementation: The pages in region [0, 2GB] except the memblock reserved ones are
>>presented to buddy allocator at early stage. It's not deferred. So for the pages in
>>[0, 2GB], we don't have consistency issue between memblock and buddy allocator.
>>The pages in region [2GB ...] are all presented to buddy allocator despite they're
>>reserved in memblock or not. It ensures the kernel text section isn't corrupted
>>and we're lucky not seeing program interrupt because of illegal instruction.
>>
>
>After more debugging, it turns out that Michael is correct: we don't have problem
>when CONFIG_NO_BOOTMEM=y. In the case, the page frames in [2G ...] is marked as
>reserved in early stage (as below function calls reveal). During the deferred
>initialization stage, those reserved pages won't be released to buddy allocator:
>
>- Below function calls mark reserved pages according to memblock reserved regions:
>  init/main.c::start_kernel()
>  init/main.c::mm_init()
>  arch/powerpc/mm/mem.c::mem_init()
>  nobootmem.c::free_all_bootmem()            <-> bootmem.c::free_all_bootmem() on !CONFIG_NO_BOOTMEM
>  nobootmem.c::free_low_memory_core_early()
>  nobootmem.c::reserve_bootmem_region()
>
>- In page_alloc.c::deferred_init_memmap(), the reserved pages aren't released
>  to buddy allocator with below check:
>
>                        if (page->flags) {
>                                VM_BUG_ON(page_zone(page) != zone);
>                                goto free_range;
>                        }
>
>
>So the issue is only existing when CONFIG_NO_BOOTMEM=n. The alternative fix would
>be similar to what we have on !CONFIG_NO_BOOTMEM: In early stage, all page structs
>for bootmem reserved pages are initialized and mark them with PG_reserved. I'm
>not sure it's worthy to fix it as we won't support bootmem as Michael mentioned.
>

Mel, could you please confirm if we need a fix on !CONFIG_NO_BOOTMEM? If we need,
I'll respin and send a patch for review.

Thanks,
Gavin

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] mm: Fix memory corruption caused by deferred page initialization
  2016-03-31  2:27       ` Gavin Shan
@ 2016-04-04  8:39         ` Mel Gorman
  2016-04-04 11:24           ` Gavin Shan
  0 siblings, 1 reply; 9+ messages in thread
From: Mel Gorman @ 2016-04-04  8:39 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Michael Ellerman, linux-mm, linuxppc-dev, zhlcindy

On Thu, Mar 31, 2016 at 01:27:34PM +1100, Gavin Shan wrote:
> >So the issue is only existing when CONFIG_NO_BOOTMEM=n. The alternative fix would
> >be similar to what we have on !CONFIG_NO_BOOTMEM: In early stage, all page structs
> >for bootmem reserved pages are initialized and mark them with PG_reserved. I'm
> >not sure it's worthy to fix it as we won't support bootmem as Michael mentioned.
> >
> 
> Mel, could you please confirm if we need a fix on !CONFIG_NO_BOOTMEM? If we need,
> I'll respin and send a patch for review.
> 

Given that CONFIG_NO_BOOTMEM is not supported and bootmem is meant to be
slowly retiring, I would suggest instead making deferred memory init
depend on NO_BOOTMEM. 

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] mm: Fix memory corruption caused by deferred page initialization
  2016-04-04  8:39         ` Mel Gorman
@ 2016-04-04 11:24           ` Gavin Shan
  0 siblings, 0 replies; 9+ messages in thread
From: Gavin Shan @ 2016-04-04 11:24 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Gavin Shan, Michael Ellerman, linux-mm, linuxppc-dev, zhlcindy

On Mon, Apr 04, 2016 at 09:39:39AM +0100, Mel Gorman wrote:
>On Thu, Mar 31, 2016 at 01:27:34PM +1100, Gavin Shan wrote:
>> >So the issue is only existing when CONFIG_NO_BOOTMEM=n. The alternative fix would
>> >be similar to what we have on !CONFIG_NO_BOOTMEM: In early stage, all page structs
>> >for bootmem reserved pages are initialized and mark them with PG_reserved. I'm
>> >not sure it's worthy to fix it as we won't support bootmem as Michael mentioned.
>> >
>> 
>> Mel, could you please confirm if we need a fix on !CONFIG_NO_BOOTMEM? If we need,
>> I'll respin and send a patch for review.
>> 
>
>Given that CONFIG_NO_BOOTMEM is not supported and bootmem is meant to be
>slowly retiring, I would suggest instead making deferred memory init
>depend on NO_BOOTMEM. 
>

Thanks for confirm, Mel. It would be the best strategy to have simplest
fix for this issue. I'll send a followup patch to address it.

Thanks,
Gavin

>-- 
>Mel Gorman
>SUSE Labs
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-04-04 11:25 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-25 16:05 [PATCH RFC] mm: Fix memory corruption caused by deferred page initialization Gavin Shan
2016-03-26  9:47 ` [RFC] " Michael Ellerman
2016-03-26 13:37   ` Gavin Shan
2016-03-27 13:48     ` Gavin Shan
2016-03-31  2:27       ` Gavin Shan
2016-04-04  8:39         ` Mel Gorman
2016-04-04 11:24           ` Gavin Shan
2016-03-28 14:20     ` Aneesh Kumar K.V
2016-03-29  5:13     ` Li Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).