All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
@ 2013-03-06 21:50 ` Cliff Wickman
  0 siblings, 0 replies; 26+ messages in thread
From: Cliff Wickman @ 2013-03-06 21:50 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-mm, x86, wli

From: Cliff Wickman <cpw@sgi.com>

Allocating a large number of 1GB hugetlbfs pages at boot takes a
very long time. 

Large system sites would at times like to allocate a very large amount of
memory as 1GB pages.  They would put this on the kernel boot line:
   default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
 up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
      start_kernel
        kernel_init
          do_pre_smp_initcalls
            hugetlb_init
              hugetlb_init_hstates
                hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).
This estimate is approximate (it depends on core frequency & number of hops
to remote memory) but should be within a factor of 2 on most systems.
A benchmark attempting to reserve a TB for 1GB pages would thus require
~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.

I propose passing a flag to the early allocator to indicate that no zeroing
of a page should be done.  The 'no zeroing' flag would have to be passed
down this code path:

  hugetlb_hstate_alloc_pages
    alloc_bootmem_huge_page
      __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
        __alloc_memory_core_early  NO_ZERO
	  if (!(flags & NO_ZERO))
            memset(ptr, 0, size);

Or this path if CONFIG_NO_BOOTMEM is not set:

  hugetlb_hstate_alloc_pages
    alloc_bootmem_huge_page
      __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
        alloc_bootmem_core          NO_ZERO
	  if (!(flags & NO_ZERO))
            memset(region, 0, size);
        __alloc_bootmem_nopanic     NO_ZERO
          ___alloc_bootmem_nopanic  NO_ZERO
            alloc_bootmem_core      NO_ZERO
	      if (!(flags & NO_ZERO))
                memset(region, 0, size);

Signed-off-by: Cliff Wickman <cpw@sgi.com>

---
 arch/x86/kernel/setup_percpu.c |    4 ++--
 include/linux/bootmem.h        |   23 ++++++++++++++++-------
 mm/bootmem.c                   |   12 +++++++-----
 mm/hugetlb.c                   |    3 ++-
 mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
 mm/page_cgroup.c               |    2 +-
 mm/sparse.c                    |    2 +-
 7 files changed, 52 insertions(+), 35 deletions(-)

Index: linux/include/linux/bootmem.h
===================================================================
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -8,6 +8,11 @@
 #include <asm/dma.h>
 
 /*
+ * allocation flags
+ */
+#define NO_ZERO		0x00000001
+
+/*
  *  simple boot-time physical memory area allocator.
  */
 
@@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
 			     unsigned long goal);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 				     unsigned long align,
-				     unsigned long goal);
+				     unsigned long goal,
+				     u32 flags);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
@@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
 extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal,
+				  u32 flags);
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
-				  unsigned long limit);
+				  unsigned long limit,
+				  u32 flags);
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
 				 unsigned long goal);
@@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
 #define alloc_bootmem_align(x, align) \
 	__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_nopanic(x) \
-	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_pages_nopanic(x) \
-	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_node_nopanic(pgdat, x) \
-	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
+				     BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_pages_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
-	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
 
 #define alloc_bootmem_low(x) \
 	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
Index: linux/arch/x86/kernel/setup_percpu.c
===================================================================
--- linux.orig/arch/x86/kernel/setup_percpu.c
+++ linux/arch/x86/kernel/setup_percpu.c
@@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
 	void *ptr;
 
 	if (!node_online(node) || !NODE_DATA(node)) {
-		ptr = __alloc_bootmem_nopanic(size, align, goal);
+		ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
 		pr_info("cpu %d has no node %d or node-local memory\n",
 			cpu, node);
 		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
 			 cpu, size, __pa(ptr));
 	} else {
 		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
-						   size, align, goal);
+						   size, align, goal, 0);
 		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
 			 cpu, size, node, __pa(ptr));
 	}
Index: linux/mm/nobootmem.c
===================================================================
--- linux.orig/mm/nobootmem.c
+++ linux/mm/nobootmem.c
@@ -33,7 +33,7 @@ unsigned long min_low_pfn;
 unsigned long max_pfn;
 
 static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-					u64 goal, u64 limit)
+					u64 goal, u64 limit, u32 flags)
 {
 	void *ptr;
 	u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
 		return NULL;
 
 	ptr = phys_to_virt(addr);
-	memset(ptr, 0, size);
+	if (!(flags & NO_ZERO))
+		memset(ptr, 0, size);
 	memblock_reserve(addr, size);
 	/*
 	 * The min_count is set to 0 so that bootmem allocated blocks
@@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
-					unsigned long limit)
+					unsigned long limit,
+					u32 flags)
 {
 	void *ptr;
 
@@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
 
 restart:
 
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
+					limit, 0);
 
 	if (ptr)
 		return ptr;
@@ -244,17 +247,17 @@ restart:
  * Returns NULL on failure.
  */
 void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
-					unsigned long goal)
+					unsigned long goal, u32 flags)
 {
 	unsigned long limit = -1UL;
 
-	return ___alloc_bootmem_nopanic(size, align, goal, limit);
+	return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
 }
 
 static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
-					unsigned long goal, unsigned long limit)
+			unsigned long goal, unsigned long limit, u32 flags)
 {
-	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
 
 	if (mem)
 		return mem;
@@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
 {
 	unsigned long limit = -1UL;
 
-	return ___alloc_bootmem(size, align, goal, limit);
+	return ___alloc_bootmem(size, align, goal, limit, 0);
 }
 
 void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 						   unsigned long size,
 						   unsigned long align,
 						   unsigned long goal,
-						   unsigned long limit)
+						   unsigned long limit,
+						   u32 flags)
 {
 	void *ptr;
 
 again:
 	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-					goal, limit);
+					goal, limit, flags);
 	if (ptr)
 		return ptr;
 
 	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					goal, limit);
+					goal, limit, flags);
 	if (ptr)
 		return ptr;
 
@@ -315,12 +319,13 @@ again:
 }
 
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
+			unsigned long align, unsigned long goal, u32 flags)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
+			0, flags);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
 	if (ptr)
 		return ptr;
 
@@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
  * The function panics if the request can not be satisfied.
  */
 void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
+			unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
@@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 				  unsigned long goal)
 {
-	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
 }
 
 void * __init __alloc_bootmem_low_nopanic(unsigned long size,
@@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
 					  unsigned long goal)
 {
 	return ___alloc_bootmem_nopanic(size, align, goal,
-					ARCH_LOW_ADDRESS_LIMIT);
+					ARCH_LOW_ADDRESS_LIMIT, 0);
 }
 
 /**
Index: linux/mm/sparse.c
===================================================================
--- linux.orig/mm/sparse.c
+++ linux/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
 	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
-					  SMP_CACHE_BYTES, goal, limit);
+					  SMP_CACHE_BYTES, goal, limit, 0);
 	if (!p && limit) {
 		limit = 0;
 		goto again;
Index: linux/mm/hugetlb.c
===================================================================
--- linux.orig/mm/hugetlb.c
+++ linux/mm/hugetlb.c
@@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
 		addr = __alloc_bootmem_node_nopanic(
 				NODE_DATA(hstate_next_node_to_alloc(h,
 						&node_states[N_MEMORY])),
-				huge_page_size(h), huge_page_size(h), 0);
+				huge_page_size(h), huge_page_size(h),
+				0, NO_ZERO);
 
 		if (addr) {
 			/*
Index: linux/mm/bootmem.c
===================================================================
--- linux.orig/mm/bootmem.c
+++ linux/mm/bootmem.c
@@ -660,7 +660,7 @@ restart:
  * Returns NULL on failure.
  */
 void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
-					unsigned long goal)
+					unsigned long goal, u32 flags)
 {
 	unsigned long limit = 0;
 
@@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
 
 void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				unsigned long size, unsigned long align,
-				unsigned long goal, unsigned long limit)
+				unsigned long goal, unsigned long limit,
+				u32 flags)
 {
 	void *ptr;
 
@@ -734,12 +735,13 @@ again:
 }
 
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
+			unsigned long align, unsigned long goal, u32 flags)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
+					     0, flags);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
 	if (ptr)
 		return ptr;
 
Index: linux/mm/page_cgroup.c
===================================================================
--- linux.orig/mm/page_cgroup.c
+++ linux/mm/page_cgroup.c
@@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
 	table_size = sizeof(struct page_cgroup) * nr_pages;
 
 	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
 	if (!base)
 		return -ENOMEM;
 	NODE_DATA(nid)->node_page_cgroup = base;

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
@ 2013-03-06 21:50 ` Cliff Wickman
  0 siblings, 0 replies; 26+ messages in thread
From: Cliff Wickman @ 2013-03-06 21:50 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-mm, x86, wli

From: Cliff Wickman <cpw@sgi.com>

Allocating a large number of 1GB hugetlbfs pages at boot takes a
very long time. 

Large system sites would at times like to allocate a very large amount of
memory as 1GB pages.  They would put this on the kernel boot line:
   default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
 up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
      start_kernel
        kernel_init
          do_pre_smp_initcalls
            hugetlb_init
              hugetlb_init_hstates
                hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).
This estimate is approximate (it depends on core frequency & number of hops
to remote memory) but should be within a factor of 2 on most systems.
A benchmark attempting to reserve a TB for 1GB pages would thus require
~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.

I propose passing a flag to the early allocator to indicate that no zeroing
of a page should be done.  The 'no zeroing' flag would have to be passed
down this code path:

  hugetlb_hstate_alloc_pages
    alloc_bootmem_huge_page
      __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
        __alloc_memory_core_early  NO_ZERO
	  if (!(flags & NO_ZERO))
            memset(ptr, 0, size);

Or this path if CONFIG_NO_BOOTMEM is not set:

  hugetlb_hstate_alloc_pages
    alloc_bootmem_huge_page
      __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
        alloc_bootmem_core          NO_ZERO
	  if (!(flags & NO_ZERO))
            memset(region, 0, size);
        __alloc_bootmem_nopanic     NO_ZERO
          ___alloc_bootmem_nopanic  NO_ZERO
            alloc_bootmem_core      NO_ZERO
	      if (!(flags & NO_ZERO))
                memset(region, 0, size);

Signed-off-by: Cliff Wickman <cpw@sgi.com>

---
 arch/x86/kernel/setup_percpu.c |    4 ++--
 include/linux/bootmem.h        |   23 ++++++++++++++++-------
 mm/bootmem.c                   |   12 +++++++-----
 mm/hugetlb.c                   |    3 ++-
 mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
 mm/page_cgroup.c               |    2 +-
 mm/sparse.c                    |    2 +-
 7 files changed, 52 insertions(+), 35 deletions(-)

Index: linux/include/linux/bootmem.h
===================================================================
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -8,6 +8,11 @@
 #include <asm/dma.h>
 
 /*
+ * allocation flags
+ */
+#define NO_ZERO		0x00000001
+
+/*
  *  simple boot-time physical memory area allocator.
  */
 
@@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
 			     unsigned long goal);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 				     unsigned long align,
-				     unsigned long goal);
+				     unsigned long goal,
+				     u32 flags);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
@@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
 extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal,
+				  u32 flags);
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
-				  unsigned long limit);
+				  unsigned long limit,
+				  u32 flags);
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
 				 unsigned long goal);
@@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
 #define alloc_bootmem_align(x, align) \
 	__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_nopanic(x) \
-	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_pages_nopanic(x) \
-	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_node_nopanic(pgdat, x) \
-	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
+				     BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_pages_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
-	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
 
 #define alloc_bootmem_low(x) \
 	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
Index: linux/arch/x86/kernel/setup_percpu.c
===================================================================
--- linux.orig/arch/x86/kernel/setup_percpu.c
+++ linux/arch/x86/kernel/setup_percpu.c
@@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
 	void *ptr;
 
 	if (!node_online(node) || !NODE_DATA(node)) {
-		ptr = __alloc_bootmem_nopanic(size, align, goal);
+		ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
 		pr_info("cpu %d has no node %d or node-local memory\n",
 			cpu, node);
 		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
 			 cpu, size, __pa(ptr));
 	} else {
 		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
-						   size, align, goal);
+						   size, align, goal, 0);
 		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
 			 cpu, size, node, __pa(ptr));
 	}
Index: linux/mm/nobootmem.c
===================================================================
--- linux.orig/mm/nobootmem.c
+++ linux/mm/nobootmem.c
@@ -33,7 +33,7 @@ unsigned long min_low_pfn;
 unsigned long max_pfn;
 
 static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-					u64 goal, u64 limit)
+					u64 goal, u64 limit, u32 flags)
 {
 	void *ptr;
 	u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
 		return NULL;
 
 	ptr = phys_to_virt(addr);
-	memset(ptr, 0, size);
+	if (!(flags & NO_ZERO))
+		memset(ptr, 0, size);
 	memblock_reserve(addr, size);
 	/*
 	 * The min_count is set to 0 so that bootmem allocated blocks
@@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
-					unsigned long limit)
+					unsigned long limit,
+					u32 flags)
 {
 	void *ptr;
 
@@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
 
 restart:
 
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
+					limit, 0);
 
 	if (ptr)
 		return ptr;
@@ -244,17 +247,17 @@ restart:
  * Returns NULL on failure.
  */
 void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
-					unsigned long goal)
+					unsigned long goal, u32 flags)
 {
 	unsigned long limit = -1UL;
 
-	return ___alloc_bootmem_nopanic(size, align, goal, limit);
+	return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
 }
 
 static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
-					unsigned long goal, unsigned long limit)
+			unsigned long goal, unsigned long limit, u32 flags)
 {
-	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
 
 	if (mem)
 		return mem;
@@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
 {
 	unsigned long limit = -1UL;
 
-	return ___alloc_bootmem(size, align, goal, limit);
+	return ___alloc_bootmem(size, align, goal, limit, 0);
 }
 
 void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 						   unsigned long size,
 						   unsigned long align,
 						   unsigned long goal,
-						   unsigned long limit)
+						   unsigned long limit,
+						   u32 flags)
 {
 	void *ptr;
 
 again:
 	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-					goal, limit);
+					goal, limit, flags);
 	if (ptr)
 		return ptr;
 
 	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					goal, limit);
+					goal, limit, flags);
 	if (ptr)
 		return ptr;
 
@@ -315,12 +319,13 @@ again:
 }
 
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
+			unsigned long align, unsigned long goal, u32 flags)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
+			0, flags);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
 	if (ptr)
 		return ptr;
 
@@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
  * The function panics if the request can not be satisfied.
  */
 void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
+			unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
@@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 				  unsigned long goal)
 {
-	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
 }
 
 void * __init __alloc_bootmem_low_nopanic(unsigned long size,
@@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
 					  unsigned long goal)
 {
 	return ___alloc_bootmem_nopanic(size, align, goal,
-					ARCH_LOW_ADDRESS_LIMIT);
+					ARCH_LOW_ADDRESS_LIMIT, 0);
 }
 
 /**
Index: linux/mm/sparse.c
===================================================================
--- linux.orig/mm/sparse.c
+++ linux/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
 	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
-					  SMP_CACHE_BYTES, goal, limit);
+					  SMP_CACHE_BYTES, goal, limit, 0);
 	if (!p && limit) {
 		limit = 0;
 		goto again;
Index: linux/mm/hugetlb.c
===================================================================
--- linux.orig/mm/hugetlb.c
+++ linux/mm/hugetlb.c
@@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
 		addr = __alloc_bootmem_node_nopanic(
 				NODE_DATA(hstate_next_node_to_alloc(h,
 						&node_states[N_MEMORY])),
-				huge_page_size(h), huge_page_size(h), 0);
+				huge_page_size(h), huge_page_size(h),
+				0, NO_ZERO);
 
 		if (addr) {
 			/*
Index: linux/mm/bootmem.c
===================================================================
--- linux.orig/mm/bootmem.c
+++ linux/mm/bootmem.c
@@ -660,7 +660,7 @@ restart:
  * Returns NULL on failure.
  */
 void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
-					unsigned long goal)
+					unsigned long goal, u32 flags)
 {
 	unsigned long limit = 0;
 
@@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
 
 void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				unsigned long size, unsigned long align,
-				unsigned long goal, unsigned long limit)
+				unsigned long goal, unsigned long limit,
+				u32 flags)
 {
 	void *ptr;
 
@@ -734,12 +735,13 @@ again:
 }
 
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
+			unsigned long align, unsigned long goal, u32 flags)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
+					     0, flags);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
 	if (ptr)
 		return ptr;
 
Index: linux/mm/page_cgroup.c
===================================================================
--- linux.orig/mm/page_cgroup.c
+++ linux/mm/page_cgroup.c
@@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
 	table_size = sizeof(struct page_cgroup) * nr_pages;
 
 	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
 	if (!base)
 		return -ENOMEM;
 	NODE_DATA(nid)->node_page_cgroup = base;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
  2013-03-06 21:50 ` Cliff Wickman
@ 2013-03-10  5:55   ` Hillf Danton
  -1 siblings, 0 replies; 26+ messages in thread
From: Hillf Danton @ 2013-03-10  5:55 UTC (permalink / raw)
  To: Cliff Wickman; +Cc: linux-kernel, linux-mm, x86, wli

On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman <cpw@sgi.com> wrote:
> From: Cliff Wickman <cpw@sgi.com>
>
> Allocating a large number of 1GB hugetlbfs pages at boot takes a
> very long time.
>
> Large system sites would at times like to allocate a very large amount of
> memory as 1GB pages.  They would put this on the kernel boot line:
>    default_hugepagesz=1G hugepagesz=1G hugepages=4096
> [Dynamic allocation of 1G pages is not an option, as zone pages only go
>  up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>
> Each page is zeroed as it is allocated, and all allocation is done by
> cpu 0, as this path is early in boot:
>       start_kernel
>         kernel_init
>           do_pre_smp_initcalls
>             hugetlb_init
>               hugetlb_init_hstates
>                 hugetlb_hstate_alloc_pages
>
> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> on large numa systems).
> This estimate is approximate (it depends on core frequency & number of hops
> to remote memory) but should be within a factor of 2 on most systems.
> A benchmark attempting to reserve a TB for 1GB pages would thus require
> ~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.
>
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done.  The 'no zeroing' flag would have to be passed
> down this code path:
>

FYI: huge pages are cleared just after allocated, for instance,
clear_huge_page() in hugetlb_no_page()

Hillf
>   hugetlb_hstate_alloc_pages
>     alloc_bootmem_huge_page
>       __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
>         __alloc_memory_core_early  NO_ZERO
>           if (!(flags & NO_ZERO))
>             memset(ptr, 0, size);
>
> Or this path if CONFIG_NO_BOOTMEM is not set:
>
>   hugetlb_hstate_alloc_pages
>     alloc_bootmem_huge_page
>       __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
>         alloc_bootmem_core          NO_ZERO
>           if (!(flags & NO_ZERO))
>             memset(region, 0, size);
>         __alloc_bootmem_nopanic     NO_ZERO
>           ___alloc_bootmem_nopanic  NO_ZERO
>             alloc_bootmem_core      NO_ZERO
>               if (!(flags & NO_ZERO))
>                 memset(region, 0, size);
>
> Signed-off-by: Cliff Wickman <cpw@sgi.com>
>
> ---
>  arch/x86/kernel/setup_percpu.c |    4 ++--
>  include/linux/bootmem.h        |   23 ++++++++++++++++-------
>  mm/bootmem.c                   |   12 +++++++-----
>  mm/hugetlb.c                   |    3 ++-
>  mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
>  mm/page_cgroup.c               |    2 +-
>  mm/sparse.c                    |    2 +-
>  7 files changed, 52 insertions(+), 35 deletions(-)
>
> Index: linux/include/linux/bootmem.h
> ===================================================================
> --- linux.orig/include/linux/bootmem.h
> +++ linux/include/linux/bootmem.h
> @@ -8,6 +8,11 @@
>  #include <asm/dma.h>
>
>  /*
> + * allocation flags
> + */
> +#define NO_ZERO                0x00000001
> +
> +/*
>   *  simple boot-time physical memory area allocator.
>   */
>
> @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
>                              unsigned long goal);
>  extern void *__alloc_bootmem_nopanic(unsigned long size,
>                                      unsigned long align,
> -                                    unsigned long goal);
> +                                    unsigned long goal,
> +                                    u32 flags);
>  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
>                                   unsigned long size,
>                                   unsigned long align,
> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
>  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>                                   unsigned long size,
>                                   unsigned long align,
> -                                 unsigned long goal);
> +                                 unsigned long goal,
> +                                 u32 flags);
>  void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>                                   unsigned long size,
>                                   unsigned long align,
>                                   unsigned long goal,
> -                                 unsigned long limit);
> +                                 unsigned long limit,
> +                                 u32 flags);
>  extern void *__alloc_bootmem_low(unsigned long size,
>                                  unsigned long align,
>                                  unsigned long goal);
> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
>  #define alloc_bootmem_align(x, align) \
>         __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
>  #define alloc_bootmem_nopanic(x) \
> -       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> +       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
>  #define alloc_bootmem_pages(x) \
>         __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>  #define alloc_bootmem_pages_nopanic(x) \
> -       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> +       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>  #define alloc_bootmem_node(pgdat, x) \
>         __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>  #define alloc_bootmem_node_nopanic(pgdat, x) \
> -       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> +       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> +                                    BOOTMEM_LOW_LIMIT, 0)
>  #define alloc_bootmem_pages_node(pgdat, x) \
>         __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>  #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> -       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> +       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>
>  #define alloc_bootmem_low(x) \
>         __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> Index: linux/arch/x86/kernel/setup_percpu.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_percpu.c
> +++ linux/arch/x86/kernel/setup_percpu.c
> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
>         void *ptr;
>
>         if (!node_online(node) || !NODE_DATA(node)) {
> -               ptr = __alloc_bootmem_nopanic(size, align, goal);
> +               ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
>                 pr_info("cpu %d has no node %d or node-local memory\n",
>                         cpu, node);
>                 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
>                          cpu, size, __pa(ptr));
>         } else {
>                 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> -                                                  size, align, goal);
> +                                                  size, align, goal, 0);
>                 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
>                          cpu, size, node, __pa(ptr));
>         }
> Index: linux/mm/nobootmem.c
> ===================================================================
> --- linux.orig/mm/nobootmem.c
> +++ linux/mm/nobootmem.c
> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
>  unsigned long max_pfn;
>
>  static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> -                                       u64 goal, u64 limit)
> +                                       u64 goal, u64 limit, u32 flags)
>  {
>         void *ptr;
>         u64 addr;
> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
>                 return NULL;
>
>         ptr = phys_to_virt(addr);
> -       memset(ptr, 0, size);
> +       if (!(flags & NO_ZERO))
> +               memset(ptr, 0, size);
>         memblock_reserve(addr, size);
>         /*
>          * The min_count is set to 0 so that bootmem allocated blocks
> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
>  static void * __init ___alloc_bootmem_nopanic(unsigned long size,
>                                         unsigned long align,
>                                         unsigned long goal,
> -                                       unsigned long limit)
> +                                       unsigned long limit,
> +                                       u32 flags)
>  {
>         void *ptr;
>
> @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>
>  restart:
>
> -       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> +       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> +                                       limit, 0);
>
>         if (ptr)
>                 return ptr;
> @@ -244,17 +247,17 @@ restart:
>   * Returns NULL on failure.
>   */
>  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> -                                       unsigned long goal)
> +                                       unsigned long goal, u32 flags)
>  {
>         unsigned long limit = -1UL;
>
> -       return ___alloc_bootmem_nopanic(size, align, goal, limit);
> +       return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>  }
>
>  static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> -                                       unsigned long goal, unsigned long limit)
> +                       unsigned long goal, unsigned long limit, u32 flags)
>  {
> -       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> +       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>
>         if (mem)
>                 return mem;
> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
>  {
>         unsigned long limit = -1UL;
>
> -       return ___alloc_bootmem(size, align, goal, limit);
> +       return ___alloc_bootmem(size, align, goal, limit, 0);
>  }
>
>  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>                                                    unsigned long size,
>                                                    unsigned long align,
>                                                    unsigned long goal,
> -                                                  unsigned long limit)
> +                                                  unsigned long limit,
> +                                                  u32 flags)
>  {
>         void *ptr;
>
>  again:
>         ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> -                                       goal, limit);
> +                                       goal, limit, flags);
>         if (ptr)
>                 return ptr;
>
>         ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> -                                       goal, limit);
> +                                       goal, limit, flags);
>         if (ptr)
>                 return ptr;
>
> @@ -315,12 +319,13 @@ again:
>  }
>
>  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> -                                  unsigned long align, unsigned long goal)
> +                       unsigned long align, unsigned long goal, u32 flags)
>  {
>         if (WARN_ON_ONCE(slab_is_available()))
>                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> +                       0, flags);
>  }
>
>  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
>  {
>         void *ptr;
>
> -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
>         if (ptr)
>                 return ptr;
>
> @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
>   * The function panics if the request can not be satisfied.
>   */
>  void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> -                                  unsigned long align, unsigned long goal)
> +                       unsigned long align, unsigned long goal)
>  {
>         if (WARN_ON_ONCE(slab_is_available()))
>                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
>  void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
>                                   unsigned long goal)
>  {
> -       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> +       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
>  }
>
>  void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
>                                           unsigned long goal)
>  {
>         return ___alloc_bootmem_nopanic(size, align, goal,
> -                                       ARCH_LOW_ADDRESS_LIMIT);
> +                                       ARCH_LOW_ADDRESS_LIMIT, 0);
>  }
>
>  /**
> Index: linux/mm/sparse.c
> ===================================================================
> --- linux.orig/mm/sparse.c
> +++ linux/mm/sparse.c
> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
>         nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
>  again:
>         p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> -                                         SMP_CACHE_BYTES, goal, limit);
> +                                         SMP_CACHE_BYTES, goal, limit, 0);
>         if (!p && limit) {
>                 limit = 0;
>                 goto again;
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
>                 addr = __alloc_bootmem_node_nopanic(
>                                 NODE_DATA(hstate_next_node_to_alloc(h,
>                                                 &node_states[N_MEMORY])),
> -                               huge_page_size(h), huge_page_size(h), 0);
> +                               huge_page_size(h), huge_page_size(h),
> +                               0, NO_ZERO);
>
>                 if (addr) {
>                         /*
> Index: linux/mm/bootmem.c
> ===================================================================
> --- linux.orig/mm/bootmem.c
> +++ linux/mm/bootmem.c
> @@ -660,7 +660,7 @@ restart:
>   * Returns NULL on failure.
>   */
>  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> -                                       unsigned long goal)
> +                                       unsigned long goal, u32 flags)
>  {
>         unsigned long limit = 0;
>
> @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>
>  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>                                 unsigned long size, unsigned long align,
> -                               unsigned long goal, unsigned long limit)
> +                               unsigned long goal, unsigned long limit,
> +                               u32 flags)
>  {
>         void *ptr;
>
> @@ -734,12 +735,13 @@ again:
>  }
>
>  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> -                                  unsigned long align, unsigned long goal)
> +                       unsigned long align, unsigned long goal, u32 flags)
>  {
>         if (WARN_ON_ONCE(slab_is_available()))
>                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> +                                            0, flags);
>  }
>
>  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
>  {
>         void *ptr;
>
> -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
>         if (ptr)
>                 return ptr;
>
> Index: linux/mm/page_cgroup.c
> ===================================================================
> --- linux.orig/mm/page_cgroup.c
> +++ linux/mm/page_cgroup.c
> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
>         table_size = sizeof(struct page_cgroup) * nr_pages;
>
>         base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> -                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> +                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
>         if (!base)
>                 return -ENOMEM;
>         NODE_DATA(nid)->node_page_cgroup = base;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
@ 2013-03-10  5:55   ` Hillf Danton
  0 siblings, 0 replies; 26+ messages in thread
From: Hillf Danton @ 2013-03-10  5:55 UTC (permalink / raw)
  To: Cliff Wickman; +Cc: linux-kernel, linux-mm, x86, wli

On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman <cpw@sgi.com> wrote:
> From: Cliff Wickman <cpw@sgi.com>
>
> Allocating a large number of 1GB hugetlbfs pages at boot takes a
> very long time.
>
> Large system sites would at times like to allocate a very large amount of
> memory as 1GB pages.  They would put this on the kernel boot line:
>    default_hugepagesz=1G hugepagesz=1G hugepages=4096
> [Dynamic allocation of 1G pages is not an option, as zone pages only go
>  up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>
> Each page is zeroed as it is allocated, and all allocation is done by
> cpu 0, as this path is early in boot:
>       start_kernel
>         kernel_init
>           do_pre_smp_initcalls
>             hugetlb_init
>               hugetlb_init_hstates
>                 hugetlb_hstate_alloc_pages
>
> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> on large numa systems).
> This estimate is approximate (it depends on core frequency & number of hops
> to remote memory) but should be within a factor of 2 on most systems.
> A benchmark attempting to reserve a TB for 1GB pages would thus require
> ~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.
>
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done.  The 'no zeroing' flag would have to be passed
> down this code path:
>

FYI: huge pages are cleared just after allocated, for instance,
clear_huge_page() in hugetlb_no_page()

Hillf
>   hugetlb_hstate_alloc_pages
>     alloc_bootmem_huge_page
>       __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
>         __alloc_memory_core_early  NO_ZERO
>           if (!(flags & NO_ZERO))
>             memset(ptr, 0, size);
>
> Or this path if CONFIG_NO_BOOTMEM is not set:
>
>   hugetlb_hstate_alloc_pages
>     alloc_bootmem_huge_page
>       __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
>         alloc_bootmem_core          NO_ZERO
>           if (!(flags & NO_ZERO))
>             memset(region, 0, size);
>         __alloc_bootmem_nopanic     NO_ZERO
>           ___alloc_bootmem_nopanic  NO_ZERO
>             alloc_bootmem_core      NO_ZERO
>               if (!(flags & NO_ZERO))
>                 memset(region, 0, size);
>
> Signed-off-by: Cliff Wickman <cpw@sgi.com>
>
> ---
>  arch/x86/kernel/setup_percpu.c |    4 ++--
>  include/linux/bootmem.h        |   23 ++++++++++++++++-------
>  mm/bootmem.c                   |   12 +++++++-----
>  mm/hugetlb.c                   |    3 ++-
>  mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
>  mm/page_cgroup.c               |    2 +-
>  mm/sparse.c                    |    2 +-
>  7 files changed, 52 insertions(+), 35 deletions(-)
>
> Index: linux/include/linux/bootmem.h
> ===================================================================
> --- linux.orig/include/linux/bootmem.h
> +++ linux/include/linux/bootmem.h
> @@ -8,6 +8,11 @@
>  #include <asm/dma.h>
>
>  /*
> + * allocation flags
> + */
> +#define NO_ZERO                0x00000001
> +
> +/*
>   *  simple boot-time physical memory area allocator.
>   */
>
> @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
>                              unsigned long goal);
>  extern void *__alloc_bootmem_nopanic(unsigned long size,
>                                      unsigned long align,
> -                                    unsigned long goal);
> +                                    unsigned long goal,
> +                                    u32 flags);
>  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
>                                   unsigned long size,
>                                   unsigned long align,
> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
>  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>                                   unsigned long size,
>                                   unsigned long align,
> -                                 unsigned long goal);
> +                                 unsigned long goal,
> +                                 u32 flags);
>  void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>                                   unsigned long size,
>                                   unsigned long align,
>                                   unsigned long goal,
> -                                 unsigned long limit);
> +                                 unsigned long limit,
> +                                 u32 flags);
>  extern void *__alloc_bootmem_low(unsigned long size,
>                                  unsigned long align,
>                                  unsigned long goal);
> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
>  #define alloc_bootmem_align(x, align) \
>         __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
>  #define alloc_bootmem_nopanic(x) \
> -       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> +       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
>  #define alloc_bootmem_pages(x) \
>         __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>  #define alloc_bootmem_pages_nopanic(x) \
> -       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> +       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>  #define alloc_bootmem_node(pgdat, x) \
>         __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>  #define alloc_bootmem_node_nopanic(pgdat, x) \
> -       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> +       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> +                                    BOOTMEM_LOW_LIMIT, 0)
>  #define alloc_bootmem_pages_node(pgdat, x) \
>         __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>  #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> -       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> +       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>
>  #define alloc_bootmem_low(x) \
>         __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> Index: linux/arch/x86/kernel/setup_percpu.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_percpu.c
> +++ linux/arch/x86/kernel/setup_percpu.c
> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
>         void *ptr;
>
>         if (!node_online(node) || !NODE_DATA(node)) {
> -               ptr = __alloc_bootmem_nopanic(size, align, goal);
> +               ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
>                 pr_info("cpu %d has no node %d or node-local memory\n",
>                         cpu, node);
>                 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
>                          cpu, size, __pa(ptr));
>         } else {
>                 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> -                                                  size, align, goal);
> +                                                  size, align, goal, 0);
>                 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
>                          cpu, size, node, __pa(ptr));
>         }
> Index: linux/mm/nobootmem.c
> ===================================================================
> --- linux.orig/mm/nobootmem.c
> +++ linux/mm/nobootmem.c
> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
>  unsigned long max_pfn;
>
>  static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> -                                       u64 goal, u64 limit)
> +                                       u64 goal, u64 limit, u32 flags)
>  {
>         void *ptr;
>         u64 addr;
> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
>                 return NULL;
>
>         ptr = phys_to_virt(addr);
> -       memset(ptr, 0, size);
> +       if (!(flags & NO_ZERO))
> +               memset(ptr, 0, size);
>         memblock_reserve(addr, size);
>         /*
>          * The min_count is set to 0 so that bootmem allocated blocks
> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
>  static void * __init ___alloc_bootmem_nopanic(unsigned long size,
>                                         unsigned long align,
>                                         unsigned long goal,
> -                                       unsigned long limit)
> +                                       unsigned long limit,
> +                                       u32 flags)
>  {
>         void *ptr;
>
> @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>
>  restart:
>
> -       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> +       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> +                                       limit, 0);
>
>         if (ptr)
>                 return ptr;
> @@ -244,17 +247,17 @@ restart:
>   * Returns NULL on failure.
>   */
>  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> -                                       unsigned long goal)
> +                                       unsigned long goal, u32 flags)
>  {
>         unsigned long limit = -1UL;
>
> -       return ___alloc_bootmem_nopanic(size, align, goal, limit);
> +       return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>  }
>
>  static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> -                                       unsigned long goal, unsigned long limit)
> +                       unsigned long goal, unsigned long limit, u32 flags)
>  {
> -       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> +       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>
>         if (mem)
>                 return mem;
> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
>  {
>         unsigned long limit = -1UL;
>
> -       return ___alloc_bootmem(size, align, goal, limit);
> +       return ___alloc_bootmem(size, align, goal, limit, 0);
>  }
>
>  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>                                                    unsigned long size,
>                                                    unsigned long align,
>                                                    unsigned long goal,
> -                                                  unsigned long limit)
> +                                                  unsigned long limit,
> +                                                  u32 flags)
>  {
>         void *ptr;
>
>  again:
>         ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> -                                       goal, limit);
> +                                       goal, limit, flags);
>         if (ptr)
>                 return ptr;
>
>         ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> -                                       goal, limit);
> +                                       goal, limit, flags);
>         if (ptr)
>                 return ptr;
>
> @@ -315,12 +319,13 @@ again:
>  }
>
>  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> -                                  unsigned long align, unsigned long goal)
> +                       unsigned long align, unsigned long goal, u32 flags)
>  {
>         if (WARN_ON_ONCE(slab_is_available()))
>                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> +                       0, flags);
>  }
>
>  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
>  {
>         void *ptr;
>
> -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
>         if (ptr)
>                 return ptr;
>
> @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
>   * The function panics if the request can not be satisfied.
>   */
>  void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> -                                  unsigned long align, unsigned long goal)
> +                       unsigned long align, unsigned long goal)
>  {
>         if (WARN_ON_ONCE(slab_is_available()))
>                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
>  void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
>                                   unsigned long goal)
>  {
> -       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> +       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
>  }
>
>  void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
>                                           unsigned long goal)
>  {
>         return ___alloc_bootmem_nopanic(size, align, goal,
> -                                       ARCH_LOW_ADDRESS_LIMIT);
> +                                       ARCH_LOW_ADDRESS_LIMIT, 0);
>  }
>
>  /**
> Index: linux/mm/sparse.c
> ===================================================================
> --- linux.orig/mm/sparse.c
> +++ linux/mm/sparse.c
> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
>         nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
>  again:
>         p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> -                                         SMP_CACHE_BYTES, goal, limit);
> +                                         SMP_CACHE_BYTES, goal, limit, 0);
>         if (!p && limit) {
>                 limit = 0;
>                 goto again;
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
>                 addr = __alloc_bootmem_node_nopanic(
>                                 NODE_DATA(hstate_next_node_to_alloc(h,
>                                                 &node_states[N_MEMORY])),
> -                               huge_page_size(h), huge_page_size(h), 0);
> +                               huge_page_size(h), huge_page_size(h),
> +                               0, NO_ZERO);
>
>                 if (addr) {
>                         /*
> Index: linux/mm/bootmem.c
> ===================================================================
> --- linux.orig/mm/bootmem.c
> +++ linux/mm/bootmem.c
> @@ -660,7 +660,7 @@ restart:
>   * Returns NULL on failure.
>   */
>  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> -                                       unsigned long goal)
> +                                       unsigned long goal, u32 flags)
>  {
>         unsigned long limit = 0;
>
> @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>
>  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>                                 unsigned long size, unsigned long align,
> -                               unsigned long goal, unsigned long limit)
> +                               unsigned long goal, unsigned long limit,
> +                               u32 flags)
>  {
>         void *ptr;
>
> @@ -734,12 +735,13 @@ again:
>  }
>
>  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> -                                  unsigned long align, unsigned long goal)
> +                       unsigned long align, unsigned long goal, u32 flags)
>  {
>         if (WARN_ON_ONCE(slab_is_available()))
>                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> +                                            0, flags);
>  }
>
>  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
>  {
>         void *ptr;
>
> -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
>         if (ptr)
>                 return ptr;
>
> Index: linux/mm/page_cgroup.c
> ===================================================================
> --- linux.orig/mm/page_cgroup.c
> +++ linux/mm/page_cgroup.c
> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
>         table_size = sizeof(struct page_cgroup) * nr_pages;
>
>         base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> -                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> +                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
>         if (!base)
>                 return -ENOMEM;
>         NODE_DATA(nid)->node_page_cgroup = base;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
  2013-03-10  5:55   ` Hillf Danton
@ 2013-03-11 12:32     ` Cliff Wickman
  -1 siblings, 0 replies; 26+ messages in thread
From: Cliff Wickman @ 2013-03-11 12:32 UTC (permalink / raw)
  To: Hillf Danton; +Cc: linux-kernel, linux-mm, x86, wli

On Sun, Mar 10, 2013 at 01:55:10PM +0800, Hillf Danton wrote:
> On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman <cpw@sgi.com> wrote:
> > From: Cliff Wickman <cpw@sgi.com>
> >
> > Allocating a large number of 1GB hugetlbfs pages at boot takes a
> > very long time.
> >
> > Large system sites would at times like to allocate a very large amount of
> > memory as 1GB pages.  They would put this on the kernel boot line:
> >    default_hugepagesz=1G hugepagesz=1G hugepages=4096
> > [Dynamic allocation of 1G pages is not an option, as zone pages only go
> >  up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
> >
> > Each page is zeroed as it is allocated, and all allocation is done by
> > cpu 0, as this path is early in boot:
> >       start_kernel
> >         kernel_init
> >           do_pre_smp_initcalls
> >             hugetlb_init
> >               hugetlb_init_hstates
> >                 hugetlb_hstate_alloc_pages
> >
> > Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> > on large numa systems).
> > This estimate is approximate (it depends on core frequency & number of hops
> > to remote memory) but should be within a factor of 2 on most systems.
> > A benchmark attempting to reserve a TB for 1GB pages would thus require
> > ~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.
> >
> > I propose passing a flag to the early allocator to indicate that no zeroing
> > of a page should be done.  The 'no zeroing' flag would have to be passed
> > down this code path:
> >
> 
> FYI: huge pages are cleared just after allocated, for instance,
> clear_huge_page() in hugetlb_no_page()
> 
> Hillf

Yes, I should have added that comment to the changelog.  And because
this is true there is no need to clear a huge page at boot time.

-Cliff
> >   hugetlb_hstate_alloc_pages
> >     alloc_bootmem_huge_page
> >       __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
> >         __alloc_memory_core_early  NO_ZERO
> >           if (!(flags & NO_ZERO))
> >             memset(ptr, 0, size);
> >
> > Or this path if CONFIG_NO_BOOTMEM is not set:
> >
> >   hugetlb_hstate_alloc_pages
> >     alloc_bootmem_huge_page
> >       __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
> >         alloc_bootmem_core          NO_ZERO
> >           if (!(flags & NO_ZERO))
> >             memset(region, 0, size);
> >         __alloc_bootmem_nopanic     NO_ZERO
> >           ___alloc_bootmem_nopanic  NO_ZERO
> >             alloc_bootmem_core      NO_ZERO
> >               if (!(flags & NO_ZERO))
> >                 memset(region, 0, size);
> >
> > Signed-off-by: Cliff Wickman <cpw@sgi.com>
> >
> > ---
> >  arch/x86/kernel/setup_percpu.c |    4 ++--
> >  include/linux/bootmem.h        |   23 ++++++++++++++++-------
> >  mm/bootmem.c                   |   12 +++++++-----
> >  mm/hugetlb.c                   |    3 ++-
> >  mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
> >  mm/page_cgroup.c               |    2 +-
> >  mm/sparse.c                    |    2 +-
> >  7 files changed, 52 insertions(+), 35 deletions(-)
> >
> > Index: linux/include/linux/bootmem.h
> > ===================================================================
> > --- linux.orig/include/linux/bootmem.h
> > +++ linux/include/linux/bootmem.h
> > @@ -8,6 +8,11 @@
> >  #include <asm/dma.h>
> >
> >  /*
> > + * allocation flags
> > + */
> > +#define NO_ZERO                0x00000001
> > +
> > +/*
> >   *  simple boot-time physical memory area allocator.
> >   */
> >
> > @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
> >                              unsigned long goal);
> >  extern void *__alloc_bootmem_nopanic(unsigned long size,
> >                                      unsigned long align,
> > -                                    unsigned long goal);
> > +                                    unsigned long goal,
> > +                                    u32 flags);
> >  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
> >                                   unsigned long size,
> >                                   unsigned long align,
> > @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
> >  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                   unsigned long size,
> >                                   unsigned long align,
> > -                                 unsigned long goal);
> > +                                 unsigned long goal,
> > +                                 u32 flags);
> >  void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                   unsigned long size,
> >                                   unsigned long align,
> >                                   unsigned long goal,
> > -                                 unsigned long limit);
> > +                                 unsigned long limit,
> > +                                 u32 flags);
> >  extern void *__alloc_bootmem_low(unsigned long size,
> >                                  unsigned long align,
> >                                  unsigned long goal);
> > @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
> >  #define alloc_bootmem_align(x, align) \
> >         __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_nopanic(x) \
> > -       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
> >  #define alloc_bootmem_pages(x) \
> >         __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_pages_nopanic(x) \
> > -       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> >  #define alloc_bootmem_node(pgdat, x) \
> >         __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_node_nopanic(pgdat, x) \
> > -       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> > +                                    BOOTMEM_LOW_LIMIT, 0)
> >  #define alloc_bootmem_pages_node(pgdat, x) \
> >         __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> > -       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> >
> >  #define alloc_bootmem_low(x) \
> >         __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> > Index: linux/arch/x86/kernel/setup_percpu.c
> > ===================================================================
> > --- linux.orig/arch/x86/kernel/setup_percpu.c
> > +++ linux/arch/x86/kernel/setup_percpu.c
> > @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
> >         void *ptr;
> >
> >         if (!node_online(node) || !NODE_DATA(node)) {
> > -               ptr = __alloc_bootmem_nopanic(size, align, goal);
> > +               ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
> >                 pr_info("cpu %d has no node %d or node-local memory\n",
> >                         cpu, node);
> >                 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
> >                          cpu, size, __pa(ptr));
> >         } else {
> >                 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> > -                                                  size, align, goal);
> > +                                                  size, align, goal, 0);
> >                 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
> >                          cpu, size, node, __pa(ptr));
> >         }
> > Index: linux/mm/nobootmem.c
> > ===================================================================
> > --- linux.orig/mm/nobootmem.c
> > +++ linux/mm/nobootmem.c
> > @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
> >  unsigned long max_pfn;
> >
> >  static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> > -                                       u64 goal, u64 limit)
> > +                                       u64 goal, u64 limit, u32 flags)
> >  {
> >         void *ptr;
> >         u64 addr;
> > @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
> >                 return NULL;
> >
> >         ptr = phys_to_virt(addr);
> > -       memset(ptr, 0, size);
> > +       if (!(flags & NO_ZERO))
> > +               memset(ptr, 0, size);
> >         memblock_reserve(addr, size);
> >         /*
> >          * The min_count is set to 0 so that bootmem allocated blocks
> > @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
> >  static void * __init ___alloc_bootmem_nopanic(unsigned long size,
> >                                         unsigned long align,
> >                                         unsigned long goal,
> > -                                       unsigned long limit)
> > +                                       unsigned long limit,
> > +                                       u32 flags)
> >  {
> >         void *ptr;
> >
> > @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
> >
> >  restart:
> >
> > -       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> > +       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> > +                                       limit, 0);
> >
> >         if (ptr)
> >                 return ptr;
> > @@ -244,17 +247,17 @@ restart:
> >   * Returns NULL on failure.
> >   */
> >  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> > -                                       unsigned long goal)
> > +                                       unsigned long goal, u32 flags)
> >  {
> >         unsigned long limit = -1UL;
> >
> > -       return ___alloc_bootmem_nopanic(size, align, goal, limit);
> > +       return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> >  }
> >
> >  static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> > -                                       unsigned long goal, unsigned long limit)
> > +                       unsigned long goal, unsigned long limit, u32 flags)
> >  {
> > -       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> > +       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> >
> >         if (mem)
> >                 return mem;
> > @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
> >  {
> >         unsigned long limit = -1UL;
> >
> > -       return ___alloc_bootmem(size, align, goal, limit);
> > +       return ___alloc_bootmem(size, align, goal, limit, 0);
> >  }
> >
> >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                                    unsigned long size,
> >                                                    unsigned long align,
> >                                                    unsigned long goal,
> > -                                                  unsigned long limit)
> > +                                                  unsigned long limit,
> > +                                                  u32 flags)
> >  {
> >         void *ptr;
> >
> >  again:
> >         ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> > -                                       goal, limit);
> > +                                       goal, limit, flags);
> >         if (ptr)
> >                 return ptr;
> >
> >         ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> > -                                       goal, limit);
> > +                                       goal, limit, flags);
> >         if (ptr)
> >                 return ptr;
> >
> > @@ -315,12 +319,13 @@ again:
> >  }
> >
> >  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> > -                                  unsigned long align, unsigned long goal)
> > +                       unsigned long align, unsigned long goal, u32 flags)
> >  {
> >         if (WARN_ON_ONCE(slab_is_available()))
> >                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> >
> > -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> > +                       0, flags);
> >  }
> >
> >  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
> >  {
> >         void *ptr;
> >
> > -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> > +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
> >         if (ptr)
> >                 return ptr;
> >
> > @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
> >   * The function panics if the request can not be satisfied.
> >   */
> >  void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > -                                  unsigned long align, unsigned long goal)
> > +                       unsigned long align, unsigned long goal)
> >  {
> >         if (WARN_ON_ONCE(slab_is_available()))
> >                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> > @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
> >  void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
> >                                   unsigned long goal)
> >  {
> > -       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> > +       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
> >  }
> >
> >  void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> > @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
> >                                           unsigned long goal)
> >  {
> >         return ___alloc_bootmem_nopanic(size, align, goal,
> > -                                       ARCH_LOW_ADDRESS_LIMIT);
> > +                                       ARCH_LOW_ADDRESS_LIMIT, 0);
> >  }
> >
> >  /**
> > Index: linux/mm/sparse.c
> > ===================================================================
> > --- linux.orig/mm/sparse.c
> > +++ linux/mm/sparse.c
> > @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
> >         nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> >  again:
> >         p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> > -                                         SMP_CACHE_BYTES, goal, limit);
> > +                                         SMP_CACHE_BYTES, goal, limit, 0);
> >         if (!p && limit) {
> >                 limit = 0;
> >                 goto again;
> > Index: linux/mm/hugetlb.c
> > ===================================================================
> > --- linux.orig/mm/hugetlb.c
> > +++ linux/mm/hugetlb.c
> > @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
> >                 addr = __alloc_bootmem_node_nopanic(
> >                                 NODE_DATA(hstate_next_node_to_alloc(h,
> >                                                 &node_states[N_MEMORY])),
> > -                               huge_page_size(h), huge_page_size(h), 0);
> > +                               huge_page_size(h), huge_page_size(h),
> > +                               0, NO_ZERO);
> >
> >                 if (addr) {
> >                         /*
> > Index: linux/mm/bootmem.c
> > ===================================================================
> > --- linux.orig/mm/bootmem.c
> > +++ linux/mm/bootmem.c
> > @@ -660,7 +660,7 @@ restart:
> >   * Returns NULL on failure.
> >   */
> >  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> > -                                       unsigned long goal)
> > +                                       unsigned long goal, u32 flags)
> >  {
> >         unsigned long limit = 0;
> >
> > @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
> >
> >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                 unsigned long size, unsigned long align,
> > -                               unsigned long goal, unsigned long limit)
> > +                               unsigned long goal, unsigned long limit,
> > +                               u32 flags)
> >  {
> >         void *ptr;
> >
> > @@ -734,12 +735,13 @@ again:
> >  }
> >
> >  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> > -                                  unsigned long align, unsigned long goal)
> > +                       unsigned long align, unsigned long goal, u32 flags)
> >  {
> >         if (WARN_ON_ONCE(slab_is_available()))
> >                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> >
> > -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> > +                                            0, flags);
> >  }
> >
> >  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
> >  {
> >         void *ptr;
> >
> > -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
> >         if (ptr)
> >                 return ptr;
> >
> > Index: linux/mm/page_cgroup.c
> > ===================================================================
> > --- linux.orig/mm/page_cgroup.c
> > +++ linux/mm/page_cgroup.c
> > @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
> >         table_size = sizeof(struct page_cgroup) * nr_pages;
> >
> >         base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > -                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > +                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
> >         if (!base)
> >                 return -ENOMEM;
> >         NODE_DATA(nid)->node_page_cgroup = base;
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> >
> >

-- 
Cliff Wickman
SGI
cpw@sgi.com
(651) 683-3824

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
@ 2013-03-11 12:32     ` Cliff Wickman
  0 siblings, 0 replies; 26+ messages in thread
From: Cliff Wickman @ 2013-03-11 12:32 UTC (permalink / raw)
  To: Hillf Danton; +Cc: linux-kernel, linux-mm, x86, wli

On Sun, Mar 10, 2013 at 01:55:10PM +0800, Hillf Danton wrote:
> On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman <cpw@sgi.com> wrote:
> > From: Cliff Wickman <cpw@sgi.com>
> >
> > Allocating a large number of 1GB hugetlbfs pages at boot takes a
> > very long time.
> >
> > Large system sites would at times like to allocate a very large amount of
> > memory as 1GB pages.  They would put this on the kernel boot line:
> >    default_hugepagesz=1G hugepagesz=1G hugepages=4096
> > [Dynamic allocation of 1G pages is not an option, as zone pages only go
> >  up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
> >
> > Each page is zeroed as it is allocated, and all allocation is done by
> > cpu 0, as this path is early in boot:
> >       start_kernel
> >         kernel_init
> >           do_pre_smp_initcalls
> >             hugetlb_init
> >               hugetlb_init_hstates
> >                 hugetlb_hstate_alloc_pages
> >
> > Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> > on large numa systems).
> > This estimate is approximate (it depends on core frequency & number of hops
> > to remote memory) but should be within a factor of 2 on most systems.
> > A benchmark attempting to reserve a TB for 1GB pages would thus require
> > ~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.
> >
> > I propose passing a flag to the early allocator to indicate that no zeroing
> > of a page should be done.  The 'no zeroing' flag would have to be passed
> > down this code path:
> >
> 
> FYI: huge pages are cleared just after allocated, for instance,
> clear_huge_page() in hugetlb_no_page()
> 
> Hillf

Yes, I should have added that comment to the changelog.  And because
this is true there is no need to clear a huge page at boot time.

-Cliff
> >   hugetlb_hstate_alloc_pages
> >     alloc_bootmem_huge_page
> >       __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
> >         __alloc_memory_core_early  NO_ZERO
> >           if (!(flags & NO_ZERO))
> >             memset(ptr, 0, size);
> >
> > Or this path if CONFIG_NO_BOOTMEM is not set:
> >
> >   hugetlb_hstate_alloc_pages
> >     alloc_bootmem_huge_page
> >       __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
> >         alloc_bootmem_core          NO_ZERO
> >           if (!(flags & NO_ZERO))
> >             memset(region, 0, size);
> >         __alloc_bootmem_nopanic     NO_ZERO
> >           ___alloc_bootmem_nopanic  NO_ZERO
> >             alloc_bootmem_core      NO_ZERO
> >               if (!(flags & NO_ZERO))
> >                 memset(region, 0, size);
> >
> > Signed-off-by: Cliff Wickman <cpw@sgi.com>
> >
> > ---
> >  arch/x86/kernel/setup_percpu.c |    4 ++--
> >  include/linux/bootmem.h        |   23 ++++++++++++++++-------
> >  mm/bootmem.c                   |   12 +++++++-----
> >  mm/hugetlb.c                   |    3 ++-
> >  mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
> >  mm/page_cgroup.c               |    2 +-
> >  mm/sparse.c                    |    2 +-
> >  7 files changed, 52 insertions(+), 35 deletions(-)
> >
> > Index: linux/include/linux/bootmem.h
> > ===================================================================
> > --- linux.orig/include/linux/bootmem.h
> > +++ linux/include/linux/bootmem.h
> > @@ -8,6 +8,11 @@
> >  #include <asm/dma.h>
> >
> >  /*
> > + * allocation flags
> > + */
> > +#define NO_ZERO                0x00000001
> > +
> > +/*
> >   *  simple boot-time physical memory area allocator.
> >   */
> >
> > @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
> >                              unsigned long goal);
> >  extern void *__alloc_bootmem_nopanic(unsigned long size,
> >                                      unsigned long align,
> > -                                    unsigned long goal);
> > +                                    unsigned long goal,
> > +                                    u32 flags);
> >  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
> >                                   unsigned long size,
> >                                   unsigned long align,
> > @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
> >  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                   unsigned long size,
> >                                   unsigned long align,
> > -                                 unsigned long goal);
> > +                                 unsigned long goal,
> > +                                 u32 flags);
> >  void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                   unsigned long size,
> >                                   unsigned long align,
> >                                   unsigned long goal,
> > -                                 unsigned long limit);
> > +                                 unsigned long limit,
> > +                                 u32 flags);
> >  extern void *__alloc_bootmem_low(unsigned long size,
> >                                  unsigned long align,
> >                                  unsigned long goal);
> > @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
> >  #define alloc_bootmem_align(x, align) \
> >         __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_nopanic(x) \
> > -       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
> >  #define alloc_bootmem_pages(x) \
> >         __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_pages_nopanic(x) \
> > -       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> >  #define alloc_bootmem_node(pgdat, x) \
> >         __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_node_nopanic(pgdat, x) \
> > -       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> > +                                    BOOTMEM_LOW_LIMIT, 0)
> >  #define alloc_bootmem_pages_node(pgdat, x) \
> >         __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> > -       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> >
> >  #define alloc_bootmem_low(x) \
> >         __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> > Index: linux/arch/x86/kernel/setup_percpu.c
> > ===================================================================
> > --- linux.orig/arch/x86/kernel/setup_percpu.c
> > +++ linux/arch/x86/kernel/setup_percpu.c
> > @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
> >         void *ptr;
> >
> >         if (!node_online(node) || !NODE_DATA(node)) {
> > -               ptr = __alloc_bootmem_nopanic(size, align, goal);
> > +               ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
> >                 pr_info("cpu %d has no node %d or node-local memory\n",
> >                         cpu, node);
> >                 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
> >                          cpu, size, __pa(ptr));
> >         } else {
> >                 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> > -                                                  size, align, goal);
> > +                                                  size, align, goal, 0);
> >                 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
> >                          cpu, size, node, __pa(ptr));
> >         }
> > Index: linux/mm/nobootmem.c
> > ===================================================================
> > --- linux.orig/mm/nobootmem.c
> > +++ linux/mm/nobootmem.c
> > @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
> >  unsigned long max_pfn;
> >
> >  static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> > -                                       u64 goal, u64 limit)
> > +                                       u64 goal, u64 limit, u32 flags)
> >  {
> >         void *ptr;
> >         u64 addr;
> > @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
> >                 return NULL;
> >
> >         ptr = phys_to_virt(addr);
> > -       memset(ptr, 0, size);
> > +       if (!(flags & NO_ZERO))
> > +               memset(ptr, 0, size);
> >         memblock_reserve(addr, size);
> >         /*
> >          * The min_count is set to 0 so that bootmem allocated blocks
> > @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
> >  static void * __init ___alloc_bootmem_nopanic(unsigned long size,
> >                                         unsigned long align,
> >                                         unsigned long goal,
> > -                                       unsigned long limit)
> > +                                       unsigned long limit,
> > +                                       u32 flags)
> >  {
> >         void *ptr;
> >
> > @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
> >
> >  restart:
> >
> > -       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> > +       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> > +                                       limit, 0);
> >
> >         if (ptr)
> >                 return ptr;
> > @@ -244,17 +247,17 @@ restart:
> >   * Returns NULL on failure.
> >   */
> >  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> > -                                       unsigned long goal)
> > +                                       unsigned long goal, u32 flags)
> >  {
> >         unsigned long limit = -1UL;
> >
> > -       return ___alloc_bootmem_nopanic(size, align, goal, limit);
> > +       return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> >  }
> >
> >  static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> > -                                       unsigned long goal, unsigned long limit)
> > +                       unsigned long goal, unsigned long limit, u32 flags)
> >  {
> > -       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> > +       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> >
> >         if (mem)
> >                 return mem;
> > @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
> >  {
> >         unsigned long limit = -1UL;
> >
> > -       return ___alloc_bootmem(size, align, goal, limit);
> > +       return ___alloc_bootmem(size, align, goal, limit, 0);
> >  }
> >
> >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                                    unsigned long size,
> >                                                    unsigned long align,
> >                                                    unsigned long goal,
> > -                                                  unsigned long limit)
> > +                                                  unsigned long limit,
> > +                                                  u32 flags)
> >  {
> >         void *ptr;
> >
> >  again:
> >         ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> > -                                       goal, limit);
> > +                                       goal, limit, flags);
> >         if (ptr)
> >                 return ptr;
> >
> >         ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> > -                                       goal, limit);
> > +                                       goal, limit, flags);
> >         if (ptr)
> >                 return ptr;
> >
> > @@ -315,12 +319,13 @@ again:
> >  }
> >
> >  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> > -                                  unsigned long align, unsigned long goal)
> > +                       unsigned long align, unsigned long goal, u32 flags)
> >  {
> >         if (WARN_ON_ONCE(slab_is_available()))
> >                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> >
> > -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> > +                       0, flags);
> >  }
> >
> >  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
> >  {
> >         void *ptr;
> >
> > -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> > +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
> >         if (ptr)
> >                 return ptr;
> >
> > @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
> >   * The function panics if the request can not be satisfied.
> >   */
> >  void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > -                                  unsigned long align, unsigned long goal)
> > +                       unsigned long align, unsigned long goal)
> >  {
> >         if (WARN_ON_ONCE(slab_is_available()))
> >                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> > @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
> >  void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
> >                                   unsigned long goal)
> >  {
> > -       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> > +       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
> >  }
> >
> >  void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> > @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
> >                                           unsigned long goal)
> >  {
> >         return ___alloc_bootmem_nopanic(size, align, goal,
> > -                                       ARCH_LOW_ADDRESS_LIMIT);
> > +                                       ARCH_LOW_ADDRESS_LIMIT, 0);
> >  }
> >
> >  /**
> > Index: linux/mm/sparse.c
> > ===================================================================
> > --- linux.orig/mm/sparse.c
> > +++ linux/mm/sparse.c
> > @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
> >         nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> >  again:
> >         p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> > -                                         SMP_CACHE_BYTES, goal, limit);
> > +                                         SMP_CACHE_BYTES, goal, limit, 0);
> >         if (!p && limit) {
> >                 limit = 0;
> >                 goto again;
> > Index: linux/mm/hugetlb.c
> > ===================================================================
> > --- linux.orig/mm/hugetlb.c
> > +++ linux/mm/hugetlb.c
> > @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
> >                 addr = __alloc_bootmem_node_nopanic(
> >                                 NODE_DATA(hstate_next_node_to_alloc(h,
> >                                                 &node_states[N_MEMORY])),
> > -                               huge_page_size(h), huge_page_size(h), 0);
> > +                               huge_page_size(h), huge_page_size(h),
> > +                               0, NO_ZERO);
> >
> >                 if (addr) {
> >                         /*
> > Index: linux/mm/bootmem.c
> > ===================================================================
> > --- linux.orig/mm/bootmem.c
> > +++ linux/mm/bootmem.c
> > @@ -660,7 +660,7 @@ restart:
> >   * Returns NULL on failure.
> >   */
> >  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> > -                                       unsigned long goal)
> > +                                       unsigned long goal, u32 flags)
> >  {
> >         unsigned long limit = 0;
> >
> > @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
> >
> >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                 unsigned long size, unsigned long align,
> > -                               unsigned long goal, unsigned long limit)
> > +                               unsigned long goal, unsigned long limit,
> > +                               u32 flags)
> >  {
> >         void *ptr;
> >
> > @@ -734,12 +735,13 @@ again:
> >  }
> >
> >  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> > -                                  unsigned long align, unsigned long goal)
> > +                       unsigned long align, unsigned long goal, u32 flags)
> >  {
> >         if (WARN_ON_ONCE(slab_is_available()))
> >                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> >
> > -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> > +                                            0, flags);
> >  }
> >
> >  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
> >  {
> >         void *ptr;
> >
> > -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
> >         if (ptr)
> >                 return ptr;
> >
> > Index: linux/mm/page_cgroup.c
> > ===================================================================
> > --- linux.orig/mm/page_cgroup.c
> > +++ linux/mm/page_cgroup.c
> > @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
> >         table_size = sizeof(struct page_cgroup) * nr_pages;
> >
> >         base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > -                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > +                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
> >         if (!base)
> >                 return -ENOMEM;
> >         NODE_DATA(nid)->node_page_cgroup = base;
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> >
> >

-- 
Cliff Wickman
SGI
cpw@sgi.com
(651) 683-3824

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
  2013-03-06 21:50 ` Cliff Wickman
@ 2013-03-14  8:51   ` Michal Hocko
  -1 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-03-14  8:51 UTC (permalink / raw)
  To: Cliff Wickman; +Cc: linux-kernel, linux-mm, x86, wli

On Wed 06-03-13 15:50:20, Cliff Wickman wrote:
[...]
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done.  The 'no zeroing' flag would have to be passed
> down this code path:
> 
>   hugetlb_hstate_alloc_pages
>     alloc_bootmem_huge_page
>       __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
>         __alloc_memory_core_early  NO_ZERO
> 	  if (!(flags & NO_ZERO))
>             memset(ptr, 0, size);
> 
> Or this path if CONFIG_NO_BOOTMEM is not set:
> 
>   hugetlb_hstate_alloc_pages
>     alloc_bootmem_huge_page
>       __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
>         alloc_bootmem_core          NO_ZERO
> 	  if (!(flags & NO_ZERO))
>             memset(region, 0, size);
>         __alloc_bootmem_nopanic     NO_ZERO
>           ___alloc_bootmem_nopanic  NO_ZERO
>             alloc_bootmem_core      NO_ZERO
> 	      if (!(flags & NO_ZERO))
>                 memset(region, 0, size);

Yes, the patch makes sense. I just think it make unnecessary churn.
Can we just add __alloc_bootmem_node_nopanic_nozero and hide the flag
downwards the call chain so that we do not have to touch all
__alloc_bootmem_node_nopanic callers?

Thanks

> Signed-off-by: Cliff Wickman <cpw@sgi.com>
> 
> ---
>  arch/x86/kernel/setup_percpu.c |    4 ++--
>  include/linux/bootmem.h        |   23 ++++++++++++++++-------
>  mm/bootmem.c                   |   12 +++++++-----
>  mm/hugetlb.c                   |    3 ++-
>  mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
>  mm/page_cgroup.c               |    2 +-
>  mm/sparse.c                    |    2 +-
>  7 files changed, 52 insertions(+), 35 deletions(-)
> 
[...]
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
@ 2013-03-14  8:51   ` Michal Hocko
  0 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-03-14  8:51 UTC (permalink / raw)
  To: Cliff Wickman; +Cc: linux-kernel, linux-mm, x86, wli

On Wed 06-03-13 15:50:20, Cliff Wickman wrote:
[...]
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done.  The 'no zeroing' flag would have to be passed
> down this code path:
> 
>   hugetlb_hstate_alloc_pages
>     alloc_bootmem_huge_page
>       __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
>         __alloc_memory_core_early  NO_ZERO
> 	  if (!(flags & NO_ZERO))
>             memset(ptr, 0, size);
> 
> Or this path if CONFIG_NO_BOOTMEM is not set:
> 
>   hugetlb_hstate_alloc_pages
>     alloc_bootmem_huge_page
>       __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
>         alloc_bootmem_core          NO_ZERO
> 	  if (!(flags & NO_ZERO))
>             memset(region, 0, size);
>         __alloc_bootmem_nopanic     NO_ZERO
>           ___alloc_bootmem_nopanic  NO_ZERO
>             alloc_bootmem_core      NO_ZERO
> 	      if (!(flags & NO_ZERO))
>                 memset(region, 0, size);

Yes, the patch makes sense. I just think it make unnecessary churn.
Can we just add __alloc_bootmem_node_nopanic_nozero and hide the flag
downwards the call chain so that we do not have to touch all
__alloc_bootmem_node_nopanic callers?

Thanks

> Signed-off-by: Cliff Wickman <cpw@sgi.com>
> 
> ---
>  arch/x86/kernel/setup_percpu.c |    4 ++--
>  include/linux/bootmem.h        |   23 ++++++++++++++++-------
>  mm/bootmem.c                   |   12 +++++++-----
>  mm/hugetlb.c                   |    3 ++-
>  mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
>  mm/page_cgroup.c               |    2 +-
>  mm/sparse.c                    |    2 +-
>  7 files changed, 52 insertions(+), 35 deletions(-)
> 
[...]
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
  2013-03-14  8:51   ` Michal Hocko
@ 2013-04-03  2:43     ` Robin Holt
  -1 siblings, 0 replies; 26+ messages in thread
From: Robin Holt @ 2013-04-03  2:43 UTC (permalink / raw)
  To: Michal Hocko, Cliff Wickman; +Cc: linux-kernel, linux-mm, x86, wli

Reserving a large number of 1GB hugetlbfs pages at boot takes a very
long time due to the pages being memset to 0 during the reservation.
This is unneeded as the pages will be zeroed by clear_huge_page() when
being allocated by the user.

Large system sites would at times like to allocate a very large amount
of memory as 1GB pages.  They would put this on the kernel boot line:
   default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
 up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
      start_kernel
        kernel_init
          do_pre_smp_initcalls
            hugetlb_init
              hugetlb_init_hstates
                hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).  This estimate is approximate (it depends on
core frequency & number of hops to remote memory) but should be within
a factor of 2 on most systems.  A benchmark attempting to reserve a TB
for 1GB pages would thus require ~1000 seconds of boot time just for
this allocating.  32TB would take 8 hours.

Signed-off-by: Robin Holt <holt@sgi.com>
To: Cliff Whickman <cpw@sgi.com>
To: Michal Hocko <mhocko@suse.cz>
Cc: lkml <linux-kernel@vger.kernel.org>
Cc: Linux mm <linux-mm@kvack.org>
Cc: x86 Maintainers <x86@kernel.org>
---

Changes since -v1
 - Reworked to remove the special NO_ZERO flag and push that down further
   in the call chain.

Note: I compiled this only with a .config which specified
CONFIG_NO_BOOTMEM (x86_64).  I have not tried a config which uses a
bootmem allocator.

 include/linux/bootmem.h |  8 +++++++-
 mm/bootmem.c            | 21 +++++++++++++++++----
 mm/hugetlb.c            |  2 +-
 mm/nobootmem.c          | 37 +++++++++++++++++++++++++++----------
 mm/sparse.c             |  2 +-
 5 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab..04563fc 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -92,11 +92,17 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
+extern void *__alloc_bootmem_node_nopanic_notzeroed(
+				  pg_data_t *pgdat,
+				  unsigned long size,
+				  unsigned long align,
+				  unsigned long goal);
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
-				  unsigned long limit);
+				  unsigned long limit,
+				  int zeroed);
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
 				 unsigned long goal);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb0..b2e4027 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 
 void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				unsigned long size, unsigned long align,
-				unsigned long goal, unsigned long limit)
+				unsigned long goal, unsigned long limit,
+				int zeroed)
 {
 	void *ptr;
 
 	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc(size, GFP_NOWAIT);
+		if (zeroed)
+			return kzalloc(size, GFP_NOWAIT);
+		else
+			return kmalloc(size, GFP_NOWAIT);
 again:
 
 	/* do not panic in alloc_bootmem_bdata() */
@@ -733,13 +737,22 @@ again:
 	return NULL;
 }
 
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +761,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 	if (ptr)
 		return ptr;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ca9a7c6..7683f6a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
 	while (nr_nodes) {
 		void *addr;
 
-		addr = __alloc_bootmem_node_nopanic(
+		addr = __alloc_bootmem_node_nopanic_notzeroed(
 				NODE_DATA(hstate_next_node_to_alloc(h,
 						&node_states[N_MEMORY])),
 				huge_page_size(h), huge_page_size(h), 0);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..342511b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -32,8 +32,8 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-					u64 goal, u64 limit)
+static void * __init ___alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit, int zeroed)
 {
 	void *ptr;
 	u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 		return NULL;
 
 	ptr = phys_to_virt(addr);
-	memset(ptr, 0, size);
+	if (zeroed)
+		memset(ptr, 0, size);
 	memblock_reserve(addr, size);
 	/*
 	 * The min_count is set to 0 so that bootmem allocated blocks
@@ -56,6 +57,12 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 	return ptr;
 }
 
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit)
+{
+	return ___alloc_memory_core_early(nid, size, align, goal, limit, 1);
+}
+
 /*
  * free_bootmem_late - free bootmem pages directly to page allocator
  * @addr: starting address of the range
@@ -291,18 +298,19 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 						   unsigned long size,
 						   unsigned long align,
 						   unsigned long goal,
-						   unsigned long limit)
+						   unsigned long limit,
+						   int zeroed)
 {
 	void *ptr;
 
 again:
-	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-					goal, limit);
+	ptr = ___alloc_memory_core_early(pgdat->node_id, size, align,
+					goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					goal, limit);
+	ptr = ___alloc_memory_core_early(MAX_NUMNODES, size, align,
+					goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
@@ -314,13 +322,22 @@ again:
 	return NULL;
 }
 
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +346,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 1);
 	if (ptr)
 		return ptr;
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc8..8a1c5ad 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
 	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
-					  SMP_CACHE_BYTES, goal, limit);
+					  SMP_CACHE_BYTES, goal, limit, 1);
 	if (!p && limit) {
 		limit = 0;
 		goto again;
-- 
1.8.1.2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
@ 2013-04-03  2:43     ` Robin Holt
  0 siblings, 0 replies; 26+ messages in thread
From: Robin Holt @ 2013-04-03  2:43 UTC (permalink / raw)
  To: Michal Hocko, Cliff Wickman; +Cc: linux-kernel, linux-mm, x86, wli

Reserving a large number of 1GB hugetlbfs pages at boot takes a very
long time due to the pages being memset to 0 during the reservation.
This is unneeded as the pages will be zeroed by clear_huge_page() when
being allocated by the user.

Large system sites would at times like to allocate a very large amount
of memory as 1GB pages.  They would put this on the kernel boot line:
   default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
 up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
      start_kernel
        kernel_init
          do_pre_smp_initcalls
            hugetlb_init
              hugetlb_init_hstates
                hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).  This estimate is approximate (it depends on
core frequency & number of hops to remote memory) but should be within
a factor of 2 on most systems.  A benchmark attempting to reserve a TB
for 1GB pages would thus require ~1000 seconds of boot time just for
this allocating.  32TB would take 8 hours.

Signed-off-by: Robin Holt <holt@sgi.com>
To: Cliff Whickman <cpw@sgi.com>
To: Michal Hocko <mhocko@suse.cz>
Cc: lkml <linux-kernel@vger.kernel.org>
Cc: Linux mm <linux-mm@kvack.org>
Cc: x86 Maintainers <x86@kernel.org>
---

Changes since -v1
 - Reworked to remove the special NO_ZERO flag and push that down further
   in the call chain.

Note: I compiled this only with a .config which specified
CONFIG_NO_BOOTMEM (x86_64).  I have not tried a config which uses a
bootmem allocator.

 include/linux/bootmem.h |  8 +++++++-
 mm/bootmem.c            | 21 +++++++++++++++++----
 mm/hugetlb.c            |  2 +-
 mm/nobootmem.c          | 37 +++++++++++++++++++++++++++----------
 mm/sparse.c             |  2 +-
 5 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab..04563fc 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -92,11 +92,17 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
+extern void *__alloc_bootmem_node_nopanic_notzeroed(
+				  pg_data_t *pgdat,
+				  unsigned long size,
+				  unsigned long align,
+				  unsigned long goal);
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
-				  unsigned long limit);
+				  unsigned long limit,
+				  int zeroed);
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
 				 unsigned long goal);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb0..b2e4027 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 
 void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				unsigned long size, unsigned long align,
-				unsigned long goal, unsigned long limit)
+				unsigned long goal, unsigned long limit,
+				int zeroed)
 {
 	void *ptr;
 
 	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc(size, GFP_NOWAIT);
+		if (zeroed)
+			return kzalloc(size, GFP_NOWAIT);
+		else
+			return kmalloc(size, GFP_NOWAIT);
 again:
 
 	/* do not panic in alloc_bootmem_bdata() */
@@ -733,13 +737,22 @@ again:
 	return NULL;
 }
 
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +761,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 	if (ptr)
 		return ptr;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ca9a7c6..7683f6a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
 	while (nr_nodes) {
 		void *addr;
 
-		addr = __alloc_bootmem_node_nopanic(
+		addr = __alloc_bootmem_node_nopanic_notzeroed(
 				NODE_DATA(hstate_next_node_to_alloc(h,
 						&node_states[N_MEMORY])),
 				huge_page_size(h), huge_page_size(h), 0);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..342511b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -32,8 +32,8 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-					u64 goal, u64 limit)
+static void * __init ___alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit, int zeroed)
 {
 	void *ptr;
 	u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 		return NULL;
 
 	ptr = phys_to_virt(addr);
-	memset(ptr, 0, size);
+	if (zeroed)
+		memset(ptr, 0, size);
 	memblock_reserve(addr, size);
 	/*
 	 * The min_count is set to 0 so that bootmem allocated blocks
@@ -56,6 +57,12 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 	return ptr;
 }
 
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit)
+{
+	return ___alloc_memory_core_early(nid, size, align, goal, limit, 1);
+}
+
 /*
  * free_bootmem_late - free bootmem pages directly to page allocator
  * @addr: starting address of the range
@@ -291,18 +298,19 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 						   unsigned long size,
 						   unsigned long align,
 						   unsigned long goal,
-						   unsigned long limit)
+						   unsigned long limit,
+						   int zeroed)
 {
 	void *ptr;
 
 again:
-	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-					goal, limit);
+	ptr = ___alloc_memory_core_early(pgdat->node_id, size, align,
+					goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					goal, limit);
+	ptr = ___alloc_memory_core_early(MAX_NUMNODES, size, align,
+					goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
@@ -314,13 +322,22 @@ again:
 	return NULL;
 }
 
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +346,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 1);
 	if (ptr)
 		return ptr;
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc8..8a1c5ad 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
 	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
-					  SMP_CACHE_BYTES, goal, limit);
+					  SMP_CACHE_BYTES, goal, limit, 1);
 	if (!p && limit) {
 		limit = 0;
 		goto again;
-- 
1.8.1.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
  2013-04-03  2:43     ` Robin Holt
@ 2013-04-03 14:00       ` Michal Hocko
  -1 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-04-03 14:00 UTC (permalink / raw)
  To: Robin Holt; +Cc: Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Tue 02-04-13 21:43:44, Robin Holt wrote:
[...]
> diff --git a/mm/bootmem.c b/mm/bootmem.c
> index 2b0bcb0..b2e4027 100644
> --- a/mm/bootmem.c
> +++ b/mm/bootmem.c
> @@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
>  
>  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>  				unsigned long size, unsigned long align,
> -				unsigned long goal, unsigned long limit)
> +				unsigned long goal, unsigned long limit,
> +				int zeroed)
>  {
>  	void *ptr;
>  
>  	if (WARN_ON_ONCE(slab_is_available()))
> -		return kzalloc(size, GFP_NOWAIT);
> +		if (zeroed)
> +			return kzalloc(size, GFP_NOWAIT);
> +		else
> +			return kmalloc(size, GFP_NOWAIT);
>  again:
>  
>  	/* do not panic in alloc_bootmem_bdata() */

You need to update alloc_bootmem_bdata and alloc_bootmem_core as well.
Otherwise this is a no-op for early allocations when slab is not
available which is the case unless something is broken.

[...]
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
@ 2013-04-03 14:00       ` Michal Hocko
  0 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-04-03 14:00 UTC (permalink / raw)
  To: Robin Holt; +Cc: Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Tue 02-04-13 21:43:44, Robin Holt wrote:
[...]
> diff --git a/mm/bootmem.c b/mm/bootmem.c
> index 2b0bcb0..b2e4027 100644
> --- a/mm/bootmem.c
> +++ b/mm/bootmem.c
> @@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
>  
>  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>  				unsigned long size, unsigned long align,
> -				unsigned long goal, unsigned long limit)
> +				unsigned long goal, unsigned long limit,
> +				int zeroed)
>  {
>  	void *ptr;
>  
>  	if (WARN_ON_ONCE(slab_is_available()))
> -		return kzalloc(size, GFP_NOWAIT);
> +		if (zeroed)
> +			return kzalloc(size, GFP_NOWAIT);
> +		else
> +			return kmalloc(size, GFP_NOWAIT);
>  again:
>  
>  	/* do not panic in alloc_bootmem_bdata() */

You need to update alloc_bootmem_bdata and alloc_bootmem_core as well.
Otherwise this is a no-op for early allocations when slab is not
available which is the case unless something is broken.

[...]
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
  2013-04-03  2:43     ` Robin Holt
@ 2013-04-03 14:02       ` Michal Hocko
  -1 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-04-03 14:02 UTC (permalink / raw)
  To: Robin Holt; +Cc: Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Tue 02-04-13 21:43:44, Robin Holt wrote:
[...]
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ca9a7c6..7683f6a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
>  	while (nr_nodes) {
>  		void *addr;
>  
> -		addr = __alloc_bootmem_node_nopanic(
> +		addr = __alloc_bootmem_node_nopanic_notzeroed(
>  				NODE_DATA(hstate_next_node_to_alloc(h,
>  						&node_states[N_MEMORY])),
>  				huge_page_size(h), huge_page_size(h), 0);

Ohh, and powerpc seems to have its own opinion how to allocate huge
pages. See arch/powerpc/mm/hugetlbpage.c

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
@ 2013-04-03 14:02       ` Michal Hocko
  0 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-04-03 14:02 UTC (permalink / raw)
  To: Robin Holt; +Cc: Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Tue 02-04-13 21:43:44, Robin Holt wrote:
[...]
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ca9a7c6..7683f6a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
>  	while (nr_nodes) {
>  		void *addr;
>  
> -		addr = __alloc_bootmem_node_nopanic(
> +		addr = __alloc_bootmem_node_nopanic_notzeroed(
>  				NODE_DATA(hstate_next_node_to_alloc(h,
>  						&node_states[N_MEMORY])),
>  				huge_page_size(h), huge_page_size(h), 0);

Ohh, and powerpc seems to have its own opinion how to allocate huge
pages. See arch/powerpc/mm/hugetlbpage.c

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
  2013-04-03 14:02       ` Michal Hocko
@ 2013-04-03 17:00         ` Robin Holt
  -1 siblings, 0 replies; 26+ messages in thread
From: Robin Holt @ 2013-04-03 17:00 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Robin Holt, Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Wed, Apr 03, 2013 at 04:02:47PM +0200, Michal Hocko wrote:
> On Tue 02-04-13 21:43:44, Robin Holt wrote:
> [...]
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index ca9a7c6..7683f6a 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> >  	while (nr_nodes) {
> >  		void *addr;
> >  
> > -		addr = __alloc_bootmem_node_nopanic(
> > +		addr = __alloc_bootmem_node_nopanic_notzeroed(
> >  				NODE_DATA(hstate_next_node_to_alloc(h,
> >  						&node_states[N_MEMORY])),
> >  				huge_page_size(h), huge_page_size(h), 0);
> 
> Ohh, and powerpc seems to have its own opinion how to allocate huge
> pages. See arch/powerpc/mm/hugetlbpage.c

Do I need to address their allocations?  Can I leave that part of the
changes as something powerpc can address if they are affected by this?

Robin

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
@ 2013-04-03 17:00         ` Robin Holt
  0 siblings, 0 replies; 26+ messages in thread
From: Robin Holt @ 2013-04-03 17:00 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Robin Holt, Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Wed, Apr 03, 2013 at 04:02:47PM +0200, Michal Hocko wrote:
> On Tue 02-04-13 21:43:44, Robin Holt wrote:
> [...]
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index ca9a7c6..7683f6a 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> >  	while (nr_nodes) {
> >  		void *addr;
> >  
> > -		addr = __alloc_bootmem_node_nopanic(
> > +		addr = __alloc_bootmem_node_nopanic_notzeroed(
> >  				NODE_DATA(hstate_next_node_to_alloc(h,
> >  						&node_states[N_MEMORY])),
> >  				huge_page_size(h), huge_page_size(h), 0);
> 
> Ohh, and powerpc seems to have its own opinion how to allocate huge
> pages. See arch/powerpc/mm/hugetlbpage.c

Do I need to address their allocations?  Can I leave that part of the
changes as something powerpc can address if they are affected by this?

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
  2013-04-03 14:00       ` Michal Hocko
@ 2013-04-03 17:21         ` Robin Holt
  -1 siblings, 0 replies; 26+ messages in thread
From: Robin Holt @ 2013-04-03 17:21 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Robin Holt, Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Wed, Apr 03, 2013 at 04:00:49PM +0200, Michal Hocko wrote:
> On Tue 02-04-13 21:43:44, Robin Holt wrote:
> [...]
> > diff --git a/mm/bootmem.c b/mm/bootmem.c
> > index 2b0bcb0..b2e4027 100644
> > --- a/mm/bootmem.c
> > +++ b/mm/bootmem.c
> > @@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
> >  
> >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >  				unsigned long size, unsigned long align,
> > -				unsigned long goal, unsigned long limit)
> > +				unsigned long goal, unsigned long limit,
> > +				int zeroed)
> >  {
> >  	void *ptr;
> >  
> >  	if (WARN_ON_ONCE(slab_is_available()))
> > -		return kzalloc(size, GFP_NOWAIT);
> > +		if (zeroed)
> > +			return kzalloc(size, GFP_NOWAIT);
> > +		else
> > +			return kmalloc(size, GFP_NOWAIT);
> >  again:
> >  
> >  	/* do not panic in alloc_bootmem_bdata() */
> 
> You need to update alloc_bootmem_bdata and alloc_bootmem_core as well.
> Otherwise this is a no-op for early allocations when slab is not
> available which is the case unless something is broken.

Michal,

Does this do what you would expect?  I compiled this for ia64, but I
have not tested it at all.

Robin

---
 mm/bootmem.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index b2e4027..350e0ab 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -497,7 +497,8 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
 
 static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
 					unsigned long size, unsigned long align,
-					unsigned long goal, unsigned long limit)
+					unsigned long goal, unsigned long limit,
+					int zeroed)
 {
 	unsigned long fallback = 0;
 	unsigned long min, max, start, sidx, midx, step;
@@ -584,7 +585,8 @@ find_block:
 
 		region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
 				start_off);
-		memset(region, 0, size);
+		if (zeroed)
+			memset(region, 0, size);
 		/*
 		 * The min_count is set to 0 so that bootmem allocated blocks
 		 * are never reported as leaks.
@@ -605,13 +607,18 @@ find_block:
 static void * __init alloc_bootmem_core(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
-					unsigned long limit)
+					unsigned long limit,
+					int zeroed)
 {
 	bootmem_data_t *bdata;
 	void *region;
 
-	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc(size, GFP_NOWAIT);
+	if (WARN_ON_ONCE(slab_is_available())) {
+		if (zeroed)
+			return kzalloc(size, GFP_NOWAIT);
+		else
+			return kmalloc(size, GFP_NOWAIT);
+	}
 
 	list_for_each_entry(bdata, &bdata_list, list) {
 		if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -619,7 +626,7 @@ static void * __init alloc_bootmem_core(unsigned long size,
 		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
 			break;
 
-		region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
+		region = alloc_bootmem_bdata(bdata, size, align, goal, limit, zeroed);
 		if (region)
 			return region;
 	}
@@ -635,7 +642,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 	void *ptr;
 
 restart:
-	ptr = alloc_bootmem_core(size, align, goal, limit);
+	ptr = alloc_bootmem_core(size, align, goal, limit, 1);
 	if (ptr)
 		return ptr;
 	if (goal) {
@@ -710,22 +717,23 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 {
 	void *ptr;
 
-	if (WARN_ON_ONCE(slab_is_available()))
+	if (WARN_ON_ONCE(slab_is_available())) {
 		if (zeroed)
 			return kzalloc(size, GFP_NOWAIT);
 		else
 			return kmalloc(size, GFP_NOWAIT);
+	}
 again:
 
 	/* do not panic in alloc_bootmem_bdata() */
 	if (limit && goal + size > limit)
 		limit = 0;
 
-	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
+	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
-	ptr = alloc_bootmem_core(size, align, goal, limit);
+	ptr = alloc_bootmem_core(size, align, goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
@@ -813,7 +821,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 
 		new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
 		ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
-						 new_goal, 0);
+						 new_goal, 0, 1);
 		if (ptr)
 			return ptr;
 	}
-- 
1.8.1.2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
@ 2013-04-03 17:21         ` Robin Holt
  0 siblings, 0 replies; 26+ messages in thread
From: Robin Holt @ 2013-04-03 17:21 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Robin Holt, Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Wed, Apr 03, 2013 at 04:00:49PM +0200, Michal Hocko wrote:
> On Tue 02-04-13 21:43:44, Robin Holt wrote:
> [...]
> > diff --git a/mm/bootmem.c b/mm/bootmem.c
> > index 2b0bcb0..b2e4027 100644
> > --- a/mm/bootmem.c
> > +++ b/mm/bootmem.c
> > @@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
> >  
> >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >  				unsigned long size, unsigned long align,
> > -				unsigned long goal, unsigned long limit)
> > +				unsigned long goal, unsigned long limit,
> > +				int zeroed)
> >  {
> >  	void *ptr;
> >  
> >  	if (WARN_ON_ONCE(slab_is_available()))
> > -		return kzalloc(size, GFP_NOWAIT);
> > +		if (zeroed)
> > +			return kzalloc(size, GFP_NOWAIT);
> > +		else
> > +			return kmalloc(size, GFP_NOWAIT);
> >  again:
> >  
> >  	/* do not panic in alloc_bootmem_bdata() */
> 
> You need to update alloc_bootmem_bdata and alloc_bootmem_core as well.
> Otherwise this is a no-op for early allocations when slab is not
> available which is the case unless something is broken.

Michal,

Does this do what you would expect?  I compiled this for ia64, but I
have not tested it at all.

Robin

---
 mm/bootmem.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index b2e4027..350e0ab 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -497,7 +497,8 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
 
 static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
 					unsigned long size, unsigned long align,
-					unsigned long goal, unsigned long limit)
+					unsigned long goal, unsigned long limit,
+					int zeroed)
 {
 	unsigned long fallback = 0;
 	unsigned long min, max, start, sidx, midx, step;
@@ -584,7 +585,8 @@ find_block:
 
 		region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
 				start_off);
-		memset(region, 0, size);
+		if (zeroed)
+			memset(region, 0, size);
 		/*
 		 * The min_count is set to 0 so that bootmem allocated blocks
 		 * are never reported as leaks.
@@ -605,13 +607,18 @@ find_block:
 static void * __init alloc_bootmem_core(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
-					unsigned long limit)
+					unsigned long limit,
+					int zeroed)
 {
 	bootmem_data_t *bdata;
 	void *region;
 
-	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc(size, GFP_NOWAIT);
+	if (WARN_ON_ONCE(slab_is_available())) {
+		if (zeroed)
+			return kzalloc(size, GFP_NOWAIT);
+		else
+			return kmalloc(size, GFP_NOWAIT);
+	}
 
 	list_for_each_entry(bdata, &bdata_list, list) {
 		if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -619,7 +626,7 @@ static void * __init alloc_bootmem_core(unsigned long size,
 		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
 			break;
 
-		region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
+		region = alloc_bootmem_bdata(bdata, size, align, goal, limit, zeroed);
 		if (region)
 			return region;
 	}
@@ -635,7 +642,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 	void *ptr;
 
 restart:
-	ptr = alloc_bootmem_core(size, align, goal, limit);
+	ptr = alloc_bootmem_core(size, align, goal, limit, 1);
 	if (ptr)
 		return ptr;
 	if (goal) {
@@ -710,22 +717,23 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 {
 	void *ptr;
 
-	if (WARN_ON_ONCE(slab_is_available()))
+	if (WARN_ON_ONCE(slab_is_available())) {
 		if (zeroed)
 			return kzalloc(size, GFP_NOWAIT);
 		else
 			return kmalloc(size, GFP_NOWAIT);
+	}
 again:
 
 	/* do not panic in alloc_bootmem_bdata() */
 	if (limit && goal + size > limit)
 		limit = 0;
 
-	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
+	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
-	ptr = alloc_bootmem_core(size, align, goal, limit);
+	ptr = alloc_bootmem_core(size, align, goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
@@ -813,7 +821,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 
 		new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
 		ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
-						 new_goal, 0);
+						 new_goal, 0, 1);
 		if (ptr)
 			return ptr;
 	}
-- 
1.8.1.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
  2013-03-06 21:50 ` Cliff Wickman
@ 2013-04-04  0:17   ` Simon Jeons
  -1 siblings, 0 replies; 26+ messages in thread
From: Simon Jeons @ 2013-04-04  0:17 UTC (permalink / raw)
  To: Cliff Wickman; +Cc: linux-kernel, linux-mm, x86, wli

On 03/07/2013 05:50 AM, Cliff Wickman wrote:
> From: Cliff Wickman <cpw@sgi.com>
>
> Allocating a large number of 1GB hugetlbfs pages at boot takes a
> very long time.
>
> Large system sites would at times like to allocate a very large amount of
> memory as 1GB pages.  They would put this on the kernel boot line:
>     default_hugepagesz=1G hugepagesz=1G hugepages=4096
> [Dynamic allocation of 1G pages is not an option, as zone pages only go
>   up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>
> Each page is zeroed as it is allocated, and all allocation is done by
> cpu 0, as this path is early in boot:

How you confirm they are done by cpu 0? just cpu 0 works during boot?

>        start_kernel
>          kernel_init
>            do_pre_smp_initcalls
>              hugetlb_init
>                hugetlb_init_hstates
>                  hugetlb_hstate_alloc_pages
>
> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> on large numa systems).
> This estimate is approximate (it depends on core frequency & number of hops
> to remote memory) but should be within a factor of 2 on most systems.
> A benchmark attempting to reserve a TB for 1GB pages would thus require
> ~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.
>
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done.  The 'no zeroing' flag would have to be passed
> down this code path:
>
>    hugetlb_hstate_alloc_pages
>      alloc_bootmem_huge_page
>        __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
>          __alloc_memory_core_early  NO_ZERO
> 	  if (!(flags & NO_ZERO))
>              memset(ptr, 0, size);
>
> Or this path if CONFIG_NO_BOOTMEM is not set:
>
>    hugetlb_hstate_alloc_pages
>      alloc_bootmem_huge_page
>        __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
>          alloc_bootmem_core          NO_ZERO
> 	  if (!(flags & NO_ZERO))
>              memset(region, 0, size);
>          __alloc_bootmem_nopanic     NO_ZERO
>            ___alloc_bootmem_nopanic  NO_ZERO
>              alloc_bootmem_core      NO_ZERO
> 	      if (!(flags & NO_ZERO))
>                  memset(region, 0, size);
>
> Signed-off-by: Cliff Wickman <cpw@sgi.com>
>
> ---
>   arch/x86/kernel/setup_percpu.c |    4 ++--
>   include/linux/bootmem.h        |   23 ++++++++++++++++-------
>   mm/bootmem.c                   |   12 +++++++-----
>   mm/hugetlb.c                   |    3 ++-
>   mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
>   mm/page_cgroup.c               |    2 +-
>   mm/sparse.c                    |    2 +-
>   7 files changed, 52 insertions(+), 35 deletions(-)
>
> Index: linux/include/linux/bootmem.h
> ===================================================================
> --- linux.orig/include/linux/bootmem.h
> +++ linux/include/linux/bootmem.h
> @@ -8,6 +8,11 @@
>   #include <asm/dma.h>
>   
>   /*
> + * allocation flags
> + */
> +#define NO_ZERO		0x00000001
> +
> +/*
>    *  simple boot-time physical memory area allocator.
>    */
>   
> @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
>   			     unsigned long goal);
>   extern void *__alloc_bootmem_nopanic(unsigned long size,
>   				     unsigned long align,
> -				     unsigned long goal);
> +				     unsigned long goal,
> +				     u32 flags);
>   extern void *__alloc_bootmem_node(pg_data_t *pgdat,
>   				  unsigned long size,
>   				  unsigned long align,
> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
>   extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>   				  unsigned long size,
>   				  unsigned long align,
> -				  unsigned long goal);
> +				  unsigned long goal,
> +				  u32 flags);
>   void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>   				  unsigned long size,
>   				  unsigned long align,
>   				  unsigned long goal,
> -				  unsigned long limit);
> +				  unsigned long limit,
> +				  u32 flags);
>   extern void *__alloc_bootmem_low(unsigned long size,
>   				 unsigned long align,
>   				 unsigned long goal);
> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
>   #define alloc_bootmem_align(x, align) \
>   	__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
>   #define alloc_bootmem_nopanic(x) \
> -	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> +	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
>   #define alloc_bootmem_pages(x) \
>   	__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>   #define alloc_bootmem_pages_nopanic(x) \
> -	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> +	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>   #define alloc_bootmem_node(pgdat, x) \
>   	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>   #define alloc_bootmem_node_nopanic(pgdat, x) \
> -	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> +	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> +				     BOOTMEM_LOW_LIMIT, 0)
>   #define alloc_bootmem_pages_node(pgdat, x) \
>   	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>   #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> -	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> +	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>   
>   #define alloc_bootmem_low(x) \
>   	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> Index: linux/arch/x86/kernel/setup_percpu.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_percpu.c
> +++ linux/arch/x86/kernel/setup_percpu.c
> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
>   	void *ptr;
>   
>   	if (!node_online(node) || !NODE_DATA(node)) {
> -		ptr = __alloc_bootmem_nopanic(size, align, goal);
> +		ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
>   		pr_info("cpu %d has no node %d or node-local memory\n",
>   			cpu, node);
>   		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
>   			 cpu, size, __pa(ptr));
>   	} else {
>   		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> -						   size, align, goal);
> +						   size, align, goal, 0);
>   		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
>   			 cpu, size, node, __pa(ptr));
>   	}
> Index: linux/mm/nobootmem.c
> ===================================================================
> --- linux.orig/mm/nobootmem.c
> +++ linux/mm/nobootmem.c
> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
>   unsigned long max_pfn;
>   
>   static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> -					u64 goal, u64 limit)
> +					u64 goal, u64 limit, u32 flags)
>   {
>   	void *ptr;
>   	u64 addr;
> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
>   		return NULL;
>   
>   	ptr = phys_to_virt(addr);
> -	memset(ptr, 0, size);
> +	if (!(flags & NO_ZERO))
> +		memset(ptr, 0, size);
>   	memblock_reserve(addr, size);
>   	/*
>   	 * The min_count is set to 0 so that bootmem allocated blocks
> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
>   static void * __init ___alloc_bootmem_nopanic(unsigned long size,
>   					unsigned long align,
>   					unsigned long goal,
> -					unsigned long limit)
> +					unsigned long limit,
> +					u32 flags)
>   {
>   	void *ptr;
>   
> @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>   
>   restart:
>   
> -	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> +	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> +					limit, 0);
>   
>   	if (ptr)
>   		return ptr;
> @@ -244,17 +247,17 @@ restart:
>    * Returns NULL on failure.
>    */
>   void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> -					unsigned long goal)
> +					unsigned long goal, u32 flags)
>   {
>   	unsigned long limit = -1UL;
>   
> -	return ___alloc_bootmem_nopanic(size, align, goal, limit);
> +	return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>   }
>   
>   static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> -					unsigned long goal, unsigned long limit)
> +			unsigned long goal, unsigned long limit, u32 flags)
>   {
> -	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> +	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>   
>   	if (mem)
>   		return mem;
> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
>   {
>   	unsigned long limit = -1UL;
>   
> -	return ___alloc_bootmem(size, align, goal, limit);
> +	return ___alloc_bootmem(size, align, goal, limit, 0);
>   }
>   
>   void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>   						   unsigned long size,
>   						   unsigned long align,
>   						   unsigned long goal,
> -						   unsigned long limit)
> +						   unsigned long limit,
> +						   u32 flags)
>   {
>   	void *ptr;
>   
>   again:
>   	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> -					goal, limit);
> +					goal, limit, flags);
>   	if (ptr)
>   		return ptr;
>   
>   	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> -					goal, limit);
> +					goal, limit, flags);
>   	if (ptr)
>   		return ptr;
>   
> @@ -315,12 +319,13 @@ again:
>   }
>   
>   void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> -				   unsigned long align, unsigned long goal)
> +			unsigned long align, unsigned long goal, u32 flags)
>   {
>   	if (WARN_ON_ONCE(slab_is_available()))
>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>   
> -	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> +			0, flags);
>   }
>   
>   void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
>   {
>   	void *ptr;
>   
> -	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> +	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
>   	if (ptr)
>   		return ptr;
>   
> @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
>    * The function panics if the request can not be satisfied.
>    */
>   void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> -				   unsigned long align, unsigned long goal)
> +			unsigned long align, unsigned long goal)
>   {
>   	if (WARN_ON_ONCE(slab_is_available()))
>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
>   void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
>   				  unsigned long goal)
>   {
> -	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> +	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
>   }
>   
>   void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
>   					  unsigned long goal)
>   {
>   	return ___alloc_bootmem_nopanic(size, align, goal,
> -					ARCH_LOW_ADDRESS_LIMIT);
> +					ARCH_LOW_ADDRESS_LIMIT, 0);
>   }
>   
>   /**
> Index: linux/mm/sparse.c
> ===================================================================
> --- linux.orig/mm/sparse.c
> +++ linux/mm/sparse.c
> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
>   	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
>   again:
>   	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> -					  SMP_CACHE_BYTES, goal, limit);
> +					  SMP_CACHE_BYTES, goal, limit, 0);
>   	if (!p && limit) {
>   		limit = 0;
>   		goto again;
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
>   		addr = __alloc_bootmem_node_nopanic(
>   				NODE_DATA(hstate_next_node_to_alloc(h,
>   						&node_states[N_MEMORY])),
> -				huge_page_size(h), huge_page_size(h), 0);
> +				huge_page_size(h), huge_page_size(h),
> +				0, NO_ZERO);
>   
>   		if (addr) {
>   			/*
> Index: linux/mm/bootmem.c
> ===================================================================
> --- linux.orig/mm/bootmem.c
> +++ linux/mm/bootmem.c
> @@ -660,7 +660,7 @@ restart:
>    * Returns NULL on failure.
>    */
>   void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> -					unsigned long goal)
> +					unsigned long goal, u32 flags)
>   {
>   	unsigned long limit = 0;
>   
> @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>   
>   void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>   				unsigned long size, unsigned long align,
> -				unsigned long goal, unsigned long limit)
> +				unsigned long goal, unsigned long limit,
> +				u32 flags)
>   {
>   	void *ptr;
>   
> @@ -734,12 +735,13 @@ again:
>   }
>   
>   void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> -				   unsigned long align, unsigned long goal)
> +			unsigned long align, unsigned long goal, u32 flags)
>   {
>   	if (WARN_ON_ONCE(slab_is_available()))
>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>   
> -	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> +					     0, flags);
>   }
>   
>   void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
>   {
>   	void *ptr;
>   
> -	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
>   	if (ptr)
>   		return ptr;
>   
> Index: linux/mm/page_cgroup.c
> ===================================================================
> --- linux.orig/mm/page_cgroup.c
> +++ linux/mm/page_cgroup.c
> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
>   	table_size = sizeof(struct page_cgroup) * nr_pages;
>   
>   	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> +			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
>   	if (!base)
>   		return -ENOMEM;
>   	NODE_DATA(nid)->node_page_cgroup = base;
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
@ 2013-04-04  0:17   ` Simon Jeons
  0 siblings, 0 replies; 26+ messages in thread
From: Simon Jeons @ 2013-04-04  0:17 UTC (permalink / raw)
  To: Cliff Wickman; +Cc: linux-kernel, linux-mm, x86, wli

On 03/07/2013 05:50 AM, Cliff Wickman wrote:
> From: Cliff Wickman <cpw@sgi.com>
>
> Allocating a large number of 1GB hugetlbfs pages at boot takes a
> very long time.
>
> Large system sites would at times like to allocate a very large amount of
> memory as 1GB pages.  They would put this on the kernel boot line:
>     default_hugepagesz=1G hugepagesz=1G hugepages=4096
> [Dynamic allocation of 1G pages is not an option, as zone pages only go
>   up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>
> Each page is zeroed as it is allocated, and all allocation is done by
> cpu 0, as this path is early in boot:

How you confirm they are done by cpu 0? just cpu 0 works during boot?

>        start_kernel
>          kernel_init
>            do_pre_smp_initcalls
>              hugetlb_init
>                hugetlb_init_hstates
>                  hugetlb_hstate_alloc_pages
>
> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> on large numa systems).
> This estimate is approximate (it depends on core frequency & number of hops
> to remote memory) but should be within a factor of 2 on most systems.
> A benchmark attempting to reserve a TB for 1GB pages would thus require
> ~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.
>
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done.  The 'no zeroing' flag would have to be passed
> down this code path:
>
>    hugetlb_hstate_alloc_pages
>      alloc_bootmem_huge_page
>        __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
>          __alloc_memory_core_early  NO_ZERO
> 	  if (!(flags & NO_ZERO))
>              memset(ptr, 0, size);
>
> Or this path if CONFIG_NO_BOOTMEM is not set:
>
>    hugetlb_hstate_alloc_pages
>      alloc_bootmem_huge_page
>        __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
>          alloc_bootmem_core          NO_ZERO
> 	  if (!(flags & NO_ZERO))
>              memset(region, 0, size);
>          __alloc_bootmem_nopanic     NO_ZERO
>            ___alloc_bootmem_nopanic  NO_ZERO
>              alloc_bootmem_core      NO_ZERO
> 	      if (!(flags & NO_ZERO))
>                  memset(region, 0, size);
>
> Signed-off-by: Cliff Wickman <cpw@sgi.com>
>
> ---
>   arch/x86/kernel/setup_percpu.c |    4 ++--
>   include/linux/bootmem.h        |   23 ++++++++++++++++-------
>   mm/bootmem.c                   |   12 +++++++-----
>   mm/hugetlb.c                   |    3 ++-
>   mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
>   mm/page_cgroup.c               |    2 +-
>   mm/sparse.c                    |    2 +-
>   7 files changed, 52 insertions(+), 35 deletions(-)
>
> Index: linux/include/linux/bootmem.h
> ===================================================================
> --- linux.orig/include/linux/bootmem.h
> +++ linux/include/linux/bootmem.h
> @@ -8,6 +8,11 @@
>   #include <asm/dma.h>
>   
>   /*
> + * allocation flags
> + */
> +#define NO_ZERO		0x00000001
> +
> +/*
>    *  simple boot-time physical memory area allocator.
>    */
>   
> @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
>   			     unsigned long goal);
>   extern void *__alloc_bootmem_nopanic(unsigned long size,
>   				     unsigned long align,
> -				     unsigned long goal);
> +				     unsigned long goal,
> +				     u32 flags);
>   extern void *__alloc_bootmem_node(pg_data_t *pgdat,
>   				  unsigned long size,
>   				  unsigned long align,
> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
>   extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>   				  unsigned long size,
>   				  unsigned long align,
> -				  unsigned long goal);
> +				  unsigned long goal,
> +				  u32 flags);
>   void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>   				  unsigned long size,
>   				  unsigned long align,
>   				  unsigned long goal,
> -				  unsigned long limit);
> +				  unsigned long limit,
> +				  u32 flags);
>   extern void *__alloc_bootmem_low(unsigned long size,
>   				 unsigned long align,
>   				 unsigned long goal);
> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
>   #define alloc_bootmem_align(x, align) \
>   	__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
>   #define alloc_bootmem_nopanic(x) \
> -	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> +	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
>   #define alloc_bootmem_pages(x) \
>   	__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>   #define alloc_bootmem_pages_nopanic(x) \
> -	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> +	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>   #define alloc_bootmem_node(pgdat, x) \
>   	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>   #define alloc_bootmem_node_nopanic(pgdat, x) \
> -	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> +	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> +				     BOOTMEM_LOW_LIMIT, 0)
>   #define alloc_bootmem_pages_node(pgdat, x) \
>   	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>   #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> -	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> +	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>   
>   #define alloc_bootmem_low(x) \
>   	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> Index: linux/arch/x86/kernel/setup_percpu.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_percpu.c
> +++ linux/arch/x86/kernel/setup_percpu.c
> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
>   	void *ptr;
>   
>   	if (!node_online(node) || !NODE_DATA(node)) {
> -		ptr = __alloc_bootmem_nopanic(size, align, goal);
> +		ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
>   		pr_info("cpu %d has no node %d or node-local memory\n",
>   			cpu, node);
>   		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
>   			 cpu, size, __pa(ptr));
>   	} else {
>   		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> -						   size, align, goal);
> +						   size, align, goal, 0);
>   		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
>   			 cpu, size, node, __pa(ptr));
>   	}
> Index: linux/mm/nobootmem.c
> ===================================================================
> --- linux.orig/mm/nobootmem.c
> +++ linux/mm/nobootmem.c
> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
>   unsigned long max_pfn;
>   
>   static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> -					u64 goal, u64 limit)
> +					u64 goal, u64 limit, u32 flags)
>   {
>   	void *ptr;
>   	u64 addr;
> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
>   		return NULL;
>   
>   	ptr = phys_to_virt(addr);
> -	memset(ptr, 0, size);
> +	if (!(flags & NO_ZERO))
> +		memset(ptr, 0, size);
>   	memblock_reserve(addr, size);
>   	/*
>   	 * The min_count is set to 0 so that bootmem allocated blocks
> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
>   static void * __init ___alloc_bootmem_nopanic(unsigned long size,
>   					unsigned long align,
>   					unsigned long goal,
> -					unsigned long limit)
> +					unsigned long limit,
> +					u32 flags)
>   {
>   	void *ptr;
>   
> @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>   
>   restart:
>   
> -	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> +	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> +					limit, 0);
>   
>   	if (ptr)
>   		return ptr;
> @@ -244,17 +247,17 @@ restart:
>    * Returns NULL on failure.
>    */
>   void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> -					unsigned long goal)
> +					unsigned long goal, u32 flags)
>   {
>   	unsigned long limit = -1UL;
>   
> -	return ___alloc_bootmem_nopanic(size, align, goal, limit);
> +	return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>   }
>   
>   static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> -					unsigned long goal, unsigned long limit)
> +			unsigned long goal, unsigned long limit, u32 flags)
>   {
> -	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> +	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>   
>   	if (mem)
>   		return mem;
> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
>   {
>   	unsigned long limit = -1UL;
>   
> -	return ___alloc_bootmem(size, align, goal, limit);
> +	return ___alloc_bootmem(size, align, goal, limit, 0);
>   }
>   
>   void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>   						   unsigned long size,
>   						   unsigned long align,
>   						   unsigned long goal,
> -						   unsigned long limit)
> +						   unsigned long limit,
> +						   u32 flags)
>   {
>   	void *ptr;
>   
>   again:
>   	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> -					goal, limit);
> +					goal, limit, flags);
>   	if (ptr)
>   		return ptr;
>   
>   	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> -					goal, limit);
> +					goal, limit, flags);
>   	if (ptr)
>   		return ptr;
>   
> @@ -315,12 +319,13 @@ again:
>   }
>   
>   void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> -				   unsigned long align, unsigned long goal)
> +			unsigned long align, unsigned long goal, u32 flags)
>   {
>   	if (WARN_ON_ONCE(slab_is_available()))
>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>   
> -	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> +			0, flags);
>   }
>   
>   void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
>   {
>   	void *ptr;
>   
> -	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> +	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
>   	if (ptr)
>   		return ptr;
>   
> @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
>    * The function panics if the request can not be satisfied.
>    */
>   void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> -				   unsigned long align, unsigned long goal)
> +			unsigned long align, unsigned long goal)
>   {
>   	if (WARN_ON_ONCE(slab_is_available()))
>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
>   void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
>   				  unsigned long goal)
>   {
> -	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> +	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
>   }
>   
>   void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
>   					  unsigned long goal)
>   {
>   	return ___alloc_bootmem_nopanic(size, align, goal,
> -					ARCH_LOW_ADDRESS_LIMIT);
> +					ARCH_LOW_ADDRESS_LIMIT, 0);
>   }
>   
>   /**
> Index: linux/mm/sparse.c
> ===================================================================
> --- linux.orig/mm/sparse.c
> +++ linux/mm/sparse.c
> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
>   	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
>   again:
>   	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> -					  SMP_CACHE_BYTES, goal, limit);
> +					  SMP_CACHE_BYTES, goal, limit, 0);
>   	if (!p && limit) {
>   		limit = 0;
>   		goto again;
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
>   		addr = __alloc_bootmem_node_nopanic(
>   				NODE_DATA(hstate_next_node_to_alloc(h,
>   						&node_states[N_MEMORY])),
> -				huge_page_size(h), huge_page_size(h), 0);
> +				huge_page_size(h), huge_page_size(h),
> +				0, NO_ZERO);
>   
>   		if (addr) {
>   			/*
> Index: linux/mm/bootmem.c
> ===================================================================
> --- linux.orig/mm/bootmem.c
> +++ linux/mm/bootmem.c
> @@ -660,7 +660,7 @@ restart:
>    * Returns NULL on failure.
>    */
>   void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> -					unsigned long goal)
> +					unsigned long goal, u32 flags)
>   {
>   	unsigned long limit = 0;
>   
> @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>   
>   void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>   				unsigned long size, unsigned long align,
> -				unsigned long goal, unsigned long limit)
> +				unsigned long goal, unsigned long limit,
> +				u32 flags)
>   {
>   	void *ptr;
>   
> @@ -734,12 +735,13 @@ again:
>   }
>   
>   void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> -				   unsigned long align, unsigned long goal)
> +			unsigned long align, unsigned long goal, u32 flags)
>   {
>   	if (WARN_ON_ONCE(slab_is_available()))
>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>   
> -	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> +					     0, flags);
>   }
>   
>   void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
>   {
>   	void *ptr;
>   
> -	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> +	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
>   	if (ptr)
>   		return ptr;
>   
> Index: linux/mm/page_cgroup.c
> ===================================================================
> --- linux.orig/mm/page_cgroup.c
> +++ linux/mm/page_cgroup.c
> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
>   	table_size = sizeof(struct page_cgroup) * nr_pages;
>   
>   	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> +			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
>   	if (!base)
>   		return -ENOMEM;
>   	NODE_DATA(nid)->node_page_cgroup = base;
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
  2013-04-03 17:00         ` Robin Holt
@ 2013-04-04  8:08           ` Michal Hocko
  -1 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-04-04  8:08 UTC (permalink / raw)
  To: Robin Holt; +Cc: Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Wed 03-04-13 12:00:12, Robin Holt wrote:
> On Wed, Apr 03, 2013 at 04:02:47PM +0200, Michal Hocko wrote:
> > On Tue 02-04-13 21:43:44, Robin Holt wrote:
> > [...]
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > index ca9a7c6..7683f6a 100644
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> > >  	while (nr_nodes) {
> > >  		void *addr;
> > >  
> > > -		addr = __alloc_bootmem_node_nopanic(
> > > +		addr = __alloc_bootmem_node_nopanic_notzeroed(
> > >  				NODE_DATA(hstate_next_node_to_alloc(h,
> > >  						&node_states[N_MEMORY])),
> > >  				huge_page_size(h), huge_page_size(h), 0);
> > 
> > Ohh, and powerpc seems to have its own opinion how to allocate huge
> > pages. See arch/powerpc/mm/hugetlbpage.c
> 
> Do I need to address their allocations?  Can I leave that part of the
> changes as something powerpc can address if they are affected by this?

I mentioned powerpc basically because I encountered it as the only
alternative implementation of alloc_bootmem_huge_page. I haven't checked
how it does the job and now that I am looking closer it uses memblock
allocator so it would need a separate fix.
I guess you are right saying that this should be handled when the need
arises.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
@ 2013-04-04  8:08           ` Michal Hocko
  0 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-04-04  8:08 UTC (permalink / raw)
  To: Robin Holt; +Cc: Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Wed 03-04-13 12:00:12, Robin Holt wrote:
> On Wed, Apr 03, 2013 at 04:02:47PM +0200, Michal Hocko wrote:
> > On Tue 02-04-13 21:43:44, Robin Holt wrote:
> > [...]
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > index ca9a7c6..7683f6a 100644
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> > >  	while (nr_nodes) {
> > >  		void *addr;
> > >  
> > > -		addr = __alloc_bootmem_node_nopanic(
> > > +		addr = __alloc_bootmem_node_nopanic_notzeroed(
> > >  				NODE_DATA(hstate_next_node_to_alloc(h,
> > >  						&node_states[N_MEMORY])),
> > >  				huge_page_size(h), huge_page_size(h), 0);
> > 
> > Ohh, and powerpc seems to have its own opinion how to allocate huge
> > pages. See arch/powerpc/mm/hugetlbpage.c
> 
> Do I need to address their allocations?  Can I leave that part of the
> changes as something powerpc can address if they are affected by this?

I mentioned powerpc basically because I encountered it as the only
alternative implementation of alloc_bootmem_huge_page. I haven't checked
how it does the job and now that I am looking closer it uses memblock
allocator so it would need a separate fix.
I guess you are right saying that this should be handled when the need
arises.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
  2013-04-03 17:21         ` Robin Holt
@ 2013-04-04  8:17           ` Michal Hocko
  -1 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-04-04  8:17 UTC (permalink / raw)
  To: Robin Holt; +Cc: Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Wed 03-04-13 12:21:32, Robin Holt wrote:
> On Wed, Apr 03, 2013 at 04:00:49PM +0200, Michal Hocko wrote:
> > On Tue 02-04-13 21:43:44, Robin Holt wrote:
> > [...]
> > > diff --git a/mm/bootmem.c b/mm/bootmem.c
> > > index 2b0bcb0..b2e4027 100644
> > > --- a/mm/bootmem.c
> > > +++ b/mm/bootmem.c
> > > @@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
> > >  
> > >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> > >  				unsigned long size, unsigned long align,
> > > -				unsigned long goal, unsigned long limit)
> > > +				unsigned long goal, unsigned long limit,
> > > +				int zeroed)
> > >  {
> > >  	void *ptr;
> > >  
> > >  	if (WARN_ON_ONCE(slab_is_available()))
> > > -		return kzalloc(size, GFP_NOWAIT);
> > > +		if (zeroed)
> > > +			return kzalloc(size, GFP_NOWAIT);
> > > +		else
> > > +			return kmalloc(size, GFP_NOWAIT);
> > >  again:
> > >  
> > >  	/* do not panic in alloc_bootmem_bdata() */
> > 
> > You need to update alloc_bootmem_bdata and alloc_bootmem_core as well.
> > Otherwise this is a no-op for early allocations when slab is not
> > available which is the case unless something is broken.
> 
> Michal,
> 
> Does this do what you would expect?  

yes, it looks right when I quickly glanced over it. I haven't checked
deeply yet. I would suggest reposting and adding more *bootmem people
into CC (e.g. Johannes Weiner, Yinghai Lu, Tejun Heo and maybe others).

> I compiled this for ia64, but I have not tested it at all.
> 
> Robin
> 
> ---
>  mm/bootmem.c | 30 +++++++++++++++++++-----------
>  1 file changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/bootmem.c b/mm/bootmem.c
> index b2e4027..350e0ab 100644
> --- a/mm/bootmem.c
> +++ b/mm/bootmem.c
> @@ -497,7 +497,8 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
>  
>  static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
>  					unsigned long size, unsigned long align,
> -					unsigned long goal, unsigned long limit)
> +					unsigned long goal, unsigned long limit,
> +					int zeroed)
>  {
>  	unsigned long fallback = 0;
>  	unsigned long min, max, start, sidx, midx, step;
> @@ -584,7 +585,8 @@ find_block:
>  
>  		region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
>  				start_off);
> -		memset(region, 0, size);
> +		if (zeroed)
> +			memset(region, 0, size);
>  		/*
>  		 * The min_count is set to 0 so that bootmem allocated blocks
>  		 * are never reported as leaks.
> @@ -605,13 +607,18 @@ find_block:
>  static void * __init alloc_bootmem_core(unsigned long size,
>  					unsigned long align,
>  					unsigned long goal,
> -					unsigned long limit)
> +					unsigned long limit,
> +					int zeroed)
>  {
>  	bootmem_data_t *bdata;
>  	void *region;
>  
> -	if (WARN_ON_ONCE(slab_is_available()))
> -		return kzalloc(size, GFP_NOWAIT);
> +	if (WARN_ON_ONCE(slab_is_available())) {
> +		if (zeroed)
> +			return kzalloc(size, GFP_NOWAIT);
> +		else
> +			return kmalloc(size, GFP_NOWAIT);
> +	}
>  
>  	list_for_each_entry(bdata, &bdata_list, list) {
>  		if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
> @@ -619,7 +626,7 @@ static void * __init alloc_bootmem_core(unsigned long size,
>  		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
>  			break;
>  
> -		region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
> +		region = alloc_bootmem_bdata(bdata, size, align, goal, limit, zeroed);
>  		if (region)
>  			return region;
>  	}
> @@ -635,7 +642,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
>  	void *ptr;
>  
>  restart:
> -	ptr = alloc_bootmem_core(size, align, goal, limit);
> +	ptr = alloc_bootmem_core(size, align, goal, limit, 1);
>  	if (ptr)
>  		return ptr;
>  	if (goal) {
> @@ -710,22 +717,23 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>  {
>  	void *ptr;
>  
> -	if (WARN_ON_ONCE(slab_is_available()))
> +	if (WARN_ON_ONCE(slab_is_available())) {
>  		if (zeroed)
>  			return kzalloc(size, GFP_NOWAIT);
>  		else
>  			return kmalloc(size, GFP_NOWAIT);
> +	}
>  again:
>  
>  	/* do not panic in alloc_bootmem_bdata() */
>  	if (limit && goal + size > limit)
>  		limit = 0;
>  
> -	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
> +	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit, zeroed);
>  	if (ptr)
>  		return ptr;
>  
> -	ptr = alloc_bootmem_core(size, align, goal, limit);
> +	ptr = alloc_bootmem_core(size, align, goal, limit, zeroed);
>  	if (ptr)
>  		return ptr;
>  
> @@ -813,7 +821,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
>  
>  		new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
>  		ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
> -						 new_goal, 0);
> +						 new_goal, 0, 1);
>  		if (ptr)
>  			return ptr;
>  	}
> -- 
> 1.8.1.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
@ 2013-04-04  8:17           ` Michal Hocko
  0 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2013-04-04  8:17 UTC (permalink / raw)
  To: Robin Holt; +Cc: Cliff Wickman, linux-kernel, linux-mm, x86, wli

On Wed 03-04-13 12:21:32, Robin Holt wrote:
> On Wed, Apr 03, 2013 at 04:00:49PM +0200, Michal Hocko wrote:
> > On Tue 02-04-13 21:43:44, Robin Holt wrote:
> > [...]
> > > diff --git a/mm/bootmem.c b/mm/bootmem.c
> > > index 2b0bcb0..b2e4027 100644
> > > --- a/mm/bootmem.c
> > > +++ b/mm/bootmem.c
> > > @@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
> > >  
> > >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> > >  				unsigned long size, unsigned long align,
> > > -				unsigned long goal, unsigned long limit)
> > > +				unsigned long goal, unsigned long limit,
> > > +				int zeroed)
> > >  {
> > >  	void *ptr;
> > >  
> > >  	if (WARN_ON_ONCE(slab_is_available()))
> > > -		return kzalloc(size, GFP_NOWAIT);
> > > +		if (zeroed)
> > > +			return kzalloc(size, GFP_NOWAIT);
> > > +		else
> > > +			return kmalloc(size, GFP_NOWAIT);
> > >  again:
> > >  
> > >  	/* do not panic in alloc_bootmem_bdata() */
> > 
> > You need to update alloc_bootmem_bdata and alloc_bootmem_core as well.
> > Otherwise this is a no-op for early allocations when slab is not
> > available which is the case unless something is broken.
> 
> Michal,
> 
> Does this do what you would expect?  

yes, it looks right when I quickly glanced over it. I haven't checked
deeply yet. I would suggest reposting and adding more *bootmem people
into CC (e.g. Johannes Weiner, Yinghai Lu, Tejun Heo and maybe others).

> I compiled this for ia64, but I have not tested it at all.
> 
> Robin
> 
> ---
>  mm/bootmem.c | 30 +++++++++++++++++++-----------
>  1 file changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/bootmem.c b/mm/bootmem.c
> index b2e4027..350e0ab 100644
> --- a/mm/bootmem.c
> +++ b/mm/bootmem.c
> @@ -497,7 +497,8 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
>  
>  static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
>  					unsigned long size, unsigned long align,
> -					unsigned long goal, unsigned long limit)
> +					unsigned long goal, unsigned long limit,
> +					int zeroed)
>  {
>  	unsigned long fallback = 0;
>  	unsigned long min, max, start, sidx, midx, step;
> @@ -584,7 +585,8 @@ find_block:
>  
>  		region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
>  				start_off);
> -		memset(region, 0, size);
> +		if (zeroed)
> +			memset(region, 0, size);
>  		/*
>  		 * The min_count is set to 0 so that bootmem allocated blocks
>  		 * are never reported as leaks.
> @@ -605,13 +607,18 @@ find_block:
>  static void * __init alloc_bootmem_core(unsigned long size,
>  					unsigned long align,
>  					unsigned long goal,
> -					unsigned long limit)
> +					unsigned long limit,
> +					int zeroed)
>  {
>  	bootmem_data_t *bdata;
>  	void *region;
>  
> -	if (WARN_ON_ONCE(slab_is_available()))
> -		return kzalloc(size, GFP_NOWAIT);
> +	if (WARN_ON_ONCE(slab_is_available())) {
> +		if (zeroed)
> +			return kzalloc(size, GFP_NOWAIT);
> +		else
> +			return kmalloc(size, GFP_NOWAIT);
> +	}
>  
>  	list_for_each_entry(bdata, &bdata_list, list) {
>  		if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
> @@ -619,7 +626,7 @@ static void * __init alloc_bootmem_core(unsigned long size,
>  		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
>  			break;
>  
> -		region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
> +		region = alloc_bootmem_bdata(bdata, size, align, goal, limit, zeroed);
>  		if (region)
>  			return region;
>  	}
> @@ -635,7 +642,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
>  	void *ptr;
>  
>  restart:
> -	ptr = alloc_bootmem_core(size, align, goal, limit);
> +	ptr = alloc_bootmem_core(size, align, goal, limit, 1);
>  	if (ptr)
>  		return ptr;
>  	if (goal) {
> @@ -710,22 +717,23 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>  {
>  	void *ptr;
>  
> -	if (WARN_ON_ONCE(slab_is_available()))
> +	if (WARN_ON_ONCE(slab_is_available())) {
>  		if (zeroed)
>  			return kzalloc(size, GFP_NOWAIT);
>  		else
>  			return kmalloc(size, GFP_NOWAIT);
> +	}
>  again:
>  
>  	/* do not panic in alloc_bootmem_bdata() */
>  	if (limit && goal + size > limit)
>  		limit = 0;
>  
> -	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
> +	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit, zeroed);
>  	if (ptr)
>  		return ptr;
>  
> -	ptr = alloc_bootmem_core(size, align, goal, limit);
> +	ptr = alloc_bootmem_core(size, align, goal, limit, zeroed);
>  	if (ptr)
>  		return ptr;
>  
> @@ -813,7 +821,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
>  
>  		new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
>  		ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
> -						 new_goal, 0);
> +						 new_goal, 0, 1);
>  		if (ptr)
>  			return ptr;
>  	}
> -- 
> 1.8.1.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
  2013-04-04  0:17   ` Simon Jeons
@ 2013-04-04 12:16     ` Cliff Wickman
  -1 siblings, 0 replies; 26+ messages in thread
From: Cliff Wickman @ 2013-04-04 12:16 UTC (permalink / raw)
  To: Simon Jeons; +Cc: linux-kernel, linux-mm, x86, wli

On Thu, Apr 04, 2013 at 08:17:08AM +0800, Simon Jeons wrote:
> On 03/07/2013 05:50 AM, Cliff Wickman wrote:
>> From: Cliff Wickman <cpw@sgi.com>
>>
>> Allocating a large number of 1GB hugetlbfs pages at boot takes a
>> very long time.
>>
>> Large system sites would at times like to allocate a very large amount of
>> memory as 1GB pages.  They would put this on the kernel boot line:
>>     default_hugepagesz=1G hugepagesz=1G hugepages=4096
>> [Dynamic allocation of 1G pages is not an option, as zone pages only go
>>   up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>>
>> Each page is zeroed as it is allocated, and all allocation is done by
>> cpu 0, as this path is early in boot:
>
> How you confirm they are done by cpu 0? just cpu 0 works during boot?

Yes, in kernel_init() you see the call to do_pre_smp_initcalls() just
before the call to smp_init().  It is smp_init() that starts the other
cpus.  They don't come out of reset until then.

>>        start_kernel
>>          kernel_init
>>            do_pre_smp_initcalls
>>              hugetlb_init
>>                hugetlb_init_hstates
>>                  hugetlb_hstate_alloc_pages
>>
>> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
>> on large numa systems).
>> This estimate is approximate (it depends on core frequency & number of hops
>> to remote memory) but should be within a factor of 2 on most systems.
>> A benchmark attempting to reserve a TB for 1GB pages would thus require
>> ~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.
>>
>> I propose passing a flag to the early allocator to indicate that no zeroing
>> of a page should be done.  The 'no zeroing' flag would have to be passed
>> down this code path:
>>
>>    hugetlb_hstate_alloc_pages
>>      alloc_bootmem_huge_page
>>        __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
>>          __alloc_memory_core_early  NO_ZERO
>> 	  if (!(flags & NO_ZERO))
>>              memset(ptr, 0, size);
>>
>> Or this path if CONFIG_NO_BOOTMEM is not set:
>>
>>    hugetlb_hstate_alloc_pages
>>      alloc_bootmem_huge_page
>>        __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
>>          alloc_bootmem_core          NO_ZERO
>> 	  if (!(flags & NO_ZERO))
>>              memset(region, 0, size);
>>          __alloc_bootmem_nopanic     NO_ZERO
>>            ___alloc_bootmem_nopanic  NO_ZERO
>>              alloc_bootmem_core      NO_ZERO
>> 	      if (!(flags & NO_ZERO))
>>                  memset(region, 0, size);
>>
>> Signed-off-by: Cliff Wickman <cpw@sgi.com>
>>
>> ---
>>   arch/x86/kernel/setup_percpu.c |    4 ++--
>>   include/linux/bootmem.h        |   23 ++++++++++++++++-------
>>   mm/bootmem.c                   |   12 +++++++-----
>>   mm/hugetlb.c                   |    3 ++-
>>   mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
>>   mm/page_cgroup.c               |    2 +-
>>   mm/sparse.c                    |    2 +-
>>   7 files changed, 52 insertions(+), 35 deletions(-)
>>
>> Index: linux/include/linux/bootmem.h
>> ===================================================================
>> --- linux.orig/include/linux/bootmem.h
>> +++ linux/include/linux/bootmem.h
>> @@ -8,6 +8,11 @@
>>   #include <asm/dma.h>
>>     /*
>> + * allocation flags
>> + */
>> +#define NO_ZERO		0x00000001
>> +
>> +/*
>>    *  simple boot-time physical memory area allocator.
>>    */
>>   @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
>>   			     unsigned long goal);
>>   extern void *__alloc_bootmem_nopanic(unsigned long size,
>>   				     unsigned long align,
>> -				     unsigned long goal);
>> +				     unsigned long goal,
>> +				     u32 flags);
>>   extern void *__alloc_bootmem_node(pg_data_t *pgdat,
>>   				  unsigned long size,
>>   				  unsigned long align,
>> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
>>   extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>>   				  unsigned long size,
>>   				  unsigned long align,
>> -				  unsigned long goal);
>> +				  unsigned long goal,
>> +				  u32 flags);
>>   void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>>   				  unsigned long size,
>>   				  unsigned long align,
>>   				  unsigned long goal,
>> -				  unsigned long limit);
>> +				  unsigned long limit,
>> +				  u32 flags);
>>   extern void *__alloc_bootmem_low(unsigned long size,
>>   				 unsigned long align,
>>   				 unsigned long goal);
>> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
>>   #define alloc_bootmem_align(x, align) \
>>   	__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
>>   #define alloc_bootmem_nopanic(x) \
>> -	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>> +	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
>>   #define alloc_bootmem_pages(x) \
>>   	__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>>   #define alloc_bootmem_pages_nopanic(x) \
>> -	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>> +	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>>   #define alloc_bootmem_node(pgdat, x) \
>>   	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>>   #define alloc_bootmem_node_nopanic(pgdat, x) \
>> -	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>> +	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
>> +				     BOOTMEM_LOW_LIMIT, 0)
>>   #define alloc_bootmem_pages_node(pgdat, x) \
>>   	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>>   #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
>> -	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>> +	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>>     #define alloc_bootmem_low(x) \
>>   	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
>> Index: linux/arch/x86/kernel/setup_percpu.c
>> ===================================================================
>> --- linux.orig/arch/x86/kernel/setup_percpu.c
>> +++ linux/arch/x86/kernel/setup_percpu.c
>> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
>>   	void *ptr;
>>     	if (!node_online(node) || !NODE_DATA(node)) {
>> -		ptr = __alloc_bootmem_nopanic(size, align, goal);
>> +		ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
>>   		pr_info("cpu %d has no node %d or node-local memory\n",
>>   			cpu, node);
>>   		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
>>   			 cpu, size, __pa(ptr));
>>   	} else {
>>   		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
>> -						   size, align, goal);
>> +						   size, align, goal, 0);
>>   		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
>>   			 cpu, size, node, __pa(ptr));
>>   	}
>> Index: linux/mm/nobootmem.c
>> ===================================================================
>> --- linux.orig/mm/nobootmem.c
>> +++ linux/mm/nobootmem.c
>> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
>>   unsigned long max_pfn;
>>     static void * __init __alloc_memory_core_early(int nid, u64 size, 
>> u64 align,
>> -					u64 goal, u64 limit)
>> +					u64 goal, u64 limit, u32 flags)
>>   {
>>   	void *ptr;
>>   	u64 addr;
>> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
>>   		return NULL;
>>     	ptr = phys_to_virt(addr);
>> -	memset(ptr, 0, size);
>> +	if (!(flags & NO_ZERO))
>> +		memset(ptr, 0, size);
>>   	memblock_reserve(addr, size);
>>   	/*
>>   	 * The min_count is set to 0 so that bootmem allocated blocks
>> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
>>   static void * __init ___alloc_bootmem_nopanic(unsigned long size,
>>   					unsigned long align,
>>   					unsigned long goal,
>> -					unsigned long limit)
>> +					unsigned long limit,
>> +					u32 flags)
>>   {
>>   	void *ptr;
>>   @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>>     restart:
>>   -	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, 
>> limit);
>> +	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
>> +					limit, 0);
>>     	if (ptr)
>>   		return ptr;
>> @@ -244,17 +247,17 @@ restart:
>>    * Returns NULL on failure.
>>    */
>>   void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
>> -					unsigned long goal)
>> +					unsigned long goal, u32 flags)
>>   {
>>   	unsigned long limit = -1UL;
>>   -	return ___alloc_bootmem_nopanic(size, align, goal, limit);
>> +	return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>>   }
>>     static void * __init ___alloc_bootmem(unsigned long size, unsigned 
>> long align,
>> -					unsigned long goal, unsigned long limit)
>> +			unsigned long goal, unsigned long limit, u32 flags)
>>   {
>> -	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
>> +	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>>     	if (mem)
>>   		return mem;
>> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
>>   {
>>   	unsigned long limit = -1UL;
>>   -	return ___alloc_bootmem(size, align, goal, limit);
>> +	return ___alloc_bootmem(size, align, goal, limit, 0);
>>   }
>>     void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>>   						   unsigned long size,
>>   						   unsigned long align,
>>   						   unsigned long goal,
>> -						   unsigned long limit)
>> +						   unsigned long limit,
>> +						   u32 flags)
>>   {
>>   	void *ptr;
>>     again:
>>   	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
>> -					goal, limit);
>> +					goal, limit, flags);
>>   	if (ptr)
>>   		return ptr;
>>     	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
>> -					goal, limit);
>> +					goal, limit, flags);
>>   	if (ptr)
>>   		return ptr;
>>   @@ -315,12 +319,13 @@ again:
>>   }
>>     void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, 
>> unsigned long size,
>> -				   unsigned long align, unsigned long goal)
>> +			unsigned long align, unsigned long goal, u32 flags)
>>   {
>>   	if (WARN_ON_ONCE(slab_is_available()))
>>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>>   -	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
>> +	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
>> +			0, flags);
>>   }
>>     void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long 
>> size,
>> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
>>   {
>>   	void *ptr;
>>   -	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 
>> limit);
>> +	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
>>   	if (ptr)
>>   		return ptr;
>>   @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
>>    * The function panics if the request can not be satisfied.
>>    */
>>   void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
>> -				   unsigned long align, unsigned long goal)
>> +			unsigned long align, unsigned long goal)
>>   {
>>   	if (WARN_ON_ONCE(slab_is_available()))
>>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
>>   void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
>>   				  unsigned long goal)
>>   {
>> -	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
>> +	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
>>   }
>>     void * __init __alloc_bootmem_low_nopanic(unsigned long size,
>> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
>>   					  unsigned long goal)
>>   {
>>   	return ___alloc_bootmem_nopanic(size, align, goal,
>> -					ARCH_LOW_ADDRESS_LIMIT);
>> +					ARCH_LOW_ADDRESS_LIMIT, 0);
>>   }
>>     /**
>> Index: linux/mm/sparse.c
>> ===================================================================
>> --- linux.orig/mm/sparse.c
>> +++ linux/mm/sparse.c
>> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
>>   	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
>>   again:
>>   	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
>> -					  SMP_CACHE_BYTES, goal, limit);
>> +					  SMP_CACHE_BYTES, goal, limit, 0);
>>   	if (!p && limit) {
>>   		limit = 0;
>>   		goto again;
>> Index: linux/mm/hugetlb.c
>> ===================================================================
>> --- linux.orig/mm/hugetlb.c
>> +++ linux/mm/hugetlb.c
>> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
>>   		addr = __alloc_bootmem_node_nopanic(
>>   				NODE_DATA(hstate_next_node_to_alloc(h,
>>   						&node_states[N_MEMORY])),
>> -				huge_page_size(h), huge_page_size(h), 0);
>> +				huge_page_size(h), huge_page_size(h),
>> +				0, NO_ZERO);
>>     		if (addr) {
>>   			/*
>> Index: linux/mm/bootmem.c
>> ===================================================================
>> --- linux.orig/mm/bootmem.c
>> +++ linux/mm/bootmem.c
>> @@ -660,7 +660,7 @@ restart:
>>    * Returns NULL on failure.
>>    */
>>   void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
>> -					unsigned long goal)
>> +					unsigned long goal, u32 flags)
>>   {
>>   	unsigned long limit = 0;
>>   @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>>     void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>>   				unsigned long size, unsigned long align,
>> -				unsigned long goal, unsigned long limit)
>> +				unsigned long goal, unsigned long limit,
>> +				u32 flags)
>>   {
>>   	void *ptr;
>>   @@ -734,12 +735,13 @@ again:
>>   }
>>     void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, 
>> unsigned long size,
>> -				   unsigned long align, unsigned long goal)
>> +			unsigned long align, unsigned long goal, u32 flags)
>>   {
>>   	if (WARN_ON_ONCE(slab_is_available()))
>>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>>   -	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
>> +	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
>> +					     0, flags);
>>   }
>>     void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long 
>> size,
>> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
>>   {
>>   	void *ptr;
>>   -	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
>> +	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
>>   	if (ptr)
>>   		return ptr;
>>   Index: linux/mm/page_cgroup.c
>> ===================================================================
>> --- linux.orig/mm/page_cgroup.c
>> +++ linux/mm/page_cgroup.c
>> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
>>   	table_size = sizeof(struct page_cgroup) * nr_pages;
>>     	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
>> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
>> +			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
>>   	if (!base)
>>   		return -ENOMEM;
>>   	NODE_DATA(nid)->node_page_cgroup = base;
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Cliff Wickman
SGI
cpw@sgi.com
(651) 683-3824

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
@ 2013-04-04 12:16     ` Cliff Wickman
  0 siblings, 0 replies; 26+ messages in thread
From: Cliff Wickman @ 2013-04-04 12:16 UTC (permalink / raw)
  To: Simon Jeons; +Cc: linux-kernel, linux-mm, x86, wli

On Thu, Apr 04, 2013 at 08:17:08AM +0800, Simon Jeons wrote:
> On 03/07/2013 05:50 AM, Cliff Wickman wrote:
>> From: Cliff Wickman <cpw@sgi.com>
>>
>> Allocating a large number of 1GB hugetlbfs pages at boot takes a
>> very long time.
>>
>> Large system sites would at times like to allocate a very large amount of
>> memory as 1GB pages.  They would put this on the kernel boot line:
>>     default_hugepagesz=1G hugepagesz=1G hugepages=4096
>> [Dynamic allocation of 1G pages is not an option, as zone pages only go
>>   up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>>
>> Each page is zeroed as it is allocated, and all allocation is done by
>> cpu 0, as this path is early in boot:
>
> How you confirm they are done by cpu 0? just cpu 0 works during boot?

Yes, in kernel_init() you see the call to do_pre_smp_initcalls() just
before the call to smp_init().  It is smp_init() that starts the other
cpus.  They don't come out of reset until then.

>>        start_kernel
>>          kernel_init
>>            do_pre_smp_initcalls
>>              hugetlb_init
>>                hugetlb_init_hstates
>>                  hugetlb_hstate_alloc_pages
>>
>> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
>> on large numa systems).
>> This estimate is approximate (it depends on core frequency & number of hops
>> to remote memory) but should be within a factor of 2 on most systems.
>> A benchmark attempting to reserve a TB for 1GB pages would thus require
>> ~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.
>>
>> I propose passing a flag to the early allocator to indicate that no zeroing
>> of a page should be done.  The 'no zeroing' flag would have to be passed
>> down this code path:
>>
>>    hugetlb_hstate_alloc_pages
>>      alloc_bootmem_huge_page
>>        __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
>>          __alloc_memory_core_early  NO_ZERO
>> 	  if (!(flags & NO_ZERO))
>>              memset(ptr, 0, size);
>>
>> Or this path if CONFIG_NO_BOOTMEM is not set:
>>
>>    hugetlb_hstate_alloc_pages
>>      alloc_bootmem_huge_page
>>        __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
>>          alloc_bootmem_core          NO_ZERO
>> 	  if (!(flags & NO_ZERO))
>>              memset(region, 0, size);
>>          __alloc_bootmem_nopanic     NO_ZERO
>>            ___alloc_bootmem_nopanic  NO_ZERO
>>              alloc_bootmem_core      NO_ZERO
>> 	      if (!(flags & NO_ZERO))
>>                  memset(region, 0, size);
>>
>> Signed-off-by: Cliff Wickman <cpw@sgi.com>
>>
>> ---
>>   arch/x86/kernel/setup_percpu.c |    4 ++--
>>   include/linux/bootmem.h        |   23 ++++++++++++++++-------
>>   mm/bootmem.c                   |   12 +++++++-----
>>   mm/hugetlb.c                   |    3 ++-
>>   mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
>>   mm/page_cgroup.c               |    2 +-
>>   mm/sparse.c                    |    2 +-
>>   7 files changed, 52 insertions(+), 35 deletions(-)
>>
>> Index: linux/include/linux/bootmem.h
>> ===================================================================
>> --- linux.orig/include/linux/bootmem.h
>> +++ linux/include/linux/bootmem.h
>> @@ -8,6 +8,11 @@
>>   #include <asm/dma.h>
>>     /*
>> + * allocation flags
>> + */
>> +#define NO_ZERO		0x00000001
>> +
>> +/*
>>    *  simple boot-time physical memory area allocator.
>>    */
>>   @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
>>   			     unsigned long goal);
>>   extern void *__alloc_bootmem_nopanic(unsigned long size,
>>   				     unsigned long align,
>> -				     unsigned long goal);
>> +				     unsigned long goal,
>> +				     u32 flags);
>>   extern void *__alloc_bootmem_node(pg_data_t *pgdat,
>>   				  unsigned long size,
>>   				  unsigned long align,
>> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
>>   extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>>   				  unsigned long size,
>>   				  unsigned long align,
>> -				  unsigned long goal);
>> +				  unsigned long goal,
>> +				  u32 flags);
>>   void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>>   				  unsigned long size,
>>   				  unsigned long align,
>>   				  unsigned long goal,
>> -				  unsigned long limit);
>> +				  unsigned long limit,
>> +				  u32 flags);
>>   extern void *__alloc_bootmem_low(unsigned long size,
>>   				 unsigned long align,
>>   				 unsigned long goal);
>> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
>>   #define alloc_bootmem_align(x, align) \
>>   	__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
>>   #define alloc_bootmem_nopanic(x) \
>> -	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>> +	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
>>   #define alloc_bootmem_pages(x) \
>>   	__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>>   #define alloc_bootmem_pages_nopanic(x) \
>> -	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>> +	__alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>>   #define alloc_bootmem_node(pgdat, x) \
>>   	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>>   #define alloc_bootmem_node_nopanic(pgdat, x) \
>> -	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>> +	__alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
>> +				     BOOTMEM_LOW_LIMIT, 0)
>>   #define alloc_bootmem_pages_node(pgdat, x) \
>>   	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>>   #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
>> -	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>> +	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>>     #define alloc_bootmem_low(x) \
>>   	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
>> Index: linux/arch/x86/kernel/setup_percpu.c
>> ===================================================================
>> --- linux.orig/arch/x86/kernel/setup_percpu.c
>> +++ linux/arch/x86/kernel/setup_percpu.c
>> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
>>   	void *ptr;
>>     	if (!node_online(node) || !NODE_DATA(node)) {
>> -		ptr = __alloc_bootmem_nopanic(size, align, goal);
>> +		ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
>>   		pr_info("cpu %d has no node %d or node-local memory\n",
>>   			cpu, node);
>>   		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
>>   			 cpu, size, __pa(ptr));
>>   	} else {
>>   		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
>> -						   size, align, goal);
>> +						   size, align, goal, 0);
>>   		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
>>   			 cpu, size, node, __pa(ptr));
>>   	}
>> Index: linux/mm/nobootmem.c
>> ===================================================================
>> --- linux.orig/mm/nobootmem.c
>> +++ linux/mm/nobootmem.c
>> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
>>   unsigned long max_pfn;
>>     static void * __init __alloc_memory_core_early(int nid, u64 size, 
>> u64 align,
>> -					u64 goal, u64 limit)
>> +					u64 goal, u64 limit, u32 flags)
>>   {
>>   	void *ptr;
>>   	u64 addr;
>> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
>>   		return NULL;
>>     	ptr = phys_to_virt(addr);
>> -	memset(ptr, 0, size);
>> +	if (!(flags & NO_ZERO))
>> +		memset(ptr, 0, size);
>>   	memblock_reserve(addr, size);
>>   	/*
>>   	 * The min_count is set to 0 so that bootmem allocated blocks
>> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
>>   static void * __init ___alloc_bootmem_nopanic(unsigned long size,
>>   					unsigned long align,
>>   					unsigned long goal,
>> -					unsigned long limit)
>> +					unsigned long limit,
>> +					u32 flags)
>>   {
>>   	void *ptr;
>>   @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>>     restart:
>>   -	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, 
>> limit);
>> +	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
>> +					limit, 0);
>>     	if (ptr)
>>   		return ptr;
>> @@ -244,17 +247,17 @@ restart:
>>    * Returns NULL on failure.
>>    */
>>   void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
>> -					unsigned long goal)
>> +					unsigned long goal, u32 flags)
>>   {
>>   	unsigned long limit = -1UL;
>>   -	return ___alloc_bootmem_nopanic(size, align, goal, limit);
>> +	return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>>   }
>>     static void * __init ___alloc_bootmem(unsigned long size, unsigned 
>> long align,
>> -					unsigned long goal, unsigned long limit)
>> +			unsigned long goal, unsigned long limit, u32 flags)
>>   {
>> -	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
>> +	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>>     	if (mem)
>>   		return mem;
>> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
>>   {
>>   	unsigned long limit = -1UL;
>>   -	return ___alloc_bootmem(size, align, goal, limit);
>> +	return ___alloc_bootmem(size, align, goal, limit, 0);
>>   }
>>     void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>>   						   unsigned long size,
>>   						   unsigned long align,
>>   						   unsigned long goal,
>> -						   unsigned long limit)
>> +						   unsigned long limit,
>> +						   u32 flags)
>>   {
>>   	void *ptr;
>>     again:
>>   	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
>> -					goal, limit);
>> +					goal, limit, flags);
>>   	if (ptr)
>>   		return ptr;
>>     	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
>> -					goal, limit);
>> +					goal, limit, flags);
>>   	if (ptr)
>>   		return ptr;
>>   @@ -315,12 +319,13 @@ again:
>>   }
>>     void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, 
>> unsigned long size,
>> -				   unsigned long align, unsigned long goal)
>> +			unsigned long align, unsigned long goal, u32 flags)
>>   {
>>   	if (WARN_ON_ONCE(slab_is_available()))
>>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>>   -	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
>> +	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
>> +			0, flags);
>>   }
>>     void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long 
>> size,
>> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
>>   {
>>   	void *ptr;
>>   -	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 
>> limit);
>> +	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
>>   	if (ptr)
>>   		return ptr;
>>   @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
>>    * The function panics if the request can not be satisfied.
>>    */
>>   void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
>> -				   unsigned long align, unsigned long goal)
>> +			unsigned long align, unsigned long goal)
>>   {
>>   	if (WARN_ON_ONCE(slab_is_available()))
>>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
>>   void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
>>   				  unsigned long goal)
>>   {
>> -	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
>> +	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
>>   }
>>     void * __init __alloc_bootmem_low_nopanic(unsigned long size,
>> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
>>   					  unsigned long goal)
>>   {
>>   	return ___alloc_bootmem_nopanic(size, align, goal,
>> -					ARCH_LOW_ADDRESS_LIMIT);
>> +					ARCH_LOW_ADDRESS_LIMIT, 0);
>>   }
>>     /**
>> Index: linux/mm/sparse.c
>> ===================================================================
>> --- linux.orig/mm/sparse.c
>> +++ linux/mm/sparse.c
>> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
>>   	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
>>   again:
>>   	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
>> -					  SMP_CACHE_BYTES, goal, limit);
>> +					  SMP_CACHE_BYTES, goal, limit, 0);
>>   	if (!p && limit) {
>>   		limit = 0;
>>   		goto again;
>> Index: linux/mm/hugetlb.c
>> ===================================================================
>> --- linux.orig/mm/hugetlb.c
>> +++ linux/mm/hugetlb.c
>> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
>>   		addr = __alloc_bootmem_node_nopanic(
>>   				NODE_DATA(hstate_next_node_to_alloc(h,
>>   						&node_states[N_MEMORY])),
>> -				huge_page_size(h), huge_page_size(h), 0);
>> +				huge_page_size(h), huge_page_size(h),
>> +				0, NO_ZERO);
>>     		if (addr) {
>>   			/*
>> Index: linux/mm/bootmem.c
>> ===================================================================
>> --- linux.orig/mm/bootmem.c
>> +++ linux/mm/bootmem.c
>> @@ -660,7 +660,7 @@ restart:
>>    * Returns NULL on failure.
>>    */
>>   void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
>> -					unsigned long goal)
>> +					unsigned long goal, u32 flags)
>>   {
>>   	unsigned long limit = 0;
>>   @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>>     void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>>   				unsigned long size, unsigned long align,
>> -				unsigned long goal, unsigned long limit)
>> +				unsigned long goal, unsigned long limit,
>> +				u32 flags)
>>   {
>>   	void *ptr;
>>   @@ -734,12 +735,13 @@ again:
>>   }
>>     void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, 
>> unsigned long size,
>> -				   unsigned long align, unsigned long goal)
>> +			unsigned long align, unsigned long goal, u32 flags)
>>   {
>>   	if (WARN_ON_ONCE(slab_is_available()))
>>   		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>>   -	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
>> +	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
>> +					     0, flags);
>>   }
>>     void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long 
>> size,
>> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
>>   {
>>   	void *ptr;
>>   -	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
>> +	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
>>   	if (ptr)
>>   		return ptr;
>>   Index: linux/mm/page_cgroup.c
>> ===================================================================
>> --- linux.orig/mm/page_cgroup.c
>> +++ linux/mm/page_cgroup.c
>> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
>>   	table_size = sizeof(struct page_cgroup) * nr_pages;
>>     	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
>> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
>> +			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
>>   	if (!base)
>>   		return -ENOMEM;
>>   	NODE_DATA(nid)->node_page_cgroup = base;
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Cliff Wickman
SGI
cpw@sgi.com
(651) 683-3824

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2013-04-04 12:16 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-03-06 21:50 [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot Cliff Wickman
2013-03-06 21:50 ` Cliff Wickman
2013-03-10  5:55 ` Hillf Danton
2013-03-10  5:55   ` Hillf Danton
2013-03-11 12:32   ` Cliff Wickman
2013-03-11 12:32     ` Cliff Wickman
2013-03-14  8:51 ` Michal Hocko
2013-03-14  8:51   ` Michal Hocko
2013-04-03  2:43   ` [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2 Robin Holt
2013-04-03  2:43     ` Robin Holt
2013-04-03 14:00     ` Michal Hocko
2013-04-03 14:00       ` Michal Hocko
2013-04-03 17:21       ` Robin Holt
2013-04-03 17:21         ` Robin Holt
2013-04-04  8:17         ` Michal Hocko
2013-04-04  8:17           ` Michal Hocko
2013-04-03 14:02     ` Michal Hocko
2013-04-03 14:02       ` Michal Hocko
2013-04-03 17:00       ` Robin Holt
2013-04-03 17:00         ` Robin Holt
2013-04-04  8:08         ` Michal Hocko
2013-04-04  8:08           ` Michal Hocko
2013-04-04  0:17 ` [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot Simon Jeons
2013-04-04  0:17   ` Simon Jeons
2013-04-04 12:16   ` Cliff Wickman
2013-04-04 12:16     ` Cliff Wickman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.