All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Simple brk allocator for very early allocations
@ 2009-02-28  1:51 Jeremy Fitzhardinge
  2009-02-28  1:51 ` [PATCH] x86: add brk allocation for very, " Jeremy Fitzhardinge
                   ` (5 more replies)
  0 siblings, 6 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  1:51 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: the arch/x86 maintainers, Linux Kernel Mailing List


This series adds a very simple brk-like allocator for very early
allocations.  By default it extends the bss segment, starting at _end.

This is used to allocate x86-32's initial head_32.S pagetable, removing
init_pg_table_start/end and pg0, replacing them with brk allocations.

dmi_alloc() is also changed to use extend_brk.

	J

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] x86: add brk allocation for very, very early allocations
  2009-02-28  1:51 [PATCH] Simple brk allocator for very early allocations Jeremy Fitzhardinge
@ 2009-02-28  1:51 ` Jeremy Fitzhardinge
  2009-02-28  1:51 ` [PATCH] x86: reserve brk earlier Jeremy Fitzhardinge
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  1:51 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: the arch/x86 maintainers, Linux Kernel Mailing List, Jeremy Fitzhardinge

From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

Add a brk()-like allocator which effectively extends the bss
in order to allow very early code to do dynamic allocations.
This is better than using statically allocated arrays for
data in subsystems which may never get used.

The amount of space available depends on how much the initial
kernel mappings have covered, and so is fairly limited.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/setup.h |    5 +++++
 arch/x86/kernel/head32.c     |    2 ++
 arch/x86/kernel/head64.c     |    2 ++
 arch/x86/kernel/setup.c      |   30 +++++++++++++++++++++++++-----
 arch/x86/mm/pageattr.c       |    5 +++--
 arch/x86/xen/enlighten.c     |    3 +++
 6 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 05c6f6b..fd9b420 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -100,6 +100,11 @@ extern struct boot_params boot_params;
  */
 #define LOWMEMSIZE()	(0x9f000)
 
+/* exceedingly early brk-like allocator */
+extern unsigned long _brk_start, _brk_end;
+void init_brk(unsigned long start);
+void *extend_brk(size_t size, size_t align);
+
 #ifdef __i386__
 
 void __init i386_start_kernel(void);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1..fa9ae31 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -34,6 +34,8 @@ void __init i386_start_kernel(void)
 
 	reserve_ebda_region();
 
+	init_brk((unsigned long)__va(init_pg_tables_end));
+
 	/*
 	 * At this point everything still needed from the boot loader
 	 * or BIOS or kernel text should be early reserved or marked not
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5b2722..4b29802 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -91,6 +91,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	if (console_loglevel == 10)
 		early_printk("Kernel alive\n");
 
+	init_brk((unsigned long)&_end);
+
 	x86_64_start_reservations(real_mode_data);
 }
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4c54bc0..6a21423 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,6 +113,7 @@
 #endif
 
 unsigned int boot_cpu_id __read_mostly;
+__initdata unsigned long _brk_start, _brk_end;
 
 #ifdef CONFIG_X86_64
 int default_cpu_present_to_apicid(int mps_cpu)
@@ -335,6 +336,26 @@ static void __init relocate_initrd(void)
 }
 #endif
 
+void __init init_brk(unsigned long brk)
+{
+	_brk_start = _brk_end = brk;
+}
+
+void * __init extend_brk(size_t size, size_t align)
+{
+	size_t mask = align - 1;
+	void *ret;
+
+	BUG_ON(align & mask);
+
+	_brk_end = (_brk_end + mask) & ~mask;
+
+	ret = (void *)_brk_end;
+	_brk_end += size;
+
+	return ret;
+}
+
 static void __init reserve_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -715,11 +736,7 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
-	init_mm.brk = (unsigned long) &_end;
-#endif
+	init_mm.brk = _brk_end;
 
 	code_resource.start = virt_to_phys(_text);
 	code_resource.end = virt_to_phys(_etext)-1;
@@ -881,6 +898,9 @@ void __init setup_arch(char **cmdline_p)
 	acpi_numa_init();
 #endif
 
+	if (_brk_end > _brk_start)
+		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
 	initmem_init(0, max_pfn);
 
 #ifdef CONFIG_ACPI_SLEEP
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8253bc9..5b75188 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,7 @@
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/setup.h>
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
@@ -95,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
 
 static inline unsigned long highmap_end_pfn(void)
 {
-	return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+	return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
 }
 
 #endif
@@ -700,7 +701,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	 * No need to redo, when the primary call touched the high
 	 * mapping already:
 	 */
-	if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
+	if (within(vaddr, (unsigned long) _text, _brk_end))
 		return 0;
 
 	/*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c52f403..a51e595 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -948,6 +948,9 @@ asmlinkage void __init xen_start_kernel(void)
 
 	init_mm.pgd = pgd;
 
+	/* Set up very early brk allocator after Xen pagetables */
+	init_brk(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE);
+
 	/* keep using Xen gdt for now; no urgent need to change it */
 
 	pv_info.kernel_rpl = 1;
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH] x86: reserve brk earlier
  2009-02-28  1:51 [PATCH] Simple brk allocator for very early allocations Jeremy Fitzhardinge
  2009-02-28  1:51 ` [PATCH] x86: add brk allocation for very, " Jeremy Fitzhardinge
@ 2009-02-28  1:51 ` Jeremy Fitzhardinge
  2009-02-28  1:51 ` [PATCH] x86-32: use brk segment for allocating initial kernel pagetable Jeremy Fitzhardinge
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  1:51 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: the arch/x86 maintainers, Linux Kernel Mailing List, Jeremy Fitzhardinge

From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

We need to reserve the brk segment before doing the pagetable
construction, so that it doesn't allocate the same memory.

Also, pre-initialize brk to _end (but still leave init_brk for cases
where that's not correct, like i386 (for now) and Xen).

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/kernel/head64.c |    2 --
 arch/x86/kernel/setup.c  |   16 ++++++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 4b29802..f5b2722 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -91,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	if (console_loglevel == 10)
 		early_printk("Kernel alive\n");
 
-	init_brk((unsigned long)&_end);
-
 	x86_64_start_reservations(real_mode_data);
 }
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6a21423..c246dc4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,7 +113,8 @@
 #endif
 
 unsigned int boot_cpu_id __read_mostly;
-__initdata unsigned long _brk_start, _brk_end;
+__initdata unsigned long _brk_start = (unsigned long)&_end;
+__initdata unsigned long _brk_end = (unsigned long)&_end;
 
 #ifdef CONFIG_X86_64
 int default_cpu_present_to_apicid(int mps_cpu)
@@ -356,6 +357,14 @@ void * __init extend_brk(size_t size, size_t align)
 	return ret;
 }
 
+static void __init reserve_brk(void)
+{
+	if (_brk_end > _brk_start)
+		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
+	_brk_end = _brk_start = 0xdeadb00c + POISON_POINTER_DELTA;
+}
+
 static void __init reserve_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -856,6 +865,8 @@ void __init setup_arch(char **cmdline_p)
 	setup_bios_corruption_check();
 #endif
 
+	reserve_brk();
+
 	/* max_pfn_mapped is updated here */
 	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
 	max_pfn_mapped = max_low_pfn_mapped;
@@ -898,9 +909,6 @@ void __init setup_arch(char **cmdline_p)
 	acpi_numa_init();
 #endif
 
-	if (_brk_end > _brk_start)
-		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
-
 	initmem_init(0, max_pfn);
 
 #ifdef CONFIG_ACPI_SLEEP
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH] x86-32: use brk segment for allocating initial kernel pagetable
  2009-02-28  1:51 [PATCH] Simple brk allocator for very early allocations Jeremy Fitzhardinge
  2009-02-28  1:51 ` [PATCH] x86: add brk allocation for very, " Jeremy Fitzhardinge
  2009-02-28  1:51 ` [PATCH] x86: reserve brk earlier Jeremy Fitzhardinge
@ 2009-02-28  1:51 ` Jeremy Fitzhardinge
  2009-02-28  7:02   ` Yinghai Lu
  2009-02-28  1:51 ` [PATCH] x86: use brk allocation for DMI Jeremy Fitzhardinge
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  1:51 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: the arch/x86 maintainers, Linux Kernel Mailing List, Jeremy Fitzhardinge

From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

Rather than having special purpose init_pg_table_start/end variables
to delimit the kernel pagetable built by head_32.S, just use the brk
mechanism to extend the bss for the new pagetable.

This patch removes init_pg_table_start/end and pg0, defines __brk_base
(which is page-aligned and immediately follows _end), initializes
the brk region to start there, and uses it for the 32-bit pagetable.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/pgtable_32.h |    3 ---
 arch/x86/include/asm/setup.h      |    3 ---
 arch/x86/kernel/head32.c          |    5 -----
 arch/x86/kernel/head_32.S         |   14 +++++++-------
 arch/x86/kernel/setup.c           |   12 ++++--------
 arch/x86/kernel/vmlinux_32.lds.S  |    3 +--
 arch/x86/kernel/vmlinux_64.lds.S  |    2 ++
 arch/x86/lguest/boot.c            |    8 --------
 arch/x86/xen/mmu.c                |    8 +++++---
 9 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 97612fc..31bd120 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
  */
 #undef TEST_ACCESS_OK
 
-/* The boot page tables (all created as a single array) */
-extern unsigned long pg0[];
-
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
 #else
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index fd9b420..2ca00a9 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -110,9 +110,6 @@ void *extend_brk(size_t size, size_t align);
 void __init i386_start_kernel(void);
 extern void probe_roms(void);
 
-extern unsigned long init_pg_tables_start;
-extern unsigned long init_pg_tables_end;
-
 #else
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index fa9ae31..b6caa9e 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,13 +29,8 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
-	reserve_early(init_pg_tables_start, init_pg_tables_end,
-			"INIT_PG_TABLE");
-
 	reserve_ebda_region();
 
-	init_brk((unsigned long)__va(init_pg_tables_end));
-
 	/*
 	 * At this point everything still needed from the boot loader
 	 * or BIOS or kernel text should be early reserved or marked not
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6219259..d243437 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -167,7 +167,7 @@ num_subarch_entries = (. - subarch_entries) / 4
 /*
  * Initialize page tables.  This creates a PDE and a set of page
  * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * _brk_end is set up to point to the first "safe" location.
  * Mappings are created both at virtual address 0 (identity mapping)
  * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
  *
@@ -190,8 +190,7 @@ default_entry:
 
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
-	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
+	movl $pa(__brk_base), %edi
 	movl $pa(swapper_pg_pmd), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -216,7 +215,8 @@ default_entry:
 	cmpl %ebp,%eax
 	jb 10b
 1:
-	movl %edi,pa(init_pg_tables_end)
+	addl $__PAGE_OFFSET, %edi
+	movl %edi, pa(_brk_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -227,8 +227,7 @@ default_entry:
 
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
-	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
+	movl $pa(__brk_base), %edi
 	movl $pa(swapper_pg_dir), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -249,7 +248,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
 	cmpl %ebp,%eax
 	jb 10b
-	movl %edi,pa(init_pg_tables_end)
+	addl $__PAGE_OFFSET, %edi
+	movl %edi, pa(_brk_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c246dc4..ed02176 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,8 +113,10 @@
 #endif
 
 unsigned int boot_cpu_id __read_mostly;
-__initdata unsigned long _brk_start = (unsigned long)&_end;
-__initdata unsigned long _brk_end = (unsigned long)&_end;
+
+extern char __brk_base[];
+__initdata unsigned long _brk_start = (unsigned long)__brk_base;
+__initdata unsigned long _brk_end = (unsigned long)&__brk_base;
 
 #ifdef CONFIG_X86_64
 int default_cpu_present_to_apicid(int mps_cpu)
@@ -160,12 +162,6 @@ static struct resource bss_resource = {
 
 
 #ifdef CONFIG_X86_32
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
 	.start	= 0xa0000,
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 0d86096..2f772dc 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -190,9 +190,8 @@ SECTIONS
 	. = ALIGN(4);
 	__bss_stop = .;
   	_end = . ;
-	/* This is where the kernel creates the early boot page tables */
 	. = ALIGN(PAGE_SIZE);
-	pg0 = . ;
+	__brk_base = . ;
   }
 
   /* Sections to be discarded */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6..4878198 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -245,6 +245,8 @@ SECTIONS
   __bss_stop = .;
 
   _end = . ;
+  . = ALIGN(PAGE_SIZE);
+  __brk_base = . ;
 
   /* Sections to be discarded */
   /DISCARD/ : {
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f3a5305..ba2c2da 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1051,14 +1051,6 @@ __init void lguest_init(void)
 	 * lguest_init() where the rest of the fairly chaotic boot setup
 	 * occurs. */
 
-	/* The native boot code sets up initial page tables immediately after
-	 * the kernel itself, and sets init_pg_tables_end so they're not
-	 * clobbered.  The Launcher places our initial pagetables somewhere at
-	 * the top of our physical memory, so we don't need extra space: set
-	 * init_pg_tables_end to the end of the kernel. */
-	init_pg_tables_start = __pa(pg0);
-	init_pg_tables_end = __pa(pg0);
-
 	/* As described in head_32.S, we map the first 128M of memory. */
 	max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40..cf16959 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1716,9 +1716,11 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 {
 	pmd_t *kernel_pmd;
 
-	init_pg_tables_start = __pa(pgd);
-	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-	max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+	/*
+	 * We already set _brk_start to point to the end of the
+	 * Xen-provided pagetables.
+	 */
+	max_pfn_mapped = PFN_DOWN(_brk_start + 512*1024);
 
 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
 	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH] x86: use brk allocation for DMI
  2009-02-28  1:51 [PATCH] Simple brk allocator for very early allocations Jeremy Fitzhardinge
                   ` (2 preceding siblings ...)
  2009-02-28  1:51 ` [PATCH] x86-32: use brk segment for allocating initial kernel pagetable Jeremy Fitzhardinge
@ 2009-02-28  1:51 ` Jeremy Fitzhardinge
  2009-02-28  1:51 ` [PATCH] x86: leave _brk_end defined Jeremy Fitzhardinge
  2009-02-28  5:23 ` [PATCH] Simple brk allocator for very early allocations Andrew Morton
  5 siblings, 0 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  1:51 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: the arch/x86 maintainers, Linux Kernel Mailing List, Jeremy Fitzhardinge

From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

Use extend_brk() to allocate memory for DMI rather than having an
ad-hoc allocator.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/dmi.h |   14 ++------------
 arch/x86/kernel/setup.c    |    6 ------
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index bc68212..aa32f7e 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -2,21 +2,11 @@
 #define _ASM_X86_DMI_H
 
 #include <asm/io.h>
+#include <asm/setup.h>
 
-#define DMI_MAX_DATA 2048
-
-extern int dmi_alloc_index;
-extern char dmi_alloc_data[DMI_MAX_DATA];
-
-/* This is so early that there is no good way to allocate dynamic memory.
-   Allocate data in an BSS array. */
 static inline void *dmi_alloc(unsigned len)
 {
-	int idx = dmi_alloc_index;
-	if ((dmi_alloc_index + len) > DMI_MAX_DATA)
-		return NULL;
-	dmi_alloc_index += len;
-	return dmi_alloc_data + idx;
+	return extend_brk(len, sizeof(int));
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ed02176..1d4fe19 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -215,12 +215,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
 int bootloader_type;
 
 /*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
  * Setup options
  */
 struct screen_info screen_info;
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH] x86: leave _brk_end defined
  2009-02-28  1:51 [PATCH] Simple brk allocator for very early allocations Jeremy Fitzhardinge
                   ` (3 preceding siblings ...)
  2009-02-28  1:51 ` [PATCH] x86: use brk allocation for DMI Jeremy Fitzhardinge
@ 2009-02-28  1:51 ` Jeremy Fitzhardinge
  2009-02-28  5:23 ` [PATCH] Simple brk allocator for very early allocations Andrew Morton
  5 siblings, 0 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  1:51 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: the arch/x86 maintainers, Linux Kernel Mailing List, Jeremy Fitzhardinge

From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

highmap_end_pfn refers to _brk_end, so leave it as normal data with a
meaningful value.  Make extend_brk() check for a NULL _brk_start to
look for uses after the brk memory has been reserved and locked down.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/kernel/setup.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1d4fe19..11c4198 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -116,7 +116,7 @@ unsigned int boot_cpu_id __read_mostly;
 
 extern char __brk_base[];
 __initdata unsigned long _brk_start = (unsigned long)__brk_base;
-__initdata unsigned long _brk_end = (unsigned long)&__brk_base;
+unsigned long _brk_end = (unsigned long)&__brk_base;
 
 #ifdef CONFIG_X86_64
 int default_cpu_present_to_apicid(int mps_cpu)
@@ -337,6 +337,7 @@ void * __init extend_brk(size_t size, size_t align)
 	size_t mask = align - 1;
 	void *ret;
 
+	BUG_ON(_brk_start == 0);
 	BUG_ON(align & mask);
 
 	_brk_end = (_brk_end + mask) & ~mask;
@@ -352,7 +353,7 @@ static void __init reserve_brk(void)
 	if (_brk_end > _brk_start)
 		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
 
-	_brk_end = _brk_start = 0xdeadb00c + POISON_POINTER_DELTA;
+	_brk_start = 0;
 }
 
 static void __init reserve_initrd(void)
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH] Simple brk allocator for very early allocations
  2009-02-28  1:51 [PATCH] Simple brk allocator for very early allocations Jeremy Fitzhardinge
                   ` (4 preceding siblings ...)
  2009-02-28  1:51 ` [PATCH] x86: leave _brk_end defined Jeremy Fitzhardinge
@ 2009-02-28  5:23 ` Andrew Morton
  2009-02-28  6:30   ` Jeremy Fitzhardinge
  5 siblings, 1 reply; 43+ messages in thread
From: Andrew Morton @ 2009-02-28  5:23 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, the arch/x86 maintainers, Linux Kernel Mailing List

On Fri, 27 Feb 2009 17:51:17 -0800 Jeremy Fitzhardinge <jeremy@goop.org> wrote:

> This series adds a very simple brk-like allocator for very early
> allocations.  By default it extends the bss segment, starting at _end.
> 
> This is used to allocate x86-32's initial head_32.S pagetable, removing
> init_pg_table_start/end and pg0, replacing them with brk allocations.

Changelog fails to provide a reason for this?

> dmi_alloc() is also changed to use extend_brk.

Seems a large patchset just to clean up DMI a bit ;)

I assume that xen needs this?  domU or dom0?


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] Simple brk allocator for very early allocations
  2009-02-28  5:23 ` [PATCH] Simple brk allocator for very early allocations Andrew Morton
@ 2009-02-28  6:30   ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  6:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: H. Peter Anvin, the arch/x86 maintainers, Linux Kernel Mailing List

Andrew Morton wrote:
> On Fri, 27 Feb 2009 17:51:17 -0800 Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>
>   
>> This series adds a very simple brk-like allocator for very early
>> allocations.  By default it extends the bss segment, starting at _end.
>>
>> This is used to allocate x86-32's initial head_32.S pagetable, removing
>> init_pg_table_start/end and pg0, replacing them with brk allocations.
>>     
>
> Changelog fails to provide a reason for this?
>   

Well, the whole thing is just a generalization of what the 32-bit 
pagetable builder does anyway, to make it more useful.  With the brk 
allocator in place, there's no reason for head_32.S to do it again.

>> dmi_alloc() is also changed to use extend_brk.
>>     
>
> Seems a large patchset just to clean up DMI a bit ;)
>   

A few patches could be folded together.  But its pretty small really...

> I assume that xen needs this?  domU or dom0

I can make use of it in Xen to remove a bunch of static arrays.  I'm 
pretty sure there's quite a few places around the kernel which could 
make use of this facility.  My kernel has 900k of bss; how much of that 
is stuff that 1) could be allocated, and 2) not actually being used at 
runtime?  A lot of things which are compile-time sized, hash tables, log 
buffers, etc, could be runtime sized instead.

    J

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86-32: use brk segment for allocating initial kernel  pagetable
  2009-02-28  1:51 ` [PATCH] x86-32: use brk segment for allocating initial kernel pagetable Jeremy Fitzhardinge
@ 2009-02-28  7:02   ` Yinghai Lu
  2009-02-28  7:05     ` J Jeremy Fitzhardinge
  0 siblings, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-02-28  7:02 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

On Fri, Feb 27, 2009 at 5:51 PM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
> From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
>
> Rather than having special purpose init_pg_table_start/end variables
> to delimit the kernel pagetable built by head_32.S, just use the brk
> mechanism to extend the bss for the new pagetable.
>
> This patch removes init_pg_table_start/end and pg0, defines __brk_base
> (which is page-aligned and immediately follows _end), initializes
> the brk region to start there, and uses it for the 32-bit pagetable.
...

> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index c246dc4..ed02176 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -113,8 +113,10 @@
>  #endif
>
>  unsigned int boot_cpu_id __read_mostly;
> -__initdata unsigned long _brk_start = (unsigned long)&_end;
> -__initdata unsigned long _brk_end = (unsigned long)&_end;
> +
> +extern char __brk_base[];
> +__initdata unsigned long _brk_start = (unsigned long)__brk_base;
> +__initdata unsigned long _brk_end = (unsigned long)&__brk_base;

?

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* J
  2009-02-28  7:02   ` Yinghai Lu
@ 2009-02-28  7:05     ` Jeremy Fitzhardinge
  2009-02-28  7:15       ` J Ingo Molnar
  2009-02-28  7:30       ` J Yinghai Lu
  0 siblings, 2 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  7:05 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Yinghai Lu wrote:
> On Fri, Feb 27, 2009 at 5:51 PM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>   
>> From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
>>
>> Rather than having special purpose init_pg_table_start/end variables
>> to delimit the kernel pagetable built by head_32.S, just use the brk
>> mechanism to extend the bss for the new pagetable.
>>
>> This patch removes init_pg_table_start/end and pg0, defines __brk_base
>> (which is page-aligned and immediately follows _end), initializes
>> the brk region to start there, and uses it for the 32-bit pagetable.
>>     
> ...
>
>   
>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>> index c246dc4..ed02176 100644
>> --- a/arch/x86/kernel/setup.c
>> +++ b/arch/x86/kernel/setup.c
>> @@ -113,8 +113,10 @@
>>  #endif
>>
>>  unsigned int boot_cpu_id __read_mostly;
>> -__initdata unsigned long _brk_start = (unsigned long)&_end;
>> -__initdata unsigned long _brk_end = (unsigned long)&_end;
>> +
>> +extern char __brk_base[];
>> +__initdata unsigned long _brk_start = (unsigned long)__brk_base;
>> +__initdata unsigned long _brk_end = (unsigned long)&__brk_base;
>>     
>
> ?
>   

What are you asking?  __brk_base is _end rounded up to a page boundary, 
so head_32.S can use it directly for pagetable allocation.  Are you 
flagging the '&' typo?  Something else?

    J

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: J
  2009-02-28  7:05     ` J Jeremy Fitzhardinge
@ 2009-02-28  7:15       ` Ingo Molnar
  2009-02-28  7:39         ` does boot loader check uncompressed kernel size? Yinghai Lu
  2009-02-28  8:17         ` J Jeremy Fitzhardinge
  2009-02-28  7:30       ` J Yinghai Lu
  1 sibling, 2 replies; 43+ messages in thread
From: Ingo Molnar @ 2009-02-28  7:15 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Yinghai Lu, H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge


* Jeremy Fitzhardinge <jeremy@goop.org> wrote:

> Yinghai Lu wrote:
>> On Fri, Feb 27, 2009 at 5:51 PM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>>   
>>> From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
>>>
>>> Rather than having special purpose init_pg_table_start/end variables
>>> to delimit the kernel pagetable built by head_32.S, just use the brk
>>> mechanism to extend the bss for the new pagetable.
>>>
>>> This patch removes init_pg_table_start/end and pg0, defines __brk_base
>>> (which is page-aligned and immediately follows _end), initializes
>>> the brk region to start there, and uses it for the 32-bit pagetable.
>>>     
>> ...
>>
>>   
>>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>>> index c246dc4..ed02176 100644
>>> --- a/arch/x86/kernel/setup.c
>>> +++ b/arch/x86/kernel/setup.c
>>> @@ -113,8 +113,10 @@
>>>  #endif
>>>
>>>  unsigned int boot_cpu_id __read_mostly;
>>> -__initdata unsigned long _brk_start = (unsigned long)&_end;
>>> -__initdata unsigned long _brk_end = (unsigned long)&_end;
>>> +
>>> +extern char __brk_base[];
>>> +__initdata unsigned long _brk_start = (unsigned long)__brk_base;
>>> +__initdata unsigned long _brk_end = (unsigned long)&__brk_base;
>>>     
>>
>> ?
>>   
>
> What are you asking?  __brk_base is _end rounded up to a page 
> boundary, so head_32.S can use it directly for pagetable 
> allocation.  Are you flagging the '&' typo?  Something else?

it's not really a typo either (__brk_base and &__brk_base should 
be the same) - just a bit weird looking.

	Ingo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: J
  2009-02-28  7:05     ` J Jeremy Fitzhardinge
  2009-02-28  7:15       ` J Ingo Molnar
@ 2009-02-28  7:30       ` Yinghai Lu
  1 sibling, 0 replies; 43+ messages in thread
From: Yinghai Lu @ 2009-02-28  7:30 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Jeremy Fitzhardinge wrote:
> Yinghai Lu wrote:
>> On Fri, Feb 27, 2009 at 5:51 PM, Jeremy Fitzhardinge <jeremy@goop.org>
>> wrote:
>>  
>>> From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
>>>
>>> Rather than having special purpose init_pg_table_start/end variables
>>> to delimit the kernel pagetable built by head_32.S, just use the brk
>>> mechanism to extend the bss for the new pagetable.
>>>
>>> This patch removes init_pg_table_start/end and pg0, defines __brk_base
>>> (which is page-aligned and immediately follows _end), initializes
>>> the brk region to start there, and uses it for the 32-bit pagetable.
>>>     
>> ...
>>
>>  
>>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>>> index c246dc4..ed02176 100644
>>> --- a/arch/x86/kernel/setup.c
>>> +++ b/arch/x86/kernel/setup.c
>>> @@ -113,8 +113,10 @@
>>>  #endif
>>>
>>>  unsigned int boot_cpu_id __read_mostly;
>>> -__initdata unsigned long _brk_start = (unsigned long)&_end;
>>> -__initdata unsigned long _brk_end = (unsigned long)&_end;
>>> +
>>> +extern char __brk_base[];
>>> +__initdata unsigned long _brk_start = (unsigned long)__brk_base;
>>> +__initdata unsigned long _brk_end = (unsigned long)&__brk_base;
>>>     
>>
>> ?
>>   
> 
> What are you asking?  __brk_base is _end rounded up to a page boundary,
> so head_32.S can use it directly for pagetable allocation.  Are you
> flagging the '&' typo?  Something else?

why start don't have &, but end has &

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* does boot loader check uncompressed kernel size?
  2009-02-28  7:15       ` J Ingo Molnar
@ 2009-02-28  7:39         ` Yinghai Lu
  2009-02-28  7:47           ` Cyrill Gorcunov
                             ` (2 more replies)
  2009-02-28  8:17         ` J Jeremy Fitzhardinge
  1 sibling, 3 replies; 43+ messages in thread
From: Yinghai Lu @ 2009-02-28  7:39 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin
  Cc: Jeremy Fitzhardinge, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

wonder if boot loader check uncompress size aka vmlinux size in bzImage
before it find one good position for bzImage...?

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: does boot loader check uncompressed kernel size?
  2009-02-28  7:39         ` does boot loader check uncompressed kernel size? Yinghai Lu
@ 2009-02-28  7:47           ` Cyrill Gorcunov
  2009-02-28  7:54             ` Yinghai Lu
  2009-02-28  7:52           ` brk patches Yinghai Lu
  2009-02-28  8:07           ` does boot loader check uncompressed kernel size? H. Peter Anvin
  2 siblings, 1 reply; 43+ messages in thread
From: Cyrill Gorcunov @ 2009-02-28  7:47 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, H. Peter Anvin, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Linux Kernel Mailing List,
	Jeremy Fitzhardinge

[Yinghai Lu - Fri, Feb 27, 2009 at 11:39:06PM -0800]
| wonder if boot loader check uncompress size aka vmlinux size in bzImage
| before it find one good position for bzImage...?
| 
| YH
| 

At least for x86-64 in grub-1.96 I didn't find such a check.
Btw, but why should it care? Or you mean something else?

	- Cyrill -

^ permalink raw reply	[flat|nested] 43+ messages in thread

* brk patches..
  2009-02-28  7:39         ` does boot loader check uncompressed kernel size? Yinghai Lu
  2009-02-28  7:47           ` Cyrill Gorcunov
@ 2009-02-28  7:52           ` Yinghai Lu
  2009-02-28  8:08             ` H. Peter Anvin
  2009-02-28  8:17             ` Jeremy Fitzhardinge
  2009-02-28  8:07           ` does boot loader check uncompressed kernel size? H. Peter Anvin
  2 siblings, 2 replies; 43+ messages in thread
From: Yinghai Lu @ 2009-02-28  7:52 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin
  Cc: Jeremy Fitzhardinge, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Yinghai Lu wrote:
> wonder if boot loader check uncompress size aka vmlinux size in bzImage
> before it find one good position for bzImage...?
> 

Jeremy's brk patches may break:
1. kexec load 64bit vmlinux on some ram that near the memory hole etc.
blindly to use ram after _end may have some problem
2. coreboot aka linux is using elf (by mkelfImage : vmlinux + initrd), initrd became one section after _end...
could cause initrd get overwrite... by extend _brk

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: does boot loader check uncompressed kernel size?
  2009-02-28  7:47           ` Cyrill Gorcunov
@ 2009-02-28  7:54             ` Yinghai Lu
  2009-02-28  8:08               ` H. Peter Anvin
  0 siblings, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-02-28  7:54 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Ingo Molnar, H. Peter Anvin, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Linux Kernel Mailing List,
	Jeremy Fitzhardinge

On Fri, Feb 27, 2009 at 11:47 PM, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> [Yinghai Lu - Fri, Feb 27, 2009 at 11:39:06PM -0800]
> | wonder if boot loader check uncompress size aka vmlinux size in bzImage
> | before it find one good position for bzImage...?
> |
> | YH
> |
>
> At least for x86-64 in grub-1.96 I didn't find such a check.
> Btw, but why should it care? Or you mean something else?

thinking about Jeremy's brk patches that may use ram after _end blindly...

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: does boot loader check uncompressed kernel size?
  2009-02-28  7:39         ` does boot loader check uncompressed kernel size? Yinghai Lu
  2009-02-28  7:47           ` Cyrill Gorcunov
  2009-02-28  7:52           ` brk patches Yinghai Lu
@ 2009-02-28  8:07           ` H. Peter Anvin
  2 siblings, 0 replies; 43+ messages in thread
From: H. Peter Anvin @ 2009-02-28  8:07 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Jeremy Fitzhardinge, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Yinghai Lu wrote:
> wonder if boot loader check uncompress size aka vmlinux size in bzImage
> before it find one good position for bzImage...?

Most boot loaders load the bzImage to a single standard address, 0x100000.

	-hpa

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: does boot loader check uncompressed kernel size?
  2009-02-28  7:54             ` Yinghai Lu
@ 2009-02-28  8:08               ` H. Peter Anvin
  2009-02-28 20:42                 ` Yinghai Lu
  0 siblings, 1 reply; 43+ messages in thread
From: H. Peter Anvin @ 2009-02-28  8:08 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Cyrill Gorcunov, Ingo Molnar, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Linux Kernel Mailing List,
	Jeremy Fitzhardinge

Yinghai Lu wrote:
> On Fri, Feb 27, 2009 at 11:47 PM, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>> [Yinghai Lu - Fri, Feb 27, 2009 at 11:39:06PM -0800]
>> | wonder if boot loader check uncompress size aka vmlinux size in bzImage
>> | before it find one good position for bzImage...?
>> |
>> | YH
>> |
>>
>> At least for x86-64 in grub-1.96 I didn't find such a check.
>> Btw, but why should it care? Or you mean something else?
> 
> thinking about Jeremy's brk patches that may use ram after _end blindly...
> 

We already do that.  Jeremy's brk patches just formalizes it.

	-hpa


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-02-28  7:52           ` brk patches Yinghai Lu
@ 2009-02-28  8:08             ` H. Peter Anvin
  2009-02-28  8:17             ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 43+ messages in thread
From: H. Peter Anvin @ 2009-02-28  8:08 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Jeremy Fitzhardinge, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Yinghai Lu wrote:
> Yinghai Lu wrote:
>> wonder if boot loader check uncompress size aka vmlinux size in bzImage
>> before it find one good position for bzImage...?
>>
> 
> Jeremy's brk patches may break:
> 1. kexec load 64bit vmlinux on some ram that near the memory hole etc.
> blindly to use ram after _end may have some problem
> 2. coreboot aka linux is using elf (by mkelfImage : vmlinux + initrd), initrd became one section after _end...
> could cause initrd get overwrite... by extend _brk
> 

Both of these are already invalid use.

	-hpa

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: J
  2009-02-28  7:15       ` J Ingo Molnar
  2009-02-28  7:39         ` does boot loader check uncompressed kernel size? Yinghai Lu
@ 2009-02-28  8:17         ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  8:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Yinghai Lu, H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Ingo Molnar wrote:
>> What are you asking?  __brk_base is _end rounded up to a page 
>> boundary, so head_32.S can use it directly for pagetable 
>> allocation.  Are you flagging the '&' typo?  Something else?
>>     
>
> it's not really a typo either (__brk_base and &__brk_base should 
> be the same) - just a bit weird looking.

It's a typo in that I didn't mean it to come out like that.  But yes, it 
makes no difference.

    J

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-02-28  7:52           ` brk patches Yinghai Lu
  2009-02-28  8:08             ` H. Peter Anvin
@ 2009-02-28  8:17             ` Jeremy Fitzhardinge
  2009-02-28 20:40               ` Yinghai Lu
  2009-03-01  1:23               ` [PATCH] x86: put initial_pg_tables into bss Yinghai Lu
  1 sibling, 2 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-28  8:17 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Yinghai Lu wrote:
> Yinghai Lu wrote:
>   
>> wonder if boot loader check uncompress size aka vmlinux size in bzImage
>> before it find one good position for bzImage...?
>>
>>     
>
> Jeremy's brk patches may break:
> 1. kexec load 64bit vmlinux on some ram that near the memory hole etc.
> blindly to use ram after _end may have some problem
> 2. coreboot aka linux is using elf (by mkelfImage : vmlinux + initrd), initrd became one section after _end...
> could cause initrd get overwrite... by extend _brk
>   

But its no different from what i386 does now to allocate its initial 
pagetables.  How does this not break now?

    J

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-02-28  8:17             ` Jeremy Fitzhardinge
@ 2009-02-28 20:40               ` Yinghai Lu
  2009-03-01 23:53                 ` Jeremy Fitzhardinge
  2009-03-01  1:23               ` [PATCH] x86: put initial_pg_tables into bss Yinghai Lu
  1 sibling, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-02-28 20:40 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

On Sat, Feb 28, 2009 at 12:17 AM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
> Yinghai Lu wrote:
>>
>> Yinghai Lu wrote:
>>
>>>
>>> wonder if boot loader check uncompress size aka vmlinux size in bzImage
>>> before it find one good position for bzImage...?
>>>
>>>
>>
>> Jeremy's brk patches may break:
>> 1. kexec load 64bit vmlinux on some ram that near the memory hole etc.
>> blindly to use ram after _end may have some problem
>> 2. coreboot aka linux is using elf (by mkelfImage : vmlinux + initrd),
>> initrd became one section after _end...
>> could cause initrd get overwrite... by extend _brk
>>
>
> But its no different from what i386 does now to allocate its initial
> pagetables.  How does this not break now?
>

it will try to use initial page table af first, and it is not big
enough, it will according to e820 and other reserved_early areas to
find good positions.

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: does boot loader check uncompressed kernel size?
  2009-02-28  8:08               ` H. Peter Anvin
@ 2009-02-28 20:42                 ` Yinghai Lu
  0 siblings, 0 replies; 43+ messages in thread
From: Yinghai Lu @ 2009-02-28 20:42 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Cyrill Gorcunov, Ingo Molnar, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Linux Kernel Mailing List,
	Jeremy Fitzhardinge

On Sat, Feb 28, 2009 at 12:08 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> Yinghai Lu wrote:
>>
>> On Fri, Feb 27, 2009 at 11:47 PM, Cyrill Gorcunov <gorcunov@gmail.com>
>> wrote:
>>>
>>> [Yinghai Lu - Fri, Feb 27, 2009 at 11:39:06PM -0800]
>>> | wonder if boot loader check uncompress size aka vmlinux size in bzImage
>>> | before it find one good position for bzImage...?
>>> |
>>> | YH
>>> |
>>>
>>> At least for x86-64 in grub-1.96 I didn't find such a check.
>>> Btw, but why should it care? Or you mean something else?
>>
>> thinking about Jeremy's brk patches that may use ram after _end blindly...
>>
>
> We already do that.  Jeremy's brk patches just formalizes it.
>

No, after we extend reserve_early from 64bit to 32bit, we don't use
RAM blindly in early stage.

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] x86: put initial_pg_tables into bss
  2009-02-28  8:17             ` Jeremy Fitzhardinge
  2009-02-28 20:40               ` Yinghai Lu
@ 2009-03-01  1:23               ` Yinghai Lu
  2009-03-01  8:31                 ` [PATCH] x86: put initial_pg_tables into bss -v2 Yinghai Lu
  1 sibling, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-03-01  1:23 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin, Andrew Morton, Thomas Gleixner
  Cc: Linux Kernel Mailing List, Jeremy Fitzhardinge



Impact: cleanup

Don't use ram after _end blindly for pagetables.
put those pg table into bss

also remove init_pg_tables_start/end tricks all around

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/include/asm/setup.h     |    3 ---
 arch/x86/kernel/head32.c         |    3 ---
 arch/x86/kernel/head_32.S        |   25 +++++++++++--------------
 arch/x86/kernel/setup.c          |    9 ---------
 arch/x86/kernel/vmlinux_32.lds.S |    3 ---
 arch/x86/lguest/boot.c           |    8 --------
 arch/x86/xen/mmu.c               |    4 +---
 7 files changed, 12 insertions(+), 43 deletions(-)

Index: linux-2.6/arch/x86/include/asm/setup.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/setup.h
+++ linux-2.6/arch/x86/include/asm/setup.h
@@ -105,9 +105,6 @@ extern struct boot_params boot_params;
 void __init i386_start_kernel(void);
 extern void probe_roms(void);
 
-extern unsigned long init_pg_tables_start;
-extern unsigned long init_pg_tables_end;
-
 #else
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
Index: linux-2.6/arch/x86/kernel/head32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head32.c
+++ linux-2.6/arch/x86/kernel/head32.c
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
-	reserve_early(init_pg_tables_start, init_pg_tables_end,
-			"INIT_PG_TABLE");
-
 	reserve_ebda_region();
 
 	/*
Index: linux-2.6/arch/x86/kernel/head_32.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head_32.S
+++ linux-2.6/arch/x86/kernel/head_32.S
@@ -73,7 +73,7 @@ PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_
 BOOTBITMAP_SIZE = LOW_PAGES / 8
 ALLOCATOR_SLOP = 4
 
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+INIT_MAP_SIZE = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
@@ -166,10 +166,9 @@ num_subarch_entries = (. - subarch_entri
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond _end. 
  * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end
  *
  * Note that the stack is not yet set up!
  */
@@ -191,7 +190,6 @@ default_entry:
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
 	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_pmd), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -209,14 +207,13 @@ default_entry:
 	loop 11b
 
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables.
+	 * End condition: we must map up to the end.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end), %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
-	movl %edi,pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -228,7 +225,6 @@ default_entry:
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
 	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_dir), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -242,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	addl $0x1000,%eax
 	loop 11b
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables; the +0x007 is
+	 * End condition: we must map up to end, the +0x007 is
 	 * the attribute bits
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end), %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
-	movl %edi,pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -662,6 +657,8 @@ ENTRY(swapper_pg_dir)
 # endif
 	.align PAGE_SIZE_asm		/* needs to be page-sized too */
 #endif
+ENTRY(pg0)
+	.fill INIT_MAP_SIZE,1,0
 
 .data
 ENTRY(stack_start)
Index: linux-2.6/arch/x86/kernel/setup.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/setup.c
+++ linux-2.6/arch/x86/kernel/setup.c
@@ -158,11 +158,6 @@ static struct resource bss_resource = {
 
 
 #ifdef CONFIG_X86_32
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
 
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
@@ -715,11 +710,7 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
 	init_mm.brk = (unsigned long) &_end;
-#endif
 
 	code_resource.start = virt_to_phys(_text);
 	code_resource.end = virt_to_phys(_etext)-1;
Index: linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/vmlinux_32.lds.S
+++ linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
@@ -190,9 +190,6 @@ SECTIONS
 	. = ALIGN(4);
 	__bss_stop = .;
   	_end = . ;
-	/* This is where the kernel creates the early boot page tables */
-	. = ALIGN(PAGE_SIZE);
-	pg0 = . ;
   }
 
   /* Sections to be discarded */
Index: linux-2.6/arch/x86/lguest/boot.c
===================================================================
--- linux-2.6.orig/arch/x86/lguest/boot.c
+++ linux-2.6/arch/x86/lguest/boot.c
@@ -1051,14 +1051,6 @@ __init void lguest_init(void)
 	 * lguest_init() where the rest of the fairly chaotic boot setup
 	 * occurs. */
 
-	/* The native boot code sets up initial page tables immediately after
-	 * the kernel itself, and sets init_pg_tables_end so they're not
-	 * clobbered.  The Launcher places our initial pagetables somewhere at
-	 * the top of our physical memory, so we don't need extra space: set
-	 * init_pg_tables_end to the end of the kernel. */
-	init_pg_tables_start = __pa(pg0);
-	init_pg_tables_end = __pa(pg0);
-
 	/* As described in head_32.S, we map the first 128M of memory. */
 	max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
 
Index: linux-2.6/arch/x86/xen/mmu.c
===================================================================
--- linux-2.6.orig/arch/x86/xen/mmu.c
+++ linux-2.6/arch/x86/xen/mmu.c
@@ -1716,9 +1716,7 @@ __init pgd_t *xen_setup_kernel_pagetable
 {
 	pmd_t *kernel_pmd;
 
-	init_pg_tables_start = __pa(pgd);
-	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-	max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+	max_pfn_mapped = PFN_DOWN(__pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE + 512*1024);
 
 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
 	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] x86: put initial_pg_tables into bss -v2
  2009-03-01  1:23               ` [PATCH] x86: put initial_pg_tables into bss Yinghai Lu
@ 2009-03-01  8:31                 ` Yinghai Lu
  2009-03-01  9:20                   ` H. Peter Anvin
  0 siblings, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-03-01  8:31 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin, Andrew Morton, Thomas Gleixner
  Cc: Linux Kernel Mailing List, Jeremy Fitzhardinge


Impact: cleanup

Don't use ram after _end blindly for pagetables.
put those pg table into bss

also remove init_pg_tables_start/end tricks all around

v2: keep initial page table up to 512M only.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/include/asm/pgtable_32.h |    3 --
 arch/x86/include/asm/setup.h      |    3 --
 arch/x86/kernel/head32.c          |    3 --
 arch/x86/kernel/head_32.S         |   52 ++++++++++++--------------------------
 arch/x86/kernel/setup.c           |    9 ------
 arch/x86/kernel/vmlinux_32.lds.S  |    3 --
 arch/x86/lguest/boot.c            |    8 -----
 arch/x86/xen/mmu.c                |    4 --
 8 files changed, 18 insertions(+), 67 deletions(-)

Index: linux-2.6/arch/x86/include/asm/setup.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/setup.h
+++ linux-2.6/arch/x86/include/asm/setup.h
@@ -105,9 +105,6 @@ extern struct boot_params boot_params;
 void __init i386_start_kernel(void);
 extern void probe_roms(void);
 
-extern unsigned long init_pg_tables_start;
-extern unsigned long init_pg_tables_end;
-
 #else
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
Index: linux-2.6/arch/x86/kernel/head32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head32.c
+++ linux-2.6/arch/x86/kernel/head32.c
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
-	reserve_early(init_pg_tables_start, init_pg_tables_end,
-			"INIT_PG_TABLE");
-
 	reserve_ebda_region();
 
 	/*
Index: linux-2.6/arch/x86/kernel/head_32.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head_32.S
+++ linux-2.6/arch/x86/kernel/head_32.S
@@ -38,42 +38,27 @@
 #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
 
 /*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
- * We need:
- *  - one bit for each possible page, but only in low memory, which means
- *     2^32/4096/8 = 128K worst case (4G/4G split.)
+ * This is how much memory for page table to and including _end
+ * we need mapped initially.
+ * We need for 512M
  *  - enough space to map all low memory, which means
- *     (2^32/4096) / 1024 pages (worst case, non PAE)
- *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
- *  - a few pages for allocator use before the kernel pagetable has
- *     been set up
+ *     (2^29/4096) / 1024 pages (worst case, non PAE)
+ *     (2^29/4096) / 512 + 4 pages (worst case for PAE)
  *
  * Modulo rounding, each megabyte assigned here requires a kilobyte of
  * memory, which is currently unreclaimed.
  *
  * This should be a multiple of a page.
  */
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
+LOW_PAGES = 1<<(29-PAGE_SHIFT_asm)
 
 #if PTRS_PER_PMD > 1
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
 #else
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
 #endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
-ALLOCATOR_SLOP = 4
 
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+INIT_MAP_SIZE = PAGE_TABLE_SIZE * PAGE_SIZE_asm
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
@@ -166,10 +151,9 @@ num_subarch_entries = (. - subarch_entri
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond _end. 
  * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end
  *
  * Note that the stack is not yet set up!
  */
@@ -191,7 +175,6 @@ default_entry:
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
 	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_pmd), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -209,14 +192,13 @@ default_entry:
 	loop 11b
 
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables.
+	 * End condition: we must map up to the end.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end), %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
-	movl %edi,pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -228,7 +210,6 @@ default_entry:
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
 	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_dir), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -242,14 +223,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	addl $0x1000,%eax
 	loop 11b
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables; the +0x007 is
+	 * End condition: we must map up to end, the +0x007 is
 	 * the attribute bits
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end), %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
-	movl %edi,pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -662,6 +642,8 @@ ENTRY(swapper_pg_dir)
 # endif
 	.align PAGE_SIZE_asm		/* needs to be page-sized too */
 #endif
+ENTRY(pg0)
+	.fill INIT_MAP_SIZE,1,0
 
 .data
 ENTRY(stack_start)
Index: linux-2.6/arch/x86/kernel/setup.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/setup.c
+++ linux-2.6/arch/x86/kernel/setup.c
@@ -158,11 +158,6 @@ static struct resource bss_resource = {
 
 
 #ifdef CONFIG_X86_32
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
 
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
@@ -715,11 +710,7 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
 	init_mm.brk = (unsigned long) &_end;
-#endif
 
 	code_resource.start = virt_to_phys(_text);
 	code_resource.end = virt_to_phys(_etext)-1;
Index: linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/vmlinux_32.lds.S
+++ linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
@@ -190,9 +190,6 @@ SECTIONS
 	. = ALIGN(4);
 	__bss_stop = .;
   	_end = . ;
-	/* This is where the kernel creates the early boot page tables */
-	. = ALIGN(PAGE_SIZE);
-	pg0 = . ;
   }
 
   /* Sections to be discarded */
Index: linux-2.6/arch/x86/lguest/boot.c
===================================================================
--- linux-2.6.orig/arch/x86/lguest/boot.c
+++ linux-2.6/arch/x86/lguest/boot.c
@@ -1051,14 +1051,6 @@ __init void lguest_init(void)
 	 * lguest_init() where the rest of the fairly chaotic boot setup
 	 * occurs. */
 
-	/* The native boot code sets up initial page tables immediately after
-	 * the kernel itself, and sets init_pg_tables_end so they're not
-	 * clobbered.  The Launcher places our initial pagetables somewhere at
-	 * the top of our physical memory, so we don't need extra space: set
-	 * init_pg_tables_end to the end of the kernel. */
-	init_pg_tables_start = __pa(pg0);
-	init_pg_tables_end = __pa(pg0);
-
 	/* As described in head_32.S, we map the first 128M of memory. */
 	max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
 
Index: linux-2.6/arch/x86/xen/mmu.c
===================================================================
--- linux-2.6.orig/arch/x86/xen/mmu.c
+++ linux-2.6/arch/x86/xen/mmu.c
@@ -1716,9 +1716,7 @@ __init pgd_t *xen_setup_kernel_pagetable
 {
 	pmd_t *kernel_pmd;
 
-	init_pg_tables_start = __pa(pgd);
-	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-	max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+	max_pfn_mapped = PFN_DOWN(__pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE + 512*1024);
 
 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
 	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
Index: linux-2.6/arch/x86/include/asm/pgtable_32.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/pgtable_32.h
+++ linux-2.6/arch/x86/include/asm/pgtable_32.h
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, u
  */
 #undef TEST_ACCESS_OK
 
-/* The boot page tables (all created as a single array) */
-extern unsigned long pg0[];
-
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
 #else

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into bss -v2
  2009-03-01  8:31                 ` [PATCH] x86: put initial_pg_tables into bss -v2 Yinghai Lu
@ 2009-03-01  9:20                   ` H. Peter Anvin
  2009-03-01 17:49                     ` Yinghai Lu
  2009-03-01 18:06                     ` Yinghai Lu
  0 siblings, 2 replies; 43+ messages in thread
From: H. Peter Anvin @ 2009-03-01  9:20 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Yinghai Lu wrote:
> Impact: cleanup
> 
> Don't use ram after _end blindly for pagetables.
> put those pg table into bss
> 
> also remove init_pg_tables_start/end tricks all around
> 
> v2: keep initial page table up to 512M only.
> 

I really, REALLY, don't like this.  This is going right back to the 
situation which we had before the dynamically generated page tables.  We 
now have yet another hardcoded limit, and big chunk of wasted memory in 
case we don't need to allocate it all.

   * Modulo rounding, each megabyte assigned here requires a kilobyte of
   * memory, which is currently unreclaimed.

You're potentially throwing away half a megabyte, which is a major deal 
on a small embedded system!

No, this is garbage.  If you're insisting on getting rid of the brk-like 
allocation patterns, then you have to get an alternative dynamic 
allocator available to the pre-paging code.  Now, there is no reason we 
couldn't execute C code before enabling paging, although the code would 
either have to be PIC or linked at the physical address.

	-hpa

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into bss -v2
  2009-03-01  9:20                   ` H. Peter Anvin
@ 2009-03-01 17:49                     ` Yinghai Lu
  2009-03-01 18:06                     ` Yinghai Lu
  1 sibling, 0 replies; 43+ messages in thread
From: Yinghai Lu @ 2009-03-01 17:49 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>> Impact: cleanup
>>
>> Don't use ram after _end blindly for pagetables.
>> put those pg table into bss
>>
>> also remove init_pg_tables_start/end tricks all around
>>
>> v2: keep initial page table up to 512M only.
>>
> 
> I really, REALLY, don't like this.  This is going right back to the
> situation which we had before the dynamically generated page tables.  We
> now have yet another hardcoded limit, and big chunk of wasted memory in
> case we don't need to allocate it all.

that limit is there in for 64 too. aka 512M direct mapping. (need 1Mbytes space for page tables)

we can get rid of that limit: with careful link sequence to make sure that code init_memory_mapping and before is some fixed range.

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into bss -v2
  2009-03-01  9:20                   ` H. Peter Anvin
  2009-03-01 17:49                     ` Yinghai Lu
@ 2009-03-01 18:06                     ` Yinghai Lu
  2009-03-01 23:29                       ` H. Peter Anvin
  1 sibling, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-03-01 18:06 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>> Impact: cleanup
>>
>> Don't use ram after _end blindly for pagetables.
>> put those pg table into bss
>>
>> also remove init_pg_tables_start/end tricks all around
>>
>> v2: keep initial page table up to 512M only.
...

> 
> No, this is garbage.  If you're insisting on getting rid of the brk-like
> allocation patterns, then you have to get an alternative dynamic
> allocator available to the pre-paging code.  Now, there is no reason we
> couldn't execute C code before enabling paging, although the code would
> either have to be PIC or linked at the physical address.

you can use find_e820_area()/reserve_early() pair to find right position for that.

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into bss -v2
  2009-03-01 18:06                     ` Yinghai Lu
@ 2009-03-01 23:29                       ` H. Peter Anvin
  2009-03-02  0:55                         ` Yinghai Lu
  2009-03-09  7:45                         ` [PATCH] x86: put initial_pg_tables into bss -v2 Yinghai Lu
  0 siblings, 2 replies; 43+ messages in thread
From: H. Peter Anvin @ 2009-03-01 23:29 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Yinghai Lu wrote:
> 
>> No, this is garbage.  If you're insisting on getting rid of the brk-like
>> allocation patterns,> YH
  then you have to get an alternative dynamic
>> allocator available to the pre-paging code.  Now, there is no reason we
>> couldn't execute C code before enabling paging, although the code would
>> either have to be PIC or linked at the physical address.
> 
> you can use find_e820_area()/reserve_early() pair to find right position for that.
> 

This stuff is currently done before paging is enabled, and existing C 
code can't be run as-is.  There are three ways to deal with that:

a) compile some of the code with -fPIC/-fPIE.
b) link some code twice with different offsets.
c) play really ugly games with segments (thus making the virtualization
    guys unhappy.)

Pretty much, these options all suck.  Another option, of course, is to 
generate a fixed amount of page tables just to get us into the C 
environment, generate a new set, *and reclaim the old ones*.  That way 
we're not wasting memory if we're on a small-RAM machine.

It's still really ugly, though.  A much easier and cleaner way would 
seem to be to calculate a far limit on the brk and then marking it as a 
formal (non-alloc) section in the linker script and vmlinux file.  That 
way anything that examines the vmlinux file will see it as an exclusion 
section.  We can (and should) even verify that we don't overflow the brk 
and panic if we do.

	-hpa

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-02-28 20:40               ` Yinghai Lu
@ 2009-03-01 23:53                 ` Jeremy Fitzhardinge
  2009-03-02  1:02                   ` Yinghai Lu
  0 siblings, 1 reply; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-03-01 23:53 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Yinghai Lu wrote:
>> But its no different from what i386 does now to allocate its initial
>> pagetables.  How does this not break now?
>>
>>     
>
> it will try to use initial page table af first, and it is not big
> enough, it will according to e820 and other reserved_early areas to
> find good positions.
>   

head_32.S has no such logic.  It just starts building the kernel 
mappings directly after _end, starting at pg0, and uses as much space as 
it needs.  For a !PSE CPU with a large kernel, that can be quite a lot 
of space.  Only later, when its creating the linear memory mappings, 
does it search around in the e820 tables (which it now has access to) 
for space.

The whole point of the brk segment was to have a way of allocating some 
memory very early, before e820 is even available.  If you really think 
this is dangerous, then we can easily extend the bss in the linker 
script to include the brk memory, and release any leftover when we do 
the normal bootmem freeup.  That would also give us a well-defined upper 
limit on how much brk memory can be allocated; its a bit undefined at 
the moment, as it depends on how much slop there is after the kernel 
mapping.

    J

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into bss -v2
  2009-03-01 23:29                       ` H. Peter Anvin
@ 2009-03-02  0:55                         ` Yinghai Lu
  2009-03-09  8:15                           ` [PATCH] x86: put initial_pg_tables into .bss -v4 Yinghai Lu
  2009-03-09  7:45                         ` [PATCH] x86: put initial_pg_tables into bss -v2 Yinghai Lu
  1 sibling, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-03-02  0:55 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>>
>>> No, this is garbage.  If you're insisting on getting rid of the brk-like
>>> allocation patterns,> YH
>  then you have to get an alternative dynamic
>>> allocator available to the pre-paging code.  Now, there is no reason we
>>> couldn't execute C code before enabling paging, although the code would
>>> either have to be PIC or linked at the physical address.
>>
>> you can use find_e820_area()/reserve_early() pair to find right
>> position for that.
>>
> 
> This stuff is currently done before paging is enabled, and existing C
> code can't be run as-is.  There are three ways to deal with that:
> 
> a) compile some of the code with -fPIC/-fPIE.
> b) link some code twice with different offsets.
> c) play really ugly games with segments (thus making the virtualization
>    guys unhappy.)
> 
> Pretty much, these options all suck.  Another option, of course, is to
> generate a fixed amount of page tables just to get us into the C
> environment, generate a new set, *and reclaim the old ones*.  That way
> we're not wasting memory if we're on a small-RAM machine.
> 
> It's still really ugly, though.  A much easier and cleaner way would
> seem to be to calculate a far limit on the brk and then marking it as a
> formal (non-alloc) section in the linker script and vmlinux file.  That
> way anything that examines the vmlinux file will see it as an exclusion
> section.  We can (and should) even verify that we don't overflow the brk
> and panic if we do.
> 
please check

[PATCH] x86: put initial_pg_tables into .data -v3

Impact: cleanup

Don't use ram after _end blindly for pagetables.
put those pg table into .data

also remove init_pg_tables_start/end tricks all around

v2: keep initial page table up to 512M only.
v3: acctually it is in .data.page_aligned
    add KERNEL_IMAGE_SIZE for 32bit, so small set it some small value than 512M
	when installed RAM is smaller than 512M
    initial_pgtable will cover to KERNEL_IMAGE_SIZE to avoid wasting.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig                     |    9 +++++
 arch/x86/include/asm/page_32_types.h |    9 +++++
 arch/x86/include/asm/pgtable_32.h    |    3 -
 arch/x86/include/asm/setup.h         |    3 -
 arch/x86/kernel/head32.c             |    3 -
 arch/x86/kernel/head_32.S            |   53 ++++++++++++-----------------------
 arch/x86/kernel/setup.c              |    9 -----
 arch/x86/kernel/vmlinux_32.lds.S     |    9 +++--
 arch/x86/lguest/boot.c               |    8 -----
 arch/x86/xen/mmu.c                   |    4 --
 10 files changed, 44 insertions(+), 66 deletions(-)

Index: linux-2.6/arch/x86/include/asm/setup.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/setup.h
+++ linux-2.6/arch/x86/include/asm/setup.h
@@ -105,9 +105,6 @@ extern struct boot_params boot_params;
 void __init i386_start_kernel(void);
 extern void probe_roms(void);
 
-extern unsigned long init_pg_tables_start;
-extern unsigned long init_pg_tables_end;
-
 #else
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
Index: linux-2.6/arch/x86/kernel/head32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head32.c
+++ linux-2.6/arch/x86/kernel/head32.c
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
-	reserve_early(init_pg_tables_start, init_pg_tables_end,
-			"INIT_PG_TABLE");
-
 	reserve_ebda_region();
 
 	/*
Index: linux-2.6/arch/x86/kernel/head_32.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head_32.S
+++ linux-2.6/arch/x86/kernel/head_32.S
@@ -38,42 +38,30 @@
 #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
 
 /*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
- * We need:
- *  - one bit for each possible page, but only in low memory, which means
- *     2^32/4096/8 = 128K worst case (4G/4G split.)
+ * This is how much memory for page table to and including _end
+ * we need mapped initially.
  *  - enough space to map all low memory, which means
- *     (2^32/4096) / 1024 pages (worst case, non PAE)
- *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
- *  - a few pages for allocator use before the kernel pagetable has
- *     been set up
+ *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
  *
  * Modulo rounding, each megabyte assigned here requires a kilobyte of
  * memory, which is currently unreclaimed.
  *
  * This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
  */
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
+LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT
 
 #if PTRS_PER_PMD > 1
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
 #else
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
 #endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
 ALLOCATOR_SLOP = 4
 
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
@@ -166,10 +154,9 @@ num_subarch_entries = (. - subarch_entri
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond _end. 
  * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end
  *
  * Note that the stack is not yet set up!
  */
@@ -191,7 +178,6 @@ default_entry:
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
 	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_pmd), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -209,14 +195,13 @@ default_entry:
 	loop 11b
 
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables.
+	 * End condition: we must map up to the end.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $KERNEL_IMAGE_SIZE, %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
-	movl %edi,pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -228,7 +213,6 @@ default_entry:
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
 	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_dir), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -242,14 +226,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	addl $0x1000,%eax
 	loop 11b
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables; the +0x007 is
+	 * End condition: we must map up to end, the +0x007 is
 	 * the attribute bits
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $KERNEL_IMAGE_SIZE, %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
-	movl %edi,pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -662,6 +645,8 @@ ENTRY(swapper_pg_dir)
 # endif
 	.align PAGE_SIZE_asm		/* needs to be page-sized too */
 #endif
+ENTRY(pg0)
+	.fill INIT_MAP_SIZE,1,0
 
 .data
 ENTRY(stack_start)
Index: linux-2.6/arch/x86/kernel/setup.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/setup.c
+++ linux-2.6/arch/x86/kernel/setup.c
@@ -158,11 +158,6 @@ static struct resource bss_resource = {
 
 
 #ifdef CONFIG_X86_32
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
 
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
@@ -715,11 +710,7 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
 	init_mm.brk = (unsigned long) &_end;
-#endif
 
 	code_resource.start = virt_to_phys(_text);
 	code_resource.end = virt_to_phys(_etext)-1;
Index: linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/vmlinux_32.lds.S
+++ linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
@@ -190,9 +190,6 @@ SECTIONS
 	. = ALIGN(4);
 	__bss_stop = .;
   	_end = . ;
-	/* This is where the kernel creates the early boot page tables */
-	. = ALIGN(PAGE_SIZE);
-	pg0 = . ;
   }
 
   /* Sections to be discarded */
@@ -205,6 +202,12 @@ SECTIONS
   DWARF_DEBUG
 }
 
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+	"kernel image bigger than KERNEL_IMAGE_SIZE")
+
 #ifdef CONFIG_KEXEC
 /* Link time checks */
 #include <asm/kexec.h>
Index: linux-2.6/arch/x86/lguest/boot.c
===================================================================
--- linux-2.6.orig/arch/x86/lguest/boot.c
+++ linux-2.6/arch/x86/lguest/boot.c
@@ -1051,14 +1051,6 @@ __init void lguest_init(void)
 	 * lguest_init() where the rest of the fairly chaotic boot setup
 	 * occurs. */
 
-	/* The native boot code sets up initial page tables immediately after
-	 * the kernel itself, and sets init_pg_tables_end so they're not
-	 * clobbered.  The Launcher places our initial pagetables somewhere at
-	 * the top of our physical memory, so we don't need extra space: set
-	 * init_pg_tables_end to the end of the kernel. */
-	init_pg_tables_start = __pa(pg0);
-	init_pg_tables_end = __pa(pg0);
-
 	/* As described in head_32.S, we map the first 128M of memory. */
 	max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
 
Index: linux-2.6/arch/x86/xen/mmu.c
===================================================================
--- linux-2.6.orig/arch/x86/xen/mmu.c
+++ linux-2.6/arch/x86/xen/mmu.c
@@ -1716,9 +1716,7 @@ __init pgd_t *xen_setup_kernel_pagetable
 {
 	pmd_t *kernel_pmd;
 
-	init_pg_tables_start = __pa(pgd);
-	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-	max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+	max_pfn_mapped = PFN_DOWN(__pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE + 512*1024);
 
 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
 	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
Index: linux-2.6/arch/x86/include/asm/pgtable_32.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/pgtable_32.h
+++ linux-2.6/arch/x86/include/asm/pgtable_32.h
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, u
  */
 #undef TEST_ACCESS_OK
 
-/* The boot page tables (all created as a single array) */
-extern unsigned long pg0[];
-
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
 #else
Index: linux-2.6/arch/x86/include/asm/page_32_types.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/page_32_types.h
+++ linux-2.6/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,15 @@
 #define __VIRTUAL_MASK_SHIFT	32
 #endif	/* CONFIG_X86_PAE */
 
+/*
+ * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ */
+#ifndef CONFIG_VMLINUX_RAM_SIZE
+# define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
+#else
+# define KERNEL_IMAGE_SIZE	CONFIG_VMLINUX_RAM_SIZE
+#endif
+
 #ifndef __ASSEMBLY__
 
 /*
Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -1030,6 +1030,15 @@ config PAGE_OFFSET
 	default 0xC0000000
 	depends on X86_32
 
+config VMLINUX_RAM_SIZE
+	hex "Initial ram size directly mapped"
+	range 0x400000 0x20000000
+	default 0x1000000
+	depends on X86_32 && EMBEDDED
+	---help---
+	  Select ram size that initial page table will cover. for system less 512M ram installed.
+	  the value should be greater than vmlinux and less than 512M
+
 config HIGHMEM
 	def_bool y
 	depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-03-01 23:53                 ` Jeremy Fitzhardinge
@ 2009-03-02  1:02                   ` Yinghai Lu
  2009-03-02  1:07                     ` H. Peter Anvin
  0 siblings, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-03-02  1:02 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Jeremy Fitzhardinge wrote:
> Yinghai Lu wrote:
>>> But its no different from what i386 does now to allocate its initial
>>> pagetables.  How does this not break now?
>>>
>>>     
>>
>> it will try to use initial page table af first, and it is not big
>> enough, it will according to e820 and other reserved_early areas to
>> find good positions.
>>   
> 
> head_32.S has no such logic.  It just starts building the kernel
> mappings directly after _end, starting at pg0, and uses as much space as
> it needs.  For a !PSE CPU with a large kernel, that can be quite a lot
> of space.  Only later, when its creating the linear memory mappings,
> does it search around in the e820 tables (which it now has access to)
> for space.

yes, we should have some way to only make initial_pgtable only cover to _end
and put that initial_pagbe before _end.

> 
> The whole point of the brk segment was to have a way of allocating some
> memory very early, before e820 is even available.  If you really think
> this is dangerous, then we can easily extend the bss in the linker
> script to include the brk memory, and release any leftover when we do
> the normal bootmem freeup.  That would also give us a well-defined upper
> limit on how much brk memory can be allocated; its a bit undefined at
> the moment, as it depends on how much slop there is after the kernel
> mapping.

hope later boot loader could check vmlinux size in bzImage (according uncompressed size)
and find good position in RAM for bzImage.

we should find good position for brk with find_e820_area().

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-03-02  1:02                   ` Yinghai Lu
@ 2009-03-02  1:07                     ` H. Peter Anvin
  2009-03-02  1:16                       ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 43+ messages in thread
From: H. Peter Anvin @ 2009-03-02  1:07 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Jeremy Fitzhardinge, Ingo Molnar, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Yinghai Lu wrote:
> 
> hope later boot loader could check vmlinux size in bzImage (according uncompressed size)
> and find good position in RAM for bzImage.
> 
> we should find good position for brk with find_e820_area().
> 

If the brk is bounded-size on the same order or smaller as the kernel,
we should just mark it as an unallocated (bss) section in the ELF image
and be done with it... there really is no point in trying to be smarter
(we'd be subject to failures to load the kernel proper.)  If the brk is
significantly bigger, then yes, we need to be smarter.  However, that is
not my current understanding of the requirements.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-03-02  1:07                     ` H. Peter Anvin
@ 2009-03-02  1:16                       ` Jeremy Fitzhardinge
  2009-03-02  1:36                         ` H. Peter Anvin
  0 siblings, 1 reply; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-03-02  1:16 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Ingo Molnar, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

H. Peter Anvin wrote:
> If the brk is bounded-size on the same order or smaller as the kernel,
> we should just mark it as an unallocated (bss) section in the ELF image
> and be done with it... there really is no point in trying to be smarter
> (we'd be subject to failures to load the kernel proper.)  If the brk is
> significantly bigger, then yes, we need to be smarter.  However, that is
> not my current understanding of the requirements.
>   

Yes, right.  And in my case I actually need it to generate an 
appropriate e820 table, so adding a dependency on e820 would be circular...

(To be specific: I reshape the guest e820 table so that it doesn't have 
memory in any forbidden areas of the host e820 table.  That may require 
moving the pseudo-physical address of pages into a new overflow e820 
entry, which would also require allocating pages for the p2m radix tree.)

    J

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-03-02  1:16                       ` Jeremy Fitzhardinge
@ 2009-03-02  1:36                         ` H. Peter Anvin
  2009-03-02  1:54                           ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 43+ messages in thread
From: H. Peter Anvin @ 2009-03-02  1:36 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Yinghai Lu, Ingo Molnar, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Jeremy Fitzhardinge wrote:
> 
> (To be specific: I reshape the guest e820 table so that it doesn't have
> memory in any forbidden areas of the host e820 table.  That may require
> moving the pseudo-physical address of pages into a new overflow e820
> entry, which would also require allocating pages for the p2m radix tree.)
> 

Isn't that the domain builder's job?

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-03-02  1:36                         ` H. Peter Anvin
@ 2009-03-02  1:54                           ` Jeremy Fitzhardinge
  2009-03-02  2:12                             ` Yinghai Lu
  0 siblings, 1 reply; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-03-02  1:54 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Ingo Molnar, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

H. Peter Anvin wrote:
> Jeremy Fitzhardinge wrote:
>   
>> (To be specific: I reshape the guest e820 table so that it doesn't have
>> memory in any forbidden areas of the host e820 table.  That may require
>> moving the pseudo-physical address of pages into a new overflow e820
>> entry, which would also require allocating pages for the p2m radix tree.)
>>
>>     
>
> Isn't that the domain builder's job?
>   

No.  It just provides a flat memory map from 0-max_pfn.  I want to avoid 
the overlap between pseudo-phys and machine memory so that the resource 
manager doesn't get confused by overlapping ranges, and so that it 
places the pci window in the same place as native.

    J

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: brk patches..
  2009-03-02  1:54                           ` Jeremy Fitzhardinge
@ 2009-03-02  2:12                             ` Yinghai Lu
  0 siblings, 0 replies; 43+ messages in thread
From: Yinghai Lu @ 2009-03-02  2:12 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, Ingo Molnar, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

Jeremy Fitzhardinge wrote:
> H. Peter Anvin wrote:
>> Jeremy Fitzhardinge wrote:
>>  
>>> (To be specific: I reshape the guest e820 table so that it doesn't have
>>> memory in any forbidden areas of the host e820 table.  That may require
>>> moving the pseudo-physical address of pages into a new overflow e820
>>> entry, which would also require allocating pages for the p2m radix
>>> tree.)
>>>
>>>     
>>
>> Isn't that the domain builder's job?
>>   
> 
> No.  It just provides a flat memory map from 0-max_pfn.  I want to avoid
> the overlap between pseudo-phys and machine memory so that the resource
> manager doesn't get confused by overlapping ranges, and so that it
> places the pci window in the same place as native.

where will those mem allocator needed?

if it is used after finish_e820_parsing(), you could use find_e820_area()

YH

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into bss -v2
  2009-03-01 23:29                       ` H. Peter Anvin
  2009-03-02  0:55                         ` Yinghai Lu
@ 2009-03-09  7:45                         ` Yinghai Lu
  1 sibling, 0 replies; 43+ messages in thread
From: Yinghai Lu @ 2009-03-09  7:45 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner,
	Linux Kernel Mailing List, Jeremy Fitzhardinge

[-- Attachment #1: Type: text/plain, Size: 834 bytes --]

On Sun, Mar 1, 2009 at 4:29 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> It's still really ugly, though.  A much easier and cleaner way would seem to
> be to calculate a far limit on the brk and then marking it as a formal
> (non-alloc) section in the linker script and vmlinux file.  That way
> anything that examines the vmlinux file will see it as an exclusion section.
>  We can (and should) even verify that we don't overflow the brk and panic if
> we do.

please check attached patch.

[PATCH] x86: put initial_pg_tables into .bss -v4

Impact: cleanup

Don't use ram after _end blindly for pagetables. aka init pages is before _end
put those pg table into .bss

v2: keep initial page table up to 512M only.
v4: put initial page tables just before _end

Signed-off-by: Yinghai Lu <yinghai@kernel.org>


YH

[-- Attachment #2: initial_pg_tables_4.patch --]
[-- Type: text/x-diff, Size: 6433 bytes --]

[PATCH] x86: put initial_pg_tables into .bss -v4

Impact: cleanup

Don't use ram after _end blindly for pagetables. aka init pages is before _end
put those pg table into .bss

v2: keep initial page table up to 512M only.
v4: put initial page tables just before _end

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/include/asm/page_32_types.h |    5 +++
 arch/x86/kernel/head32.c             |    3 +
 arch/x86/kernel/head_32.S            |   55 ++++++++++++++---------------------
 arch/x86/kernel/vmlinux_32.lds.S     |   11 ++++++-
 4 files changed, 40 insertions(+), 34 deletions(-)

Index: linux-2.6/arch/x86/kernel/head32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head32.c
+++ linux-2.6/arch/x86/kernel/head32.c
@@ -18,7 +18,8 @@ void __init i386_start_kernel(void)
 {
 	reserve_trampoline_memory();
 
-	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop),
+			 "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
Index: linux-2.6/arch/x86/kernel/head_32.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head_32.S
+++ linux-2.6/arch/x86/kernel/head_32.S
@@ -38,42 +38,30 @@
 #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
 
 /*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
- * We need:
- *  - one bit for each possible page, but only in low memory, which means
- *     2^32/4096/8 = 128K worst case (4G/4G split.)
+ * This is how much memory for page table to and including _end
+ * we need mapped initially.
  *  - enough space to map all low memory, which means
- *     (2^32/4096) / 1024 pages (worst case, non PAE)
- *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
- *  - a few pages for allocator use before the kernel pagetable has
- *     been set up
+ *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
  *
  * Modulo rounding, each megabyte assigned here requires a kilobyte of
  * memory, which is currently unreclaimed.
  *
  * This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
  */
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
+LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT
 
 #if PTRS_PER_PMD > 1
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
 #else
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
 #endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
 ALLOCATOR_SLOP = 4
 
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
@@ -166,10 +154,9 @@ num_subarch_entries = (. - subarch_entri
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond _end.
  * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end
  *
  * Note that the stack is not yet set up!
  */
@@ -209,14 +196,14 @@ default_entry:
 	loop 11b
 
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables.
+	 * End condition: we must map up to the end.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end), %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
-	movl %edi,pa(init_pg_tables_end)
+	movl %edi, pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -242,14 +229,14 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	addl $0x1000,%eax
 	loop 11b
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables; the +0x007 is
+	 * End condition: we must map up to end, the +0x007 is
 	 * the attribute bits
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end), %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
-	movl %edi,pa(init_pg_tables_end)
+	movl %edi, pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -636,6 +623,10 @@ swapper_pg_fixmap:
 	.fill 1024,4,0
 ENTRY(empty_zero_page)
 	.fill 4096,1,0
+
+.section ".bss.extra_page_aligned","wa"
+	.align PAGE_SIZE_asm
+	.fill INIT_MAP_SIZE,1,0
 /*
  * This starts the data section.
  */
Index: linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/vmlinux_32.lds.S
+++ linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,10 +189,13 @@ SECTIONS
 	*(.bss)
 	. = ALIGN(4);
 	__bss_stop = .;
-  	_end = . ;
+	/* extra_page_aligned must be last one before end*/
 	/* This is where the kernel creates the early boot page tables */
 	. = ALIGN(PAGE_SIZE);
 	pg0 = . ;
+	*(.bss.extra_page_aligned)
+	. = ALIGN(8);
+	_end = . ;
   }
 
   /* Sections to be discarded */
@@ -205,6 +208,12 @@ SECTIONS
   DWARF_DEBUG
 }
 
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+	"kernel image bigger than KERNEL_IMAGE_SIZE")
+
 #ifdef CONFIG_KEXEC
 /* Link time checks */
 #include <asm/kexec.h>
Index: linux-2.6/arch/x86/include/asm/page_32_types.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/page_32_types.h
+++ linux-2.6/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,11 @@
 #define __VIRTUAL_MASK_SHIFT	32
 #endif	/* CONFIG_X86_PAE */
 
+/*
+ * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ */
+#define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
+
 #ifndef __ASSEMBLY__
 
 /*

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] x86: put initial_pg_tables into .bss -v4
  2009-03-02  0:55                         ` Yinghai Lu
@ 2009-03-09  8:15                           ` Yinghai Lu
  2009-03-09 15:41                             ` H. Peter Anvin
  0 siblings, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-03-09  8:15 UTC (permalink / raw)
  To: H. Peter Anvin, Ingo Molnar, Andrew Morton, Thomas Gleixner,
	Jeremy Fitzhardinge
  Cc: Linux Kernel Mailing List


Impact: cleanup

Don't use ram after _end blindly for pagetables. aka init pages is before _end
put those pg table into .bss

v2: keep initial page table up to 512M only.
v4: put initial page tables just before _end

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/include/asm/page_32_types.h |    5 +++
 arch/x86/kernel/head32.c             |    3 +
 arch/x86/kernel/head_32.S            |   55 ++++++++++++++---------------------
 arch/x86/kernel/vmlinux_32.lds.S     |   11 ++++++-
 4 files changed, 40 insertions(+), 34 deletions(-)

Index: linux-2.6/arch/x86/kernel/head32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head32.c
+++ linux-2.6/arch/x86/kernel/head32.c
@@ -18,7 +18,8 @@ void __init i386_start_kernel(void)
 {
 	reserve_trampoline_memory();
 
-	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop),
+			 "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
Index: linux-2.6/arch/x86/kernel/head_32.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head_32.S
+++ linux-2.6/arch/x86/kernel/head_32.S
@@ -38,42 +38,30 @@
 #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
 
 /*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
- * We need:
- *  - one bit for each possible page, but only in low memory, which means
- *     2^32/4096/8 = 128K worst case (4G/4G split.)
+ * This is how much memory for page table to and including _end
+ * we need mapped initially.
  *  - enough space to map all low memory, which means
- *     (2^32/4096) / 1024 pages (worst case, non PAE)
- *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
- *  - a few pages for allocator use before the kernel pagetable has
- *     been set up
+ *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
  *
  * Modulo rounding, each megabyte assigned here requires a kilobyte of
  * memory, which is currently unreclaimed.
  *
  * This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
  */
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
+LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT
 
 #if PTRS_PER_PMD > 1
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
 #else
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
 #endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
 ALLOCATOR_SLOP = 4
 
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
@@ -166,10 +154,9 @@ num_subarch_entries = (. - subarch_entri
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond _end.
  * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end
  *
  * Note that the stack is not yet set up!
  */
@@ -209,14 +196,14 @@ default_entry:
 	loop 11b
 
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables.
+	 * End condition: we must map up to the end.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end), %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
-	movl %edi,pa(init_pg_tables_end)
+	movl %edi, pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -242,14 +229,14 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	addl $0x1000,%eax
 	loop 11b
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables; the +0x007 is
+	 * End condition: we must map up to end, the +0x007 is
 	 * the attribute bits
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end), %ebp
+	addl PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
-	movl %edi,pa(init_pg_tables_end)
+	movl %edi, pa(init_pg_tables_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -636,6 +623,10 @@ swapper_pg_fixmap:
 	.fill 1024,4,0
 ENTRY(empty_zero_page)
 	.fill 4096,1,0
+
+.section ".bss.extra_page_aligned","wa"
+	.align PAGE_SIZE_asm
+	.fill INIT_MAP_SIZE,1,0
 /*
  * This starts the data section.
  */
Index: linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/vmlinux_32.lds.S
+++ linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,10 +189,13 @@ SECTIONS
 	*(.bss)
 	. = ALIGN(4);
 	__bss_stop = .;
-  	_end = . ;
+	/* extra_page_aligned must be last one before end*/
 	/* This is where the kernel creates the early boot page tables */
 	. = ALIGN(PAGE_SIZE);
 	pg0 = . ;
+	*(.bss.extra_page_aligned)
+	. = ALIGN(8);
+	_end = . ;
   }
 
   /* Sections to be discarded */
@@ -205,6 +208,12 @@ SECTIONS
   DWARF_DEBUG
 }
 
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+	"kernel image bigger than KERNEL_IMAGE_SIZE")
+
 #ifdef CONFIG_KEXEC
 /* Link time checks */
 #include <asm/kexec.h>
Index: linux-2.6/arch/x86/include/asm/page_32_types.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/page_32_types.h
+++ linux-2.6/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,11 @@
 #define __VIRTUAL_MASK_SHIFT	32
 #endif	/* CONFIG_X86_PAE */
 
+/*
+ * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ */
+#define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
+
 #ifndef __ASSEMBLY__
 
 /*

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into .bss -v4
  2009-03-09  8:15                           ` [PATCH] x86: put initial_pg_tables into .bss -v4 Yinghai Lu
@ 2009-03-09 15:41                             ` H. Peter Anvin
  2009-03-09 17:35                               ` Yinghai Lu
  0 siblings, 1 reply; 43+ messages in thread
From: H. Peter Anvin @ 2009-03-09 15:41 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Jeremy Fitzhardinge,
	Linux Kernel Mailing List

Yinghai Lu wrote:
> Impact: cleanup
> 
> Don't use ram after _end blindly for pagetables. aka init pages is before _end
> put those pg table into .bss
> 
> v2: keep initial page table up to 512M only.
> v4: put initial page tables just before _end
> 
> Signed-off-by: Yinghai Lu <yinghai@kernel.org>
> 

I still feel that this is a movement in *EXACTLY* the wrong direction,
as it is deliberately intended to prevent a general allocator for
anything that needs to be dynamic very early on.  I still think that
makes a lot more sense.

	-hpa


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into .bss -v4
  2009-03-09 15:41                             ` H. Peter Anvin
@ 2009-03-09 17:35                               ` Yinghai Lu
  2009-03-09 18:28                                 ` H. Peter Anvin
  0 siblings, 1 reply; 43+ messages in thread
From: Yinghai Lu @ 2009-03-09 17:35 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Jeremy Fitzhardinge,
	Linux Kernel Mailing List

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>> Impact: cleanup
>>
>> Don't use ram after _end blindly for pagetables. aka init pages is before _end
>> put those pg table into .bss
>>
>> v2: keep initial page table up to 512M only.
>> v4: put initial page tables just before _end
>>
>> Signed-off-by: Yinghai Lu <yinghai@kernel.org>
>>
> 
> I still feel that this is a movement in *EXACTLY* the wrong direction,
> as it is deliberately intended to prevent a general allocator for
> anything that needs to be dynamic very early on.  I still think that
> makes a lot more sense.

it just estimates initial_pg_tables size, and make _end a little bigger (1M), so boot loader could have idea of correct size of vmlinux aka the uncompressed size of in kernel.

I assume brk patches could estimate the extra size that it needs too.

YH



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into .bss -v4
  2009-03-09 17:35                               ` Yinghai Lu
@ 2009-03-09 18:28                                 ` H. Peter Anvin
  2009-03-11  1:39                                   ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 43+ messages in thread
From: H. Peter Anvin @ 2009-03-09 18:28 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Jeremy Fitzhardinge,
	Linux Kernel Mailing List

Yinghai Lu wrote:
> 
> it just estimates initial_pg_tables size, and make _end a little bigger (1M), so boot loader could have idea of correct size of vmlinux aka the uncompressed size of in kernel.
> 
> I assume brk patches could estimate the extra size that it needs too.
> 

Yes, and I have made that point several times in this thread already.

I really like the brk interface because it's a clean, general-purpose 
allocator.

We can create a brk segment in vmlinux (and even bound the brk to catch 
overflows, instead of randomly failing) to advertise the presence of the 
brk.

	-hpa


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] x86: put initial_pg_tables into .bss -v4
  2009-03-09 18:28                                 ` H. Peter Anvin
@ 2009-03-11  1:39                                   ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 43+ messages in thread
From: Jeremy Fitzhardinge @ 2009-03-11  1:39 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Ingo Molnar, Andrew Morton, Thomas Gleixner,
	Linux Kernel Mailing List

H. Peter Anvin wrote:
> Yinghai Lu wrote:
>   
>> it just estimates initial_pg_tables size, and make _end a little bigger (1M), so boot loader could have idea of correct size of vmlinux aka the uncompressed size of in kernel.
>>
>> I assume brk patches could estimate the extra size that it needs too.
>>
>>     
>
> Yes, and I have made that point several times in this thread already.
>
> I really like the brk interface because it's a clean, general-purpose 
> allocator.
>
> We can create a brk segment in vmlinux (and even bound the brk to catch 
> overflows, instead of randomly failing) to advertise the presence of the 
> brk.
>   

I have a patchset to do exactly this in testing; I'll try to mail it out 
later this evening.  Its very similar (including reserving 1MB after the 
end of the bss variables, in the bss section).

    J

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2009-03-11  1:39 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-02-28  1:51 [PATCH] Simple brk allocator for very early allocations Jeremy Fitzhardinge
2009-02-28  1:51 ` [PATCH] x86: add brk allocation for very, " Jeremy Fitzhardinge
2009-02-28  1:51 ` [PATCH] x86: reserve brk earlier Jeremy Fitzhardinge
2009-02-28  1:51 ` [PATCH] x86-32: use brk segment for allocating initial kernel pagetable Jeremy Fitzhardinge
2009-02-28  7:02   ` Yinghai Lu
2009-02-28  7:05     ` J Jeremy Fitzhardinge
2009-02-28  7:15       ` J Ingo Molnar
2009-02-28  7:39         ` does boot loader check uncompressed kernel size? Yinghai Lu
2009-02-28  7:47           ` Cyrill Gorcunov
2009-02-28  7:54             ` Yinghai Lu
2009-02-28  8:08               ` H. Peter Anvin
2009-02-28 20:42                 ` Yinghai Lu
2009-02-28  7:52           ` brk patches Yinghai Lu
2009-02-28  8:08             ` H. Peter Anvin
2009-02-28  8:17             ` Jeremy Fitzhardinge
2009-02-28 20:40               ` Yinghai Lu
2009-03-01 23:53                 ` Jeremy Fitzhardinge
2009-03-02  1:02                   ` Yinghai Lu
2009-03-02  1:07                     ` H. Peter Anvin
2009-03-02  1:16                       ` Jeremy Fitzhardinge
2009-03-02  1:36                         ` H. Peter Anvin
2009-03-02  1:54                           ` Jeremy Fitzhardinge
2009-03-02  2:12                             ` Yinghai Lu
2009-03-01  1:23               ` [PATCH] x86: put initial_pg_tables into bss Yinghai Lu
2009-03-01  8:31                 ` [PATCH] x86: put initial_pg_tables into bss -v2 Yinghai Lu
2009-03-01  9:20                   ` H. Peter Anvin
2009-03-01 17:49                     ` Yinghai Lu
2009-03-01 18:06                     ` Yinghai Lu
2009-03-01 23:29                       ` H. Peter Anvin
2009-03-02  0:55                         ` Yinghai Lu
2009-03-09  8:15                           ` [PATCH] x86: put initial_pg_tables into .bss -v4 Yinghai Lu
2009-03-09 15:41                             ` H. Peter Anvin
2009-03-09 17:35                               ` Yinghai Lu
2009-03-09 18:28                                 ` H. Peter Anvin
2009-03-11  1:39                                   ` Jeremy Fitzhardinge
2009-03-09  7:45                         ` [PATCH] x86: put initial_pg_tables into bss -v2 Yinghai Lu
2009-02-28  8:07           ` does boot loader check uncompressed kernel size? H. Peter Anvin
2009-02-28  8:17         ` J Jeremy Fitzhardinge
2009-02-28  7:30       ` J Yinghai Lu
2009-02-28  1:51 ` [PATCH] x86: use brk allocation for DMI Jeremy Fitzhardinge
2009-02-28  1:51 ` [PATCH] x86: leave _brk_end defined Jeremy Fitzhardinge
2009-02-28  5:23 ` [PATCH] Simple brk allocator for very early allocations Andrew Morton
2009-02-28  6:30   ` Jeremy Fitzhardinge

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.