All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yinghai Lu <yinghai@kernel.org>
To: Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@elte.hu>,
	"H. Peter Anvin" <hpa@zytor.com>, Jacob Shin <jacob.shin@amd.com>,
	Tejun Heo <tj@kernel.org>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>,
	linux-kernel@vger.kernel.org, Yinghai Lu <yinghai@kernel.org>
Subject: [PATCH 06/19] x86, mm: setup page table in top-down
Date: Thu, 18 Oct 2012 13:50:17 -0700	[thread overview]
Message-ID: <1350593430-24470-10-git-send-email-yinghai@kernel.org> (raw)
In-Reply-To: <1350593430-24470-1-git-send-email-yinghai@kernel.org>

Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first.
then use mapped pages to map more range below, and keep looping until
all pages get mapped.

alloc_low_page will use page from BRK at first, after that buff is used up,
will use memblock to find and reserve page for page table usage.

At last we could get rid of calculation and find early pgt related code.

-v2: update to after fix_xen change,
     also use MACRO for initial pgt_buf size and add comments with it.
-v3: skip big reserved range in memblock.reserved near end.
-v4: don't need fix_xen change now.

Suggested-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/include/asm/page_types.h |    1 +
 arch/x86/include/asm/pgtable.h    |    1 +
 arch/x86/kernel/setup.c           |    3 +
 arch/x86/mm/init.c                |  207 ++++++++++--------------------------
 arch/x86/mm/init_32.c             |   17 +++-
 arch/x86/mm/init_64.c             |   17 +++-
 6 files changed, 91 insertions(+), 155 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 54c9787..9f6f3e6 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr);
 
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
+extern unsigned long min_pfn_mapped;
 
 static inline phys_addr_t get_max_mapped(void)
 {
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index dd1a888..6991a3e 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd)
 
 extern int direct_gbpages;
 void init_mem_mapping(void);
+void early_alloc_pgt_buf(void);
 
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e72e4c6..73cb7ba 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -124,6 +124,7 @@
  */
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
+unsigned long min_pfn_mapped;
 
 #ifdef CONFIG_DMI
 RESERVE_BRK(dmi_alloc, 65536);
@@ -897,6 +898,8 @@ void __init setup_arch(char **cmdline_p)
 
 	reserve_ibft_region();
 
+	early_alloc_pgt_buf();
+
 	/*
 	 * Need to conclude brk, before memblock_x86_fill()
 	 *  it could use memblock_find_in_range, could overlap with
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index dbb2916..9ff29c1 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start;
 unsigned long __meminitdata pgt_buf_end;
 unsigned long __meminitdata pgt_buf_top;
 
+/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE	(5 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void  __init early_alloc_pgt_buf(void)
+{
+	unsigned long tables = INIT_PGT_BUF_SIZE;
+	phys_addr_t base;
+
+	base = __pa(extend_brk(tables, PAGE_SIZE));
+
+	pgt_buf_start = base >> PAGE_SHIFT;
+	pgt_buf_end = pgt_buf_start;
+	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
+
 int after_bootmem;
 
 int direct_gbpages
@@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
 	return nr_range;
 }
 
-static unsigned long __init calculate_table_space_size(unsigned long start,
-					  unsigned long end)
-{
-	unsigned long puds = 0, pmds = 0, ptes = 0, tables;
-	struct map_range mr[NR_RANGE_MR];
-	int nr_range, i;
-
-	pr_info("calculate_table_space_size: [mem %#010lx-%#010lx]\n",
-	       start, end - 1);
-
-	memset(mr, 0, sizeof(mr));
-	nr_range = 0;
-	nr_range = split_mem_range(mr, nr_range, start, end);
-
-	for (i = 0; i < nr_range; i++) {
-		unsigned long range, extra;
-
-		range = mr[i].end - mr[i].start;
-		puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
-
-		if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
-			extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
-			pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-		} else
-			pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
-
-		if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
-			extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
-			extra += PMD_SIZE;
-#endif
-			/* The first 2/4M doesn't use large pages. */
-			if (mr[i].start < PMD_SIZE)
-				extra += PMD_SIZE - mr[i].start;
-
-			ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		} else
-			ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	}
-
-	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-#ifdef CONFIG_X86_32
-	/* for fixmap */
-	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
-
-	return tables;
-}
-
-static unsigned long __init calculate_all_table_space_size(void)
-{
-	unsigned long start_pfn, end_pfn;
-	unsigned long tables;
-	int i;
-
-	/* the ISA range is always mapped regardless of memory holes */
-	tables = calculate_table_space_size(0, ISA_END_ADDRESS);
-
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
-		u64 start = start_pfn << PAGE_SHIFT;
-		u64 end = end_pfn << PAGE_SHIFT;
-
-		if (end <= ISA_END_ADDRESS)
-			continue;
-
-		if (start < ISA_END_ADDRESS)
-			start = ISA_END_ADDRESS;
-#ifdef CONFIG_X86_32
-		/* on 32 bit, we only map up to max_low_pfn */
-		if ((start >> PAGE_SHIFT) >= max_low_pfn)
-			continue;
-
-		if ((end >> PAGE_SHIFT) > max_low_pfn)
-			end = max_low_pfn << PAGE_SHIFT;
-#endif
-		tables += calculate_table_space_size(start, end);
-	}
-
-	return tables;
-}
-
-static void __init find_early_table_space(unsigned long start,
-					  unsigned long good_end,
-					  unsigned long tables)
-{
-	phys_addr_t base;
-
-	base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
-	if (!base)
-		panic("Cannot find space for the kernel page tables");
-
-	pgt_buf_start = base >> PAGE_SHIFT;
-	pgt_buf_end = pgt_buf_start;
-	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
-}
-
 static struct range pfn_mapped[E820_X_MAX];
 static int nr_pfn_mapped;
 
@@ -391,17 +307,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 }
 
 /*
- * Iterate through E820 memory map and create direct mappings for only E820_RAM
- * regions. We cannot simply create direct mappings for all pfns from
- * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in
- * high addresses that cannot be marked as UC by fixed/variable range MTRRs.
- * Depending on the alignment of E820 ranges, this may possibly result in using
- * smaller size (i.e. 4K instead of 2M or 1G) page tables.
+ * this one could take range with hole in it
  */
-static void __init init_range_memory_mapping(unsigned long range_start,
+static unsigned long __init init_range_memory_mapping(
+					   unsigned long range_start,
 					   unsigned long range_end)
 {
 	unsigned long start_pfn, end_pfn;
+	unsigned long mapped_ram_size = 0;
 	int i;
 
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
@@ -421,71 +334,67 @@ static void __init init_range_memory_mapping(unsigned long range_start,
 			end = range_end;
 
 		init_memory_mapping(start, end);
+
+		mapped_ram_size += end - start;
 	}
+
+	return mapped_ram_size;
 }
 
 void __init init_mem_mapping(void)
 {
-	unsigned long tables, good_end, end;
+	unsigned long end, real_end, start, last_start;
+	unsigned long step_size;
+	unsigned long addr;
+	unsigned long mapped_ram_size = 0;
+	unsigned long new_mapped_ram_size;
 
 	probe_page_size_mask();
 
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 *
-	 * Later we should allocate these tables in the local node of the
-	 * memory mapped. Unfortunately this is done currently before the
-	 * nodes are discovered.
-	 */
 #ifdef CONFIG_X86_64
 	end = max_pfn << PAGE_SHIFT;
-	good_end = end;
 #else
 	end = max_low_pfn << PAGE_SHIFT;
-	good_end = max_pfn_mapped << PAGE_SHIFT;
 #endif
-	tables = calculate_all_table_space_size();
-	find_early_table_space(0, good_end, tables);
-	printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n",
-		end - 1, pgt_buf_start << PAGE_SHIFT,
-		(pgt_buf_top << PAGE_SHIFT) - 1);
 
-	max_pfn_mapped = 0; /* will get exact value next */
 	/* the ISA range is always mapped regardless of memory holes */
 	init_memory_mapping(0, ISA_END_ADDRESS);
-	init_range_memory_mapping(ISA_END_ADDRESS, end);
+
+	/* xen has big range in reserved near end of ram, skip it at first */
+	addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
+			 PAGE_SIZE);
+	real_end = addr + PMD_SIZE;
+
+	/* step_size need to be small so pgt_buf from BRK could cover it */
+	step_size = PMD_SIZE;
+	max_pfn_mapped = 0; /* will get exact value next */
+	min_pfn_mapped = real_end >> PAGE_SHIFT;
+	last_start = start = real_end;
+	while (last_start > ISA_END_ADDRESS) {
+		if (last_start > step_size) {
+			start = round_down(last_start - 1, step_size);
+			if (start < ISA_END_ADDRESS)
+				start = ISA_END_ADDRESS;
+		} else
+			start = ISA_END_ADDRESS;
+		new_mapped_ram_size = init_range_memory_mapping(start,
+							last_start);
+		last_start = start;
+		min_pfn_mapped = last_start >> PAGE_SHIFT;
+		if (new_mapped_ram_size > mapped_ram_size)
+			step_size <<= 5;
+		mapped_ram_size += new_mapped_ram_size;
+	}
+
+	if (real_end < end)
+		init_range_memory_mapping(real_end, end);
+
 #ifdef CONFIG_X86_64
 	if (max_pfn > max_low_pfn) {
 		/* can we preseve max_low_pfn ?*/
 		max_low_pfn = max_pfn;
 	}
 #endif
-	/*
-	 * Reserve the kernel pagetable pages we used (pgt_buf_start -
-	 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
-	 * so that they can be reused for other purposes.
-	 *
-	 * On native it just means calling memblock_reserve, on Xen it also
-	 * means marking RW the pagetable pages that we allocated before
-	 * but that haven't been used.
-	 *
-	 * In fact on xen we mark RO the whole range pgt_buf_start -
-	 * pgt_buf_top, because we have to make sure that when
-	 * init_memory_mapping reaches the pagetable pages area, it maps
-	 * RO all the pagetable pages, including the ones that are beyond
-	 * pgt_buf_end at that time.
-	 */
-	if (pgt_buf_end > pgt_buf_start) {
-		printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n",
-			end - 1, pgt_buf_start << PAGE_SHIFT,
-			(pgt_buf_end << PAGE_SHIFT) - 1);
-		x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
-				PFN_PHYS(pgt_buf_end));
-	}
-
-	/* stop the wrong using */
-	pgt_buf_top = 0;
-
 	early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
 }
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 27f7fc6..7bb1106 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -61,11 +61,22 @@ bool __read_mostly __vmalloc_start_set = false;
 
 static __init void *alloc_low_page(void)
 {
-	unsigned long pfn = pgt_buf_end++;
+	unsigned long pfn;
 	void *adr;
 
-	if (pfn >= pgt_buf_top)
-		panic("alloc_low_page: ran out of memory");
+	if ((pgt_buf_end + 1) >= pgt_buf_top) {
+		unsigned long ret;
+		if (min_pfn_mapped >= max_pfn_mapped)
+			panic("alloc_low_page: ran out of memory");
+		ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+					max_pfn_mapped << PAGE_SHIFT,
+					PAGE_SIZE, PAGE_SIZE);
+		if (!ret)
+			panic("alloc_low_page: can not alloc memory");
+		memblock_reserve(ret, PAGE_SIZE);
+		pfn = ret >> PAGE_SHIFT;
+	} else
+		pfn = pgt_buf_end++;
 
 	adr = __va(pfn * PAGE_SIZE);
 	clear_page(adr);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4898e80..7dfa69b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -316,7 +316,7 @@ void __init cleanup_highmap(void)
 
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-	unsigned long pfn = pgt_buf_end++;
+	unsigned long pfn;
 	void *adr;
 
 	if (after_bootmem) {
@@ -326,8 +326,19 @@ static __ref void *alloc_low_page(unsigned long *phys)
 		return adr;
 	}
 
-	if (pfn >= pgt_buf_top)
-		panic("alloc_low_page: ran out of memory");
+	if ((pgt_buf_end + 1) >= pgt_buf_top) {
+		unsigned long ret;
+		if (min_pfn_mapped >= max_pfn_mapped)
+			panic("alloc_low_page: ran out of memory");
+		ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+					max_pfn_mapped << PAGE_SHIFT,
+					PAGE_SIZE, PAGE_SIZE);
+		if (!ret)
+			panic("alloc_low_page: can not alloc memory");
+		memblock_reserve(ret, PAGE_SIZE);
+		pfn = ret >> PAGE_SHIFT;
+	} else
+		pfn = pgt_buf_end++;
 
 	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
 	clear_page(adr);
-- 
1.7.7


  parent reply	other threads:[~2012-10-18 20:55 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-10-18 20:50 [PATCH -v5 00/19] x86: Use BRK to pre mapping page table to make xen happy Yinghai Lu
2012-10-18 20:50 ` [PATCH 1/3] ACPI: Introduce a new acpi handle to determine HID match Yinghai Lu
2012-11-02 12:23   ` Rafael J. Wysocki
2012-11-02 15:03     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 01/19] x86, mm: Align start address to correct big page size Yinghai Lu
2012-10-22 14:16   ` Konrad Rzeszutek Wilk
2012-10-22 16:31     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 2/3] PCI: correctly detect ACPI PCI host bridge objects Yinghai Lu
2012-10-26  9:10   ` Bjorn Helgaas
2012-10-26 18:13     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 02/19] x86, mm: Use big page size for small memory range Yinghai Lu
2012-10-22 14:21   ` Konrad Rzeszutek Wilk
2012-10-22 16:33     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 3/3] PCI, ACPI: debug print for installation of acpi root bridge's notifier Yinghai Lu
2012-10-18 20:50 ` [PATCH 03/19] x86, mm: Don't clear page table if range is ram Yinghai Lu
2012-10-22 14:28   ` Konrad Rzeszutek Wilk
2012-10-22 16:56     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 04/19] x86, mm: only keep initial mapping for ram Yinghai Lu
2012-10-22 14:33   ` Konrad Rzeszutek Wilk
2012-10-22 17:43     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 05/19] x86, mm: Break down init_all_memory_mapping Yinghai Lu
2012-10-18 20:50 ` Yinghai Lu [this message]
2012-10-19 16:24   ` [PATCH 06/19] x86, mm: setup page table in top-down Stefano Stabellini
2012-10-19 16:41     ` Yinghai Lu
2012-10-22 13:19       ` Stefano Stabellini
2012-10-22 18:17         ` Yinghai Lu
2012-10-22 18:22           ` Yinghai Lu
2012-10-23 12:16             ` Stefano Stabellini
2012-10-23 18:47               ` Yinghai Lu
2012-10-23 12:22           ` Stefano Stabellini
2012-10-23 18:37             ` Yinghai Lu
2012-10-24 10:55               ` Stefano Stabellini
2012-10-22 14:14       ` Konrad Rzeszutek Wilk
2012-10-22 15:06   ` Konrad Rzeszutek Wilk
2012-10-22 18:56     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 07/19] x86, mm: Remove early_memremap workaround for page table accessing on 64bit Yinghai Lu
2012-10-22 15:07   ` Konrad Rzeszutek Wilk
2012-10-22 19:08     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 08/19] x86, mm: Remove parameter in alloc_low_page for 64bit Yinghai Lu
2012-10-22 15:09   ` Konrad Rzeszutek Wilk
2012-10-22 19:09     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 09/19] x86, mm: Merge alloc_low_page between 64bit and 32bit Yinghai Lu
2012-10-22 15:11   ` Konrad Rzeszutek Wilk
2012-10-22 19:14     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 10/19] x86, mm: Move min_pfn_mapped back to mm/init.c Yinghai Lu
2012-10-18 20:50 ` [PATCH 11/19] x86, mm, xen: Remove mapping_pagatable_reserve Yinghai Lu
2012-10-22 15:14   ` Konrad Rzeszutek Wilk
2012-10-22 19:18     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 12/19] x86, mm: Add alloc_low_pages(num) Yinghai Lu
2012-10-22 15:17   ` Konrad Rzeszutek Wilk
2012-10-22 19:24     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 13/19] x86, mm: only call early_ioremap_page_table_range_init() once Yinghai Lu
2012-10-22 15:24   ` Konrad Rzeszutek Wilk
2012-10-22 19:40     ` Yinghai Lu
2012-10-23  0:01       ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 14/19] x86, mm: Move back pgt_buf_* to mm/init.c Yinghai Lu
2012-10-18 20:50 ` [PATCH 15/19] x86, mm: Move init_gbpages() out of setup.c Yinghai Lu
2012-10-18 20:50 ` [PATCH 16/19] x86, mm: change low/hignmem_pfn_init to static on 32bit Yinghai Lu
2012-10-18 20:50 ` [PATCH 17/19] x86, mm: Move function declaration into mm_internal.h Yinghai Lu
2012-10-18 20:50 ` [PATCH 18/19] x86, mm: Let "memmap=" take more entries one time Yinghai Lu
2012-10-22 15:19   ` Konrad Rzeszutek Wilk
2012-10-22 19:26     ` Yinghai Lu
2012-10-18 20:50 ` [PATCH 19/19] x86, mm: Add check before clear pte above max_low_pfn on 32bit Yinghai Lu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1350593430-24470-10-git-send-email-yinghai@kernel.org \
    --to=yinghai@kernel.org \
    --cc=hpa@zytor.com \
    --cc=jacob.shin@amd.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=stefano.stabellini@eu.citrix.com \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.