[patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2
@ 2008-09-23 21:00 Suresh Siddha
  2008-09-23 21:00 ` [patch 1/7] x86, cpa: rename PTE attribute macros for kernel direct mapping in early boot Suresh Siddha
                   ` (7 more replies)
  0 siblings, 8 replies; 18+ messages in thread
From: Suresh Siddha @ 2008-09-23 21:00 UTC (permalink / raw)
  To: mingo, hpa, tglx, arjan, venkatesh.pallipadi, jeremy; +Cc: linux-kernel

TLB Application note[1] says:

"The TLBs may contain both ordinary and large-page translations for a 4-KByte
 range of linear addresses. This may occur if software modifies the paging
 structures so that the page size used for the address range changes. If the
 two translations differ with respect to page frame or attributes (e.g.,
 permissions), processor behavior is undefined and may be implementation 
 specific. The processor may use a page frame or attributes that correspond to
 neither translation; it may improperly set or fail to set the dirty bit in the
 appropriate paging-structure entry.

 Such undefined behavior is problematic because prefetches and memory accesses
 that are a result of speculative execution may occur, using the affected range
 of linear addresses. It is also problematic if software (including the software
 modifying the paging structures) is accessing data or executing code in the
 affected range of linear addresses. Software should not write to a
 paging-structure entry in a way that would change, for any linear address,
 both the page size and either the page frame or attributes."

Currently we violate this at:

a. kernel identity mapping, where large/small pages setup very early in the
   boot will be split up/merged into large pages along with attribute changes
   during the direct memory mapping init.

b. while doing cpa(), potentially we will split large page and change attribute
   both at the same time.

Following patches fixes this behavior.

[1] http://developer.intel.com/design/processor/applnots/317080.pdf

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
--- 
v2:
 1. Fix the deadlock associated with smp call function and spin_lock_irqsave()
    for pgd_lock.
 2. No alias checks for __set_pages_np()/__set_pages_p() avoiding the large
    text page split from atomic context in CONFIG_DEBUG_PAGEALLOC.
 3. Serialize cpa() for !CONFIG_DEBUG_PAGEALLOC, so that a cpu with stale
    large page tlb's(but small PTE's in memory) won't do a cpa() in parallel
    to some other cpu splitting large pages and changing page attribute for
    a small page.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 1/7] x86, cpa: rename PTE attribute macros for kernel direct mapping in early boot
  2008-09-23 21:00 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Suresh Siddha
@ 2008-09-23 21:00 ` Suresh Siddha
  2008-09-23 21:00 ` [patch 2/7] x86, cpa: remove USER permission from the very early identity mapping attribute Suresh Siddha
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: Suresh Siddha @ 2008-09-23 21:00 UTC (permalink / raw)
  To: mingo, hpa, tglx, arjan, venkatesh.pallipadi, jeremy
  Cc: linux-kernel, Suresh Siddha

[-- Attachment #1: rename_init_pte_macros.patch --]
[-- Type: text/plain, Size: 4907 bytes --]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

Index: tip/arch/x86/kernel/head_32.S
===================================================================
--- tip.orig/arch/x86/kernel/head_32.S	2008-09-22 14:01:06.000000000 -0700
+++ tip/arch/x86/kernel/head_32.S	2008-09-22 14:01:07.000000000 -0700
@@ -172,10 +172,6 @@
  *
  * Note that the stack is not yet set up!
  */
-#define PTE_ATTR	0x007		/* PRESENT+RW+USER */
-#define PDE_ATTR	0x067		/* PRESENT+RW+USER+DIRTY+ACCESSED */
-#define PGD_ATTR	0x001		/* PRESENT (no other attributes) */
-
 default_entry:
 #ifdef CONFIG_X86_PAE
 
@@ -196,9 +192,9 @@
 	movl $pa(pg0), %edi
 	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_pmd), %edx
-	movl $PTE_ATTR, %eax
+	movl $PTE_IDENT_ATTR, %eax
 10:
-	leal PDE_ATTR(%edi),%ecx		/* Create PMD entry */
+	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
 	movl %ecx,(%edx)			/* Store PMD entry */
 						/* Upper half already zero */
 	addl $8,%edx
@@ -215,7 +211,7 @@
 	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
 	 * bytes beyond the end of our own page tables.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
+	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
@@ -224,7 +220,7 @@
 	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
-	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+	movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
 	movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
 #else	/* Not PAE */
 
@@ -233,9 +229,9 @@
 	movl $pa(pg0), %edi
 	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_dir), %edx
-	movl $PTE_ATTR, %eax
+	movl $PTE_IDENT_ATTR, %eax
 10:
-	leal PDE_ATTR(%edi),%ecx		/* Create PDE entry */
+	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
 	movl %ecx,(%edx)			/* Store identity PDE entry */
 	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
 	addl $4,%edx
@@ -249,7 +245,7 @@
 	 * bytes beyond the end of our own page tables; the +0x007 is
 	 * the attribute bits
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
+	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
 	cmpl %ebp,%eax
 	jb 10b
 	movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@
 	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
-	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+	movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
 	movl %eax,pa(swapper_pg_dir+0xffc)
 #endif
 	jmp 3f
@@ -634,19 +630,19 @@
 	/* Page-aligned for the benefit of paravirt? */
 	.align PAGE_SIZE_asm
 ENTRY(swapper_pg_dir)
-	.long	pa(swapper_pg_pmd+PGD_ATTR),0		/* low identity map */
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
 # if KPMDS == 3
-	.long	pa(swapper_pg_pmd+PGD_ATTR),0
-	.long	pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
-	.long	pa(swapper_pg_pmd+PGD_ATTR+0x2000),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
 # elif KPMDS == 2
 	.long	0,0
-	.long	pa(swapper_pg_pmd+PGD_ATTR),0
-	.long	pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
 # elif KPMDS == 1
 	.long	0,0
 	.long	0,0
-	.long	pa(swapper_pg_pmd+PGD_ATTR),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
 # else
 #  error "Kernel PMDs should be 1, 2 or 3"
 # endif
Index: tip/arch/x86/kernel/head_64.S
===================================================================
--- tip.orig/arch/x86/kernel/head_64.S	2008-09-22 14:01:06.000000000 -0700
+++ tip/arch/x86/kernel/head_64.S	2008-09-22 14:01:07.000000000 -0700
@@ -110,7 +110,7 @@
 	movq	%rdi, %rax
 	shrq	$PMD_SHIFT, %rax
 	andq	$(PTRS_PER_PMD - 1), %rax
-	leaq	__PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
+	leaq	__PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
 	leaq	level2_spare_pgt(%rip), %rbx
 	movq	%rdx, 0(%rbx, %rax, 8)
 ident_complete:
@@ -374,7 +374,7 @@
 	/* Since I easily can, map the first 1G.
 	 * Don't set NX because code runs from these pages.
 	 */
-	PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 
 NEXT_PAGE(level2_kernel_pgt)
 	/*
Index: tip/include/asm-x86/pgtable.h
===================================================================
--- tip.orig/include/asm-x86/pgtable.h	2008-09-22 14:01:06.000000000 -0700
+++ tip/include/asm-x86/pgtable.h	2008-09-22 14:01:07.000000000 -0700
@@ -142,6 +142,17 @@
 #define __S110	PAGE_SHARED_EXEC
 #define __S111	PAGE_SHARED_EXEC
 
+/*
+ * early identity mapping  pte attrib macros.
+ */
+#ifdef CONFIG_X86_64
+#define __PAGE_KERNEL_IDENT_LARGE_EXEC	__PAGE_KERNEL_LARGE_EXEC
+#else
+#define PTE_IDENT_ATTR	 0x007		/* PRESENT+RW+USER */
+#define PDE_IDENT_ATTR	 0x067		/* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PGD_IDENT_ATTR	 0x001		/* PRESENT (no other attributes) */
+#endif
+
 #ifndef __ASSEMBLY__
 
 /*

-- 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 2/7] x86, cpa: remove USER permission from the very early identity mapping attribute
  2008-09-23 21:00 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Suresh Siddha
  2008-09-23 21:00 ` [patch 1/7] x86, cpa: rename PTE attribute macros for kernel direct mapping in early boot Suresh Siddha
@ 2008-09-23 21:00 ` Suresh Siddha
  2008-09-23 21:00 ` [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence Suresh Siddha
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: Suresh Siddha @ 2008-09-23 21:00 UTC (permalink / raw)
  To: mingo, hpa, tglx, arjan, venkatesh.pallipadi, jeremy
  Cc: linux-kernel, Suresh Siddha

[-- Attachment #1: change_init_pte_mapping.patch --]
[-- Type: text/plain, Size: 1050 bytes --]

remove USER from the PTE/PDE attributes for the very early identity
mapping. We overwrite these mappings with KERNEL attribute later
in the boot. Just being paranoid here as there is no need for USER bit
to be set.

If this breaks something(don't know the history), then we can simply drop
this change.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

Index: tip/include/asm-x86/pgtable.h
===================================================================
--- tip.orig/include/asm-x86/pgtable.h	2008-09-22 15:59:32.000000000 -0700
+++ tip/include/asm-x86/pgtable.h	2008-09-22 15:59:34.000000000 -0700
@@ -148,8 +148,8 @@
 #ifdef CONFIG_X86_64
 #define __PAGE_KERNEL_IDENT_LARGE_EXEC	__PAGE_KERNEL_LARGE_EXEC
 #else
-#define PTE_IDENT_ATTR	 0x007		/* PRESENT+RW+USER */
-#define PDE_IDENT_ATTR	 0x067		/* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PTE_IDENT_ATTR	 0x003		/* PRESENT+RW */
+#define PDE_IDENT_ATTR	 0x063		/* PRESENT+RW+DIRTY+ACCESSED */
 #define PGD_IDENT_ATTR	 0x001		/* PRESENT (no other attributes) */
 #endif
 

-- 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-09-23 21:00 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Suresh Siddha
  2008-09-23 21:00 ` [patch 1/7] x86, cpa: rename PTE attribute macros for kernel direct mapping in early boot Suresh Siddha
  2008-09-23 21:00 ` [patch 2/7] x86, cpa: remove USER permission from the very early identity mapping attribute Suresh Siddha
@ 2008-09-23 21:00 ` Suresh Siddha
  2008-10-06 20:48   ` Jeremy Fitzhardinge
  2008-09-23 21:00 ` [patch 4/7] x86, cpa: dont use large pages for kernel identity mapping with DEBUG_PAGEALLOC Suresh Siddha
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Suresh Siddha @ 2008-09-23 21:00 UTC (permalink / raw)
  To: mingo, hpa, tglx, arjan, venkatesh.pallipadi, jeremy
  Cc: linux-kernel, Suresh Siddha

[-- Attachment #1: fix_large_small_page_identity_mappings.patch --]
[-- Type: text/plain, Size: 8950 bytes --]

In the first pass, kernel physical mapping will be setup using large or
small pages but uses the same PTE attributes as that of the early
PTE attributes setup by early boot code in head_[32|64].S

After flushing TLB's, we go through the second pass, which setups the
direct mapped PTE's with the appropriate attributes (like NX, GLOBAL etc)
which are runtime detectable.

This two pass mechanism conforms to the TLB app note which says:

"Software should not write to a paging-structure entry in a way that would
 change, for any linear address, both the page size and either the page frame
 or attributes."

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

Index: tip/arch/x86/mm/init_32.c
===================================================================
--- tip.orig/arch/x86/mm/init_32.c	2008-09-22 15:59:31.000000000 -0700
+++ tip/arch/x86/mm/init_32.c	2008-09-22 15:59:37.000000000 -0700
@@ -196,11 +196,30 @@
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
-	unsigned pages_2m = 0, pages_4k = 0;
+	unsigned pages_2m, pages_4k;
+	int mapping_iter;
+
+	/*
+	 * First iteration will setup identity mapping using large/small pages
+	 * based on use_pse, with other attributes same as set by
+	 * the early code in head_32.S
+	 *
+	 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+	 * as desired for the kernel identity mapping.
+	 *
+	 * This two pass mechanism conforms to the TLB app note which says:
+	 *
+	 *     "Software should not write to a paging-structure entry in a way
+	 *      that would change, for any linear address, both the page size
+	 *      and either the page frame or attributes."
+	 */
+	mapping_iter = 1;
 
 	if (!cpu_has_pse)
 		use_pse = 0;
 
+repeat:
+	pages_2m = pages_4k = 0;
 	pfn = start_pfn;
 	pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
 	pgd = pgd_base + pgd_idx;
@@ -226,6 +245,13 @@
 			if (use_pse) {
 				unsigned int addr2;
 				pgprot_t prot = PAGE_KERNEL_LARGE;
+				/*
+				 * first pass will use the same initial
+				 * identity mapping attribute + _PAGE_PSE.
+				 */
+				pgprot_t init_prot =
+					__pgprot(PTE_IDENT_ATTR |
+						 _PAGE_PSE);
 
 				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
 					PAGE_OFFSET + PAGE_SIZE-1;
@@ -235,7 +261,10 @@
 					prot = PAGE_KERNEL_LARGE_EXEC;
 
 				pages_2m++;
-				set_pmd(pmd, pfn_pmd(pfn, prot));
+				if (mapping_iter == 1)
+					set_pmd(pmd, pfn_pmd(pfn, init_prot));
+				else
+					set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
 				continue;
@@ -247,17 +276,43 @@
 			for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
 			     pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
 				pgprot_t prot = PAGE_KERNEL;
+				/*
+				 * first pass will use the same initial
+				 * identity mapping attribute.
+				 */
+				pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
 
 				if (is_kernel_text(addr))
 					prot = PAGE_KERNEL_EXEC;
 
 				pages_4k++;
-				set_pte(pte, pfn_pte(pfn, prot));
+				if (mapping_iter == 1)
+					set_pte(pte, pfn_pte(pfn, init_prot));
+				else
+					set_pte(pte, pfn_pte(pfn, prot));
 			}
 		}
 	}
-	update_page_count(PG_LEVEL_2M, pages_2m);
-	update_page_count(PG_LEVEL_4K, pages_4k);
+	if (mapping_iter == 1) {
+		/*
+		 * update direct mapping page count only in the first
+		 * iteration.
+		 */
+		update_page_count(PG_LEVEL_2M, pages_2m);
+		update_page_count(PG_LEVEL_4K, pages_4k);
+
+		/*
+		 * local global flush tlb, which will flush the previous
+		 * mappings present in both small and large page TLB's.
+		 */
+		__flush_tlb_all();
+
+		/*
+		 * Second iteration will set the actual desired PTE attributes.
+		 */
+		mapping_iter = 2;
+		goto repeat;
+	}
 }
 
 /*
Index: tip/arch/x86/mm/init_64.c
===================================================================
--- tip.orig/arch/x86/mm/init_64.c	2008-09-22 15:59:31.000000000 -0700
+++ tip/arch/x86/mm/init_64.c	2008-09-22 15:59:37.000000000 -0700
@@ -323,6 +323,8 @@
 	early_iounmap(adr, PAGE_SIZE);
 }
 
+static int physical_mapping_iter;
+
 static unsigned long __meminit
 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
 {
@@ -343,16 +345,19 @@
 		}
 
 		if (pte_val(*pte))
-			continue;
+			goto repeat_set_pte;
 
 		if (0)
 			printk("   pte=%p addr=%lx pte=%016lx\n",
 			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+		pages++;
+repeat_set_pte:
 		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
 		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
-		pages++;
 	}
-	update_page_count(PG_LEVEL_4K, pages);
+
+	if (physical_mapping_iter == 1)
+		update_page_count(PG_LEVEL_4K, pages);
 
 	return last_map_addr;
 }
@@ -371,7 +376,6 @@
 {
 	unsigned long pages = 0;
 	unsigned long last_map_addr = end;
-	unsigned long start = address;
 
 	int i = pmd_index(address);
 
@@ -394,15 +398,14 @@
 				last_map_addr = phys_pte_update(pmd, address,
 								end);
 				spin_unlock(&init_mm.page_table_lock);
+				continue;
 			}
-			/* Count entries we're using from level2_ident_pgt */
-			if (start == 0)
-				pages++;
-			continue;
+			goto repeat_set_pte;
 		}
 
 		if (page_size_mask & (1<<PG_LEVEL_2M)) {
 			pages++;
+repeat_set_pte:
 			spin_lock(&init_mm.page_table_lock);
 			set_pte((pte_t *)pmd,
 				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
@@ -419,7 +422,8 @@
 		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
 		spin_unlock(&init_mm.page_table_lock);
 	}
-	update_page_count(PG_LEVEL_2M, pages);
+	if (physical_mapping_iter == 1)
+		update_page_count(PG_LEVEL_2M, pages);
 	return last_map_addr;
 }
 
@@ -458,14 +462,18 @@
 		}
 
 		if (pud_val(*pud)) {
-			if (!pud_large(*pud))
+			if (!pud_large(*pud)) {
 				last_map_addr = phys_pmd_update(pud, addr, end,
 							 page_size_mask);
-			continue;
+				continue;
+			}
+
+			goto repeat_set_pte;
 		}
 
 		if (page_size_mask & (1<<PG_LEVEL_1G)) {
 			pages++;
+repeat_set_pte:
 			spin_lock(&init_mm.page_table_lock);
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
@@ -483,7 +491,9 @@
 		spin_unlock(&init_mm.page_table_lock);
 	}
 	__flush_tlb_all();
-	update_page_count(PG_LEVEL_1G, pages);
+
+	if (physical_mapping_iter == 1)
+		update_page_count(PG_LEVEL_1G, pages);
 
 	return last_map_addr;
 }
@@ -547,15 +557,54 @@
 		direct_gbpages = 0;
 }
 
+static int is_kernel(unsigned long pfn)
+{
+	unsigned long pg_addresss = pfn << PAGE_SHIFT;
+
+	if (pg_addresss >= (unsigned long) __pa(_text) &&
+	    pg_addresss <= (unsigned long) __pa(_end))
+		return 1;
+
+	return 0;
+}
+
 static unsigned long __init kernel_physical_mapping_init(unsigned long start,
 						unsigned long end,
 						unsigned long page_size_mask)
 {
 
-	unsigned long next, last_map_addr = end;
+	unsigned long next, last_map_addr;
+	u64 cached_supported_pte_mask = __supported_pte_mask;
+	unsigned long cache_start = start;
+	unsigned long cache_end = end;
+
+	/*
+	 * First iteration will setup identity mapping using large/small pages
+	 * based on page_size_mask, with other attributes same as set by
+	 * the early code in head_64.S
+	 *
+	 * Second iteration will setup the appropriate attributes
+	 * as desired for the kernel identity mapping.
+	 *
+	 * This two pass mechanism conforms to the TLB app note which says:
+	 *
+	 *     "Software should not write to a paging-structure entry in a way
+	 *      that would change, for any linear address, both the page size
+	 *      and either the page frame or attributes."
+	 *
+	 * For now, only difference between very early PTE attributes used in
+	 * head_64.S and here is _PAGE_NX.
+	 */
+	BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC)
+		     != _PAGE_NX);
+	__supported_pte_mask &= ~(_PAGE_NX);
+	physical_mapping_iter = 1;
 
-	start = (unsigned long)__va(start);
-	end = (unsigned long)__va(end);
+repeat:
+	last_map_addr = cache_end;
+
+	start = (unsigned long)__va(cache_start);
+	end = (unsigned long)__va(cache_end);
 
 	for (; start < end; start = next) {
 		pgd_t *pgd = pgd_offset_k(start);
@@ -567,11 +616,21 @@
 			next = end;
 
 		if (pgd_val(*pgd)) {
+			/*
+			 * Static identity mappings will be overwritten
+			 * with run-time mappings. For example, this allows
+			 * the static 0-1GB identity mapping to be mapped
+			 * non-executable with this.
+			 */
+			if (is_kernel(pte_pfn(*((pte_t *) pgd))))
+				goto realloc;
+
 			last_map_addr = phys_pud_update(pgd, __pa(start),
 						 __pa(end), page_size_mask);
 			continue;
 		}
 
+realloc:
 		pud = alloc_low_page(&pud_phys);
 		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
 						 page_size_mask);
@@ -581,6 +640,16 @@
 		pgd_populate(&init_mm, pgd, __va(pud_phys));
 		spin_unlock(&init_mm.page_table_lock);
 	}
+	__flush_tlb_all();
+
+	if (physical_mapping_iter == 1) {
+		physical_mapping_iter = 2;
+		/*
+		 * Second iteration will set the actual desired PTE attributes.
+		 */
+		__supported_pte_mask = cached_supported_pte_mask;
+		goto repeat;
+	}
 
 	return last_map_addr;
 }

-- 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 4/7] x86, cpa: dont use large pages for kernel identity mapping with DEBUG_PAGEALLOC
  2008-09-23 21:00 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Suresh Siddha
                   ` (2 preceding siblings ...)
  2008-09-23 21:00 ` [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence Suresh Siddha
@ 2008-09-23 21:00 ` Suresh Siddha
  2008-09-23 21:00 ` [patch 5/7] x86, cpa: no need to check alias for __set_pages_p/__set_pages_np Suresh Siddha
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: Suresh Siddha @ 2008-09-23 21:00 UTC (permalink / raw)
  To: mingo, hpa, tglx, arjan, venkatesh.pallipadi, jeremy
  Cc: linux-kernel, Suresh Siddha

[-- Attachment #1: no_lp_identity_mapping_debug_pgalloc.patch --]
[-- Type: text/plain, Size: 4109 bytes --]

Don't use large pages for kernel identity mapping with DEBUG_PAGEALLOC.
This will remove the need to split the large page for the
allocated kernel page in the interrupt context.

This will simplify cpa code(as we don't do the split any more from the
interrupt context). cpa code simplication in the subsequent patches.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

Index: tip/arch/x86/mm/init_32.c
===================================================================
--- tip.orig/arch/x86/mm/init_32.c	2008-09-22 15:59:37.000000000 -0700
+++ tip/arch/x86/mm/init_32.c	2008-09-22 15:59:39.000000000 -0700
@@ -775,7 +775,7 @@
 	after_init_bootmem = 1;
 }
 
-static void __init find_early_table_space(unsigned long end)
+static void __init find_early_table_space(unsigned long end, int use_pse)
 {
 	unsigned long puds, pmds, ptes, tables, start;
 
@@ -785,7 +785,7 @@
 	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
 	tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
 
-	if (cpu_has_pse) {
+	if (use_pse) {
 		unsigned long extra;
 
 		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -825,12 +825,22 @@
 	pgd_t *pgd_base = swapper_pg_dir;
 	unsigned long start_pfn, end_pfn;
 	unsigned long big_page_start;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+	 * This will simplify cpa(), which otherwise needs to support splitting
+	 * large pages into small in interrupt context, etc.
+	 */
+	int use_pse = 0;
+#else
+	int use_pse = cpu_has_pse;
+#endif
 
 	/*
 	 * Find space for the kernel direct mapping tables.
 	 */
 	if (!after_init_bootmem)
-		find_early_table_space(end);
+		find_early_table_space(end, use_pse);
 
 #ifdef CONFIG_X86_PAE
 	set_nx();
@@ -876,7 +886,7 @@
 	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
 	if (start_pfn < end_pfn)
 		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
-						cpu_has_pse);
+					     use_pse);
 
 	/* tail is not big page alignment ? */
 	start_pfn = end_pfn;
Index: tip/arch/x86/mm/init_64.c
===================================================================
--- tip.orig/arch/x86/mm/init_64.c	2008-09-22 15:59:37.000000000 -0700
+++ tip/arch/x86/mm/init_64.c	2008-09-22 15:59:39.000000000 -0700
@@ -509,13 +509,14 @@
 	return phys_pud_init(pud, addr, end, page_size_mask);
 }
 
-static void __init find_early_table_space(unsigned long end)
+static void __init find_early_table_space(unsigned long end, int use_pse,
+					  int use_gbpages)
 {
 	unsigned long puds, pmds, ptes, tables, start;
 
 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
 	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-	if (direct_gbpages) {
+	if (use_gbpages) {
 		unsigned long extra;
 		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
 		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
@@ -523,7 +524,7 @@
 		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
 	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
 
-	if (cpu_has_pse) {
+	if (use_pse) {
 		unsigned long extra;
 		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
 		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -693,6 +694,7 @@
 
 	struct map_range mr[NR_RANGE_MR];
 	int nr_range, i;
+	int use_pse, use_gbpages;
 
 	printk(KERN_INFO "init_memory_mapping\n");
 
@@ -706,9 +708,21 @@
 	if (!after_bootmem)
 		init_gbpages();
 
-	if (direct_gbpages)
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+	 * This will simplify cpa(), which otherwise needs to support splitting
+	 * large pages into small in interrupt context, etc.
+	 */
+	use_pse = use_gbpages = 0;
+#else
+	use_pse = cpu_has_pse;
+	use_gbpages = direct_gbpages;
+#endif
+
+	if (use_gbpages)
 		page_size_mask |= 1 << PG_LEVEL_1G;
-	if (cpu_has_pse)
+	if (use_pse)
 		page_size_mask |= 1 << PG_LEVEL_2M;
 
 	memset(mr, 0, sizeof(mr));
@@ -769,7 +783,7 @@
 			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
 
 	if (!after_bootmem)
-		find_early_table_space(end);
+		find_early_table_space(end, use_pse, use_gbpages);
 
 	for (i = 0; i < nr_range; i++)
 		last_map_addr = kernel_physical_mapping_init(

-- 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 5/7] x86, cpa: no need to check alias for __set_pages_p/__set_pages_np
  2008-09-23 21:00 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Suresh Siddha
                   ` (3 preceding siblings ...)
  2008-09-23 21:00 ` [patch 4/7] x86, cpa: dont use large pages for kernel identity mapping with DEBUG_PAGEALLOC Suresh Siddha
@ 2008-09-23 21:00 ` Suresh Siddha
  2008-09-23 21:00 ` [patch 6/7] x86, cpa: remove cpa pool code Suresh Siddha
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: Suresh Siddha @ 2008-09-23 21:00 UTC (permalink / raw)
  To: mingo, hpa, tglx, arjan, venkatesh.pallipadi, jeremy
  Cc: linux-kernel, Suresh Siddha

[-- Attachment #1: no_alias_for_np_p.patch --]
[-- Type: text/plain, Size: 2166 bytes --]

No alias checking needed for setting present/not-present mapping. Otherwise,
we may need to break large pages for 64-bit kernel text mappings (this adds to
complexity if we want to do this from atomic context especially, for ex:
with CONFIG_DEBUG_PAGEALLOC). Let's keep it simple!

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

Index: tip/arch/x86/mm/pageattr.c
===================================================================
--- tip.orig/arch/x86/mm/pageattr.c	2008-09-23 13:45:59.000000000 -0700
+++ tip/arch/x86/mm/pageattr.c	2008-09-23 13:47:45.000000000 -0700
@@ -1121,7 +1121,13 @@
 				.mask_clr = __pgprot(0),
 				.flags = 0};
 
-	return __change_page_attr_set_clr(&cpa, 1);
+	/*
+	 * No alias checking needed for setting present flag. otherwise,
+	 * we may need to break large pages for 64-bit kernel text
+	 * mappings (this adds to complexity if we want to do this from
+	 * atomic context especially). Let's keep it simple!
+	 */
+	return __change_page_attr_set_clr(&cpa, 0);
 }
 
 static int __set_pages_np(struct page *page, int numpages)
@@ -1133,7 +1139,13 @@
 				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
 				.flags = 0};
 
-	return __change_page_attr_set_clr(&cpa, 1);
+	/*
+	 * No alias checking needed for setting not present flag. otherwise,
+	 * we may need to break large pages for 64-bit kernel text
+	 * mappings (this adds to complexity if we want to do this from
+	 * atomic context especially). Let's keep it simple!
+	 */
+	return __change_page_attr_set_clr(&cpa, 0);
 }
 
 void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1153,11 +1165,8 @@
 
 	/*
 	 * The return value is ignored as the calls cannot fail.
-	 * Large pages are kept enabled at boot time, and are
-	 * split up quickly with DEBUG_PAGEALLOC. If a splitup
-	 * fails here (due to temporary memory shortage) no damage
-	 * is done because we just keep the largepage intact up
-	 * to the next attempt when it will likely be split up:
+	 * Large pages for identity mappings are not used at boot time
+	 * and hence no memory allocations during large page split.
 	 */
 	if (enable)
 		__set_pages_p(page, numpages);

-- 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 6/7] x86, cpa: remove cpa pool code
  2008-09-23 21:00 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Suresh Siddha
                   ` (4 preceding siblings ...)
  2008-09-23 21:00 ` [patch 5/7] x86, cpa: no need to check alias for __set_pages_p/__set_pages_np Suresh Siddha
@ 2008-09-23 21:00 ` Suresh Siddha
  2008-09-23 21:00 ` [patch 7/7] x86, cpa: srlz cpa(), global flush tlb after splitting big page and before doing cpa Suresh Siddha
  2008-09-24  8:15 ` [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Ingo Molnar
  7 siblings, 0 replies; 18+ messages in thread
From: Suresh Siddha @ 2008-09-23 21:00 UTC (permalink / raw)
  To: mingo, hpa, tglx, arjan, venkatesh.pallipadi, jeremy
  Cc: linux-kernel, Suresh Siddha

[-- Attachment #1: cleanup_cpa_pool.patch --]
[-- Type: text/plain, Size: 6107 bytes --]

Interrupt context no longer splits large page in cpa(). So we can do away
with cpa memory pool code.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

Index: tip/arch/x86/mm/pageattr.c
===================================================================
--- tip.orig/arch/x86/mm/pageattr.c	2008-09-23 13:47:45.000000000 -0700
+++ tip/arch/x86/mm/pageattr.c	2008-09-23 13:47:51.000000000 -0700
@@ -447,114 +447,17 @@
 	return do_split;
 }
 
-static LIST_HEAD(page_pool);
-static unsigned long pool_size, pool_pages, pool_low;
-static unsigned long pool_used, pool_failed;
-
-static void cpa_fill_pool(struct page **ret)
-{
-	gfp_t gfp = GFP_KERNEL;
-	unsigned long flags;
-	struct page *p;
-
-	/*
-	 * Avoid recursion (on debug-pagealloc) and also signal
-	 * our priority to get to these pagetables:
-	 */
-	if (current->flags & PF_MEMALLOC)
-		return;
-	current->flags |= PF_MEMALLOC;
-
-	/*
-	 * Allocate atomically from atomic contexts:
-	 */
-	if (in_atomic() || irqs_disabled() || debug_pagealloc)
-		gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-
-	while (pool_pages < pool_size || (ret && !*ret)) {
-		p = alloc_pages(gfp, 0);
-		if (!p) {
-			pool_failed++;
-			break;
-		}
-		/*
-		 * If the call site needs a page right now, provide it:
-		 */
-		if (ret && !*ret) {
-			*ret = p;
-			continue;
-		}
-		spin_lock_irqsave(&pgd_lock, flags);
-		list_add(&p->lru, &page_pool);
-		pool_pages++;
-		spin_unlock_irqrestore(&pgd_lock, flags);
-	}
-
-	current->flags &= ~PF_MEMALLOC;
-}
-
-#define SHIFT_MB		(20 - PAGE_SHIFT)
-#define ROUND_MB_GB		((1 << 10) - 1)
-#define SHIFT_MB_GB		10
-#define POOL_PAGES_PER_GB	16
-
-void __init cpa_init(void)
-{
-	struct sysinfo si;
-	unsigned long gb;
-
-	si_meminfo(&si);
-	/*
-	 * Calculate the number of pool pages:
-	 *
-	 * Convert totalram (nr of pages) to MiB and round to the next
-	 * GiB. Shift MiB to Gib and multiply the result by
-	 * POOL_PAGES_PER_GB:
-	 */
-	if (debug_pagealloc) {
-		gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
-		pool_size = POOL_PAGES_PER_GB * gb;
-	} else {
-		pool_size = 1;
-	}
-	pool_low = pool_size;
-
-	cpa_fill_pool(NULL);
-	printk(KERN_DEBUG
-	       "CPA: page pool initialized %lu of %lu pages preallocated\n",
-	       pool_pages, pool_size);
-}
-
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
 	unsigned long flags, pfn, pfninc = 1;
 	unsigned int i, level;
 	pte_t *pbase, *tmp;
 	pgprot_t ref_prot;
-	struct page *base;
+	struct page *base = alloc_pages(GFP_KERNEL, 0);
+	if (!base)
+		return -ENOMEM;
 
-	/*
-	 * Get a page from the pool. The pool list is protected by the
-	 * pgd_lock, which we have to take anyway for the split
-	 * operation:
-	 */
 	spin_lock_irqsave(&pgd_lock, flags);
-	if (list_empty(&page_pool)) {
-		spin_unlock_irqrestore(&pgd_lock, flags);
-		base = NULL;
-		cpa_fill_pool(&base);
-		if (!base)
-			return -ENOMEM;
-		spin_lock_irqsave(&pgd_lock, flags);
-	} else {
-		base = list_first_entry(&page_pool, struct page, lru);
-		list_del(&base->lru);
-		pool_pages--;
-
-		if (pool_pages < pool_low)
-			pool_low = pool_pages;
-	}
-
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -611,11 +514,8 @@
 	 * If we dropped out via the lookup_address check under
 	 * pgd_lock then stick the page back into the pool:
 	 */
-	if (base) {
-		list_add(&base->lru, &page_pool);
-		pool_pages++;
-	} else
-		pool_used++;
+	if (base)
+		__free_page(base);
 	spin_unlock_irqrestore(&pgd_lock, flags);
 
 	return 0;
@@ -899,8 +799,6 @@
 		cpa_flush_all(cache);
 
 out:
-	cpa_fill_pool(NULL);
-
 	return ret;
 }
 
@@ -1178,53 +1076,8 @@
 	 * but that can deadlock->flush only current cpu:
 	 */
 	__flush_tlb_all();
-
-	/*
-	 * Try to refill the page pool here. We can do this only after
-	 * the tlb flush.
-	 */
-	cpa_fill_pool(NULL);
 }
 
-#ifdef CONFIG_DEBUG_FS
-static int dpa_show(struct seq_file *m, void *v)
-{
-	seq_puts(m, "DEBUG_PAGEALLOC\n");
-	seq_printf(m, "pool_size     : %lu\n", pool_size);
-	seq_printf(m, "pool_pages    : %lu\n", pool_pages);
-	seq_printf(m, "pool_low      : %lu\n", pool_low);
-	seq_printf(m, "pool_used     : %lu\n", pool_used);
-	seq_printf(m, "pool_failed   : %lu\n", pool_failed);
-
-	return 0;
-}
-
-static int dpa_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, dpa_show, NULL);
-}
-
-static const struct file_operations dpa_fops = {
-	.open		= dpa_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int __init debug_pagealloc_proc_init(void)
-{
-	struct dentry *de;
-
-	de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
-				 &dpa_fops);
-	if (!de)
-		return -ENOMEM;
-
-	return 0;
-}
-__initcall(debug_pagealloc_proc_init);
-#endif
-
 #ifdef CONFIG_HIBERNATION
 
 bool kernel_page_present(struct page *page)
Index: tip/arch/x86/mm/init_64.c
===================================================================
--- tip.orig/arch/x86/mm/init_64.c	2008-09-23 13:46:14.000000000 -0700
+++ tip/arch/x86/mm/init_64.c	2008-09-23 13:47:51.000000000 -0700
@@ -944,8 +944,6 @@
 		reservedpages << (PAGE_SHIFT-10),
 		datasize >> 10,
 		initsize >> 10);
-
-	cpa_init();
 }
 
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
Index: tip/arch/x86/mm/init_32.c
===================================================================
--- tip.orig/arch/x86/mm/init_32.c	2008-09-23 13:46:14.000000000 -0700
+++ tip/arch/x86/mm/init_32.c	2008-09-23 13:47:51.000000000 -0700
@@ -1053,7 +1053,6 @@
 	if (boot_cpu_data.wp_works_ok < 0)
 		test_wp_bit();
 
-	cpa_init();
 	save_pg_dir();
 	zap_low_mappings();
 }
Index: tip/include/asm-x86/cacheflush.h
===================================================================
--- tip.orig/include/asm-x86/cacheflush.h	2008-09-23 13:45:48.000000000 -0700
+++ tip/include/asm-x86/cacheflush.h	2008-09-23 13:47:51.000000000 -0700
@@ -99,8 +99,6 @@
 
 void clflush_cache_range(void *addr, unsigned int size);
 
-void cpa_init(void);
-
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
 extern const int rodata_test_data;

-- 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 7/7] x86, cpa: srlz cpa(), global flush tlb after splitting big page and before doing cpa
  2008-09-23 21:00 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Suresh Siddha
                   ` (5 preceding siblings ...)
  2008-09-23 21:00 ` [patch 6/7] x86, cpa: remove cpa pool code Suresh Siddha
@ 2008-09-23 21:00 ` Suresh Siddha
  2008-09-24  8:15 ` [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Ingo Molnar
  7 siblings, 0 replies; 18+ messages in thread
From: Suresh Siddha @ 2008-09-23 21:00 UTC (permalink / raw)
  To: mingo, hpa, tglx, arjan, venkatesh.pallipadi, jeremy
  Cc: linux-kernel, Suresh Siddha

[-- Attachment #1: introduce_cpa_lock.patch --]
[-- Type: text/plain, Size: 3246 bytes --]

Do a global flush tlb after splitting the large page and before we do the
actual change page attribute in the PTE.

With out this, we violate the TLB application note, which says
    "The TLBs may contain both ordinary and large-page translations for
     a 4-KByte range of linear addresses. This may occur if software
     modifies the paging structures so that the page size used for the
     address range changes. If the two translations differ with respect
     to page frame or attributes (e.g., permissions), processor behavior
     is undefined and may be implementation-specific."

And also serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity
mappings) using cpa_lock. So that we don't allow any other cpu, with stale
large tlb entries change the page attribute in parallel to some other cpu
splitting a large page entry along with changing the attribute.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

Index: tip/arch/x86/mm/pageattr.c
===================================================================
--- tip.orig/arch/x86/mm/pageattr.c	2008-09-23 13:47:51.000000000 -0700
+++ tip/arch/x86/mm/pageattr.c	2008-09-23 13:47:54.000000000 -0700
@@ -35,6 +35,14 @@
 	int		curpage;
 };
 
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
 
@@ -453,7 +461,13 @@
 	unsigned int i, level;
 	pte_t *pbase, *tmp;
 	pgprot_t ref_prot;
-	struct page *base = alloc_pages(GFP_KERNEL, 0);
+	struct page *base;
+
+	if (!debug_pagealloc)
+		spin_unlock(&cpa_lock);
+	base = alloc_pages(GFP_KERNEL, 0);
+	if (!debug_pagealloc)
+		spin_lock(&cpa_lock);
 	if (!base)
 		return -ENOMEM;
 
@@ -594,7 +608,25 @@
 	 */
 	err = split_large_page(kpte, address);
 	if (!err) {
-		cpa->flags |= CPA_FLUSHTLB;
+		/*
+	 	 * Do a global flush tlb after splitting the large page
+	 	 * and before we do the actual change page attribute in the PTE.
+	 	 *
+	 	 * With out this, we violate the TLB application note, that says
+	 	 * "The TLBs may contain both ordinary and large-page
+		 *  translations for a 4-KByte range of linear addresses. This
+		 *  may occur if software modifies the paging structures so that
+		 *  the page size used for the address range changes. If the two
+		 *  translations differ with respect to page frame or attributes
+		 *  (e.g., permissions), processor behavior is undefined and may
+		 *  be implementation-specific."
+	 	 *
+	 	 * We do this global tlb flush inside the cpa_lock, so that we
+		 * don't allow any other cpu, with stale tlb entries change the
+		 * page attribute in parallel, that also falls into the
+		 * just split large page entry.
+	 	 */
+		flush_tlb_all();
 		goto repeat;
 	}
 
@@ -686,7 +718,11 @@
 		if (cpa->flags & CPA_ARRAY)
 			cpa->numpages = 1;
 
+		if (!debug_pagealloc)
+			spin_lock(&cpa_lock);
 		ret = __change_page_attr(cpa, checkalias);
+		if (!debug_pagealloc)
+			spin_unlock(&cpa_lock);
 		if (ret)
 			return ret;
 

-- 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2
  2008-09-23 21:00 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Suresh Siddha
                   ` (6 preceding siblings ...)
  2008-09-23 21:00 ` [patch 7/7] x86, cpa: srlz cpa(), global flush tlb after splitting big page and before doing cpa Suresh Siddha
@ 2008-09-24  8:15 ` Ingo Molnar
  7 siblings, 0 replies; 18+ messages in thread
From: Ingo Molnar @ 2008-09-24  8:15 UTC (permalink / raw)
  To: Suresh Siddha; +Cc: hpa, tglx, arjan, venkatesh.pallipadi, jeremy, linux-kernel


* Suresh Siddha <suresh.b.siddha@intel.com> wrote:

> v2:
>  1. Fix the deadlock associated with smp call function and spin_lock_irqsave()
>     for pgd_lock.
>  2. No alias checks for __set_pages_np()/__set_pages_p() avoiding the large
>     text page split from atomic context in CONFIG_DEBUG_PAGEALLOC.
>  3. Serialize cpa() for !CONFIG_DEBUG_PAGEALLOC, so that a cpu with stale
>     large page tlb's(but small PTE's in memory) won't do a cpa() in parallel
>     to some other cpu splitting large pages and changing page attribute for
>     a small page.

ok, looks good and the code got simpler as well. I've started testing it 
in tip/master and will push out these updates to tip/x86/pat if they 
pass. (v1 had stability problems)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-09-23 21:00 ` [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence Suresh Siddha
@ 2008-10-06 20:48   ` Jeremy Fitzhardinge
  2008-10-06 23:09     ` Jeremy Fitzhardinge
  2008-10-07  1:58     ` Suresh Siddha
  0 siblings, 2 replies; 18+ messages in thread
From: Jeremy Fitzhardinge @ 2008-10-06 20:48 UTC (permalink / raw)
  To: Suresh Siddha; +Cc: mingo, hpa, tglx, arjan, venkatesh.pallipadi, linux-kernel

Suresh Siddha wrote:
> In the first pass, kernel physical mapping will be setup using large or
> small pages but uses the same PTE attributes as that of the early
> PTE attributes setup by early boot code in head_[32|64].S
>
> After flushing TLB's, we go through the second pass, which setups the
> direct mapped PTE's with the appropriate attributes (like NX, GLOBAL etc)
> which are runtime detectable.
>
> This two pass mechanism conforms to the TLB app note which says:
>
> "Software should not write to a paging-structure entry in a way that would
>  change, for any linear address, both the page size and either the page frame
>  or attributes."
>   

I'd noticed that current tip/master hasn't been booting under Xen, and I 
just got around to bisecting it down to this change.

This patch is causing Xen to fail various pagetable updates because it 
ends up remapping pagetables to RW, which Xen explicitly prohibits (as 
that would allow guests to make arbitrary changes to pagetables, rather 
than have them mediated by the hypervisor).

A few things strike me about this patch:

   1. It's high time we unified the physical memory mapping code, and it
      would have been better to do so before making a change of this
      kind to the code.
   2. The existing code already avoided overwriting a pagetable entry
      unless the page size changed.  Wouldn't it be easier to construct
      the mappings first, using the old code, then do a CPA call to set
      the NX bit appropriately?
   3. The actual implementation is pretty ugly; adding a global variable
      and hopping about with goto does not improve this code.

What are the downsides of not following the TLB app note's advice?  Does 
it cause real failures?  Could we revert this patch and address the 
problem some other way?  Which app note is this, BTW?  The one I have on 
hand, "TLBs, Paging-Structure Caches, and Their Invalidation", Apr 2007, 
does not seem to mention this restriction.

As it is, I suspect it will take a non-trivial amount of work to restore 
Xen with this code in place (touching this code is always non-trivial).  
I haven't looked into it in depth yet, but there's a few stand out "bad 
for Xen" pieces of code here.  (And I haven't tested 32-bit yet.)

Quick rules for keeping Xen happy here:

   1. Xen provides its own initial pagetable; the head_64.S one is
      unused when booting under Xen.
   2. Xen requires that any pagetable page must always be mapped RO, so
      we're careful to not replace an existing mapping with a new one,
      in case the existing mapping is a pagetable one.
   3. Xen never uses large pages, and the hypervisor will fail any
      attempt to do so.


> Index: tip/arch/x86/mm/init_64.c
> ===================================================================
> --- tip.orig/arch/x86/mm/init_64.c	2008-09-22 15:59:31.000000000 -0700
> +++ tip/arch/x86/mm/init_64.c	2008-09-22 15:59:37.000000000 -0700
> @@ -323,6 +323,8 @@
>  	early_iounmap(adr, PAGE_SIZE);
>  }
>  
> +static int physical_mapping_iter;
> +
>  static unsigned long __meminit
>  phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
>  {
> @@ -343,16 +345,19 @@
>  		}
>  
>  		if (pte_val(*pte))
> -			continue;
> +			goto repeat_set_pte;
>   

This looks troublesome.  The code was explicitly avoiding resetting a 
pte which had already been set.  This change will make it overwrite the 
mapping with PAGE_KERNEL, which will break Xen if the mapping was 
previously RO.

>  
>  		if (0)
>  			printk("   pte=%p addr=%lx pte=%016lx\n",
>  			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
> +		pages++;
> +repeat_set_pte:
>  		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
>  		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
> -		pages++;
>  	}
> -	update_page_count(PG_LEVEL_4K, pages);
> +
> +	if (physical_mapping_iter == 1)
> +		update_page_count(PG_LEVEL_4K, pages);
>  
>  	return last_map_addr;
>  }
> @@ -371,7 +376,6 @@
>  {
>  	unsigned long pages = 0;
>  	unsigned long last_map_addr = end;
> -	unsigned long start = address;
>  
>  	int i = pmd_index(address);
>  
> @@ -394,15 +398,14 @@
>  				last_map_addr = phys_pte_update(pmd, address,
>  								end);
>  				spin_unlock(&init_mm.page_table_lock);
> +				continue;
>  			}
> -			/* Count entries we're using from level2_ident_pgt */
> -			if (start == 0)
> -				pages++;
> -			continue;
> +			goto repeat_set_pte;
>  		}
>  
>  		if (page_size_mask & (1<<PG_LEVEL_2M)) {
>  			pages++;
> +repeat_set_pte:
>  			spin_lock(&init_mm.page_table_lock);
>  			set_pte((pte_t *)pmd,
>  				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
> @@ -419,7 +422,8 @@
>  		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
>  		spin_unlock(&init_mm.page_table_lock);
>  	}
> -	update_page_count(PG_LEVEL_2M, pages);
> +	if (physical_mapping_iter == 1)
> +		update_page_count(PG_LEVEL_2M, pages);
>  	return last_map_addr;
>  }
>  
> @@ -458,14 +462,18 @@
>  		}
>  
>  		if (pud_val(*pud)) {
> -			if (!pud_large(*pud))
> +			if (!pud_large(*pud)) {
>  				last_map_addr = phys_pmd_update(pud, addr, end,
>  							 page_size_mask);
> -			continue;
> +				continue;
> +			}
> +
> +			goto repeat_set_pte;
>  		}
>  
>  		if (page_size_mask & (1<<PG_LEVEL_1G)) {
>  			pages++;
> +repeat_set_pte:
>  			spin_lock(&init_mm.page_table_lock);
>  			set_pte((pte_t *)pud,
>  				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
> @@ -483,7 +491,9 @@
>  		spin_unlock(&init_mm.page_table_lock);
>  	}
>  	__flush_tlb_all();
> -	update_page_count(PG_LEVEL_1G, pages);
> +
> +	if (physical_mapping_iter == 1)
> +		update_page_count(PG_LEVEL_1G, pages);
>  
>  	return last_map_addr;
>  }
> @@ -547,15 +557,54 @@
>  		direct_gbpages = 0;
>  }
>  
> +static int is_kernel(unsigned long pfn)
> +{
> +	unsigned long pg_addresss = pfn << PAGE_SHIFT;
> +
> +	if (pg_addresss >= (unsigned long) __pa(_text) &&
> +	    pg_addresss <= (unsigned long) __pa(_end))
> +		return 1;
> +
> +	return 0;
> +}
> +
>  static unsigned long __init kernel_physical_mapping_init(unsigned long start,
>  						unsigned long end,
>  						unsigned long page_size_mask)
>  {
>  
> -	unsigned long next, last_map_addr = end;
> +	unsigned long next, last_map_addr;
> +	u64 cached_supported_pte_mask = __supported_pte_mask;
> +	unsigned long cache_start = start;
> +	unsigned long cache_end = end;
> +
> +	/*
> +	 * First iteration will setup identity mapping using large/small pages
> +	 * based on page_size_mask, with other attributes same as set by
> +	 * the early code in head_64.S
>   

We can't assume here that the pagetables we're modifying are necessarily 
the head_64.S ones.

> +	 *
> +	 * Second iteration will setup the appropriate attributes
> +	 * as desired for the kernel identity mapping.
> +	 *
> +	 * This two pass mechanism conforms to the TLB app note which says:
> +	 *
> +	 *     "Software should not write to a paging-structure entry in a way
> +	 *      that would change, for any linear address, both the page size
> +	 *      and either the page frame or attributes."
> +	 *
> +	 * For now, only difference between very early PTE attributes used in
> +	 * head_64.S and here is _PAGE_NX.
> +	 */
> +	BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC)
> +		     != _PAGE_NX);
> +	__supported_pte_mask &= ~(_PAGE_NX);
> +	physical_mapping_iter = 1;
>  
> -	start = (unsigned long)__va(start);
> -	end = (unsigned long)__va(end);
> +repeat:
> +	last_map_addr = cache_end;
> +
> +	start = (unsigned long)__va(cache_start);
> +	end = (unsigned long)__va(cache_end);
>  
>  	for (; start < end; start = next) {
>  		pgd_t *pgd = pgd_offset_k(start);
> @@ -567,11 +616,21 @@
>  			next = end;
>  
>  		if (pgd_val(*pgd)) {
> +			/*
> +			 * Static identity mappings will be overwritten
> +			 * with run-time mappings. For example, this allows
> +			 * the static 0-1GB identity mapping to be mapped
> +			 * non-executable with this.
> +			 */
> +			if (is_kernel(pte_pfn(*((pte_t *) pgd))))
> +				goto realloc;
>   

This is definitely a Xen-breaker, but removing this is not sufficient on 
its own.  Is this actually related to the rest of the patch, or a 
gratuitous throw-in change?

> +
>  			last_map_addr = phys_pud_update(pgd, __pa(start),
>  						 __pa(end), page_size_mask);
>  			continue;
>  		}
>  
> +realloc:
>  		pud = alloc_low_page(&pud_phys);
>  		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
>  						 page_size_mask);
> @@ -581,6 +640,16 @@
>  		pgd_populate(&init_mm, pgd, __va(pud_phys));
>  		spin_unlock(&init_mm.page_table_lock);
>  	}
> +	__flush_tlb_all();
> +
> +	if (physical_mapping_iter == 1) {
> +		physical_mapping_iter = 2;
> +		/*
> +		 * Second iteration will set the actual desired PTE attributes.
> +		 */
> +		__supported_pte_mask = cached_supported_pte_mask;
> +		goto repeat;
> +	}
>  
>  	return last_map_addr;
>  }
>
>   

Thanks,
    J

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-10-06 20:48   ` Jeremy Fitzhardinge
@ 2008-10-06 23:09     ` Jeremy Fitzhardinge
  2008-10-07  1:58     ` Suresh Siddha
  1 sibling, 0 replies; 18+ messages in thread
From: Jeremy Fitzhardinge @ 2008-10-06 23:09 UTC (permalink / raw)
  To: Suresh Siddha; +Cc: mingo, hpa, tglx, arjan, venkatesh.pallipadi, linux-kernel

Jeremy Fitzhardinge wrote:
> As it is, I suspect it will take a non-trivial amount of work to 
> restore Xen with this code in place (touching this code is always 
> non-trivial).  I haven't looked into it in depth yet, but there's a 
> few stand out "bad for Xen" pieces of code here.  (And I haven't 
> tested 32-bit yet.)

32-bit Xen is OK with this patch.  Reverting it restores 64-bit Xen.

    J

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-10-06 20:48   ` Jeremy Fitzhardinge
  2008-10-06 23:09     ` Jeremy Fitzhardinge
@ 2008-10-07  1:58     ` Suresh Siddha
  2008-10-07 15:28       ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 18+ messages in thread
From: Suresh Siddha @ 2008-10-07  1:58 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Siddha, Suresh B, mingo, hpa, tglx, arjan, Pallipadi, Venkatesh,
	linux-kernel

On Mon, Oct 06, 2008 at 01:48:13PM -0700, Jeremy Fitzhardinge wrote:
> Suresh Siddha wrote:
> > In the first pass, kernel physical mapping will be setup using large or
> > small pages but uses the same PTE attributes as that of the early
> > PTE attributes setup by early boot code in head_[32|64].S
> >
> > After flushing TLB's, we go through the second pass, which setups the
> > direct mapped PTE's with the appropriate attributes (like NX, GLOBAL etc)
> > which are runtime detectable.
> >
> > This two pass mechanism conforms to the TLB app note which says:
> >
> > "Software should not write to a paging-structure entry in a way that would
> >  change, for any linear address, both the page size and either the page frame
> >  or attributes."
> >
> 
> I'd noticed that current tip/master hasn't been booting under Xen, and I
> just got around to bisecting it down to this change.
> 
> This patch is causing Xen to fail various pagetable updates because it
> ends up remapping pagetables to RW, which Xen explicitly prohibits (as
> that would allow guests to make arbitrary changes to pagetables, rather
> than have them mediated by the hypervisor).

Jeremy, hi. This dependency is not documented or explicitly called anywhere
in the mm/init_64.c code. I would have expected to see a big comment near this
kind of code :(

	if (pte_val(*pte))
		continue;

> A few things strike me about this patch:
> 
>    1. It's high time we unified the physical memory mapping code, and it
>       would have been better to do so before making a change of this
>       kind to the code.
>    2. The existing code already avoided overwriting a pagetable entry
>       unless the page size changed. Wouldn't it be easier to construct
>       the mappings first, using the old code, then do a CPA call to set
>       the NX bit appropriately?

It is not just the NX bit that we change. For DEBUG_PAGEALLOC, we want
use 4k pages instead of large page mappings during the identity mapping
(as this will clean some of the cpa pool code avoiding the cpa and hence
 the page allocations for splitting the big pages from interrupt context's).
In this case will will split the static large page mappings.

>    3. The actual implementation is pretty ugly; adding a global variable
>       and hopping about with goto does not improve this code.

This is very early init code and I can't be fancy like calling cpa()
which need mm to be up and running. And also, cpa's on individual chunks
for entire identity mapping will make the boot slow.

Now that I am aware of this xen failure, I will try to clean up this in a better
fashion.

> What are the downsides of not following the TLB app note's advice?  Does

App notes says that cpu behavior is undefined. We will  probably see more
issues with attribute changes like UC/WB etc, as far as the other attributes
are concerned we are paranoid and wanted to fix all the violations while
we are at it.

> it cause real failures?  Could we revert this patch and address the
> problem some other way?  Which app note is this, BTW?  The one I have on
> hand, "TLBs, Paging-Structure Caches, and Their Invalidation", Apr 2007,
> does not seem to mention this restriction.

http://developer.intel.com/design/processor/applnots/317080.pdf
Section 6 page 26

> As it is, I suspect it will take a non-trivial amount of work to restore

I didn't get much time today to think about this. Let me think a bit
more tonight and will get back to you tomorrow with a patch fixing
this or a request to Ingo to revert this(if we revert we have to revert
the whole patchset otherwise, we will break DEBUG_PAGEALLOC etc).

> Xen with this code in place (touching this code is always non-trivial).
> I haven't looked into it in depth yet, but there's a few stand out "bad
> for Xen" pieces of code here.  (And I haven't tested 32-bit yet.)
> 
> Quick rules for keeping Xen happy here:
> 
>    1. Xen provides its own initial pagetable; the head_64.S one is
>       unused when booting under Xen.
>    2. Xen requires that any pagetable page must always be mapped RO, so
>       we're careful to not replace an existing mapping with a new one,
>       in case the existing mapping is a pagetable one.
>    3. Xen never uses large pages, and the hypervisor will fail any
>       attempt to do so.

Thanks for this info. Will get back to you tomorrow.

thanks,
suresh

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-10-07  1:58     ` Suresh Siddha
@ 2008-10-07 15:28       ` Jeremy Fitzhardinge
  2008-10-07 20:58         ` Suresh Siddha
  0 siblings, 1 reply; 18+ messages in thread
From: Jeremy Fitzhardinge @ 2008-10-07 15:28 UTC (permalink / raw)
  To: Suresh Siddha; +Cc: mingo, hpa, tglx, arjan, Pallipadi, Venkatesh, linux-kernel

Suresh Siddha wrote:
> Jeremy, hi. This dependency is not documented or explicitly called anywhere
> in the mm/init_64.c code. I would have expected to see a big comment near this
> kind of code :(
>   

Indeed yes.  I've explained it in various places, including commit 
comments, but there should be a comment right there in the code.

> It is not just the NX bit that we change. For DEBUG_PAGEALLOC, we want
> use 4k pages instead of large page mappings during the identity mapping
> (as this will clean some of the cpa pool code avoiding the cpa and hence
>  the page allocations for splitting the big pages from interrupt context's).
> In this case will will split the static large page mappings.
>   

Well, that's OK.  We just need to preserve the original page permissions 
when fragmenting the large mappings.  (This isn't a case that affects 
Xen, because it will already be 4k mappings.)

>>    3. The actual implementation is pretty ugly; adding a global variable
>>       and hopping about with goto does not improve this code.
>>     
>
> This is very early init code and I can't be fancy like calling cpa()
> which need mm to be up and running.

Well, is there any urgency to set NX that early?  It might catch some 
early bugs, but there's no urgent need.

>  And also, cpa's on individual chunks
> for entire identity mapping will make the boot slow.
>   

Really?  Why?  How slow?


>> it cause real failures?  Could we revert this patch and address the
>> problem some other way?  Which app note is this, BTW?  The one I have on
>> hand, "TLBs, Paging-Structure Caches, and Their Invalidation", Apr 2007,
>> does not seem to mention this restriction.
>>     
>
> http://developer.intel.com/design/processor/applnots/317080.pdf
> Section 6 page 26
>   

Ah, OK.  I have the first version of this document which does not 
mention this.  It would be good to explicitly cite this document by name 
in the comments.

>> Xen with this code in place (touching this code is always non-trivial).
>> I haven't looked into it in depth yet, but there's a few stand out "bad
>> for Xen" pieces of code here.  (And I haven't tested 32-bit yet.)
>>
>> Quick rules for keeping Xen happy here:
>>
>>    1. Xen provides its own initial pagetable; the head_64.S one is
>>       unused when booting under Xen.
>>    2. Xen requires that any pagetable page must always be mapped RO, so
>>       we're careful to not replace an existing mapping with a new one,
>>       in case the existing mapping is a pagetable one.
>>    3. Xen never uses large pages, and the hypervisor will fail any
>>       attempt to do so.
>>     
>
> Thanks for this info. Will get back to you tomorrow.
>   

Great.  Also, do you think you'll have a chance to look at unifying the 
32 and 64 bit code (where 32 uses the 64-bit version)?


Thanks,
    J

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-10-07 15:28       ` Jeremy Fitzhardinge
@ 2008-10-07 20:58         ` Suresh Siddha
  2008-10-07 21:33           ` Jeremy Fitzhardinge
  2008-10-08 19:46           ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 18+ messages in thread
From: Suresh Siddha @ 2008-10-07 20:58 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Siddha, Suresh B, mingo, hpa, tglx, arjan, Pallipadi, Venkatesh,
	linux-kernel, yinghai

On Tue, Oct 07, 2008 at 08:28:08AM -0700, Jeremy Fitzhardinge wrote:
> Well, that's OK.  We just need to preserve the original page permissions
> when fragmenting the large mappings.  (This isn't a case that affects
> Xen, because it will already be 4k mappings.)

Jeremy, Can you please check if the appended patch fixes your issue and Ack
it? Test booted on three different 64bit platforms with and without
DEBUG_PAGEALLOC.

> Great.  Also, do you think you'll have a chance to look at unifying the
> 32 and 64 bit code (where 32 uses the 64-bit version)?

32bit can't use the 64-bit version. 64bit uses large pages in the
static mapping setup by head_64.S while the 32bit uses small mappings.
I am also not sure why the Xen 32bit didn't break. With or with out may
recent changes, 32bit kernel is always doing set_pte() and doesn't seem
to care about the previous mappings. So what is special with 32bit xen
that doesn't cause this failure. Thanks.

---
From: Suresh Siddha <suresh.b.siddha@intel.com>
Subject: x64, cpa: modify kernel physical mapping initialization to satisfy Xen

Jeremy Fitzhardinge wrote:

> I'd noticed that current tip/master hasn't been booting under Xen, and I
> just got around to bisecting it down to this change.
>
> commit 065ae73c5462d42e9761afb76f2b52965ff45bd6
> Author: Suresh Siddha <suresh.b.siddha@intel.com>
>
>    x86, cpa: make the kernel physical mapping initialization a two pass sequence
>
> This patch is causing Xen to fail various pagetable updates because it
> ends up remapping pagetables to RW, which Xen explicitly prohibits (as
> that would allow guests to make arbitrary changes to pagetables, rather
> than have them mediated by the hypervisor).

Instead of making init a two pass sequence, to satisfy the Intel's TLB
Application note (developer.intel.com/design/processor/applnots/317080.pdf
Section 6 page 26), we preserve the original page permissions
when fragmenting the large mappings and don't touch the existing memory
mapping (which satisfies Xen's requirements).

Only open issue is: on a native linux kernel, we will go back to mapping
the first 0-1GB kernel identity mapping as executable (because of the
static mapping setup in head_64.S). We can fix this in a different
patch if needed.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

Index: tip/arch/x86/mm/init_64.c
===================================================================
--- tip.orig/arch/x86/mm/init_64.c	2008-10-07 13:30:06.000000000 -0700
+++ tip/arch/x86/mm/init_64.c	2008-10-07 13:30:29.000000000 -0700
@@ -323,10 +323,9 @@
 	early_iounmap(adr, PAGE_SIZE);
 }
 
-static int physical_mapping_iter;
-
 static unsigned long __meminit
-phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+	      pgprot_t prot)
 {
 	unsigned pages = 0;
 	unsigned long last_map_addr = end;
@@ -344,35 +343,40 @@
 			break;
 		}
 
+		/*
+		 * We will re-use the existing mapping.
+		 * Xen for example has some special requirements, like mapping
+		 * pagetable pages as RO. So assume someone who pre-setup
+		 * these mappings are more intelligent.
+		 */
 		if (pte_val(*pte))
-			goto repeat_set_pte;
+			continue;
 
 		if (0)
 			printk("   pte=%p addr=%lx pte=%016lx\n",
 			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
 		pages++;
-repeat_set_pte:
-		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
+		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
 		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
 	}
 
-	if (physical_mapping_iter == 1)
-		update_page_count(PG_LEVEL_4K, pages);
+	update_page_count(PG_LEVEL_4K, pages);
 
 	return last_map_addr;
 }
 
 static unsigned long __meminit
-phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
+		pgprot_t prot)
 {
 	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
 
-	return phys_pte_init(pte, address, end);
+	return phys_pte_init(pte, address, end, prot);
 }
 
 static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
-			 unsigned long page_size_mask)
+	      unsigned long page_size_mask, pgprot_t prot)
 {
 	unsigned long pages = 0;
 	unsigned long last_map_addr = end;
@@ -383,6 +387,7 @@
 		unsigned long pte_phys;
 		pmd_t *pmd = pmd_page + pmd_index(address);
 		pte_t *pte;
+		pgprot_t new_prot = prot;
 
 		if (address >= end) {
 			if (!after_bootmem) {
@@ -396,45 +401,58 @@
 			if (!pmd_large(*pmd)) {
 				spin_lock(&init_mm.page_table_lock);
 				last_map_addr = phys_pte_update(pmd, address,
-								end);
+								end, prot);
 				spin_unlock(&init_mm.page_table_lock);
 				continue;
 			}
-			goto repeat_set_pte;
+			/*
+			 * If we are ok with PG_LEVEL_2M mapping, then we will
+			 * use the existing mapping,
+			 *
+			 * Otherwise, we will split the large page mapping but
+			 * use the same existing protection bits except for
+			 * large page, so that we don't violate Intel's TLB
+			 * Application note (317080) which says, while changing
+			 * the page sizes, new and old translations should
+			 * not differ with respect to page frame and
+			 * attributes.
+			 */
+			if (page_size_mask & (1 << PG_LEVEL_2M))
+				continue;
+			new_prot = pte_pgprot(pte_clrhuge(* (pte_t *)pmd));
 		}
 
 		if (page_size_mask & (1<<PG_LEVEL_2M)) {
 			pages++;
-repeat_set_pte:
 			spin_lock(&init_mm.page_table_lock);
 			set_pte((pte_t *)pmd,
-				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+				pfn_pte(address >> PAGE_SHIFT,
+					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
 			spin_unlock(&init_mm.page_table_lock);
 			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
 			continue;
 		}
 
 		pte = alloc_low_page(&pte_phys);
-		last_map_addr = phys_pte_init(pte, address, end);
+		last_map_addr = phys_pte_init(pte, address, end, new_prot);
 		unmap_low_page(pte);
 
 		spin_lock(&init_mm.page_table_lock);
 		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
 		spin_unlock(&init_mm.page_table_lock);
 	}
-	if (physical_mapping_iter == 1)
-		update_page_count(PG_LEVEL_2M, pages);
+	update_page_count(PG_LEVEL_2M, pages);
 	return last_map_addr;
 }
 
 static unsigned long __meminit
 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
-			 unsigned long page_size_mask)
+		unsigned long page_size_mask, pgprot_t prot)
 {
 	pmd_t *pmd = pmd_offset(pud, 0);
 	unsigned long last_map_addr;
 
-	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
+	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
 	__flush_tlb_all();
 	return last_map_addr;
 }
@@ -451,6 +469,7 @@
 		unsigned long pmd_phys;
 		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;
+		pgprot_t prot = PAGE_KERNEL;
 
 		if (addr >= end)
 			break;
@@ -464,16 +483,28 @@
 		if (pud_val(*pud)) {
 			if (!pud_large(*pud)) {
 				last_map_addr = phys_pmd_update(pud, addr, end,
-							 page_size_mask);
+							 page_size_mask, prot);
 				continue;
 			}
-
-			goto repeat_set_pte;
+			/*
+			 * If we are ok with PG_LEVEL_1G mapping, then we will
+			 * use the existing mapping.
+			 *
+			 * Otherwise, we will split the gbpage mapping but use
+			 * the same existing protection  bits except for large
+			 * page, so that we don't violate Intel's TLB
+			 * Application note (317080) which says, while changing
+			 * the page sizes, new and old translations should
+			 * not differ with respect to page frame and
+			 * attributes.
+			 */
+			if (page_size_mask & (1 << PG_LEVEL_1G))
+				continue;
+			prot = pte_pgprot(pte_clrhuge(* (pte_t *)pud));
 		}
 
 		if (page_size_mask & (1<<PG_LEVEL_1G)) {
 			pages++;
-repeat_set_pte:
 			spin_lock(&init_mm.page_table_lock);
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
@@ -483,7 +514,8 @@
 		}
 
 		pmd = alloc_low_page(&pmd_phys);
-		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
+		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
+					      prot);
 		unmap_low_page(pmd);
 
 		spin_lock(&init_mm.page_table_lock);
@@ -492,8 +524,7 @@
 	}
 	__flush_tlb_all();
 
-	if (physical_mapping_iter == 1)
-		update_page_count(PG_LEVEL_1G, pages);
+	update_page_count(PG_LEVEL_1G, pages);
 
 	return last_map_addr;
 }
@@ -558,54 +589,15 @@
 		direct_gbpages = 0;
 }
 
-static int is_kernel(unsigned long pfn)
-{
-	unsigned long pg_addresss = pfn << PAGE_SHIFT;
-
-	if (pg_addresss >= (unsigned long) __pa(_text) &&
-	    pg_addresss < (unsigned long) __pa(_end))
-		return 1;
-
-	return 0;
-}
-
 static unsigned long __init kernel_physical_mapping_init(unsigned long start,
 						unsigned long end,
 						unsigned long page_size_mask)
 {
 
-	unsigned long next, last_map_addr;
-	u64 cached_supported_pte_mask = __supported_pte_mask;
-	unsigned long cache_start = start;
-	unsigned long cache_end = end;
-
-	/*
-	 * First iteration will setup identity mapping using large/small pages
-	 * based on page_size_mask, with other attributes same as set by
-	 * the early code in head_64.S
-	 *
-	 * Second iteration will setup the appropriate attributes
-	 * as desired for the kernel identity mapping.
-	 *
-	 * This two pass mechanism conforms to the TLB app note which says:
-	 *
-	 *     "Software should not write to a paging-structure entry in a way
-	 *      that would change, for any linear address, both the page size
-	 *      and either the page frame or attributes."
-	 *
-	 * For now, only difference between very early PTE attributes used in
-	 * head_64.S and here is _PAGE_NX.
-	 */
-	BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC)
-		     != _PAGE_NX);
-	__supported_pte_mask &= ~(_PAGE_NX);
-	physical_mapping_iter = 1;
+	unsigned long next, last_map_addr = end;
 
-repeat:
-	last_map_addr = cache_end;
-
-	start = (unsigned long)__va(cache_start);
-	end = (unsigned long)__va(cache_end);
+	start = (unsigned long)__va(start);
+	end = (unsigned long)__va(end);
 
 	for (; start < end; start = next) {
 		pgd_t *pgd = pgd_offset_k(start);
@@ -617,21 +609,11 @@
 			next = end;
 
 		if (pgd_val(*pgd)) {
-			/*
-			 * Static identity mappings will be overwritten
-			 * with run-time mappings. For example, this allows
-			 * the static 0-1GB identity mapping to be mapped
-			 * non-executable with this.
-			 */
-			if (is_kernel(pte_pfn(*((pte_t *) pgd))))
-				goto realloc;
-
 			last_map_addr = phys_pud_update(pgd, __pa(start),
 						 __pa(end), page_size_mask);
 			continue;
 		}
 
-realloc:
 		pud = alloc_low_page(&pud_phys);
 		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
 						 page_size_mask);
@@ -643,15 +625,6 @@
 	}
 	__flush_tlb_all();
 
-	if (physical_mapping_iter == 1) {
-		physical_mapping_iter = 2;
-		/*
-		 * Second iteration will set the actual desired PTE attributes.
-		 */
-		__supported_pte_mask = cached_supported_pte_mask;
-		goto repeat;
-	}
-
 	return last_map_addr;
 }
 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-10-07 20:58         ` Suresh Siddha
@ 2008-10-07 21:33           ` Jeremy Fitzhardinge
  2008-10-08 19:46           ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 18+ messages in thread
From: Jeremy Fitzhardinge @ 2008-10-07 21:33 UTC (permalink / raw)
  To: Suresh Siddha
  Cc: mingo, hpa, tglx, arjan, Pallipadi, Venkatesh, linux-kernel, yinghai

Suresh Siddha wrote:
> On Tue, Oct 07, 2008 at 08:28:08AM -0700, Jeremy Fitzhardinge wrote:
>   
>> Well, that's OK.  We just need to preserve the original page permissions
>> when fragmenting the large mappings.  (This isn't a case that affects
>> Xen, because it will already be 4k mappings.)
>>     
>
> Jeremy, Can you please check if the appended patch fixes your issue and Ack
> it? Test booted on three different 64bit platforms with and without
> DEBUG_PAGEALLOC.
>   

Will do.

>> Great.  Also, do you think you'll have a chance to look at unifying the
>> 32 and 64 bit code (where 32 uses the 64-bit version)?
>>     
>
> 32bit can't use the 64-bit version. 64bit uses large pages in the
> static mapping setup by head_64.S while the 32bit uses small mappings.
>   

The 64-bit code would obviously need a little bit of modification to 
make it work.

I don't see why 4k vs 2M initial mappings makes a difference.  32-bit 
dynamically sets up a pagetable in head.S.  The physical mapping setup 
can replace those mappings with large pages if it wants to.

Functionally both pieces of code are doing the same thing, and there's 
no reason why they couldn't be unified.

> I am also not sure why the Xen 32bit didn't break. With or with out may
> recent changes, 32bit kernel is always doing set_pte() and doesn't seem
> to care about the previous mappings. So what is special with 32bit xen
> that doesn't cause this failure. Thanks.
>   

I have a fairly sleazy hack in 32-bit Xen which means that set_pte 
doesn't override the page permissions when doing the physical mapping 
setup.  It's something I'd like to get rid of.

    J

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-10-07 20:58         ` Suresh Siddha
  2008-10-07 21:33           ` Jeremy Fitzhardinge
@ 2008-10-08 19:46           ` Jeremy Fitzhardinge
  2008-10-08 21:08             ` Ingo Molnar
  1 sibling, 1 reply; 18+ messages in thread
From: Jeremy Fitzhardinge @ 2008-10-08 19:46 UTC (permalink / raw)
  To: Suresh Siddha
  Cc: mingo, hpa, tglx, arjan, Pallipadi, Venkatesh, linux-kernel, yinghai

Suresh Siddha wrote:
> On Tue, Oct 07, 2008 at 08:28:08AM -0700, Jeremy Fitzhardinge wrote:
>   
>> Well, that's OK.  We just need to preserve the original page permissions
>> when fragmenting the large mappings.  (This isn't a case that affects
>> Xen, because it will already be 4k mappings.)
>>     
>
> Jeremy, Can you please check if the appended patch fixes your issue and Ack
> it? Test booted on three different 64bit platforms with and without
> DEBUG_PAGEALLOC.
>   

This patch works fine under Xen.  Thanks for the quick fix.

Tested-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

    J

>   
>> Great.  Also, do you think you'll have a chance to look at unifying the
>> 32 and 64 bit code (where 32 uses the 64-bit version)?
>>     
>
> 32bit can't use the 64-bit version. 64bit uses large pages in the
> static mapping setup by head_64.S while the 32bit uses small mappings.
> I am also not sure why the Xen 32bit didn't break. With or with out may
> recent changes, 32bit kernel is always doing set_pte() and doesn't seem
> to care about the previous mappings. So what is special with 32bit xen
> that doesn't cause this failure. Thanks.
>
> ---
> From: Suresh Siddha <suresh.b.siddha@intel.com>
> Subject: x64, cpa: modify kernel physical mapping initialization to satisfy Xen
>
> Jeremy Fitzhardinge wrote:
>
>   
>> I'd noticed that current tip/master hasn't been booting under Xen, and I
>> just got around to bisecting it down to this change.
>>
>> commit 065ae73c5462d42e9761afb76f2b52965ff45bd6
>> Author: Suresh Siddha <suresh.b.siddha@intel.com>
>>
>>    x86, cpa: make the kernel physical mapping initialization a two pass sequence
>>
>> This patch is causing Xen to fail various pagetable updates because it
>> ends up remapping pagetables to RW, which Xen explicitly prohibits (as
>> that would allow guests to make arbitrary changes to pagetables, rather
>> than have them mediated by the hypervisor).
>>     
>
> Instead of making init a two pass sequence, to satisfy the Intel's TLB
> Application note (developer.intel.com/design/processor/applnots/317080.pdf
> Section 6 page 26), we preserve the original page permissions
> when fragmenting the large mappings and don't touch the existing memory
> mapping (which satisfies Xen's requirements).
>
> Only open issue is: on a native linux kernel, we will go back to mapping
> the first 0-1GB kernel identity mapping as executable (because of the
> static mapping setup in head_64.S). We can fix this in a different
> patch if needed.
>
> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> ---
>
> Index: tip/arch/x86/mm/init_64.c
> ===================================================================
> --- tip.orig/arch/x86/mm/init_64.c	2008-10-07 13:30:06.000000000 -0700
> +++ tip/arch/x86/mm/init_64.c	2008-10-07 13:30:29.000000000 -0700
> @@ -323,10 +323,9 @@
>  	early_iounmap(adr, PAGE_SIZE);
>  }
>  
> -static int physical_mapping_iter;
> -
>  static unsigned long __meminit
> -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
> +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
> +	      pgprot_t prot)
>  {
>  	unsigned pages = 0;
>  	unsigned long last_map_addr = end;
> @@ -344,35 +343,40 @@
>  			break;
>  		}
>  
> +		/*
> +		 * We will re-use the existing mapping.
> +		 * Xen for example has some special requirements, like mapping
> +		 * pagetable pages as RO. So assume someone who pre-setup
> +		 * these mappings are more intelligent.
> +		 */
>  		if (pte_val(*pte))
> -			goto repeat_set_pte;
> +			continue;
>  
>  		if (0)
>  			printk("   pte=%p addr=%lx pte=%016lx\n",
>  			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
>  		pages++;
> -repeat_set_pte:
> -		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
> +		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
>  		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
>  	}
>  
> -	if (physical_mapping_iter == 1)
> -		update_page_count(PG_LEVEL_4K, pages);
> +	update_page_count(PG_LEVEL_4K, pages);
>  
>  	return last_map_addr;
>  }
>  
>  static unsigned long __meminit
> -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
> +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
> +		pgprot_t prot)
>  {
>  	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
>  
> -	return phys_pte_init(pte, address, end);
> +	return phys_pte_init(pte, address, end, prot);
>  }
>  
>  static unsigned long __meminit
>  phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
> -			 unsigned long page_size_mask)
> +	      unsigned long page_size_mask, pgprot_t prot)
>  {
>  	unsigned long pages = 0;
>  	unsigned long last_map_addr = end;
> @@ -383,6 +387,7 @@
>  		unsigned long pte_phys;
>  		pmd_t *pmd = pmd_page + pmd_index(address);
>  		pte_t *pte;
> +		pgprot_t new_prot = prot;
>  
>  		if (address >= end) {
>  			if (!after_bootmem) {
> @@ -396,45 +401,58 @@
>  			if (!pmd_large(*pmd)) {
>  				spin_lock(&init_mm.page_table_lock);
>  				last_map_addr = phys_pte_update(pmd, address,
> -								end);
> +								end, prot);
>  				spin_unlock(&init_mm.page_table_lock);
>  				continue;
>  			}
> -			goto repeat_set_pte;
> +			/*
> +			 * If we are ok with PG_LEVEL_2M mapping, then we will
> +			 * use the existing mapping,
> +			 *
> +			 * Otherwise, we will split the large page mapping but
> +			 * use the same existing protection bits except for
> +			 * large page, so that we don't violate Intel's TLB
> +			 * Application note (317080) which says, while changing
> +			 * the page sizes, new and old translations should
> +			 * not differ with respect to page frame and
> +			 * attributes.
> +			 */
> +			if (page_size_mask & (1 << PG_LEVEL_2M))
> +				continue;
> +			new_prot = pte_pgprot(pte_clrhuge(* (pte_t *)pmd));
>  		}
>  
>  		if (page_size_mask & (1<<PG_LEVEL_2M)) {
>  			pages++;
> -repeat_set_pte:
>  			spin_lock(&init_mm.page_table_lock);
>  			set_pte((pte_t *)pmd,
> -				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
> +				pfn_pte(address >> PAGE_SHIFT,
> +					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
>  			spin_unlock(&init_mm.page_table_lock);
>  			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
>  			continue;
>  		}
>  
>  		pte = alloc_low_page(&pte_phys);
> -		last_map_addr = phys_pte_init(pte, address, end);
> +		last_map_addr = phys_pte_init(pte, address, end, new_prot);
>  		unmap_low_page(pte);
>  
>  		spin_lock(&init_mm.page_table_lock);
>  		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
>  		spin_unlock(&init_mm.page_table_lock);
>  	}
> -	if (physical_mapping_iter == 1)
> -		update_page_count(PG_LEVEL_2M, pages);
> +	update_page_count(PG_LEVEL_2M, pages);
>  	return last_map_addr;
>  }
>  
>  static unsigned long __meminit
>  phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
> -			 unsigned long page_size_mask)
> +		unsigned long page_size_mask, pgprot_t prot)
>  {
>  	pmd_t *pmd = pmd_offset(pud, 0);
>  	unsigned long last_map_addr;
>  
> -	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
> +	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
>  	__flush_tlb_all();
>  	return last_map_addr;
>  }
> @@ -451,6 +469,7 @@
>  		unsigned long pmd_phys;
>  		pud_t *pud = pud_page + pud_index(addr);
>  		pmd_t *pmd;
> +		pgprot_t prot = PAGE_KERNEL;
>  
>  		if (addr >= end)
>  			break;
> @@ -464,16 +483,28 @@
>  		if (pud_val(*pud)) {
>  			if (!pud_large(*pud)) {
>  				last_map_addr = phys_pmd_update(pud, addr, end,
> -							 page_size_mask);
> +							 page_size_mask, prot);
>  				continue;
>  			}
> -
> -			goto repeat_set_pte;
> +			/*
> +			 * If we are ok with PG_LEVEL_1G mapping, then we will
> +			 * use the existing mapping.
> +			 *
> +			 * Otherwise, we will split the gbpage mapping but use
> +			 * the same existing protection  bits except for large
> +			 * page, so that we don't violate Intel's TLB
> +			 * Application note (317080) which says, while changing
> +			 * the page sizes, new and old translations should
> +			 * not differ with respect to page frame and
> +			 * attributes.
> +			 */
> +			if (page_size_mask & (1 << PG_LEVEL_1G))
> +				continue;
> +			prot = pte_pgprot(pte_clrhuge(* (pte_t *)pud));
>  		}
>  
>  		if (page_size_mask & (1<<PG_LEVEL_1G)) {
>  			pages++;
> -repeat_set_pte:
>  			spin_lock(&init_mm.page_table_lock);
>  			set_pte((pte_t *)pud,
>  				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
> @@ -483,7 +514,8 @@
>  		}
>  
>  		pmd = alloc_low_page(&pmd_phys);
> -		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
> +		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
> +					      prot);
>  		unmap_low_page(pmd);
>  
>  		spin_lock(&init_mm.page_table_lock);
> @@ -492,8 +524,7 @@
>  	}
>  	__flush_tlb_all();
>  
> -	if (physical_mapping_iter == 1)
> -		update_page_count(PG_LEVEL_1G, pages);
> +	update_page_count(PG_LEVEL_1G, pages);
>  
>  	return last_map_addr;
>  }
> @@ -558,54 +589,15 @@
>  		direct_gbpages = 0;
>  }
>  
> -static int is_kernel(unsigned long pfn)
> -{
> -	unsigned long pg_addresss = pfn << PAGE_SHIFT;
> -
> -	if (pg_addresss >= (unsigned long) __pa(_text) &&
> -	    pg_addresss < (unsigned long) __pa(_end))
> -		return 1;
> -
> -	return 0;
> -}
> -
>  static unsigned long __init kernel_physical_mapping_init(unsigned long start,
>  						unsigned long end,
>  						unsigned long page_size_mask)
>  {
>  
> -	unsigned long next, last_map_addr;
> -	u64 cached_supported_pte_mask = __supported_pte_mask;
> -	unsigned long cache_start = start;
> -	unsigned long cache_end = end;
> -
> -	/*
> -	 * First iteration will setup identity mapping using large/small pages
> -	 * based on page_size_mask, with other attributes same as set by
> -	 * the early code in head_64.S
> -	 *
> -	 * Second iteration will setup the appropriate attributes
> -	 * as desired for the kernel identity mapping.
> -	 *
> -	 * This two pass mechanism conforms to the TLB app note which says:
> -	 *
> -	 *     "Software should not write to a paging-structure entry in a way
> -	 *      that would change, for any linear address, both the page size
> -	 *      and either the page frame or attributes."
> -	 *
> -	 * For now, only difference between very early PTE attributes used in
> -	 * head_64.S and here is _PAGE_NX.
> -	 */
> -	BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC)
> -		     != _PAGE_NX);
> -	__supported_pte_mask &= ~(_PAGE_NX);
> -	physical_mapping_iter = 1;
> +	unsigned long next, last_map_addr = end;
>  
> -repeat:
> -	last_map_addr = cache_end;
> -
> -	start = (unsigned long)__va(cache_start);
> -	end = (unsigned long)__va(cache_end);
> +	start = (unsigned long)__va(start);
> +	end = (unsigned long)__va(end);
>  
>  	for (; start < end; start = next) {
>  		pgd_t *pgd = pgd_offset_k(start);
> @@ -617,21 +609,11 @@
>  			next = end;
>  
>  		if (pgd_val(*pgd)) {
> -			/*
> -			 * Static identity mappings will be overwritten
> -			 * with run-time mappings. For example, this allows
> -			 * the static 0-1GB identity mapping to be mapped
> -			 * non-executable with this.
> -			 */
> -			if (is_kernel(pte_pfn(*((pte_t *) pgd))))
> -				goto realloc;
> -
>  			last_map_addr = phys_pud_update(pgd, __pa(start),
>  						 __pa(end), page_size_mask);
>  			continue;
>  		}
>  
> -realloc:
>  		pud = alloc_low_page(&pud_phys);
>  		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
>  						 page_size_mask);
> @@ -643,15 +625,6 @@
>  	}
>  	__flush_tlb_all();
>  
> -	if (physical_mapping_iter == 1) {
> -		physical_mapping_iter = 2;
> -		/*
> -		 * Second iteration will set the actual desired PTE attributes.
> -		 */
> -		__supported_pte_mask = cached_supported_pte_mask;
> -		goto repeat;
> -	}
> -
>  	return last_map_addr;
>  }
>  
>   


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-10-08 19:46           ` Jeremy Fitzhardinge
@ 2008-10-08 21:08             ` Ingo Molnar
  0 siblings, 0 replies; 18+ messages in thread
From: Ingo Molnar @ 2008-10-08 21:08 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Suresh Siddha, hpa, tglx, arjan, Pallipadi, Venkatesh,
	linux-kernel, yinghai


* Jeremy Fitzhardinge <jeremy@goop.org> wrote:

> Suresh Siddha wrote:
>> On Tue, Oct 07, 2008 at 08:28:08AM -0700, Jeremy Fitzhardinge wrote:
>>   
>>> Well, that's OK.  We just need to preserve the original page permissions
>>> when fragmenting the large mappings.  (This isn't a case that affects
>>> Xen, because it will already be 4k mappings.)
>>>     
>>
>> Jeremy, Can you please check if the appended patch fixes your issue and Ack
>> it? Test booted on three different 64bit platforms with and without
>> DEBUG_PAGEALLOC.
>>   
>
> This patch works fine under Xen.  Thanks for the quick fix.
>
> Tested-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

applied to tip/x86/pat, thanks guys!

If it fails testing for some reason then it might not make the first 
v2.6.28 pull request, but it's definitely moving in the right direction 
so the PAT/TLB changes seem v2.6.28 worthy.

	Ingo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence
  2008-09-11 20:30 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note Suresh Siddha
@ 2008-09-11 20:30 ` Suresh Siddha
  0 siblings, 0 replies; 18+ messages in thread
From: Suresh Siddha @ 2008-09-11 20:30 UTC (permalink / raw)
  To: mingo, hpa, tglx, arjan; +Cc: linux-kernel, Suresh Siddha

[-- Attachment #1: fix_large_small_page_identity_mappings.patch --]
[-- Type: text/plain, Size: 8944 bytes --]

In the first pass, kernel physical mapping will be setup using large or
small pages but uses the same PTE attributes as that of the early
PTE attributes setup by early boot code in head_[32|64].S

After flushing TLB's, we go through the second pass, which setups the
direct mapped PTE's with the appropriate attributes (like NX, GLOBAL etc)
which are runtime detectable.

This two pass mechanism conforms to the TLB app note which says:

"Software should not write to a paging-structure entry in a way that would
 change, for any linear address, both the page size and either the page frame
 or attributes."

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

Index: tip/arch/x86/mm/init_32.c
===================================================================
--- tip.orig/arch/x86/mm/init_32.c	2008-09-11 12:59:35.000000000 -0700
+++ tip/arch/x86/mm/init_32.c	2008-09-11 13:02:34.000000000 -0700
@@ -196,11 +196,30 @@
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
-	unsigned pages_2m = 0, pages_4k = 0;
+	unsigned pages_2m, pages_4k;
+	int mapping_iter;
+
+	/*
+	 * First iteration will setup identity mapping using large/small pages
+	 * based on use_pse, with other attributes same as set by
+	 * the early code in head_32.S
+	 *
+	 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+	 * as desired for the kernel identity mapping.
+	 *
+	 * This two pass mechanism conforms to the TLB app note which says:
+	 *
+	 *     "Software should not write to a paging-structure entry in a way
+	 *      that would change, for any linear address, both the page size
+	 *      and either the page frame or attributes."
+	 */
+	mapping_iter = 1;
 
 	if (!cpu_has_pse)
 		use_pse = 0;
 
+repeat:
+	pages_2m = pages_4k = 0;
 	pfn = start_pfn;
 	pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
 	pgd = pgd_base + pgd_idx;
@@ -226,6 +245,13 @@
 			if (use_pse) {
 				unsigned int addr2;
 				pgprot_t prot = PAGE_KERNEL_LARGE;
+				/*
+				 * first pass will use the same initial
+				 * identity mapping attribute + _PAGE_PSE.
+				 */
+				pgprot_t init_prot =
+					__pgprot(PTE_IDENT_ATTR |
+						 _PAGE_PSE);
 
 				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
 					PAGE_OFFSET + PAGE_SIZE-1;
@@ -235,7 +261,10 @@
 					prot = PAGE_KERNEL_LARGE_EXEC;
 
 				pages_2m++;
-				set_pmd(pmd, pfn_pmd(pfn, prot));
+				if (mapping_iter == 1)
+					set_pmd(pmd, pfn_pmd(pfn, init_prot));
+				else
+					set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
 				continue;
@@ -247,17 +276,43 @@
 			for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
 			     pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
 				pgprot_t prot = PAGE_KERNEL;
+				/*
+				 * first pass will use the same initial
+				 * identity mapping attribute.
+				 */
+				pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
 
 				if (is_kernel_text(addr))
 					prot = PAGE_KERNEL_EXEC;
 
 				pages_4k++;
-				set_pte(pte, pfn_pte(pfn, prot));
+				if (mapping_iter == 1)
+					set_pte(pte, pfn_pte(pfn, init_prot));
+				else
+					set_pte(pte, pfn_pte(pfn, prot));
 			}
 		}
 	}
-	update_page_count(PG_LEVEL_2M, pages_2m);
-	update_page_count(PG_LEVEL_4K, pages_4k);
+	if (mapping_iter == 1) {
+		/*
+		 * update direct mapping page count only in the first
+		 * iteration.
+		 */
+		update_page_count(PG_LEVEL_2M, pages_2m);
+		update_page_count(PG_LEVEL_4K, pages_4k);
+
+		/*
+		 * local global flush tlb, which will flush the previous
+		 * mappings present in both small and large page TLB's.
+		 */
+		__flush_tlb_all();
+
+		/*
+		 * Second iteration will set the actual desired PTE attributes.
+		 */
+		mapping_iter = 2;
+		goto repeat;
+	}
 }
 
 /*
Index: tip/arch/x86/mm/init_64.c
===================================================================
--- tip.orig/arch/x86/mm/init_64.c	2008-09-11 12:59:35.000000000 -0700
+++ tip/arch/x86/mm/init_64.c	2008-09-11 13:00:04.000000000 -0700
@@ -326,6 +326,8 @@
 	early_iounmap(adr, PAGE_SIZE);
 }
 
+static int physical_mapping_iter;
+
 static unsigned long __meminit
 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
 {
@@ -346,16 +348,19 @@
 		}
 
 		if (pte_val(*pte))
-			continue;
+			goto repeat_set_pte;
 
 		if (0)
 			printk("   pte=%p addr=%lx pte=%016lx\n",
 			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+		pages++;
+repeat_set_pte:
 		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
 		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
-		pages++;
 	}
-	update_page_count(PG_LEVEL_4K, pages);
+
+	if (physical_mapping_iter == 1)
+		update_page_count(PG_LEVEL_4K, pages);
 
 	return last_map_addr;
 }
@@ -374,7 +379,6 @@
 {
 	unsigned long pages = 0;
 	unsigned long last_map_addr = end;
-	unsigned long start = address;
 
 	int i = pmd_index(address);
 
@@ -397,15 +401,14 @@
 				last_map_addr = phys_pte_update(pmd, address,
 								end);
 				spin_unlock(&init_mm.page_table_lock);
+				continue;
 			}
-			/* Count entries we're using from level2_ident_pgt */
-			if (start == 0)
-				pages++;
-			continue;
+			goto repeat_set_pte;
 		}
 
 		if (page_size_mask & (1<<PG_LEVEL_2M)) {
 			pages++;
+repeat_set_pte:
 			spin_lock(&init_mm.page_table_lock);
 			set_pte((pte_t *)pmd,
 				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
@@ -422,7 +425,8 @@
 		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
 		spin_unlock(&init_mm.page_table_lock);
 	}
-	update_page_count(PG_LEVEL_2M, pages);
+	if (physical_mapping_iter == 1)
+		update_page_count(PG_LEVEL_2M, pages);
 	return last_map_addr;
 }
 
@@ -461,14 +465,18 @@
 		}
 
 		if (pud_val(*pud)) {
-			if (!pud_large(*pud))
+			if (!pud_large(*pud)) {
 				last_map_addr = phys_pmd_update(pud, addr, end,
 							 page_size_mask);
-			continue;
+				continue;
+			}
+
+			goto repeat_set_pte;
 		}
 
 		if (page_size_mask & (1<<PG_LEVEL_1G)) {
 			pages++;
+repeat_set_pte:
 			spin_lock(&init_mm.page_table_lock);
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
@@ -486,7 +494,9 @@
 		spin_unlock(&init_mm.page_table_lock);
 	}
 	__flush_tlb_all();
-	update_page_count(PG_LEVEL_1G, pages);
+
+	if (physical_mapping_iter == 1)
+		update_page_count(PG_LEVEL_1G, pages);
 
 	return last_map_addr;
 }
@@ -550,15 +560,54 @@
 		direct_gbpages = 0;
 }
 
+static int is_kernel(unsigned long pfn)
+{
+	unsigned long pg_addresss = pfn << PAGE_SHIFT;
+
+	if (pg_addresss >= (unsigned long) __pa(_text) &&
+	    pg_addresss <= (unsigned long) __pa(_end))
+		return 1;
+
+	return 0;
+}
+
 static unsigned long __init kernel_physical_mapping_init(unsigned long start,
 						unsigned long end,
 						unsigned long page_size_mask)
 {
 
-	unsigned long next, last_map_addr = end;
+	unsigned long next, last_map_addr;
+	u64 cached_supported_pte_mask = __supported_pte_mask;
+	unsigned long cache_start = start;
+	unsigned long cache_end = end;
+
+	/*
+	 * First iteration will setup identity mapping using large/small pages
+	 * based on page_size_mask, with other attributes same as set by
+	 * the early code in head_64.S
+	 *
+	 * Second iteration will setup the appropriate attributes
+	 * as desired for the kernel identity mapping.
+	 *
+	 * This two pass mechanism conforms to the TLB app note which says:
+	 *
+	 *     "Software should not write to a paging-structure entry in a way
+	 *      that would change, for any linear address, both the page size
+	 *      and either the page frame or attributes."
+	 *
+	 * For now, only difference between very early PTE attributes used in
+	 * head_64.S and here is _PAGE_NX.
+	 */
+	BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC)
+		     != _PAGE_NX);
+	__supported_pte_mask &= ~(_PAGE_NX);
+	physical_mapping_iter = 1;
 
-	start = (unsigned long)__va(start);
-	end = (unsigned long)__va(end);
+repeat:
+	last_map_addr = end;
+
+	start = (unsigned long)__va(cache_start);
+	end = (unsigned long)__va(cache_end);
 
 	for (; start < end; start = next) {
 		pgd_t *pgd = pgd_offset_k(start);
@@ -570,11 +619,21 @@
 			next = end;
 
 		if (pgd_val(*pgd)) {
+			/*
+			 * Static identity mappings will be overwritten
+			 * with run-time mappings. For example, this allows
+			 * the static 0-1GB identity mapping to be mapped
+			 * non-executable with this.
+			 */
+			if (is_kernel(pte_pfn(*((pte_t *) pgd))))
+				goto realloc;
+
 			last_map_addr = phys_pud_update(pgd, __pa(start),
 						 __pa(end), page_size_mask);
 			continue;
 		}
 
+realloc:
 		pud = alloc_low_page(&pud_phys);
 		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
 						 page_size_mask);
@@ -584,6 +643,16 @@
 		pgd_populate(&init_mm, pgd, __va(pud_phys));
 		spin_unlock(&init_mm.page_table_lock);
 	}
+	__flush_tlb_all();
+
+	if (physical_mapping_iter == 1) {
+		physical_mapping_iter = 2;
+		/*
+		 * Second iteration will set the actual desired PTE attributes.
+		 */
+		__supported_pte_mask = cached_supported_pte_mask;
+		goto repeat;
+	}
 
 	return last_map_addr;
 }

-- 


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2008-10-08 21:09 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-09-23 21:00 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Suresh Siddha
2008-09-23 21:00 ` [patch 1/7] x86, cpa: rename PTE attribute macros for kernel direct mapping in early boot Suresh Siddha
2008-09-23 21:00 ` [patch 2/7] x86, cpa: remove USER permission from the very early identity mapping attribute Suresh Siddha
2008-09-23 21:00 ` [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence Suresh Siddha
2008-10-06 20:48   ` Jeremy Fitzhardinge
2008-10-06 23:09     ` Jeremy Fitzhardinge
2008-10-07  1:58     ` Suresh Siddha
2008-10-07 15:28       ` Jeremy Fitzhardinge
2008-10-07 20:58         ` Suresh Siddha
2008-10-07 21:33           ` Jeremy Fitzhardinge
2008-10-08 19:46           ` Jeremy Fitzhardinge
2008-10-08 21:08             ` Ingo Molnar
2008-09-23 21:00 ` [patch 4/7] x86, cpa: dont use large pages for kernel identity mapping with DEBUG_PAGEALLOC Suresh Siddha
2008-09-23 21:00 ` [patch 5/7] x86, cpa: no need to check alias for __set_pages_p/__set_pages_np Suresh Siddha
2008-09-23 21:00 ` [patch 6/7] x86, cpa: remove cpa pool code Suresh Siddha
2008-09-23 21:00 ` [patch 7/7] x86, cpa: srlz cpa(), global flush tlb after splitting big page and before doing cpa Suresh Siddha
2008-09-24  8:15 ` [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note - v2 Ingo Molnar
  -- strict thread matches above, loose matches on Subject: below --
2008-09-11 20:30 [patch 0/7] x86, cpa: cpa related changes to be inline with TLB Application note Suresh Siddha
2008-09-11 20:30 ` [patch 3/7] x86, cpa: make the kernel physical mapping initialization a two pass sequence Suresh Siddha

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).