linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86: fix ref-counting bug in change_page_attr()
@ 2007-12-13  9:34 Jan Beulich
  2007-12-14  7:12 ` Jeremy Fitzhardinge
  2007-12-17 13:28 ` Ingo Molnar
  0 siblings, 2 replies; 6+ messages in thread
From: Jan Beulich @ 2007-12-13  9:34 UTC (permalink / raw)
  To: tglx, mingo, hpa; +Cc: Andi Kleen, linux-kernel

When either calling change_page_attr() with the default attributes
pages in the direct mapping have and a page's attributes already were
set to the default or when changing the attributes from one non-default
value to another, the reference counting broke, leading to either
premature restoration of a large page or missing the opportunity to do
so.

At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it
architecturally ought to have.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>

 arch/x86/mm/ioremap_64.c     |    4 +-
 arch/x86/mm/pageattr_32.c    |   84 +++++++++++++++++++++++++++++--------------
 arch/x86/mm/pageattr_64.c    |   57 +++++++++++++++++++----------
 include/asm-x86/page_32.h    |   10 +++++
 include/asm-x86/page_64.h    |    2 -
 include/asm-x86/pgtable_32.h |    3 +
 include/asm-x86/pgtable_64.h |    4 +-
 7 files changed, 114 insertions(+), 50 deletions(-)

--- linux-2.6.24-rc5/arch/x86/mm/ioremap_64.c	2007-12-12 11:28:18.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/ioremap_64.c	2007-12-04 16:01:11.000000000 +0100
@@ -48,7 +48,7 @@ ioremap_change_attr(unsigned long phys_a
  		 * Must use a address here and not struct page because the phys addr
 		 * can be a in hole between nodes and not have an memmap entry.
 		 */
-		err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
+		err = change_page_attr_addr(vaddr,npages,MAKE_GLOBAL(__PAGE_KERNEL|flags));
 		if (!err)
 			global_flush_tlb();
 	}
@@ -199,7 +199,7 @@ void iounmap(volatile void __iomem *addr
 
 	/* Reset the direct mapping. Can block */
 	if (p->flags >> 20)
-		ioremap_change_attr(p->phys_addr, p->size, 0);
+		ioremap_change_attr(p->phys_addr, get_vm_area_size(p), 0);
 
 	/* Finally remove it */
 	o = remove_vm_area((void *)addr);
--- linux-2.6.24-rc5/arch/x86/mm/pageattr_32.c	2007-12-12 11:28:18.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_32.c	2007-12-04 16:01:11.000000000 +0100
@@ -116,24 +116,22 @@ static void set_pmd_pte(pte_t *kpte, uns
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
+static pgprot_t _ref_prot[KERNEL_PGD_PTRS * PTRS_PER_PMD];
+#define ref_prot(addr) _ref_prot[__pa(addr) >> PMD_SHIFT]
+
 /* 
  * No more special protections in this 2/4MB area - revert to a
  * large page again. 
  */
 static inline void revert_page(struct page *kpte_page, unsigned long address)
 {
-	pgprot_t ref_prot;
 	pte_t *linear;
 
-	ref_prot =
-	((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
-		? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
-
 	linear = (pte_t *)
 		pmd_offset(pud_offset(pgd_offset_k(address), address), address);
 	set_pmd_pte(linear,  address,
-		    pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
-			    ref_prot));
+		    pte_mkhuge(pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
+				       ref_prot(address))));
 }
 
 static inline void save_page(struct page *kpte_page)
@@ -142,12 +140,22 @@ static inline void save_page(struct page
 		list_add(&kpte_page->lru, &df_list);
 }
 
+static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2)
+{
+	return !((pgprot_val(prot1) ^ pgprot_val(prot2))
+#ifdef CONFIG_X86_PAE
+		 & __supported_pte_mask
+#endif
+		 & ~(_PAGE_ACCESSED|_PAGE_DIRTY));
+}
+
 static int
 __change_page_attr(struct page *page, pgprot_t prot)
 { 
 	pte_t *kpte; 
 	unsigned long address;
 	struct page *kpte_page;
+	pgprot_t old_prot, ref_prot;
 
 	BUG_ON(PageHighMem(page));
 	address = (unsigned long)page_address(page);
@@ -159,29 +167,31 @@ __change_page_attr(struct page *page, pg
 	BUG_ON(PageLRU(kpte_page));
 	BUG_ON(PageCompound(kpte_page));
 
-	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
+	old_prot = pte_pgprot(pte_clrhuge(*kpte));
+	ref_prot = ref_prot(address);
+	if (!pgprot_match(prot, ref_prot)) {
 		if (!pte_huge(*kpte)) {
 			set_pte_atomic(kpte, mk_pte(page, prot)); 
 		} else {
-			pgprot_t ref_prot;
-			struct page *split;
-
-			ref_prot =
-			((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
-				? PAGE_KERNEL_EXEC : PAGE_KERNEL;
-			split = split_large_page(address, prot, ref_prot);
-			if (!split)
+			BUG_ON(!pgprot_match(old_prot, ref_prot));
+			kpte_page = split_large_page(address, prot, ref_prot);
+			if (!kpte_page)
 				return -ENOMEM;
-			set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
-			kpte_page = split;
+			set_pmd_pte(kpte, address,
+				    mk_pte(kpte_page, PAGE_KERNEL_EXEC));
+		}
+		if (!PageReserved(kpte_page)
+		    && pgprot_match(old_prot, ref_prot))
+			page_private(kpte_page)++;
+	} else if (!pgprot_match(ref_prot, old_prot)) {
+		BUG_ON(pte_huge(*kpte));
+		set_pte_atomic(kpte, mk_pte(page, ref_prot));
+		if (!PageReserved(kpte_page)) {
+			BUG_ON(page_private(kpte_page) == 0);
+			page_private(kpte_page)--;
 		}
-		page_private(kpte_page)++;
-	} else if (!pte_huge(*kpte)) {
-		set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
-		BUG_ON(page_private(kpte_page) == 0);
-		page_private(kpte_page)--;
 	} else
-		BUG();
+		return 0;
 
 	/*
 	 * If the pte was reserved, it means it was created at boot
@@ -190,8 +200,17 @@ __change_page_attr(struct page *page, pg
 	 */
 
 	save_page(kpte_page);
-	if (!PageReserved(kpte_page)) {
-		if (cpu_has_pse && (page_private(kpte_page) == 0)) {
+	if (!PageReserved(kpte_page) && cpu_has_pse) {
+		if (page_private(kpte_page) == PTRS_PER_PTE) {
+			unsigned i;
+
+			kpte = page_address(kpte_page);
+			for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte)
+				if (pgprot_match(pte_pgprot(*kpte), prot))
+					page_private(kpte_page)--;
+			ref_prot(address) = prot;
+		}
+		if (page_private(kpte_page) == 0) {
 			paravirt_release_pt(page_to_pfn(kpte_page));
 			revert_page(kpte_page, address);
 		}
@@ -222,8 +241,21 @@ int change_page_attr(struct page *page, 
 	int err = 0; 
 	int i; 
 	unsigned long flags;
+	static char first = 1;
 
 	spin_lock_irqsave(&cpa_lock, flags);
+
+	if (unlikely(first)) {
+		unsigned long addr = PAGE_OFFSET & PMD_MASK;
+
+		/* This must match is_kernel_text(). */
+		for (; addr <= (unsigned long)__init_end; addr += PMD_SIZE)
+			ref_prot(addr) = PAGE_KERNEL_EXEC;
+		for (; addr > PAGE_OFFSET; addr += PMD_SIZE)
+			ref_prot(addr) = PAGE_KERNEL;
+		first = 0;
+	}
+
 	for (i = 0; i < numpages; i++, page++) { 
 		err = __change_page_attr(page, prot);
 		if (err) 
--- linux-2.6.24-rc5/arch/x86/mm/pageattr_64.c	2007-12-12 11:28:18.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_64.c	2007-12-04 16:01:11.000000000 +0100
@@ -98,8 +98,14 @@ static inline void save_page(struct page
 		list_add(&fpage->lru, &deferred_pages);
 }
 
+/* protected by init_mm.mmap_sem */
+static pgprot_t kref_prot[] =	{
+	[0 ... (KERNEL_TEXT_SIZE - 1) >> PMD_SHIFT] = PAGE_KERNEL_EXEC
+};
+#define kref_prot(kaddr) kref_prot[((kaddr) - __START_KERNEL_map) >> PMD_SHIFT]
+
 /* 
- * No more special protections in this 2/4MB area - revert to a
+ * No more special protections in this 2MB area - revert to a
  * large page again. 
  */
 static void revert_page(unsigned long address, pgprot_t ref_prot)
@@ -122,55 +128,68 @@ static void revert_page(unsigned long ad
 	set_pte((pte_t *)pmd, large_pte);
 }      
 
+static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2)
+{
+	return !((pgprot_val(prot1) ^ pgprot_val(prot2))
+		 & __supported_pte_mask & ~(_PAGE_ACCESSED|_PAGE_DIRTY));
+}
+
 static int
 __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
 				   pgprot_t ref_prot)
 { 
 	pte_t *kpte; 
 	struct page *kpte_page;
-	pgprot_t ref_prot2;
+	pgprot_t old_prot;
 
 	kpte = lookup_address(address);
 	if (!kpte) return 0;
-	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+	kpte_page = virt_to_page(kpte);
 	BUG_ON(PageLRU(kpte_page));
 	BUG_ON(PageCompound(kpte_page));
-	if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
+	old_prot = pte_pgprot(pte_clrhuge(*kpte));
+	if (!pgprot_match(prot, ref_prot)) {
 		if (!pte_huge(*kpte)) {
 			set_pte(kpte, pfn_pte(pfn, prot));
 		} else {
- 			/*
-			 * split_large_page will take the reference for this
-			 * change_page_attr on the split page.
- 			 */
-			struct page *split;
-			ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
-			split = split_large_page(address, prot, ref_prot2);
-			if (!split)
+			BUG_ON(!pgprot_match(old_prot, ref_prot));
+			kpte_page = split_large_page(address, prot, ref_prot);
+			if (!kpte_page)
 				return -ENOMEM;
-			pgprot_val(ref_prot2) &= ~_PAGE_NX;
-			set_pte(kpte, mk_pte(split, ref_prot2));
-			kpte_page = split;
+			set_pte(kpte, mk_pte(kpte_page, PAGE_KERNEL_EXEC));
 		}
-		page_private(kpte_page)++;
-	} else if (!pte_huge(*kpte)) {
+		if (pgprot_match(old_prot, ref_prot))
+			page_private(kpte_page)++;
+	} else if (!pgprot_match(ref_prot, old_prot)) {
+		BUG_ON(pte_huge(*kpte));
 		set_pte(kpte, pfn_pte(pfn, ref_prot));
 		BUG_ON(page_private(kpte_page) == 0);
 		page_private(kpte_page)--;
 	} else
-		BUG();
+		return 0;
 
 	/* on x86-64 the direct mapping set at boot is not using 4k pages */
  	BUG_ON(PageReserved(kpte_page));
 
 	save_page(kpte_page);
+	if (page_private(kpte_page) == PTRS_PER_PTE
+	    && address >= __START_KERNEL_map
+	    && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
+		unsigned i;
+
+		kpte = page_address(kpte_page);
+		for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte)
+			if (pgprot_match(pte_pgprot(*kpte), prot))
+				page_private(kpte_page)--;
+		kref_prot(address) = ref_prot = prot;
+	}
 	if (page_private(kpte_page) == 0)
 		revert_page(address, ref_prot);
 	return 0;
 } 
 
 /*
- * Change the page attributes of an page in the linear mapping.
+ * Change the page attributes of a page in the linear mapping.
  *
  * This should be used when a page is mapped with a different caching policy
  * than write-back somewhere - some CPUs do not like it when mappings with
--- linux-2.6.24-rc5/include/asm-x86/page_32.h	2007-12-12 11:29:30.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_32.h	2007-12-04 16:01:11.000000000 +0100
@@ -6,6 +6,16 @@
 #define PAGE_SIZE	(1UL << PAGE_SHIFT)
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 
+#ifdef CONFIG_X86_PAE
+#define __PHYSICAL_MASK_SHIFT	52
+#define __PHYSICAL_MASK		((1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define PHYSICAL_PAGE_MASK	(~(PAGE_SIZE - 1ULL) & __PHYSICAL_MASK)
+#else
+#define __PHYSICAL_MASK_SHIFT	32
+#define __PHYSICAL_MASK		(~0UL)
+#define PHYSICAL_PAGE_MASK	(PAGE_MASK & __PHYSICAL_MASK)
+#endif
+
 #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
 #define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
 
--- linux-2.6.24-rc5/include/asm-x86/page_64.h	2007-12-12 11:29:30.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_64.h	2007-12-04 16:01:11.000000000 +0100
@@ -98,7 +98,7 @@ extern unsigned long phys_base;
 #define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
 
 /* See Documentation/x86_64/mm.txt for a description of the memory map. */
-#define __PHYSICAL_MASK_SHIFT	46
+#define __PHYSICAL_MASK_SHIFT	52
 #define __PHYSICAL_MASK		((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
 #define __VIRTUAL_MASK_SHIFT	48
 #define __VIRTUAL_MASK		((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
--- linux-2.6.24-rc5/include/asm-x86/pgtable_32.h	2007-12-12 11:29:30.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_32.h	2007-12-04 16:01:11.000000000 +0100
@@ -228,11 +228,14 @@ static inline int pte_file(pte_t pte)		{
 static inline pte_t pte_mkclean(pte_t pte)	{ (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkold(pte_t pte)	{ (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
 static inline pte_t pte_wrprotect(pte_t pte)	{ (pte).pte_low &= ~_PAGE_RW; return pte; }
+static inline pte_t pte_clrhuge(pte_t pte)	{ (pte).pte_low &= ~_PAGE_PSE; return pte; }
 static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
 static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return pte; }
 
+#define pte_pgprot(pte) (__pgprot(pte_val(pte) & ~PHYSICAL_PAGE_MASK))
+
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
 #else
--- linux-2.6.24-rc5/include/asm-x86/pgtable_64.h	2007-12-12 11:29:30.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_64.h	2007-12-04 16:01:11.000000000 +0100
@@ -344,9 +344,9 @@ static inline int pmd_large(pmd_t pte) {
 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
 #define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
 
-#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+#define pte_to_pgoff(pte) (pte_val(pte) >> PAGE_SHIFT)
 #define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
-#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+#define PTE_FILE_MAX_BITS (64 - PAGE_SHIFT)
 
 /* PTE - Level 1 access. */
 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
  2007-12-13  9:34 [PATCH] x86: fix ref-counting bug in change_page_attr() Jan Beulich
@ 2007-12-14  7:12 ` Jeremy Fitzhardinge
  2007-12-14  8:38   ` Jan Beulich
  2007-12-17 13:28 ` Ingo Molnar
  1 sibling, 1 reply; 6+ messages in thread
From: Jeremy Fitzhardinge @ 2007-12-14  7:12 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tglx, mingo, hpa, Andi Kleen, linux-kernel

Jan Beulich wrote:
> When either calling change_page_attr() with the default attributes
> pages in the direct mapping have and a page's attributes already were
> set to the default or when changing the attributes from one non-default
> value to another, the reference counting broke, leading to either
> premature restoration of a large page or missing the opportunity to do
> so.
>
> At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it
> architecturally ought to have.
>   

Could you put this in a separate patch?  I have a bunch of page*.h and
pgtable*.h refactoring patches which will conflict with this.

    J

> Signed-off-by: Jan Beulich <jbeulich@novell.com>
> Cc: Andi Kleen <ak@suse.de>
>
>  arch/x86/mm/ioremap_64.c     |    4 +-
>  arch/x86/mm/pageattr_32.c    |   84 +++++++++++++++++++++++++++++--------------
>  arch/x86/mm/pageattr_64.c    |   57 +++++++++++++++++++----------
>  include/asm-x86/page_32.h    |   10 +++++
>  include/asm-x86/page_64.h    |    2 -
>  include/asm-x86/pgtable_32.h |    3 +
>  include/asm-x86/pgtable_64.h |    4 +-
>  7 files changed, 114 insertions(+), 50 deletions(-)
>
> --- linux-2.6.24-rc5/arch/x86/mm/ioremap_64.c	2007-12-12 11:28:18.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/ioremap_64.c	2007-12-04 16:01:11.000000000 +0100
> @@ -48,7 +48,7 @@ ioremap_change_attr(unsigned long phys_a
>   		 * Must use a address here and not struct page because the phys addr
>  		 * can be a in hole between nodes and not have an memmap entry.
>  		 */
> -		err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
> +		err = change_page_attr_addr(vaddr,npages,MAKE_GLOBAL(__PAGE_KERNEL|flags));
>  		if (!err)
>  			global_flush_tlb();
>  	}
> @@ -199,7 +199,7 @@ void iounmap(volatile void __iomem *addr
>  
>  	/* Reset the direct mapping. Can block */
>  	if (p->flags >> 20)
> -		ioremap_change_attr(p->phys_addr, p->size, 0);
> +		ioremap_change_attr(p->phys_addr, get_vm_area_size(p), 0);
>  
>  	/* Finally remove it */
>  	o = remove_vm_area((void *)addr);
> --- linux-2.6.24-rc5/arch/x86/mm/pageattr_32.c	2007-12-12 11:28:18.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_32.c	2007-12-04 16:01:11.000000000 +0100
> @@ -116,24 +116,22 @@ static void set_pmd_pte(pte_t *kpte, uns
>  	spin_unlock_irqrestore(&pgd_lock, flags);
>  }
>  
> +static pgprot_t _ref_prot[KERNEL_PGD_PTRS * PTRS_PER_PMD];
> +#define ref_prot(addr) _ref_prot[__pa(addr) >> PMD_SHIFT]
> +
>  /* 
>   * No more special protections in this 2/4MB area - revert to a
>   * large page again. 
>   */
>  static inline void revert_page(struct page *kpte_page, unsigned long address)
>  {
> -	pgprot_t ref_prot;
>  	pte_t *linear;
>  
> -	ref_prot =
> -	((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
> -		? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
> -
>  	linear = (pte_t *)
>  		pmd_offset(pud_offset(pgd_offset_k(address), address), address);
>  	set_pmd_pte(linear,  address,
> -		    pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
> -			    ref_prot));
> +		    pte_mkhuge(pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
> +				       ref_prot(address))));
>  }
>  
>  static inline void save_page(struct page *kpte_page)
> @@ -142,12 +140,22 @@ static inline void save_page(struct page
>  		list_add(&kpte_page->lru, &df_list);
>  }
>  
> +static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2)
> +{
> +	return !((pgprot_val(prot1) ^ pgprot_val(prot2))
> +#ifdef CONFIG_X86_PAE
> +		 & __supported_pte_mask
> +#endif
> +		 & ~(_PAGE_ACCESSED|_PAGE_DIRTY));
> +}
> +
>  static int
>  __change_page_attr(struct page *page, pgprot_t prot)
>  { 
>  	pte_t *kpte; 
>  	unsigned long address;
>  	struct page *kpte_page;
> +	pgprot_t old_prot, ref_prot;
>  
>  	BUG_ON(PageHighMem(page));
>  	address = (unsigned long)page_address(page);
> @@ -159,29 +167,31 @@ __change_page_attr(struct page *page, pg
>  	BUG_ON(PageLRU(kpte_page));
>  	BUG_ON(PageCompound(kpte_page));
>  
> -	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
> +	old_prot = pte_pgprot(pte_clrhuge(*kpte));
> +	ref_prot = ref_prot(address);
> +	if (!pgprot_match(prot, ref_prot)) {
>  		if (!pte_huge(*kpte)) {
>  			set_pte_atomic(kpte, mk_pte(page, prot)); 
>  		} else {
> -			pgprot_t ref_prot;
> -			struct page *split;
> -
> -			ref_prot =
> -			((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
> -				? PAGE_KERNEL_EXEC : PAGE_KERNEL;
> -			split = split_large_page(address, prot, ref_prot);
> -			if (!split)
> +			BUG_ON(!pgprot_match(old_prot, ref_prot));
> +			kpte_page = split_large_page(address, prot, ref_prot);
> +			if (!kpte_page)
>  				return -ENOMEM;
> -			set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
> -			kpte_page = split;
> +			set_pmd_pte(kpte, address,
> +				    mk_pte(kpte_page, PAGE_KERNEL_EXEC));
> +		}
> +		if (!PageReserved(kpte_page)
> +		    && pgprot_match(old_prot, ref_prot))
> +			page_private(kpte_page)++;
> +	} else if (!pgprot_match(ref_prot, old_prot)) {
> +		BUG_ON(pte_huge(*kpte));
> +		set_pte_atomic(kpte, mk_pte(page, ref_prot));
> +		if (!PageReserved(kpte_page)) {
> +			BUG_ON(page_private(kpte_page) == 0);
> +			page_private(kpte_page)--;
>  		}
> -		page_private(kpte_page)++;
> -	} else if (!pte_huge(*kpte)) {
> -		set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
> -		BUG_ON(page_private(kpte_page) == 0);
> -		page_private(kpte_page)--;
>  	} else
> -		BUG();
> +		return 0;
>  
>  	/*
>  	 * If the pte was reserved, it means it was created at boot
> @@ -190,8 +200,17 @@ __change_page_attr(struct page *page, pg
>  	 */
>  
>  	save_page(kpte_page);
> -	if (!PageReserved(kpte_page)) {
> -		if (cpu_has_pse && (page_private(kpte_page) == 0)) {
> +	if (!PageReserved(kpte_page) && cpu_has_pse) {
> +		if (page_private(kpte_page) == PTRS_PER_PTE) {
> +			unsigned i;
> +
> +			kpte = page_address(kpte_page);
> +			for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte)
> +				if (pgprot_match(pte_pgprot(*kpte), prot))
> +					page_private(kpte_page)--;
> +			ref_prot(address) = prot;
> +		}
> +		if (page_private(kpte_page) == 0) {
>  			paravirt_release_pt(page_to_pfn(kpte_page));
>  			revert_page(kpte_page, address);
>  		}
> @@ -222,8 +241,21 @@ int change_page_attr(struct page *page, 
>  	int err = 0; 
>  	int i; 
>  	unsigned long flags;
> +	static char first = 1;
>  
>  	spin_lock_irqsave(&cpa_lock, flags);
> +
> +	if (unlikely(first)) {
> +		unsigned long addr = PAGE_OFFSET & PMD_MASK;
> +
> +		/* This must match is_kernel_text(). */
> +		for (; addr <= (unsigned long)__init_end; addr += PMD_SIZE)
> +			ref_prot(addr) = PAGE_KERNEL_EXEC;
> +		for (; addr > PAGE_OFFSET; addr += PMD_SIZE)
> +			ref_prot(addr) = PAGE_KERNEL;
> +		first = 0;
> +	}
> +
>  	for (i = 0; i < numpages; i++, page++) { 
>  		err = __change_page_attr(page, prot);
>  		if (err) 
> --- linux-2.6.24-rc5/arch/x86/mm/pageattr_64.c	2007-12-12 11:28:18.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_64.c	2007-12-04 16:01:11.000000000 +0100
> @@ -98,8 +98,14 @@ static inline void save_page(struct page
>  		list_add(&fpage->lru, &deferred_pages);
>  }
>  
> +/* protected by init_mm.mmap_sem */
> +static pgprot_t kref_prot[] =	{
> +	[0 ... (KERNEL_TEXT_SIZE - 1) >> PMD_SHIFT] = PAGE_KERNEL_EXEC
> +};
> +#define kref_prot(kaddr) kref_prot[((kaddr) - __START_KERNEL_map) >> PMD_SHIFT]
> +
>  /* 
> - * No more special protections in this 2/4MB area - revert to a
> + * No more special protections in this 2MB area - revert to a
>   * large page again. 
>   */
>  static void revert_page(unsigned long address, pgprot_t ref_prot)
> @@ -122,55 +128,68 @@ static void revert_page(unsigned long ad
>  	set_pte((pte_t *)pmd, large_pte);
>  }      
>  
> +static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2)
> +{
> +	return !((pgprot_val(prot1) ^ pgprot_val(prot2))
> +		 & __supported_pte_mask & ~(_PAGE_ACCESSED|_PAGE_DIRTY));
> +}
> +
>  static int
>  __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
>  				   pgprot_t ref_prot)
>  { 
>  	pte_t *kpte; 
>  	struct page *kpte_page;
> -	pgprot_t ref_prot2;
> +	pgprot_t old_prot;
>  
>  	kpte = lookup_address(address);
>  	if (!kpte) return 0;
> -	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
> +	kpte_page = virt_to_page(kpte);
>  	BUG_ON(PageLRU(kpte_page));
>  	BUG_ON(PageCompound(kpte_page));
> -	if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
> +	old_prot = pte_pgprot(pte_clrhuge(*kpte));
> +	if (!pgprot_match(prot, ref_prot)) {
>  		if (!pte_huge(*kpte)) {
>  			set_pte(kpte, pfn_pte(pfn, prot));
>  		} else {
> - 			/*
> -			 * split_large_page will take the reference for this
> -			 * change_page_attr on the split page.
> - 			 */
> -			struct page *split;
> -			ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
> -			split = split_large_page(address, prot, ref_prot2);
> -			if (!split)
> +			BUG_ON(!pgprot_match(old_prot, ref_prot));
> +			kpte_page = split_large_page(address, prot, ref_prot);
> +			if (!kpte_page)
>  				return -ENOMEM;
> -			pgprot_val(ref_prot2) &= ~_PAGE_NX;
> -			set_pte(kpte, mk_pte(split, ref_prot2));
> -			kpte_page = split;
> +			set_pte(kpte, mk_pte(kpte_page, PAGE_KERNEL_EXEC));
>  		}
> -		page_private(kpte_page)++;
> -	} else if (!pte_huge(*kpte)) {
> +		if (pgprot_match(old_prot, ref_prot))
> +			page_private(kpte_page)++;
> +	} else if (!pgprot_match(ref_prot, old_prot)) {
> +		BUG_ON(pte_huge(*kpte));
>  		set_pte(kpte, pfn_pte(pfn, ref_prot));
>  		BUG_ON(page_private(kpte_page) == 0);
>  		page_private(kpte_page)--;
>  	} else
> -		BUG();
> +		return 0;
>  
>  	/* on x86-64 the direct mapping set at boot is not using 4k pages */
>   	BUG_ON(PageReserved(kpte_page));
>  
>  	save_page(kpte_page);
> +	if (page_private(kpte_page) == PTRS_PER_PTE
> +	    && address >= __START_KERNEL_map
> +	    && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
> +		unsigned i;
> +
> +		kpte = page_address(kpte_page);
> +		for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte)
> +			if (pgprot_match(pte_pgprot(*kpte), prot))
> +				page_private(kpte_page)--;
> +		kref_prot(address) = ref_prot = prot;
> +	}
>  	if (page_private(kpte_page) == 0)
>  		revert_page(address, ref_prot);
>  	return 0;
>  } 
>  
>  /*
> - * Change the page attributes of an page in the linear mapping.
> + * Change the page attributes of a page in the linear mapping.
>   *
>   * This should be used when a page is mapped with a different caching policy
>   * than write-back somewhere - some CPUs do not like it when mappings with
> --- linux-2.6.24-rc5/include/asm-x86/page_32.h	2007-12-12 11:29:30.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_32.h	2007-12-04 16:01:11.000000000 +0100
> @@ -6,6 +6,16 @@
>  #define PAGE_SIZE	(1UL << PAGE_SHIFT)
>  #define PAGE_MASK	(~(PAGE_SIZE-1))
>  
> +#ifdef CONFIG_X86_PAE
> +#define __PHYSICAL_MASK_SHIFT	52
> +#define __PHYSICAL_MASK		((1ULL << __PHYSICAL_MASK_SHIFT) - 1)
> +#define PHYSICAL_PAGE_MASK	(~(PAGE_SIZE - 1ULL) & __PHYSICAL_MASK)
> +#else
> +#define __PHYSICAL_MASK_SHIFT	32
> +#define __PHYSICAL_MASK		(~0UL)
> +#define PHYSICAL_PAGE_MASK	(PAGE_MASK & __PHYSICAL_MASK)
> +#endif
> +
>  #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
>  #define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
>  
> --- linux-2.6.24-rc5/include/asm-x86/page_64.h	2007-12-12 11:29:30.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_64.h	2007-12-04 16:01:11.000000000 +0100
> @@ -98,7 +98,7 @@ extern unsigned long phys_base;
>  #define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
>  
>  /* See Documentation/x86_64/mm.txt for a description of the memory map. */
> -#define __PHYSICAL_MASK_SHIFT	46
> +#define __PHYSICAL_MASK_SHIFT	52
>  #define __PHYSICAL_MASK		((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
>  #define __VIRTUAL_MASK_SHIFT	48
>  #define __VIRTUAL_MASK		((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
> --- linux-2.6.24-rc5/include/asm-x86/pgtable_32.h	2007-12-12 11:29:30.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_32.h	2007-12-04 16:01:11.000000000 +0100
> @@ -228,11 +228,14 @@ static inline int pte_file(pte_t pte)		{
>  static inline pte_t pte_mkclean(pte_t pte)	{ (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
>  static inline pte_t pte_mkold(pte_t pte)	{ (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
>  static inline pte_t pte_wrprotect(pte_t pte)	{ (pte).pte_low &= ~_PAGE_RW; return pte; }
> +static inline pte_t pte_clrhuge(pte_t pte)	{ (pte).pte_low &= ~_PAGE_PSE; return pte; }
>  static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
>  static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
>  static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
>  static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return pte; }
>  
> +#define pte_pgprot(pte) (__pgprot(pte_val(pte) & ~PHYSICAL_PAGE_MASK))
> +
>  #ifdef CONFIG_X86_PAE
>  # include <asm/pgtable-3level.h>
>  #else
> --- linux-2.6.24-rc5/include/asm-x86/pgtable_64.h	2007-12-12 11:29:30.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_64.h	2007-12-04 16:01:11.000000000 +0100
> @@ -344,9 +344,9 @@ static inline int pmd_large(pmd_t pte) {
>  #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
>  #define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
>  
> -#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
> +#define pte_to_pgoff(pte) (pte_val(pte) >> PAGE_SHIFT)
>  #define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
> -#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
> +#define PTE_FILE_MAX_BITS (64 - PAGE_SHIFT)
>  
>  /* PTE - Level 1 access. */
>  
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>   


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
  2007-12-14  7:12 ` Jeremy Fitzhardinge
@ 2007-12-14  8:38   ` Jan Beulich
  0 siblings, 0 replies; 6+ messages in thread
From: Jan Beulich @ 2007-12-14  8:38 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: tglx, mingo, Andi Kleen, linux-kernel, hpa

>>> Jeremy Fitzhardinge <jeremy@goop.org> 14.12.07 08:12 >>>
>Jan Beulich wrote:
>> When either calling change_page_attr() with the default attributes
>> pages in the direct mapping have and a page's attributes already were
>> set to the default or when changing the attributes from one non-default
>> value to another, the reference counting broke, leading to either
>> premature restoration of a large page or missing the opportunity to do
>> so.
>>
>> At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it
>> architecturally ought to have.
>>   
>
>Could you put this in a separate patch?  I have a bunch of page*.h and
>pgtable*.h refactoring patches which will conflict with this.

I doesn't seem logical to do so: The patch needs to introduce the definitions
for 32-bits (in order to define pte_pgprot()), and not doing the adjustment
for 64-bits here means (a) becoming inconsistent and (b) the pte_pgprot()
there would be incorrect. So such a split out patch would need to be a pre-
requisite to the one here, which wouldn't help avoiding the collisions with
your unification patches.

Jan


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
  2007-12-13  9:34 [PATCH] x86: fix ref-counting bug in change_page_attr() Jan Beulich
  2007-12-14  7:12 ` Jeremy Fitzhardinge
@ 2007-12-17 13:28 ` Ingo Molnar
  2007-12-17 13:41   ` Jan Beulich
  1 sibling, 1 reply; 6+ messages in thread
From: Ingo Molnar @ 2007-12-17 13:28 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tglx, mingo, hpa, Andi Kleen, linux-kernel, Andrew Morton


* Jan Beulich <jbeulich@novell.com> wrote:

> When either calling change_page_attr() with the default attributes 
> pages in the direct mapping have and a page's attributes already were 
> set to the default or when changing the attributes from one 
> non-default value to another, the reference counting broke, leading to 
> either premature restoration of a large page or missing the 
> opportunity to do so.
> 
> At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it 
> architecturally ought to have.

hm, does this patch fix any real bug seen live? (if yes then do you have 
links to it, etc?) It would be a v2.6.24 fix in theory but it looks too 
dangerous for that. So i've queued it up for v2.6.25, for the time 
being.

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
  2007-12-17 13:28 ` Ingo Molnar
@ 2007-12-17 13:41   ` Jan Beulich
  2007-12-17 14:34     ` Ingo Molnar
  0 siblings, 1 reply; 6+ messages in thread
From: Jan Beulich @ 2007-12-17 13:41 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: tglx, Andrew Morton, mingo, Andi Kleen, linux-kernel, hpa

>>> Ingo Molnar <mingo@elte.hu> 17.12.07 14:28 >>>
>
>* Jan Beulich <jbeulich@novell.com> wrote:
>
>> When either calling change_page_attr() with the default attributes 
>> pages in the direct mapping have and a page's attributes already were 
>> set to the default or when changing the attributes from one 
>> non-default value to another, the reference counting broke, leading to 
>> either premature restoration of a large page or missing the 
>> opportunity to do so.
>> 
>> At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it 
>> architecturally ought to have.
>
>hm, does this patch fix any real bug seen live? (if yes then do you have 
>links to it, etc?) It would be a v2.6.24 fix in theory but it looks too 
>dangerous for that. So i've queued it up for v2.6.25, for the time 
>being.

I had run into the issue with experimental code of mine quite a while back
(i.e. I don't even recall what exactly I was doing back then). After that I
just continued to keep the fix (which I had submitted before, but which
collided with something in Andi's tree back then iirc).

Jan


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
  2007-12-17 13:41   ` Jan Beulich
@ 2007-12-17 14:34     ` Ingo Molnar
  0 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2007-12-17 14:34 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tglx, Andrew Morton, mingo, Andi Kleen, linux-kernel, hpa


* Jan Beulich <jbeulich@novell.com> wrote:

> > hm, does this patch fix any real bug seen live? (if yes then do you 
> > have links to it, etc?) It would be a v2.6.24 fix in theory but it 
> > looks too dangerous for that. So i've queued it up for v2.6.25, for 
> > the time being.
> 
> I had run into the issue with experimental code of mine quite a while 
> back (i.e. I don't even recall what exactly I was doing back then). 
> After that I just continued to keep the fix (which I had submitted 
> before, but which collided with something in Andi's tree back then 
> iirc).

i'd be inclined to have it in the for-2.6.25 queue then - even plain 
bugfixes in the pageattr code have broken boxes in the past, so this 
isnt something we want to touch so late in the cycle. (We can still 
reprioritize it if a serious regression pops up that this addresses.)

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2007-12-17 14:34 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-12-13  9:34 [PATCH] x86: fix ref-counting bug in change_page_attr() Jan Beulich
2007-12-14  7:12 ` Jeremy Fitzhardinge
2007-12-14  8:38   ` Jan Beulich
2007-12-17 13:28 ` Ingo Molnar
2007-12-17 13:41   ` Jan Beulich
2007-12-17 14:34     ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).