* [PATCH] x86: fix ref-counting bug in change_page_attr()
@ 2007-12-13 9:34 Jan Beulich
2007-12-14 7:12 ` Jeremy Fitzhardinge
2007-12-17 13:28 ` Ingo Molnar
0 siblings, 2 replies; 6+ messages in thread
From: Jan Beulich @ 2007-12-13 9:34 UTC (permalink / raw)
To: tglx, mingo, hpa; +Cc: Andi Kleen, linux-kernel
When either calling change_page_attr() with the default attributes
pages in the direct mapping have and a page's attributes already were
set to the default or when changing the attributes from one non-default
value to another, the reference counting broke, leading to either
premature restoration of a large page or missing the opportunity to do
so.
At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it
architecturally ought to have.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
arch/x86/mm/ioremap_64.c | 4 +-
arch/x86/mm/pageattr_32.c | 84 +++++++++++++++++++++++++++++--------------
arch/x86/mm/pageattr_64.c | 57 +++++++++++++++++++----------
include/asm-x86/page_32.h | 10 +++++
include/asm-x86/page_64.h | 2 -
include/asm-x86/pgtable_32.h | 3 +
include/asm-x86/pgtable_64.h | 4 +-
7 files changed, 114 insertions(+), 50 deletions(-)
--- linux-2.6.24-rc5/arch/x86/mm/ioremap_64.c 2007-12-12 11:28:18.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/ioremap_64.c 2007-12-04 16:01:11.000000000 +0100
@@ -48,7 +48,7 @@ ioremap_change_attr(unsigned long phys_a
* Must use a address here and not struct page because the phys addr
* can be a in hole between nodes and not have an memmap entry.
*/
- err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
+ err = change_page_attr_addr(vaddr,npages,MAKE_GLOBAL(__PAGE_KERNEL|flags));
if (!err)
global_flush_tlb();
}
@@ -199,7 +199,7 @@ void iounmap(volatile void __iomem *addr
/* Reset the direct mapping. Can block */
if (p->flags >> 20)
- ioremap_change_attr(p->phys_addr, p->size, 0);
+ ioremap_change_attr(p->phys_addr, get_vm_area_size(p), 0);
/* Finally remove it */
o = remove_vm_area((void *)addr);
--- linux-2.6.24-rc5/arch/x86/mm/pageattr_32.c 2007-12-12 11:28:18.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_32.c 2007-12-04 16:01:11.000000000 +0100
@@ -116,24 +116,22 @@ static void set_pmd_pte(pte_t *kpte, uns
spin_unlock_irqrestore(&pgd_lock, flags);
}
+static pgprot_t _ref_prot[KERNEL_PGD_PTRS * PTRS_PER_PMD];
+#define ref_prot(addr) _ref_prot[__pa(addr) >> PMD_SHIFT]
+
/*
* No more special protections in this 2/4MB area - revert to a
* large page again.
*/
static inline void revert_page(struct page *kpte_page, unsigned long address)
{
- pgprot_t ref_prot;
pte_t *linear;
- ref_prot =
- ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
- ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
-
linear = (pte_t *)
pmd_offset(pud_offset(pgd_offset_k(address), address), address);
set_pmd_pte(linear, address,
- pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
- ref_prot));
+ pte_mkhuge(pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
+ ref_prot(address))));
}
static inline void save_page(struct page *kpte_page)
@@ -142,12 +140,22 @@ static inline void save_page(struct page
list_add(&kpte_page->lru, &df_list);
}
+static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2)
+{
+ return !((pgprot_val(prot1) ^ pgprot_val(prot2))
+#ifdef CONFIG_X86_PAE
+ & __supported_pte_mask
+#endif
+ & ~(_PAGE_ACCESSED|_PAGE_DIRTY));
+}
+
static int
__change_page_attr(struct page *page, pgprot_t prot)
{
pte_t *kpte;
unsigned long address;
struct page *kpte_page;
+ pgprot_t old_prot, ref_prot;
BUG_ON(PageHighMem(page));
address = (unsigned long)page_address(page);
@@ -159,29 +167,31 @@ __change_page_attr(struct page *page, pg
BUG_ON(PageLRU(kpte_page));
BUG_ON(PageCompound(kpte_page));
- if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
+ old_prot = pte_pgprot(pte_clrhuge(*kpte));
+ ref_prot = ref_prot(address);
+ if (!pgprot_match(prot, ref_prot)) {
if (!pte_huge(*kpte)) {
set_pte_atomic(kpte, mk_pte(page, prot));
} else {
- pgprot_t ref_prot;
- struct page *split;
-
- ref_prot =
- ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
- ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
- split = split_large_page(address, prot, ref_prot);
- if (!split)
+ BUG_ON(!pgprot_match(old_prot, ref_prot));
+ kpte_page = split_large_page(address, prot, ref_prot);
+ if (!kpte_page)
return -ENOMEM;
- set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
- kpte_page = split;
+ set_pmd_pte(kpte, address,
+ mk_pte(kpte_page, PAGE_KERNEL_EXEC));
+ }
+ if (!PageReserved(kpte_page)
+ && pgprot_match(old_prot, ref_prot))
+ page_private(kpte_page)++;
+ } else if (!pgprot_match(ref_prot, old_prot)) {
+ BUG_ON(pte_huge(*kpte));
+ set_pte_atomic(kpte, mk_pte(page, ref_prot));
+ if (!PageReserved(kpte_page)) {
+ BUG_ON(page_private(kpte_page) == 0);
+ page_private(kpte_page)--;
}
- page_private(kpte_page)++;
- } else if (!pte_huge(*kpte)) {
- set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
- BUG_ON(page_private(kpte_page) == 0);
- page_private(kpte_page)--;
} else
- BUG();
+ return 0;
/*
* If the pte was reserved, it means it was created at boot
@@ -190,8 +200,17 @@ __change_page_attr(struct page *page, pg
*/
save_page(kpte_page);
- if (!PageReserved(kpte_page)) {
- if (cpu_has_pse && (page_private(kpte_page) == 0)) {
+ if (!PageReserved(kpte_page) && cpu_has_pse) {
+ if (page_private(kpte_page) == PTRS_PER_PTE) {
+ unsigned i;
+
+ kpte = page_address(kpte_page);
+ for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte)
+ if (pgprot_match(pte_pgprot(*kpte), prot))
+ page_private(kpte_page)--;
+ ref_prot(address) = prot;
+ }
+ if (page_private(kpte_page) == 0) {
paravirt_release_pt(page_to_pfn(kpte_page));
revert_page(kpte_page, address);
}
@@ -222,8 +241,21 @@ int change_page_attr(struct page *page,
int err = 0;
int i;
unsigned long flags;
+ static char first = 1;
spin_lock_irqsave(&cpa_lock, flags);
+
+ if (unlikely(first)) {
+ unsigned long addr = PAGE_OFFSET & PMD_MASK;
+
+ /* This must match is_kernel_text(). */
+ for (; addr <= (unsigned long)__init_end; addr += PMD_SIZE)
+ ref_prot(addr) = PAGE_KERNEL_EXEC;
+ for (; addr > PAGE_OFFSET; addr += PMD_SIZE)
+ ref_prot(addr) = PAGE_KERNEL;
+ first = 0;
+ }
+
for (i = 0; i < numpages; i++, page++) {
err = __change_page_attr(page, prot);
if (err)
--- linux-2.6.24-rc5/arch/x86/mm/pageattr_64.c 2007-12-12 11:28:18.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_64.c 2007-12-04 16:01:11.000000000 +0100
@@ -98,8 +98,14 @@ static inline void save_page(struct page
list_add(&fpage->lru, &deferred_pages);
}
+/* protected by init_mm.mmap_sem */
+static pgprot_t kref_prot[] = {
+ [0 ... (KERNEL_TEXT_SIZE - 1) >> PMD_SHIFT] = PAGE_KERNEL_EXEC
+};
+#define kref_prot(kaddr) kref_prot[((kaddr) - __START_KERNEL_map) >> PMD_SHIFT]
+
/*
- * No more special protections in this 2/4MB area - revert to a
+ * No more special protections in this 2MB area - revert to a
* large page again.
*/
static void revert_page(unsigned long address, pgprot_t ref_prot)
@@ -122,55 +128,68 @@ static void revert_page(unsigned long ad
set_pte((pte_t *)pmd, large_pte);
}
+static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2)
+{
+ return !((pgprot_val(prot1) ^ pgprot_val(prot2))
+ & __supported_pte_mask & ~(_PAGE_ACCESSED|_PAGE_DIRTY));
+}
+
static int
__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
pgprot_t ref_prot)
{
pte_t *kpte;
struct page *kpte_page;
- pgprot_t ref_prot2;
+ pgprot_t old_prot;
kpte = lookup_address(address);
if (!kpte) return 0;
- kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+ kpte_page = virt_to_page(kpte);
BUG_ON(PageLRU(kpte_page));
BUG_ON(PageCompound(kpte_page));
- if (pgprot_val(prot) != pgprot_val(ref_prot)) {
+ old_prot = pte_pgprot(pte_clrhuge(*kpte));
+ if (!pgprot_match(prot, ref_prot)) {
if (!pte_huge(*kpte)) {
set_pte(kpte, pfn_pte(pfn, prot));
} else {
- /*
- * split_large_page will take the reference for this
- * change_page_attr on the split page.
- */
- struct page *split;
- ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
- split = split_large_page(address, prot, ref_prot2);
- if (!split)
+ BUG_ON(!pgprot_match(old_prot, ref_prot));
+ kpte_page = split_large_page(address, prot, ref_prot);
+ if (!kpte_page)
return -ENOMEM;
- pgprot_val(ref_prot2) &= ~_PAGE_NX;
- set_pte(kpte, mk_pte(split, ref_prot2));
- kpte_page = split;
+ set_pte(kpte, mk_pte(kpte_page, PAGE_KERNEL_EXEC));
}
- page_private(kpte_page)++;
- } else if (!pte_huge(*kpte)) {
+ if (pgprot_match(old_prot, ref_prot))
+ page_private(kpte_page)++;
+ } else if (!pgprot_match(ref_prot, old_prot)) {
+ BUG_ON(pte_huge(*kpte));
set_pte(kpte, pfn_pte(pfn, ref_prot));
BUG_ON(page_private(kpte_page) == 0);
page_private(kpte_page)--;
} else
- BUG();
+ return 0;
/* on x86-64 the direct mapping set at boot is not using 4k pages */
BUG_ON(PageReserved(kpte_page));
save_page(kpte_page);
+ if (page_private(kpte_page) == PTRS_PER_PTE
+ && address >= __START_KERNEL_map
+ && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
+ unsigned i;
+
+ kpte = page_address(kpte_page);
+ for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte)
+ if (pgprot_match(pte_pgprot(*kpte), prot))
+ page_private(kpte_page)--;
+ kref_prot(address) = ref_prot = prot;
+ }
if (page_private(kpte_page) == 0)
revert_page(address, ref_prot);
return 0;
}
/*
- * Change the page attributes of an page in the linear mapping.
+ * Change the page attributes of a page in the linear mapping.
*
* This should be used when a page is mapped with a different caching policy
* than write-back somewhere - some CPUs do not like it when mappings with
--- linux-2.6.24-rc5/include/asm-x86/page_32.h 2007-12-12 11:29:30.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_32.h 2007-12-04 16:01:11.000000000 +0100
@@ -6,6 +6,16 @@
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#ifdef CONFIG_X86_PAE
+#define __PHYSICAL_MASK_SHIFT 52
+#define __PHYSICAL_MASK ((1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE - 1ULL) & __PHYSICAL_MASK)
+#else
+#define __PHYSICAL_MASK_SHIFT 32
+#define __PHYSICAL_MASK (~0UL)
+#define PHYSICAL_PAGE_MASK (PAGE_MASK & __PHYSICAL_MASK)
+#endif
+
#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
--- linux-2.6.24-rc5/include/asm-x86/page_64.h 2007-12-12 11:29:30.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_64.h 2007-12-04 16:01:11.000000000 +0100
@@ -98,7 +98,7 @@ extern unsigned long phys_base;
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
/* See Documentation/x86_64/mm.txt for a description of the memory map. */
-#define __PHYSICAL_MASK_SHIFT 46
+#define __PHYSICAL_MASK_SHIFT 52
#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
#define __VIRTUAL_MASK_SHIFT 48
#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
--- linux-2.6.24-rc5/include/asm-x86/pgtable_32.h 2007-12-12 11:29:30.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_32.h 2007-12-04 16:01:11.000000000 +0100
@@ -228,11 +228,14 @@ static inline int pte_file(pte_t pte) {
static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
+static inline pte_t pte_clrhuge(pte_t pte) { (pte).pte_low &= ~_PAGE_PSE; return pte; }
static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
+#define pte_pgprot(pte) (__pgprot(pte_val(pte) & ~PHYSICAL_PAGE_MASK))
+
#ifdef CONFIG_X86_PAE
# include <asm/pgtable-3level.h>
#else
--- linux-2.6.24-rc5/include/asm-x86/pgtable_64.h 2007-12-12 11:29:30.000000000 +0100
+++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_64.h 2007-12-04 16:01:11.000000000 +0100
@@ -344,9 +344,9 @@ static inline int pmd_large(pmd_t pte) {
#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+#define pte_to_pgoff(pte) (pte_val(pte) >> PAGE_SHIFT)
#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
-#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+#define PTE_FILE_MAX_BITS (64 - PAGE_SHIFT)
/* PTE - Level 1 access. */
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
2007-12-13 9:34 [PATCH] x86: fix ref-counting bug in change_page_attr() Jan Beulich
@ 2007-12-14 7:12 ` Jeremy Fitzhardinge
2007-12-14 8:38 ` Jan Beulich
2007-12-17 13:28 ` Ingo Molnar
1 sibling, 1 reply; 6+ messages in thread
From: Jeremy Fitzhardinge @ 2007-12-14 7:12 UTC (permalink / raw)
To: Jan Beulich; +Cc: tglx, mingo, hpa, Andi Kleen, linux-kernel
Jan Beulich wrote:
> When either calling change_page_attr() with the default attributes
> pages in the direct mapping have and a page's attributes already were
> set to the default or when changing the attributes from one non-default
> value to another, the reference counting broke, leading to either
> premature restoration of a large page or missing the opportunity to do
> so.
>
> At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it
> architecturally ought to have.
>
Could you put this in a separate patch? I have a bunch of page*.h and
pgtable*.h refactoring patches which will conflict with this.
J
> Signed-off-by: Jan Beulich <jbeulich@novell.com>
> Cc: Andi Kleen <ak@suse.de>
>
> arch/x86/mm/ioremap_64.c | 4 +-
> arch/x86/mm/pageattr_32.c | 84 +++++++++++++++++++++++++++++--------------
> arch/x86/mm/pageattr_64.c | 57 +++++++++++++++++++----------
> include/asm-x86/page_32.h | 10 +++++
> include/asm-x86/page_64.h | 2 -
> include/asm-x86/pgtable_32.h | 3 +
> include/asm-x86/pgtable_64.h | 4 +-
> 7 files changed, 114 insertions(+), 50 deletions(-)
>
> --- linux-2.6.24-rc5/arch/x86/mm/ioremap_64.c 2007-12-12 11:28:18.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/ioremap_64.c 2007-12-04 16:01:11.000000000 +0100
> @@ -48,7 +48,7 @@ ioremap_change_attr(unsigned long phys_a
> * Must use a address here and not struct page because the phys addr
> * can be a in hole between nodes and not have an memmap entry.
> */
> - err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
> + err = change_page_attr_addr(vaddr,npages,MAKE_GLOBAL(__PAGE_KERNEL|flags));
> if (!err)
> global_flush_tlb();
> }
> @@ -199,7 +199,7 @@ void iounmap(volatile void __iomem *addr
>
> /* Reset the direct mapping. Can block */
> if (p->flags >> 20)
> - ioremap_change_attr(p->phys_addr, p->size, 0);
> + ioremap_change_attr(p->phys_addr, get_vm_area_size(p), 0);
>
> /* Finally remove it */
> o = remove_vm_area((void *)addr);
> --- linux-2.6.24-rc5/arch/x86/mm/pageattr_32.c 2007-12-12 11:28:18.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_32.c 2007-12-04 16:01:11.000000000 +0100
> @@ -116,24 +116,22 @@ static void set_pmd_pte(pte_t *kpte, uns
> spin_unlock_irqrestore(&pgd_lock, flags);
> }
>
> +static pgprot_t _ref_prot[KERNEL_PGD_PTRS * PTRS_PER_PMD];
> +#define ref_prot(addr) _ref_prot[__pa(addr) >> PMD_SHIFT]
> +
> /*
> * No more special protections in this 2/4MB area - revert to a
> * large page again.
> */
> static inline void revert_page(struct page *kpte_page, unsigned long address)
> {
> - pgprot_t ref_prot;
> pte_t *linear;
>
> - ref_prot =
> - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
> - ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
> -
> linear = (pte_t *)
> pmd_offset(pud_offset(pgd_offset_k(address), address), address);
> set_pmd_pte(linear, address,
> - pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
> - ref_prot));
> + pte_mkhuge(pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
> + ref_prot(address))));
> }
>
> static inline void save_page(struct page *kpte_page)
> @@ -142,12 +140,22 @@ static inline void save_page(struct page
> list_add(&kpte_page->lru, &df_list);
> }
>
> +static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2)
> +{
> + return !((pgprot_val(prot1) ^ pgprot_val(prot2))
> +#ifdef CONFIG_X86_PAE
> + & __supported_pte_mask
> +#endif
> + & ~(_PAGE_ACCESSED|_PAGE_DIRTY));
> +}
> +
> static int
> __change_page_attr(struct page *page, pgprot_t prot)
> {
> pte_t *kpte;
> unsigned long address;
> struct page *kpte_page;
> + pgprot_t old_prot, ref_prot;
>
> BUG_ON(PageHighMem(page));
> address = (unsigned long)page_address(page);
> @@ -159,29 +167,31 @@ __change_page_attr(struct page *page, pg
> BUG_ON(PageLRU(kpte_page));
> BUG_ON(PageCompound(kpte_page));
>
> - if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
> + old_prot = pte_pgprot(pte_clrhuge(*kpte));
> + ref_prot = ref_prot(address);
> + if (!pgprot_match(prot, ref_prot)) {
> if (!pte_huge(*kpte)) {
> set_pte_atomic(kpte, mk_pte(page, prot));
> } else {
> - pgprot_t ref_prot;
> - struct page *split;
> -
> - ref_prot =
> - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
> - ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
> - split = split_large_page(address, prot, ref_prot);
> - if (!split)
> + BUG_ON(!pgprot_match(old_prot, ref_prot));
> + kpte_page = split_large_page(address, prot, ref_prot);
> + if (!kpte_page)
> return -ENOMEM;
> - set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
> - kpte_page = split;
> + set_pmd_pte(kpte, address,
> + mk_pte(kpte_page, PAGE_KERNEL_EXEC));
> + }
> + if (!PageReserved(kpte_page)
> + && pgprot_match(old_prot, ref_prot))
> + page_private(kpte_page)++;
> + } else if (!pgprot_match(ref_prot, old_prot)) {
> + BUG_ON(pte_huge(*kpte));
> + set_pte_atomic(kpte, mk_pte(page, ref_prot));
> + if (!PageReserved(kpte_page)) {
> + BUG_ON(page_private(kpte_page) == 0);
> + page_private(kpte_page)--;
> }
> - page_private(kpte_page)++;
> - } else if (!pte_huge(*kpte)) {
> - set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
> - BUG_ON(page_private(kpte_page) == 0);
> - page_private(kpte_page)--;
> } else
> - BUG();
> + return 0;
>
> /*
> * If the pte was reserved, it means it was created at boot
> @@ -190,8 +200,17 @@ __change_page_attr(struct page *page, pg
> */
>
> save_page(kpte_page);
> - if (!PageReserved(kpte_page)) {
> - if (cpu_has_pse && (page_private(kpte_page) == 0)) {
> + if (!PageReserved(kpte_page) && cpu_has_pse) {
> + if (page_private(kpte_page) == PTRS_PER_PTE) {
> + unsigned i;
> +
> + kpte = page_address(kpte_page);
> + for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte)
> + if (pgprot_match(pte_pgprot(*kpte), prot))
> + page_private(kpte_page)--;
> + ref_prot(address) = prot;
> + }
> + if (page_private(kpte_page) == 0) {
> paravirt_release_pt(page_to_pfn(kpte_page));
> revert_page(kpte_page, address);
> }
> @@ -222,8 +241,21 @@ int change_page_attr(struct page *page,
> int err = 0;
> int i;
> unsigned long flags;
> + static char first = 1;
>
> spin_lock_irqsave(&cpa_lock, flags);
> +
> + if (unlikely(first)) {
> + unsigned long addr = PAGE_OFFSET & PMD_MASK;
> +
> + /* This must match is_kernel_text(). */
> + for (; addr <= (unsigned long)__init_end; addr += PMD_SIZE)
> + ref_prot(addr) = PAGE_KERNEL_EXEC;
> + for (; addr > PAGE_OFFSET; addr += PMD_SIZE)
> + ref_prot(addr) = PAGE_KERNEL;
> + first = 0;
> + }
> +
> for (i = 0; i < numpages; i++, page++) {
> err = __change_page_attr(page, prot);
> if (err)
> --- linux-2.6.24-rc5/arch/x86/mm/pageattr_64.c 2007-12-12 11:28:18.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_64.c 2007-12-04 16:01:11.000000000 +0100
> @@ -98,8 +98,14 @@ static inline void save_page(struct page
> list_add(&fpage->lru, &deferred_pages);
> }
>
> +/* protected by init_mm.mmap_sem */
> +static pgprot_t kref_prot[] = {
> + [0 ... (KERNEL_TEXT_SIZE - 1) >> PMD_SHIFT] = PAGE_KERNEL_EXEC
> +};
> +#define kref_prot(kaddr) kref_prot[((kaddr) - __START_KERNEL_map) >> PMD_SHIFT]
> +
> /*
> - * No more special protections in this 2/4MB area - revert to a
> + * No more special protections in this 2MB area - revert to a
> * large page again.
> */
> static void revert_page(unsigned long address, pgprot_t ref_prot)
> @@ -122,55 +128,68 @@ static void revert_page(unsigned long ad
> set_pte((pte_t *)pmd, large_pte);
> }
>
> +static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2)
> +{
> + return !((pgprot_val(prot1) ^ pgprot_val(prot2))
> + & __supported_pte_mask & ~(_PAGE_ACCESSED|_PAGE_DIRTY));
> +}
> +
> static int
> __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
> pgprot_t ref_prot)
> {
> pte_t *kpte;
> struct page *kpte_page;
> - pgprot_t ref_prot2;
> + pgprot_t old_prot;
>
> kpte = lookup_address(address);
> if (!kpte) return 0;
> - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
> + kpte_page = virt_to_page(kpte);
> BUG_ON(PageLRU(kpte_page));
> BUG_ON(PageCompound(kpte_page));
> - if (pgprot_val(prot) != pgprot_val(ref_prot)) {
> + old_prot = pte_pgprot(pte_clrhuge(*kpte));
> + if (!pgprot_match(prot, ref_prot)) {
> if (!pte_huge(*kpte)) {
> set_pte(kpte, pfn_pte(pfn, prot));
> } else {
> - /*
> - * split_large_page will take the reference for this
> - * change_page_attr on the split page.
> - */
> - struct page *split;
> - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
> - split = split_large_page(address, prot, ref_prot2);
> - if (!split)
> + BUG_ON(!pgprot_match(old_prot, ref_prot));
> + kpte_page = split_large_page(address, prot, ref_prot);
> + if (!kpte_page)
> return -ENOMEM;
> - pgprot_val(ref_prot2) &= ~_PAGE_NX;
> - set_pte(kpte, mk_pte(split, ref_prot2));
> - kpte_page = split;
> + set_pte(kpte, mk_pte(kpte_page, PAGE_KERNEL_EXEC));
> }
> - page_private(kpte_page)++;
> - } else if (!pte_huge(*kpte)) {
> + if (pgprot_match(old_prot, ref_prot))
> + page_private(kpte_page)++;
> + } else if (!pgprot_match(ref_prot, old_prot)) {
> + BUG_ON(pte_huge(*kpte));
> set_pte(kpte, pfn_pte(pfn, ref_prot));
> BUG_ON(page_private(kpte_page) == 0);
> page_private(kpte_page)--;
> } else
> - BUG();
> + return 0;
>
> /* on x86-64 the direct mapping set at boot is not using 4k pages */
> BUG_ON(PageReserved(kpte_page));
>
> save_page(kpte_page);
> + if (page_private(kpte_page) == PTRS_PER_PTE
> + && address >= __START_KERNEL_map
> + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
> + unsigned i;
> +
> + kpte = page_address(kpte_page);
> + for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte)
> + if (pgprot_match(pte_pgprot(*kpte), prot))
> + page_private(kpte_page)--;
> + kref_prot(address) = ref_prot = prot;
> + }
> if (page_private(kpte_page) == 0)
> revert_page(address, ref_prot);
> return 0;
> }
>
> /*
> - * Change the page attributes of an page in the linear mapping.
> + * Change the page attributes of a page in the linear mapping.
> *
> * This should be used when a page is mapped with a different caching policy
> * than write-back somewhere - some CPUs do not like it when mappings with
> --- linux-2.6.24-rc5/include/asm-x86/page_32.h 2007-12-12 11:29:30.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_32.h 2007-12-04 16:01:11.000000000 +0100
> @@ -6,6 +6,16 @@
> #define PAGE_SIZE (1UL << PAGE_SHIFT)
> #define PAGE_MASK (~(PAGE_SIZE-1))
>
> +#ifdef CONFIG_X86_PAE
> +#define __PHYSICAL_MASK_SHIFT 52
> +#define __PHYSICAL_MASK ((1ULL << __PHYSICAL_MASK_SHIFT) - 1)
> +#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE - 1ULL) & __PHYSICAL_MASK)
> +#else
> +#define __PHYSICAL_MASK_SHIFT 32
> +#define __PHYSICAL_MASK (~0UL)
> +#define PHYSICAL_PAGE_MASK (PAGE_MASK & __PHYSICAL_MASK)
> +#endif
> +
> #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
> #define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
>
> --- linux-2.6.24-rc5/include/asm-x86/page_64.h 2007-12-12 11:29:30.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_64.h 2007-12-04 16:01:11.000000000 +0100
> @@ -98,7 +98,7 @@ extern unsigned long phys_base;
> #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
>
> /* See Documentation/x86_64/mm.txt for a description of the memory map. */
> -#define __PHYSICAL_MASK_SHIFT 46
> +#define __PHYSICAL_MASK_SHIFT 52
> #define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
> #define __VIRTUAL_MASK_SHIFT 48
> #define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
> --- linux-2.6.24-rc5/include/asm-x86/pgtable_32.h 2007-12-12 11:29:30.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_32.h 2007-12-04 16:01:11.000000000 +0100
> @@ -228,11 +228,14 @@ static inline int pte_file(pte_t pte) {
> static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
> static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
> static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
> +static inline pte_t pte_clrhuge(pte_t pte) { (pte).pte_low &= ~_PAGE_PSE; return pte; }
> static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
> static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
> static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
> static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
>
> +#define pte_pgprot(pte) (__pgprot(pte_val(pte) & ~PHYSICAL_PAGE_MASK))
> +
> #ifdef CONFIG_X86_PAE
> # include <asm/pgtable-3level.h>
> #else
> --- linux-2.6.24-rc5/include/asm-x86/pgtable_64.h 2007-12-12 11:29:30.000000000 +0100
> +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_64.h 2007-12-04 16:01:11.000000000 +0100
> @@ -344,9 +344,9 @@ static inline int pmd_large(pmd_t pte) {
> #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
> #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
>
> -#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
> +#define pte_to_pgoff(pte) (pte_val(pte) >> PAGE_SHIFT)
> #define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
> -#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
> +#define PTE_FILE_MAX_BITS (64 - PAGE_SHIFT)
>
> /* PTE - Level 1 access. */
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
2007-12-14 7:12 ` Jeremy Fitzhardinge
@ 2007-12-14 8:38 ` Jan Beulich
0 siblings, 0 replies; 6+ messages in thread
From: Jan Beulich @ 2007-12-14 8:38 UTC (permalink / raw)
To: Jeremy Fitzhardinge; +Cc: tglx, mingo, Andi Kleen, linux-kernel, hpa
>>> Jeremy Fitzhardinge <jeremy@goop.org> 14.12.07 08:12 >>>
>Jan Beulich wrote:
>> When either calling change_page_attr() with the default attributes
>> pages in the direct mapping have and a page's attributes already were
>> set to the default or when changing the attributes from one non-default
>> value to another, the reference counting broke, leading to either
>> premature restoration of a large page or missing the opportunity to do
>> so.
>>
>> At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it
>> architecturally ought to have.
>>
>
>Could you put this in a separate patch? I have a bunch of page*.h and
>pgtable*.h refactoring patches which will conflict with this.
I doesn't seem logical to do so: The patch needs to introduce the definitions
for 32-bits (in order to define pte_pgprot()), and not doing the adjustment
for 64-bits here means (a) becoming inconsistent and (b) the pte_pgprot()
there would be incorrect. So such a split out patch would need to be a pre-
requisite to the one here, which wouldn't help avoiding the collisions with
your unification patches.
Jan
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
2007-12-13 9:34 [PATCH] x86: fix ref-counting bug in change_page_attr() Jan Beulich
2007-12-14 7:12 ` Jeremy Fitzhardinge
@ 2007-12-17 13:28 ` Ingo Molnar
2007-12-17 13:41 ` Jan Beulich
1 sibling, 1 reply; 6+ messages in thread
From: Ingo Molnar @ 2007-12-17 13:28 UTC (permalink / raw)
To: Jan Beulich; +Cc: tglx, mingo, hpa, Andi Kleen, linux-kernel, Andrew Morton
* Jan Beulich <jbeulich@novell.com> wrote:
> When either calling change_page_attr() with the default attributes
> pages in the direct mapping have and a page's attributes already were
> set to the default or when changing the attributes from one
> non-default value to another, the reference counting broke, leading to
> either premature restoration of a large page or missing the
> opportunity to do so.
>
> At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it
> architecturally ought to have.
hm, does this patch fix any real bug seen live? (if yes then do you have
links to it, etc?) It would be a v2.6.24 fix in theory but it looks too
dangerous for that. So i've queued it up for v2.6.25, for the time
being.
Ingo
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
2007-12-17 13:28 ` Ingo Molnar
@ 2007-12-17 13:41 ` Jan Beulich
2007-12-17 14:34 ` Ingo Molnar
0 siblings, 1 reply; 6+ messages in thread
From: Jan Beulich @ 2007-12-17 13:41 UTC (permalink / raw)
To: Ingo Molnar; +Cc: tglx, Andrew Morton, mingo, Andi Kleen, linux-kernel, hpa
>>> Ingo Molnar <mingo@elte.hu> 17.12.07 14:28 >>>
>
>* Jan Beulich <jbeulich@novell.com> wrote:
>
>> When either calling change_page_attr() with the default attributes
>> pages in the direct mapping have and a page's attributes already were
>> set to the default or when changing the attributes from one
>> non-default value to another, the reference counting broke, leading to
>> either premature restoration of a large page or missing the
>> opportunity to do so.
>>
>> At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it
>> architecturally ought to have.
>
>hm, does this patch fix any real bug seen live? (if yes then do you have
>links to it, etc?) It would be a v2.6.24 fix in theory but it looks too
>dangerous for that. So i've queued it up for v2.6.25, for the time
>being.
I had run into the issue with experimental code of mine quite a while back
(i.e. I don't even recall what exactly I was doing back then). After that I
just continued to keep the fix (which I had submitted before, but which
collided with something in Andi's tree back then iirc).
Jan
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] x86: fix ref-counting bug in change_page_attr()
2007-12-17 13:41 ` Jan Beulich
@ 2007-12-17 14:34 ` Ingo Molnar
0 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2007-12-17 14:34 UTC (permalink / raw)
To: Jan Beulich; +Cc: tglx, Andrew Morton, mingo, Andi Kleen, linux-kernel, hpa
* Jan Beulich <jbeulich@novell.com> wrote:
> > hm, does this patch fix any real bug seen live? (if yes then do you
> > have links to it, etc?) It would be a v2.6.24 fix in theory but it
> > looks too dangerous for that. So i've queued it up for v2.6.25, for
> > the time being.
>
> I had run into the issue with experimental code of mine quite a while
> back (i.e. I don't even recall what exactly I was doing back then).
> After that I just continued to keep the fix (which I had submitted
> before, but which collided with something in Andi's tree back then
> iirc).
i'd be inclined to have it in the for-2.6.25 queue then - even plain
bugfixes in the pageattr code have broken boxes in the past, so this
isnt something we want to touch so late in the cycle. (We can still
reprioritize it if a serious regression pops up that this addresses.)
Ingo
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2007-12-17 14:34 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-12-13 9:34 [PATCH] x86: fix ref-counting bug in change_page_attr() Jan Beulich
2007-12-14 7:12 ` Jeremy Fitzhardinge
2007-12-14 8:38 ` Jan Beulich
2007-12-17 13:28 ` Ingo Molnar
2007-12-17 13:41 ` Jan Beulich
2007-12-17 14:34 ` Ingo Molnar
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).