linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler)
@ 2002-03-21 23:10 Martin J. Bligh
  2002-03-21 23:20 ` Martin J. Bligh
  2002-03-26 17:08 ` Andrea Arcangeli
  0 siblings, 2 replies; 9+ messages in thread
From: Martin J. Bligh @ 2002-03-21 23:10 UTC (permalink / raw)
  To: linux-kernel; +Cc: gerrit

The smaller two patches they published against 2.5.5 work fine, but
the main one needed a little work - this seems to work for me, but
I've only really touch tested it ... just thought this might save someone
some time .... there didn't seem to be anything too complex in the merge,
but that doesn't necessarily mean I got it right ;-)

M.

diff -urN linux-2.4.18-prepte/arch/i386/config.in linux-2.4.18-highpte/arch/i386/config.in
--- linux-2.4.18-prepte/arch/i386/config.in	Mon Feb 25 11:37:52 2002
+++ linux-2.4.18-highpte/arch/i386/config.in	Thu Mar 21 17:37:29 2002
@@ -171,16 +171,27 @@
 tristate '/dev/cpu/*/cpuid - CPU information support' CONFIG_X86_CPUID
 
 choice 'High Memory Support' \
-	"off    CONFIG_NOHIGHMEM \
-	 4GB    CONFIG_HIGHMEM4G \
-	 64GB   CONFIG_HIGHMEM64G" off
+	"off           CONFIG_NOHIGHMEM \
+	 4GB           CONFIG_HIGHMEM4G \
+	 4GB-highpte   CONFIG_HIGHMEM4G_HIGHPTE \
+	 64GB          CONFIG_HIGHMEM64G \
+	 64GB-highpte  CONFIG_HIGHMEM64G_HIGHPTE" off
 if [ "$CONFIG_HIGHMEM4G" = "y" ]; then
    define_bool CONFIG_HIGHMEM y
 fi
+if [ "$CONFIG_HIGHMEM4G_HIGHPTE" = "y" ]; then
+   define_bool CONFIG_HIGHMEM y
+   define_bool CONFIG_HIGHPTE y
+fi
 if [ "$CONFIG_HIGHMEM64G" = "y" ]; then
    define_bool CONFIG_HIGHMEM y
    define_bool CONFIG_X86_PAE y
 fi
+if [ "$CONFIG_HIGHMEM64G_HIGHPTE" = "y" ]; then
+   define_bool CONFIG_HIGHMEM y
+   define_bool CONFIG_HIGHPTE y
+   define_bool CONFIG_X86_PAE y
+fi
 
 bool 'Math emulation' CONFIG_MATH_EMULATION
 bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
@@ -416,11 +427,13 @@
 
 bool 'Kernel debugging' CONFIG_DEBUG_KERNEL
 if [ "$CONFIG_DEBUG_KERNEL" != "n" ]; then
-   bool '  Debug high memory support' CONFIG_DEBUG_HIGHMEM
    bool '  Debug memory allocations' CONFIG_DEBUG_SLAB
    bool '  Memory mapped I/O debugging' CONFIG_DEBUG_IOVIRT
    bool '  Magic SysRq key' CONFIG_MAGIC_SYSRQ
    bool '  Spinlock debugging' CONFIG_DEBUG_SPINLOCK
+   if [ "$CONFIG_HIGHMEM" = "y" ]; then
+      bool '  Highmem debugging' CONFIG_DEBUG_HIGHMEM
+   fi
    bool '  Verbose BUG() reporting (adds 70K)' CONFIG_DEBUG_BUGVERBOSE
 fi
 
diff -urN linux-2.4.18-prepte/arch/i386/kernel/process.c linux-2.4.18-highpte/arch/i386/kernel/process.c
--- linux-2.4.18-prepte/arch/i386/kernel/process.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/arch/i386/kernel/process.c	Thu Mar 21 17:37:29 2002
@@ -132,7 +132,6 @@
 		if (!current->need_resched)
 			idle();
 		schedule();
-		check_pgt_cache();
 	}
 }
 
diff -urN linux-2.4.18-prepte/arch/i386/kernel/smpboot.c linux-2.4.18-highpte/arch/i386/kernel/smpboot.c
--- linux-2.4.18-prepte/arch/i386/kernel/smpboot.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/arch/i386/kernel/smpboot.c	Thu Mar 21 17:37:29 2002
@@ -144,10 +144,6 @@
 	struct cpuinfo_x86 *c = cpu_data + id;
 
 	*c = boot_cpu_data;
-	c->pte_quick = 0;
-	c->pmd_quick = 0;
-	c->pgd_quick = 0;
-	c->pgtable_cache_sz = 0;
 	identify_cpu(c);
 	/*
 	 * Mask B, Pentium, but not Pentium MMX
diff -urN linux-2.4.18-prepte/arch/i386/kernel/traps.c linux-2.4.18-highpte/arch/i386/kernel/traps.c
--- linux-2.4.18-prepte/arch/i386/kernel/traps.c	Sun Sep 30 12:26:08 2001
+++ linux-2.4.18-highpte/arch/i386/kernel/traps.c	Thu Mar 21 17:37:29 2002
@@ -734,7 +734,7 @@
 	page = (unsigned long) vmalloc(PAGE_SIZE);
 	pgd = pgd_offset(&init_mm, page);
 	pmd = pmd_offset(pgd, page);
-	pte = pte_offset(pmd, page);
+	pte = pte_offset_kernel(pmd, page);
 	__free_page(pte_page(*pte));
 	*pte = mk_pte_phys(__pa(&idt_table), PAGE_KERNEL_RO);
 	/*
diff -urN linux-2.4.18-prepte/arch/i386/kernel/vm86.c linux-2.4.18-highpte/arch/i386/kernel/vm86.c
--- linux-2.4.18-prepte/arch/i386/kernel/vm86.c	Mon Feb 25 11:37:53 2002
+++ linux-2.4.18-highpte/arch/i386/kernel/vm86.c	Thu Mar 21 17:37:29 2002
@@ -94,7 +94,7 @@
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
-	pte_t *pte;
+	pte_t *pte, *mapped;
 	int i;
 
 	pgd = pgd_offset(tsk->mm, 0xA0000);
@@ -113,12 +113,15 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, 0xA0000);
+	preempt_disable();
+	pte = mapped = pte_offset_map(pmd, 0xA0000);
 	for (i = 0; i < 32; i++) {
 		if (pte_present(*pte))
 			set_pte(pte, pte_wrprotect(*pte));
 		pte++;
 	}
+	pte_unmap(mapped);
+	preempt_enable();
 	flush_tlb();
 }
 
diff -urN linux-2.4.18-prepte/arch/i386/mm/fault.c linux-2.4.18-highpte/arch/i386/mm/fault.c
--- linux-2.4.18-prepte/arch/i386/mm/fault.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/arch/i386/mm/fault.c	Thu Mar 21 17:37:29 2002
@@ -324,12 +324,20 @@
 	asm("movl %%cr3,%0":"=r" (page));
 	page = ((unsigned long *) __va(page))[address >> 22];
 	printk(KERN_ALERT "*pde = %08lx\n", page);
+	/*
+	 * We must not directly access the pte in the highpte
+	 * case, the page table might be allocated in highmem.
+	 * And lets rather not kmap-atomic the pte, just in case
+	 * it's allocated already.
+	 */
+#ifndef CONFIG_HIGHPTE
 	if (page & 1) {
 		page &= PAGE_MASK;
 		address &= 0x003ff000;
 		page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
 		printk(KERN_ALERT "*pte = %08lx\n", page);
 	}
+#endif
 	die("Oops", regs, error_code);
 	bust_spinlocks(0);
 	do_exit(SIGKILL);
@@ -399,7 +407,7 @@
 			goto no_context;
 		set_pmd(pmd, *pmd_k);
 
-		pte_k = pte_offset(pmd_k, address);
+		pte_k = pte_offset_kernel(pmd_k, address);
 		if (!pte_present(*pte_k))
 			goto no_context;
 		return;
diff -urN linux-2.4.18-prepte/arch/i386/mm/init.c linux-2.4.18-highpte/arch/i386/mm/init.c
--- linux-2.4.18-prepte/arch/i386/mm/init.c	Fri Dec 21 09:41:53 2001
+++ linux-2.4.18-highpte/arch/i386/mm/init.c	Thu Mar 21 17:37:29 2002
@@ -43,28 +43,6 @@
 static unsigned long totalram_pages;
 static unsigned long totalhigh_pages;
 
-int do_check_pgt_cache(int low, int high)
-{
-	int freed = 0;
-	if(pgtable_cache_size > high) {
-		do {
-			if (pgd_quicklist) {
-				free_pgd_slow(get_pgd_fast());
-				freed++;
-			}
-			if (pmd_quicklist) {
-				pmd_free_slow(pmd_alloc_one_fast(NULL, 0));
-				freed++;
-			}
-			if (pte_quicklist) {
-				pte_free_slow(pte_alloc_one_fast(NULL, 0));
-				freed++;
-			}
-		} while(pgtable_cache_size > low);
-	}
-	return freed;
-}
-
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  * physical space so we can cache the place of the first one and move
@@ -76,7 +54,7 @@
 pgprot_t kmap_prot;
 
 #define kmap_get_fixmap_pte(vaddr)					\
-	pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
+	pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
 
 void __init kmap_init(void)
 {
@@ -116,7 +94,6 @@
 	printk("%d reserved pages\n",reserved);
 	printk("%d pages shared\n",shared);
 	printk("%d pages swap cached\n",cached);
-	printk("%ld pages in page table cache\n",pgtable_cache_size);
 	show_buffers();
 }
 
@@ -143,7 +120,7 @@
 		printk("PAE BUG #01!\n");
 		return;
 	}
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	if (pte_val(*pte))
 		pte_ERROR(*pte);
 	pgprot_val(prot) = pgprot_val(PAGE_KERNEL) | pgprot_val(flags);
@@ -196,7 +173,7 @@
 			if (pmd_none(*pmd)) {
 				pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
 				set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
-				if (pte != pte_offset(pmd, 0))
+				if (pte != pte_offset_kernel(pmd, 0))
 					BUG();
 			}
 			vaddr += PMD_SIZE;
@@ -267,7 +244,7 @@
 				*pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL);
 			}
 			set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
-			if (pte_base != pte_offset(pmd, 0))
+			if (pte_base != pte_offset_kernel(pmd, 0))
 				BUG();
 
 		}
@@ -289,7 +266,7 @@
 
 	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	pkmap_page_table = pte;
 #endif
 
@@ -398,7 +375,7 @@
 
 	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	old_pte = *pte;
 	*pte = mk_pte_phys(0, PAGE_READONLY);
 	local_flush_tlb();
diff -urN linux-2.4.18-prepte/arch/i386/mm/ioremap.c linux-2.4.18-highpte/arch/i386/mm/ioremap.c
--- linux-2.4.18-prepte/arch/i386/mm/ioremap.c	Tue Mar 20 08:13:33 2001
+++ linux-2.4.18-highpte/arch/i386/mm/ioremap.c	Thu Mar 21 17:37:29 2002
@@ -49,7 +49,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
diff -urN linux-2.4.18-prepte/drivers/char/drm/drm_proc.h linux-2.4.18-highpte/drivers/char/drm/drm_proc.h
--- linux-2.4.18-prepte/drivers/char/drm/drm_proc.h	Thu Nov 22 11:46:37 2001
+++ linux-2.4.18-highpte/drivers/char/drm/drm_proc.h	Thu Mar 21 17:37:24 2002
@@ -449,7 +449,8 @@
 		for (i = vma->vm_start; i < vma->vm_end; i += PAGE_SIZE) {
 			pgd = pgd_offset(vma->vm_mm, i);
 			pmd = pmd_offset(pgd, i);
-			pte = pte_offset(pmd, i);
+			preempt_disable();
+			pte = pte_offset_map(pmd, i);
 			if (pte_present(*pte)) {
 				address = __pa(pte_page(*pte))
 					+ (i & (PAGE_SIZE-1));
@@ -465,6 +466,8 @@
 			} else {
 				DRM_PROC_PRINT("      0x%08lx\n", i);
 			}
+			pte_unmap(pte);
+			preempt_enable();
 		}
 #endif
 	}
diff -urN linux-2.4.18-prepte/drivers/char/drm/drm_scatter.h linux-2.4.18-highpte/drivers/char/drm/drm_scatter.h
--- linux-2.4.18-prepte/drivers/char/drm/drm_scatter.h	Thu Nov 22 11:46:37 2001
+++ linux-2.4.18-highpte/drivers/char/drm/drm_scatter.h	Thu Mar 21 17:37:24 2002
@@ -143,9 +143,15 @@
 		if ( !pmd_present( *pmd ) )
 			goto failed;
 
-		pte = pte_offset( pmd, i );
-		if ( !pte_present( *pte ) )
+		preempt_disable();
+		pte = pte_offset_map( pmd, i );
+		if ( !pte_present( *pte ) ) {
+			pte_unmap(pte);
+			preempt_enable();
 			goto failed;
+		}
+		pte_unmap(pte);
+		preempt_enable();
 
 		entry->pagelist[j] = pte_page( *pte );
 
diff -urN linux-2.4.18-prepte/drivers/char/drm/drm_vm.h linux-2.4.18-highpte/drivers/char/drm/drm_vm.h
--- linux-2.4.18-prepte/drivers/char/drm/drm_vm.h	Thu Nov 22 11:46:37 2001
+++ linux-2.4.18-highpte/drivers/char/drm/drm_vm.h	Thu Mar 21 17:37:24 2002
@@ -169,8 +169,15 @@
 	if( !pgd_present( *pgd ) ) return NOPAGE_OOM;
 	pmd = pmd_offset( pgd, i );
 	if( !pmd_present( *pmd ) ) return NOPAGE_OOM;
-	pte = pte_offset( pmd, i );
-	if( !pte_present( *pte ) ) return NOPAGE_OOM;
+	preempt_disable();
+	pte = pte_offset_map( pmd, i );
+	if( !pte_present( *pte ) ) {
+		pte_unmap(pte);
+		preempt_enable();
+		return NOPAGE_OOM;
+	}
+	pte_unmap(pte);
+	preempt_enable();
 
 	page = pte_page(*pte);
 	get_page(page);
diff -urN linux-2.4.18-prepte/drivers/sgi/char/graphics.c linux-2.4.18-highpte/drivers/sgi/char/graphics.c
--- linux-2.4.18-prepte/drivers/sgi/char/graphics.c	Thu Oct 11 09:43:30 2001
+++ linux-2.4.18-highpte/drivers/sgi/char/graphics.c	Thu Mar 21 17:37:24 2002
@@ -221,6 +221,7 @@
 	int board = GRAPHICS_CARD (vma->vm_dentry->d_inode->i_rdev);
 
 	unsigned long virt_add, phys_add;
+	struct page * page;
 
 #ifdef DEBUG
 	printk ("Got a page fault for board %d address=%lx guser=%lx\n", board,
@@ -247,8 +248,10 @@
 
 	pgd = pgd_offset(current->mm, address);
 	pmd = pmd_offset(pgd, address);
-	pte = pte_offset(pmd, address);
-	return pte_page(*pte);
+	pte = pte_kmap_offset(pmd, address);
+	page = pte_page(*pte);
+	pte_kunmap(pte);
+	return page;
 }
 
 /*
diff -urN linux-2.4.18-prepte/drivers/usb/stv680.c linux-2.4.18-highpte/drivers/usb/stv680.c
--- linux-2.4.18-prepte/drivers/usb/stv680.c	Mon Feb 25 11:38:07 2002
+++ linux-2.4.18-highpte/drivers/usb/stv680.c	Thu Mar 21 17:37:24 2002
@@ -133,8 +133,11 @@
 	if (!pgd_none (*pgd)) {
 		pmd = pmd_offset (pgd, adr);
 		if (!pmd_none (*pmd)) {
-			ptep = pte_offset (pmd, adr);
+			preempt_disable();
+			ptep = pte_offset_map (pmd, adr);
 			pte = *ptep;
+			pte_unmap(pte);
+			preempt_enable();
 			if (pte_present (pte)) {
 				ret = (unsigned long) page_address (pte_page (pte));
 				ret |= (adr & (PAGE_SIZE - 1));
diff -urN linux-2.4.18-prepte/drivers/usb/vicam.c linux-2.4.18-highpte/drivers/usb/vicam.c
--- linux-2.4.18-prepte/drivers/usb/vicam.c	Mon Feb 25 11:38:07 2002
+++ linux-2.4.18-highpte/drivers/usb/vicam.c	Thu Mar 21 17:37:24 2002
@@ -115,8 +115,11 @@
 	if (!pgd_none(*pgd)) {
 		pmd = pmd_offset(pgd, adr);
 		if (!pmd_none(*pmd)) {
-			ptep = pte_offset(pmd, adr);
+			preempt_disable();
+			ptep = pte_offset_map(pmd, adr);
 			pte = *ptep;
+			pte_unmap(pte);
+			preempt_enable();
 			if(pte_present(pte)) {
 				ret  = (unsigned long) page_address(pte_page(pte));
 				ret |= (adr & (PAGE_SIZE - 1));
diff -urN linux-2.4.18-prepte/fs/exec.c linux-2.4.18-highpte/fs/exec.c
--- linux-2.4.18-prepte/fs/exec.c	Fri Dec 21 09:41:55 2001
+++ linux-2.4.18-highpte/fs/exec.c	Thu Mar 21 17:37:24 2002
@@ -270,15 +270,18 @@
 	pmd = pmd_alloc(tsk->mm, pgd, address);
 	if (!pmd)
 		goto out;
-	pte = pte_alloc(tsk->mm, pmd, address);
+	pte = pte_alloc_map(tsk->mm, pmd, address);
 	if (!pte)
 		goto out;
-	if (!pte_none(*pte))
+	if (!pte_none(*pte)) {
+		pte_unmap(pte);
 		goto out;
+	}
 	lru_cache_add(page);
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
+	pte_unmap(pte);
 	tsk->mm->rss++;
 	spin_unlock(&tsk->mm->page_table_lock);
 
diff -urN linux-2.4.18-prepte/fs/proc/array.c linux-2.4.18-highpte/fs/proc/array.c
--- linux-2.4.18-prepte/fs/proc/array.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/fs/proc/array.c	Thu Mar 21 17:37:24 2002
@@ -392,11 +392,10 @@
 	return res;
 }
 		
-static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
-	int * pages, int * shared, int * dirty, int * total)
+static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, int * pages, int * shared, int * dirty, int * total)
 {
-	pte_t * pte;
-	unsigned long end;
+	unsigned long end, pmd_end;
+	pte_t *pte;
 
 	if (pmd_none(*pmd))
 		return;
@@ -405,11 +404,12 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, address);
-	address &= ~PMD_MASK;
+	preempt_disable();
+	pte = pte_offset_map(pmd, address);
 	end = address + size;
-	if (end > PMD_SIZE)
-		end = PMD_SIZE;
+	pmd_end = (address + PMD_SIZE) & PMD_MASK;
+	if (end > pmd_end)
+		end = pmd_end;
 	do {
 		pte_t page = *pte;
 		struct page *ptpage;
@@ -430,6 +430,8 @@
 		if (page_count(pte_page(page)) > 1)
 			++*shared;
 	} while (address < end);
+	pte_unmap(pte - 1);
+	preempt_enable();
 }
 
 static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size,
diff -urN linux-2.4.18-prepte/include/asm-i386/highmem.h linux-2.4.18-highpte/include/asm-i386/highmem.h
--- linux-2.4.18-prepte/include/asm-i386/highmem.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.18-highpte/include/asm-i386/highmem.h	Thu Mar 21 17:37:24 2002
@@ -26,12 +26,6 @@
 #include <asm/kmap_types.h>
 #include <asm/pgtable.h>
 
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define HIGHMEM_DEBUG 1
-#else
-#define HIGHMEM_DEBUG 0
-#endif
-
 /* declarations for highmem.c */
 extern unsigned long highstart_pfn, highend_pfn;
 
@@ -93,7 +87,7 @@
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-#if HIGHMEM_DEBUG
+#if CONFIG_DEBUG_HIGHMEM
 	if (!pte_none(*(kmap_pte-idx)))
 		BUG();
 #endif
@@ -105,8 +99,8 @@
 
 static inline void kunmap_atomic(void *kvaddr, enum km_type type)
 {
-#if HIGHMEM_DEBUG
-	unsigned long vaddr = (unsigned long) kvaddr;
+#if CONFIG_DEBUG_HIGHMEM
+	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
 	if (vaddr < FIXADDR_START) // FIXME
diff -urN linux-2.4.18-prepte/include/asm-i386/kmap_types.h linux-2.4.18-highpte/include/asm-i386/kmap_types.h
--- linux-2.4.18-prepte/include/asm-i386/kmap_types.h	Mon Sep 17 13:16:30 2001
+++ linux-2.4.18-highpte/include/asm-i386/kmap_types.h	Thu Mar 21 19:16:24 2002
@@ -1,13 +1,26 @@
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
+#include <linux/config.h>
+
+#if CONFIG_DEBUG_HIGHMEM
+# define D(n) __KM_FENCE_##n ,
+#else
+# define D(n)
+#endif
+
 enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_TYPE_NR
+D(0)  KM_BOUNCE_READ,
+D(1)  KM_SKB_DATA,
+D(2)  KM_SKB_DATA_SOFTIRQ,
+D(3)  KM_USER0,
+D(4)  KM_USER1,
+D(5)  KM_BIO_IRQ,
+D(6)  KM_PTE0,
+D(7)  KM_PTE1,
+D(8)  KM_TYPE_NR
 };
+
+#undef D
 
 #endif
diff -urN linux-2.4.18-prepte/include/asm-i386/pgalloc.h linux-2.4.18-highpte/include/asm-i386/pgalloc.h
--- linux-2.4.18-prepte/include/asm-i386/pgalloc.h	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/include/asm-i386/pgalloc.h	Thu Mar 21 19:14:29 2002
@@ -5,15 +5,17 @@
 #include <asm/processor.h>
 #include <asm/fixmap.h>
 #include <linux/threads.h>
+#include <linux/highmem.h>
 
-#define pgd_quicklist (current_cpu_data.pgd_quick)
-#define pmd_quicklist (current_cpu_data.pmd_quick)
-#define pte_quicklist (current_cpu_data.pte_quick)
-#define pgtable_cache_size (current_cpu_data.pgtable_cache_sz)
-
-#define pmd_populate(mm, pmd, pte) \
+#define pmd_populate_kernel(mm, pmd, pte) \
 		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
 
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+	set_pmd(pmd, __pmd(_PAGE_TABLE +
+		((unsigned long long)(pte - mem_map) <<
+			(unsigned long long) PAGE_SHIFT)));
+}
 /*
  * Allocate and free page tables.
  */
@@ -29,7 +31,7 @@
 extern void kmem_cache_free(struct kmem_cache_s *, void *);
 
 
-static inline pgd_t *get_pgd_slow(void)
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	int i;
 	pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);
@@ -56,7 +58,7 @@
 
 #else
 
-static inline pgd_t *get_pgd_slow(void)
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
 
@@ -71,27 +73,7 @@
 
 #endif /* CONFIG_X86_PAE */
 
-static inline pgd_t *get_pgd_fast(void)
-{
-	unsigned long *ret;
-
-	if ((ret = pgd_quicklist) != NULL) {
-		pgd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		pgtable_cache_size--;
-	} else
-		ret = (unsigned long *)get_pgd_slow();
-	return (pgd_t *)ret;
-}
-
-static inline void free_pgd_fast(pgd_t *pgd)
-{
-	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
-	pgd_quicklist = (unsigned long *) pgd;
-	pgtable_cache_size++;
-}
-
-static inline void free_pgd_slow(pgd_t *pgd)
+static inline void free_pgd(pgd_t *pgd)
 {
 #if defined(CONFIG_X86_PAE)
 	int i;
@@ -104,59 +86,64 @@
 #endif
 }
 
-static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
+	int count = 0;
 	pte_t *pte;
-
-	pte = (pte_t *) __get_free_page(GFP_KERNEL);
-	if (pte)
-		clear_page(pte);
+   
+   	do {
+		pte = (pte_t *) __get_free_page(GFP_KERNEL);
+		if (pte)
+			clear_page(pte);
+		else {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(HZ);
+		}
+	} while (!pte && (count++ < 10));
 	return pte;
 }
-
-static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm,
-					unsigned long address)
+  
+static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	unsigned long *ret;
-
-	if ((ret = (unsigned long *)pte_quicklist) != NULL) {
-		pte_quicklist = (unsigned long *)(*ret);
-		ret[0] = ret[1];
-		pgtable_cache_size--;
-	}
-	return (pte_t *)ret;
+	int count = 0;
+	struct page *pte;
+   
+   	do {
+#if CONFIG_HIGHPTE
+		pte = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0);
+#else
+		pte = alloc_pages(GFP_KERNEL, 0);
+#endif
+		if (pte)
+			clear_highpage(pte);
+		else {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(HZ);
+		}
+	} while (!pte && (count++ < 10));
+	return pte;
 }
-
-static inline void pte_free_fast(pte_t *pte)
+  
+static inline void pte_free_kernel(pte_t *pte)
 {
-	*(unsigned long *)pte = (unsigned long) pte_quicklist;
-	pte_quicklist = (unsigned long *) pte;
-	pgtable_cache_size++;
+	free_page((unsigned long)pte);
 }
-
-static __inline__ void pte_free_slow(pte_t *pte)
+  
+static inline void pte_free(struct page *pte)
 {
-	free_page((unsigned long)pte);
+	__free_page(pte);
 }
-
-#define pte_free(pte)		pte_free_slow(pte)
-#define pgd_free(pgd)		free_pgd_slow(pgd)
-#define pgd_alloc(mm)		get_pgd_fast()
-
+  
 /*
  * allocating and freeing a pmd is trivial: the 1-entry pmd is
  * inside the pgd, so has no extra memory associated with it.
  * (In the PAE case we free the pmds as part of the pgd.)
  */
 
-#define pmd_alloc_one_fast(mm, addr)	({ BUG(); ((pmd_t *)1); })
 #define pmd_alloc_one(mm, addr)		({ BUG(); ((pmd_t *)2); })
-#define pmd_free_slow(x)		do { } while (0)
-#define pmd_free_fast(x)		do { } while (0)
 #define pmd_free(x)			do { } while (0)
 #define pgd_populate(mm, pmd, pte)	BUG()
-
-extern int do_check_pgt_cache(int, int);
 
 /*
  * TLB flushing:
diff -urN linux-2.4.18-prepte/include/asm-i386/pgtable.h linux-2.4.18-highpte/include/asm-i386/pgtable.h
--- linux-2.4.18-prepte/include/asm-i386/pgtable.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.18-highpte/include/asm-i386/pgtable.h	Thu Mar 21 17:37:24 2002
@@ -315,9 +315,12 @@
 
 #define page_pte(page) page_pte_prot(page, __pgprot(0))
 
-#define pmd_page(pmd) \
+#define pmd_page_kernel(pmd) \
 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
+#define pmd_page(pmd) \
+	(mem_map + (pmd_val(pmd) >> PAGE_SHIFT))
+
 /* to find an entry in a page-table-directory. */
 #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 
@@ -334,8 +337,14 @@
 /* Find an entry in the third-level page table.. */
 #define __pte_offset(address) \
 		((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset(dir, address) ((pte_t *) pmd_page(*(dir)) + \
-			__pte_offset(address))
+#define pte_offset_kernel(dir, address) \
+	((pte_t *) pmd_page_kernel(*(dir)) +  __pte_offset(address))
+#define pte_offset_map(dir, address) \
+	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + __pte_offset(address))
+#define pte_offset_map2(dir, address) \
+	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + __pte_offset(address))
+#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
+#define pte_unmap2(pte) kunmap_atomic(pte, KM_PTE1)
 
 /*
  * The i386 doesn't have any external MMU info: the kernel page
diff -urN linux-2.4.18-prepte/include/asm-i386/processor.h linux-2.4.18-highpte/include/asm-i386/processor.h
--- linux-2.4.18-prepte/include/asm-i386/processor.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.18-highpte/include/asm-i386/processor.h	Thu Mar 21 17:37:24 2002
@@ -49,10 +49,6 @@
 	int	f00f_bug;
 	int	coma_bug;
 	unsigned long loops_per_jiffy;
-	unsigned long *pgd_quick;
-	unsigned long *pmd_quick;
-	unsigned long *pte_quick;
-	unsigned long pgtable_cache_sz;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL 0
diff -urN linux-2.4.18-prepte/include/linux/highmem.h linux-2.4.18-highpte/include/linux/highmem.h
--- linux-2.4.18-prepte/include/linux/highmem.h	Mon Feb 25 11:38:13 2002
+++ linux-2.4.18-highpte/include/linux/highmem.h	Thu Mar 21 19:07:18 2002
@@ -2,7 +2,6 @@
 #define _LINUX_HIGHMEM_H
 
 #include <linux/config.h>
-#include <asm/pgalloc.h>
 
 #ifdef CONFIG_HIGHMEM
 
@@ -14,7 +13,7 @@
 unsigned int nr_free_highpages(void);
 
 extern struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig);
-
+extern void check_highmem_ptes(void);
 
 static inline char *bh_kmap(struct buffer_head *bh)
 {
@@ -52,8 +51,9 @@
 
 static inline void clear_highpage(struct page *page)
 {
-	clear_page(kmap(page));
-	kunmap(page);
+	void *kaddr = kmap_atomic(page, KM_USER0);
+	clear_page(kaddr);
+	kunmap_atomic(kaddr, KM_USER0);
 }
 
 /*
@@ -61,15 +61,16 @@
  */
 static inline void memclear_highpage_flush(struct page *page, unsigned int offset, unsigned int size)
 {
-	char *kaddr;
+	void *kaddr;
 
 	if (offset + size > PAGE_SIZE)
 		BUG();
-	kaddr = kmap(page);
-	memset(kaddr + offset, 0, size);
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset((char *)kaddr + offset, 0, size);
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
-	kunmap(page);
+	kunmap_atomic(kaddr, KM_USER0);
 }
 
 static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr)
diff -urN linux-2.4.18-prepte/include/linux/mm.h linux-2.4.18-highpte/include/linux/mm.h
--- linux-2.4.18-prepte/include/linux/mm.h	Thu Mar 21 19:20:20 2002
+++ linux-2.4.18-highpte/include/linux/mm.h	Thu Mar 21 17:37:24 2002
@@ -411,7 +411,8 @@
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
 extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
@@ -427,7 +428,7 @@
 
 /*
  * On a two-level page table, this ends up being trivial. Thus the
- * inlining and the symmetry break with pte_alloc() that does all
+ * inlining and the symmetry break with pte_alloc_map() that does all
  * of this out-of-line.
  */
 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
@@ -436,9 +437,6 @@
 		return __pmd_alloc(mm, pgd, address);
 	return pmd_offset(pgd, address);
 }
-
-extern int pgt_cache_water[2];
-extern int check_pgt_cache(void);
 
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
diff -urN linux-2.4.18-prepte/kernel/sched.c linux-2.4.18-highpte/kernel/sched.c
--- linux-2.4.18-prepte/kernel/sched.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/kernel/sched.c	Thu Mar 21 17:49:37 2002
@@ -20,6 +20,7 @@
 #include <linux/interrupt.h>
 #include <asm/mmu_context.h>
 #include <linux/kernel_stat.h>
+#include <linux/highmem.h>
 
 /*
  * Priority of a process goes from 0 to 139. The 0-99
@@ -751,6 +752,10 @@
 
 	if (unlikely(in_interrupt()))
 		BUG();
+#if CONFIG_DEBUG_HIGHMEM
+	check_highmem_ptes();
+#endif
+
 	release_kernel_lock(prev, smp_processor_id());
 	prev->sleep_timestamp = jiffies;
 	spin_lock_irq(&rq->lock);
diff -urN linux-2.4.18-prepte/kernel/sysctl.c linux-2.4.18-highpte/kernel/sysctl.c
--- linux-2.4.18-prepte/kernel/sysctl.c	Fri Dec 21 09:42:04 2001
+++ linux-2.4.18-highpte/kernel/sysctl.c	Thu Mar 21 17:37:24 2002
@@ -96,8 +96,6 @@
 extern int acct_parm[];
 #endif
 
-extern int pgt_cache_water[];
-
 static int parse_table(int *, int, void *, size_t *, void *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -267,8 +265,6 @@
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
 	{VM_PAGERDAEMON, "kswapd",
 	 &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
-	{VM_PGT_CACHE, "pagetable_cache", 
-	 &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_PAGE_CLUSTER, "page-cluster", 
 	 &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_MIN_READAHEAD, "min-readahead",
diff -urN linux-2.4.18-prepte/mm/filemap.c linux-2.4.18-highpte/mm/filemap.c
--- linux-2.4.18-prepte/mm/filemap.c	Mon Feb 25 11:38:13 2002
+++ linux-2.4.18-highpte/mm/filemap.c	Thu Mar 21 18:21:11 2002
@@ -1989,7 +1989,7 @@
 /* Called with mm->page_table_lock held to protect against other
  * threads/the swapper from ripping pte's out from under us.
  */
-static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
+static inline int filemap_sync_pte(pte_t *ptep, pmd_t *pmdp, struct vm_area_struct *vma,
 	unsigned long address, unsigned int flags)
 {
 	pte_t pte = *ptep;
@@ -2005,11 +2005,10 @@
 }
 
 static inline int filemap_sync_pte_range(pmd_t * pmd,
-	unsigned long address, unsigned long size, 
-	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
+	unsigned long address, unsigned long end, 
+	struct vm_area_struct *vma, unsigned int flags)
 {
-	pte_t * pte;
-	unsigned long end;
+	pte_t *pte;
 	int error;
 
 	if (pmd_none(*pmd))
@@ -2019,27 +2018,26 @@
 		pmd_clear(pmd);
 		return 0;
 	}
-	pte = pte_offset(pmd, address);
-	offset += address & PMD_MASK;
-	address &= ~PMD_MASK;
-	end = address + size;
-	if (end > PMD_SIZE)
-		end = PMD_SIZE;
+	pte = pte_offset_map(pmd, address);
+	if ((address & PMD_MASK) != (end & PMD_MASK))
+		end = (address & PMD_MASK) + PMD_SIZE;
 	error = 0;
 	do {
-		error |= filemap_sync_pte(pte, vma, address + offset, flags);
+		error |= filemap_sync_pte(pte, pmd, vma, address, flags);
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+
+	pte_unmap(pte - 1);
+
 	return error;
 }
 
 static inline int filemap_sync_pmd_range(pgd_t * pgd,
-	unsigned long address, unsigned long size, 
+	unsigned long address, unsigned long end, 
 	struct vm_area_struct *vma, unsigned int flags)
 {
 	pmd_t * pmd;
-	unsigned long offset, end;
 	int error;
 
 	if (pgd_none(*pgd))
@@ -2050,14 +2048,11 @@
 		return 0;
 	}
 	pmd = pmd_offset(pgd, address);
-	offset = address & PGDIR_MASK;
-	address &= ~PGDIR_MASK;
-	end = address + size;
-	if (end > PGDIR_SIZE)
-		end = PGDIR_SIZE;
+	if ((address & PGDIR_MASK) != (end & PGDIR_MASK))
+		end = (address & PGDIR_MASK) + PGDIR_SIZE;
 	error = 0;
 	do {
-		error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
+		error |= filemap_sync_pte_range(pmd, address, end, vma, flags);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -2077,11 +2072,11 @@
 	spin_lock(&vma->vm_mm->page_table_lock);
 
 	dir = pgd_offset(vma->vm_mm, address);
-	flush_cache_range(vma->vm_mm, end - size, end);
+	flush_cache_range(vma, address, end);
 	if (address >= end)
 		BUG();
 	do {
-		error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
+		error |= filemap_sync_pmd_range(dir, address, end, vma, flags);
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
diff -urN linux-2.4.18-prepte/mm/highmem.c linux-2.4.18-highpte/mm/highmem.c
--- linux-2.4.18-prepte/mm/highmem.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/mm/highmem.c	Thu Mar 21 17:51:29 2002
@@ -21,6 +21,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <asm/pgalloc.h>
 
 /*
  * Virtual_count is not a pure "count".
@@ -446,3 +447,17 @@
 	return bh;
 }
 
+#if CONFIG_DEBUG_HIGHMEM
+void check_highmem_ptes(void)
+{
+	int idx, type;
+ 
+	for (type = 0; type < KM_TYPE_NR; type++) {
+		idx = type + KM_TYPE_NR*smp_processor_id();
+ 		if (!pte_none(*(kmap_pte-idx))) {
+ 			printk("scheduling with KM_TYPE %d held!\n", type);
+ 			BUG();
+ 		}
+ 	}
+}
+#endif
diff -urN linux-2.4.18-prepte/mm/memory.c linux-2.4.18-highpte/mm/memory.c
--- linux-2.4.18-prepte/mm/memory.c	Thu Mar 21 19:20:20 2002
+++ linux-2.4.18-highpte/mm/memory.c	Thu Mar 21 17:37:24 2002
@@ -90,7 +90,7 @@
  */
 static inline void free_one_pmd(pmd_t * dir)
 {
-	pte_t * pte;
+	struct page *pte;
 
 	if (pmd_none(*dir))
 		return;
@@ -99,7 +99,7 @@
 		pmd_clear(dir);
 		return;
 	}
-	pte = pte_offset(dir, 0);
+	pte = pmd_page(*dir);
 	pmd_clear(dir);
 	pte_free(pte);
 }
@@ -125,18 +125,6 @@
 	pmd_free(pmd);
 }
 
-/* Low and high watermarks for page table cache.
-   The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
- */
-int pgt_cache_water[2] = { 25, 50 };
-
-/* Returns the number of pages freed */
-int check_pgt_cache(void)
-{
-	return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
-}
-
-
 /*
  * This function clears all user-level page tables of a process - this
  * is needed by execve(), so that old pages aren't in the way.
@@ -152,11 +140,59 @@
 		page_dir++;
 	} while (--nr);
 	spin_unlock(&mm->page_table_lock);
+}
+
+pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (!pmd_present(*pmd)) {
+		struct page *new;
 
-	/* keep the page table cache within bounds */
-	check_pgt_cache();
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
+
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free(new);
+			goto out;
+		}
+		pmd_populate(mm, pmd, new);
+	}
+out:
+	if (pmd_present(*pmd))
+		return pte_offset_map(pmd, address);
+	return NULL;
 }
 
+pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (!pmd_present(*pmd)) {
+		pte_t *new;
+
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one_kernel(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
+
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free_kernel(new);
+			goto out;
+		}
+		pmd_populate_kernel(mm, pmd, new);
+	}
+out:
+	return pte_offset_kernel(pmd, address);
+}
 #define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
 #define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
 
@@ -169,7 +205,7 @@
  *         variable count and make things faster. -jj
  *
  * dst->page_table_lock is held on entry and exit,
- * but may be dropped within pmd_alloc() and pte_alloc().
+ * but may be dropped within pmd_alloc() and pte_alloc_map().
  */
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma)
@@ -221,12 +257,11 @@
 				goto cont_copy_pmd_range;
 			}
 
-			src_pte = pte_offset(src_pmd, address);
-			dst_pte = pte_alloc(dst, dst_pmd, address);
+			dst_pte = pte_alloc_map(dst, dst_pmd, address);
 			if (!dst_pte)
 				goto nomem;
-
 			spin_lock(&src->page_table_lock);			
+			src_pte = pte_offset_map2(src_pmd, address);
 			do {
 				pte_t pte = *src_pte;
 				struct page *ptepage;
@@ -259,11 +294,16 @@
 
 cont_copy_pte_range:		set_pte(dst_pte, pte);
 cont_copy_pte_range_noset:	address += PAGE_SIZE;
-				if (address >= end)
+				if (address >= end) {
+					pte_unmap2(src_pte);
+					pte_unmap(dst_pte);
 					goto out_unlock;
+				}
 				src_pte++;
 				dst_pte++;
 			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
+			pte_unmap2(src_pte-1);
+			pte_unmap(dst_pte-1);
 			spin_unlock(&src->page_table_lock);
 		
 cont_copy_pmd_range:	src_pmd++;
@@ -292,7 +332,7 @@
 static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
 {
 	unsigned long offset;
-	pte_t * ptep;
+	pte_t *ptep;
 	int freed = 0;
 
 	if (pmd_none(*pmd))
@@ -302,7 +342,7 @@
 		pmd_clear(pmd);
 		return 0;
 	}
-	ptep = pte_offset(pmd, address);
+	ptep = pte_offset_map(pmd, address);
 	offset = address & ~PMD_MASK;
 	if (offset + size > PMD_SIZE)
 		size = PMD_SIZE - offset;
@@ -322,6 +362,7 @@
 			pte_clear(ptep);
 		}
 	}
+	pte_unmap(ptep-1);
 
 	return freed;
 }
@@ -414,11 +455,16 @@
 	if (pmd_none(*pmd) || pmd_bad(*pmd))
 		goto out;
 
-	ptep = pte_offset(pmd, address);
-	if (!ptep)
+	preempt_disable();
+	ptep = pte_offset_map(pmd, address);
+	if (!ptep) {
+		preempt_enable();
 		goto out;
+	}
 
 	pte = *ptep;
+	pte_unmap(ptep);
+	preempt_enable();
 	if (pte_present(pte)) {
 		if (!write ||
 		    (pte_write(pte) && pte_dirty(pte)))
@@ -771,10 +817,11 @@
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		pte_t * pte = pte_alloc(mm, pmd, address);
+		pte_t * pte = pte_alloc_map(mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		zeromap_pte_range(pte, address, end - address, prot);
+		pte_unmap(pte);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -851,10 +898,11 @@
 		end = PGDIR_SIZE;
 	phys_addr -= address;
 	do {
-		pte_t * pte = pte_alloc(mm, pmd, address);
+		pte_t * pte = pte_alloc_map(mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_pte_range(pte, address, end - address, address + phys_addr, prot);
+		pte_unmap(pte);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -940,7 +988,7 @@
  * with the page_table_lock released.
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
-	unsigned long address, pte_t *page_table, pte_t pte)
+	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
 {
 	struct page *old_page, *new_page;
 
@@ -954,10 +1002,12 @@
 		if (reuse) {
 			flush_cache_page(vma, address);
 			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
+			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
 			return 1;	/* Minor fault */
 		}
 	}
+	pte_unmap(page_table);
 
 	/*
 	 * Ok, we need to copy. Oh, well..
@@ -974,6 +1024,7 @@
 	 * Re-check the pte - we dropped the lock
 	 */
 	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
 			++mm->rss;
@@ -983,12 +1034,14 @@
 		/* Free the old page.. */
 		new_page = old_page;
 	}
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	return 1;	/* Minor fault */
 
 bad_wp_page:
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
 	return -1;
@@ -1110,13 +1163,14 @@
  */
 static int do_swap_page(struct mm_struct * mm,
 	struct vm_area_struct * vma, unsigned long address,
-	pte_t * page_table, pte_t orig_pte, int write_access)
+	pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
 {
 	struct page *page;
 	swp_entry_t entry = pte_to_swp_entry(orig_pte);
 	pte_t pte;
 	int ret = 1;
 
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	page = lookup_swap_cache(entry);
 	if (!page) {
@@ -1129,7 +1183,9 @@
 			 */
 			int retval;
 			spin_lock(&mm->page_table_lock);
+			page_table = pte_offset_map(pmd, address);
 			retval = pte_same(*page_table, orig_pte) ? -1 : 1;
+			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
 			return retval;
 		}
@@ -1147,7 +1203,9 @@
 	 * released the page table lock.
 	 */
 	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
 	if (!pte_same(*page_table, orig_pte)) {
+		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
 		unlock_page(page);
 		page_cache_release(page);
@@ -1172,6 +1230,7 @@
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
@@ -1181,7 +1240,7 @@
  * spinlock held to protect against concurrent faults in
  * multithreaded programs. 
  */
-static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
+static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr)
 {
 	pte_t entry;
 
@@ -1193,6 +1252,7 @@
 		struct page *page;
 
 		/* Allocate our own private page. */
+		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
 
 		page = alloc_page(GFP_HIGHUSER);
@@ -1201,7 +1261,10 @@
 		clear_user_highpage(page, addr);
 
 		spin_lock(&mm->page_table_lock);
+		page_table = pte_offset_map(pmd, addr);
+
 		if (!pte_none(*page_table)) {
+			pte_unmap(page_table);
 			page_cache_release(page);
 			spin_unlock(&mm->page_table_lock);
 			return 1;
@@ -1214,6 +1277,7 @@
 	}
 
 	set_pte(page_table, entry);
+	pte_unmap(page_table);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, entry);
@@ -1237,13 +1301,14 @@
  * spinlock held. Exit with the spinlock released.
  */
 static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
-	unsigned long address, int write_access, pte_t *page_table)
+	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
 {
 	struct page * new_page;
 	pte_t entry;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
-		return do_anonymous_page(mm, vma, page_table, write_access, address);
+		return do_anonymous_page(mm, vma, page_table, pmd, write_access, address);
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
@@ -1269,6 +1334,8 @@
 	}
 
 	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
+
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
 	 * due to the bad i386 page protection. But it's valid
@@ -1288,8 +1355,10 @@
 		if (write_access)
 			entry = pte_mkwrite(pte_mkdirty(entry));
 		set_pte(page_table, entry);
+		pte_unmap(page_table);
 	} else {
 		/* One of our sibling threads was faster, back out. */
+		pte_unmap(page_table);
 		page_cache_release(new_page);
 		spin_unlock(&mm->page_table_lock);
 		return 1;
@@ -1324,7 +1393,7 @@
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 	struct vm_area_struct * vma, unsigned long address,
-	int write_access, pte_t * pte)
+	int write_access, pte_t *pte, pmd_t *pmd)
 {
 	pte_t entry;
 
@@ -1336,18 +1405,19 @@
 		 * drop the lock.
 		 */
 		if (pte_none(entry))
-			return do_no_page(mm, vma, address, write_access, pte);
-		return do_swap_page(mm, vma, address, pte, entry, write_access);
+			return do_no_page(mm, vma, address, write_access, pte, pmd);
+		return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
 	}
 
 	if (write_access) {
 		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address, pte, entry);
+			return do_wp_page(mm, vma, address, pte, pmd, entry);
 
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
 	establish_pte(vma, address, pte, entry);
+	pte_unmap(pte);
 	spin_unlock(&mm->page_table_lock);
 	return 1;
 }
@@ -1372,9 +1442,9 @@
 	pmd = pmd_alloc(mm, pgd, address);
 
 	if (pmd) {
-		pte_t * pte = pte_alloc(mm, pmd, address);
+		pte_t * pte = pte_alloc_map(mm, pmd, address);
 		if (pte)
-			return handle_pte_fault(mm, vma, address, write_access, pte);
+			return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
 	}
 	spin_unlock(&mm->page_table_lock);
 	return -1;
@@ -1393,62 +1463,23 @@
 {
 	pmd_t *new;
 
-	/* "fast" allocation can happen without dropping the lock.. */
-	new = pmd_alloc_one_fast(mm, address);
-	if (!new) {
-		spin_unlock(&mm->page_table_lock);
-		new = pmd_alloc_one(mm, address);
-		spin_lock(&mm->page_table_lock);
-		if (!new)
-			return NULL;
+	spin_unlock(&mm->page_table_lock);
+	new = pmd_alloc_one(mm, address);
+	spin_lock(&mm->page_table_lock);
+	if (!new)
+		return NULL;
 
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (!pgd_none(*pgd)) {
-			pmd_free(new);
-			goto out;
-		}
+	/*
+	 * Because we dropped the lock, we should re-check the
+	 * entry, as somebody else could have populated it..
+	 */
+	if (pgd_present(*pgd)) {
+		pmd_free(new);
+		goto out;
 	}
 	pgd_populate(mm, pgd, new);
 out:
 	return pmd_offset(pgd, address);
-}
-
-/*
- * Allocate the page table directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
- */
-pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
-{
-	if (pmd_none(*pmd)) {
-		pte_t *new;
-
-		/* "fast" allocation can happen without dropping the lock.. */
-		new = pte_alloc_one_fast(mm, address);
-		if (!new) {
-			spin_unlock(&mm->page_table_lock);
-			new = pte_alloc_one(mm, address);
-			spin_lock(&mm->page_table_lock);
-			if (!new)
-				return NULL;
-
-			/*
-			 * Because we dropped the lock, we should re-check the
-			 * entry, as somebody else could have populated it..
-			 */
-			if (!pmd_none(*pmd)) {
-				pte_free(new);
-				goto out;
-			}
-		}
-		pmd_populate(mm, pmd, new);
-	}
-out:
-	return pte_offset(pmd, address);
 }
 
 int make_pages_present(unsigned long addr, unsigned long end)
diff -urN linux-2.4.18-prepte/mm/mprotect.c linux-2.4.18-highpte/mm/mprotect.c
--- linux-2.4.18-prepte/mm/mprotect.c	Mon Sep 17 15:30:23 2001
+++ linux-2.4.18-highpte/mm/mprotect.c	Thu Mar 21 17:37:24 2002
@@ -11,6 +11,7 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
+#include <linux/highmem.h>
 
 static inline void change_pte_range(pmd_t * pmd, unsigned long address,
 	unsigned long size, pgprot_t newprot)
@@ -25,7 +26,7 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, address);
+	pte = pte_offset_map(pmd, address);
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
@@ -44,6 +45,7 @@
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	pte_unmap(pte - 1);
 }
 
 static inline void change_pmd_range(pgd_t * pgd, unsigned long address,
diff -urN linux-2.4.18-prepte/mm/mremap.c linux-2.4.18-highpte/mm/mremap.c
--- linux-2.4.18-prepte/mm/mremap.c	Thu Sep 20 20:31:26 2001
+++ linux-2.4.18-highpte/mm/mremap.c	Thu Mar 21 17:37:24 2002
@@ -15,7 +15,7 @@
 
 extern int vm_enough_memory(long pages);
 
-static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
+static inline pte_t *get_one_pte_map2(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t * pgd;
 	pmd_t * pmd;
@@ -39,21 +39,23 @@
 		goto end;
 	}
 
-	pte = pte_offset(pmd, addr);
-	if (pte_none(*pte))
+	pte = pte_offset_map2(pmd, addr);
+	if (pte_none(*pte)) {
+		pte_unmap2(pte);
 		pte = NULL;
+	}
 end:
 	return pte;
 }
 
-static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr)
+static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
 {
 	pmd_t * pmd;
 	pte_t * pte = NULL;
 
 	pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr);
 	if (pmd)
-		pte = pte_alloc(mm, pmd, addr);
+		pte = pte_alloc_map(mm, pmd, addr);
 	return pte;
 }
 
@@ -77,12 +79,16 @@
 static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr)
 {
 	int error = 0;
-	pte_t * src;
+	pte_t *src, *dst;
 
 	spin_lock(&mm->page_table_lock);
-	src = get_one_pte(mm, old_addr);
-	if (src)
-		error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr));
+	src = get_one_pte_map2(mm, old_addr);
+	if (src) {
+		dst = alloc_one_pte_map(mm, new_addr);
+		error = copy_one_pte(mm, src, dst);
+		pte_unmap2(src);
+		pte_unmap(dst);
+	}
 	spin_unlock(&mm->page_table_lock);
 	return error;
 }
diff -urN linux-2.4.18-prepte/mm/swapfile.c linux-2.4.18-highpte/mm/swapfile.c
--- linux-2.4.18-prepte/mm/swapfile.c	Mon Feb 25 11:38:14 2002
+++ linux-2.4.18-highpte/mm/swapfile.c	Thu Mar 21 17:37:24 2002
@@ -393,7 +393,7 @@
 		pmd_clear(dir);
 		return;
 	}
-	pte = pte_offset(dir, address);
+	pte = pte_offset_map(dir, address);
 	offset += address & PMD_MASK;
 	address &= ~PMD_MASK;
 	end = address + size;
@@ -404,6 +404,7 @@
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	pte_unmap(pte - 1);
 }
 
 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
diff -urN linux-2.4.18-prepte/mm/vmalloc.c linux-2.4.18-highpte/mm/vmalloc.c
--- linux-2.4.18-prepte/mm/vmalloc.c	Mon Feb 25 11:38:14 2002
+++ linux-2.4.18-highpte/mm/vmalloc.c	Thu Mar 21 17:37:24 2002
@@ -31,7 +31,7 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, address);
+	pte = pte_offset_kernel(pmd, address);
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
@@ -126,7 +126,7 @@
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		pte_t * pte = pte_alloc(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		if (alloc_area_pte(pte, address, end - address, gfp_mask, prot))
diff -urN linux-2.4.18-prepte/mm/vmscan.c linux-2.4.18-highpte/mm/vmscan.c
--- linux-2.4.18-prepte/mm/vmscan.c	Mon Feb 25 11:38:14 2002
+++ linux-2.4.18-highpte/mm/vmscan.c	Thu Mar 21 17:37:24 2002
@@ -166,7 +166,7 @@
 		return count;
 	}
 	
-	pte = pte_offset(dir, address);
+	pte = pte_offset_map(dir, address);
 	
 	pmd_end = (address + PMD_SIZE) & PMD_MASK;
 	if (end > pmd_end)
@@ -180,6 +180,7 @@
 				count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
 				if (!count) {
 					address += PAGE_SIZE;
+					pte++;
 					break;
 				}
 			}
@@ -187,6 +188,7 @@
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	pte_unmap(pte - 1);
 	mm->swap_address = address;
 	return count;
 }


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler)
  2002-03-21 23:10 Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler) Martin J. Bligh
@ 2002-03-21 23:20 ` Martin J. Bligh
  2002-03-26 17:08 ` Andrea Arcangeli
  1 sibling, 0 replies; 9+ messages in thread
From: Martin J. Bligh @ 2002-03-21 23:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: gerrit

> The smaller two patches they published against 2.5.5 work fine, but
> the main one needed a little work - this seems to work for me, but
> I've only really touch tested it ... just thought this might save someone
> some time .... there didn't seem to be anything too complex in the merge,
> but that doesn't necessarily mean I got it right ;-)

Sigh .... old version ... forgot to rediff. Working version below - sorry.

diff -urN linux-2.4.18-prepte/arch/i386/config.in linux-2.4.18-highpte/arch/i386/config.in
--- linux-2.4.18-prepte/arch/i386/config.in	Mon Feb 25 11:37:52 2002
+++ linux-2.4.18-highpte/arch/i386/config.in	Thu Mar 21 17:37:29 2002
@@ -171,16 +171,27 @@
 tristate '/dev/cpu/*/cpuid - CPU information support' CONFIG_X86_CPUID
 
 choice 'High Memory Support' \
-	"off    CONFIG_NOHIGHMEM \
-	 4GB    CONFIG_HIGHMEM4G \
-	 64GB   CONFIG_HIGHMEM64G" off
+	"off           CONFIG_NOHIGHMEM \
+	 4GB           CONFIG_HIGHMEM4G \
+	 4GB-highpte   CONFIG_HIGHMEM4G_HIGHPTE \
+	 64GB          CONFIG_HIGHMEM64G \
+	 64GB-highpte  CONFIG_HIGHMEM64G_HIGHPTE" off
 if [ "$CONFIG_HIGHMEM4G" = "y" ]; then
    define_bool CONFIG_HIGHMEM y
 fi
+if [ "$CONFIG_HIGHMEM4G_HIGHPTE" = "y" ]; then
+   define_bool CONFIG_HIGHMEM y
+   define_bool CONFIG_HIGHPTE y
+fi
 if [ "$CONFIG_HIGHMEM64G" = "y" ]; then
    define_bool CONFIG_HIGHMEM y
    define_bool CONFIG_X86_PAE y
 fi
+if [ "$CONFIG_HIGHMEM64G_HIGHPTE" = "y" ]; then
+   define_bool CONFIG_HIGHMEM y
+   define_bool CONFIG_HIGHPTE y
+   define_bool CONFIG_X86_PAE y
+fi
 
 bool 'Math emulation' CONFIG_MATH_EMULATION
 bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
@@ -416,11 +427,13 @@
 
 bool 'Kernel debugging' CONFIG_DEBUG_KERNEL
 if [ "$CONFIG_DEBUG_KERNEL" != "n" ]; then
-   bool '  Debug high memory support' CONFIG_DEBUG_HIGHMEM
    bool '  Debug memory allocations' CONFIG_DEBUG_SLAB
    bool '  Memory mapped I/O debugging' CONFIG_DEBUG_IOVIRT
    bool '  Magic SysRq key' CONFIG_MAGIC_SYSRQ
    bool '  Spinlock debugging' CONFIG_DEBUG_SPINLOCK
+   if [ "$CONFIG_HIGHMEM" = "y" ]; then
+      bool '  Highmem debugging' CONFIG_DEBUG_HIGHMEM
+   fi
    bool '  Verbose BUG() reporting (adds 70K)' CONFIG_DEBUG_BUGVERBOSE
 fi
 
diff -urN linux-2.4.18-prepte/arch/i386/kernel/process.c linux-2.4.18-highpte/arch/i386/kernel/process.c
--- linux-2.4.18-prepte/arch/i386/kernel/process.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/arch/i386/kernel/process.c	Thu Mar 21 17:37:29 2002
@@ -132,7 +132,6 @@
 		if (!current->need_resched)
 			idle();
 		schedule();
-		check_pgt_cache();
 	}
 }
 
diff -urN linux-2.4.18-prepte/arch/i386/kernel/smpboot.c linux-2.4.18-highpte/arch/i386/kernel/smpboot.c
--- linux-2.4.18-prepte/arch/i386/kernel/smpboot.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/arch/i386/kernel/smpboot.c	Thu Mar 21 17:37:29 2002
@@ -144,10 +144,6 @@
 	struct cpuinfo_x86 *c = cpu_data + id;
 
 	*c = boot_cpu_data;
-	c->pte_quick = 0;
-	c->pmd_quick = 0;
-	c->pgd_quick = 0;
-	c->pgtable_cache_sz = 0;
 	identify_cpu(c);
 	/*
 	 * Mask B, Pentium, but not Pentium MMX
diff -urN linux-2.4.18-prepte/arch/i386/kernel/traps.c linux-2.4.18-highpte/arch/i386/kernel/traps.c
--- linux-2.4.18-prepte/arch/i386/kernel/traps.c	Sun Sep 30 12:26:08 2001
+++ linux-2.4.18-highpte/arch/i386/kernel/traps.c	Thu Mar 21 17:37:29 2002
@@ -734,7 +734,7 @@
 	page = (unsigned long) vmalloc(PAGE_SIZE);
 	pgd = pgd_offset(&init_mm, page);
 	pmd = pmd_offset(pgd, page);
-	pte = pte_offset(pmd, page);
+	pte = pte_offset_kernel(pmd, page);
 	__free_page(pte_page(*pte));
 	*pte = mk_pte_phys(__pa(&idt_table), PAGE_KERNEL_RO);
 	/*
diff -urN linux-2.4.18-prepte/arch/i386/kernel/vm86.c linux-2.4.18-highpte/arch/i386/kernel/vm86.c
--- linux-2.4.18-prepte/arch/i386/kernel/vm86.c	Mon Feb 25 11:37:53 2002
+++ linux-2.4.18-highpte/arch/i386/kernel/vm86.c	Thu Mar 21 19:43:19 2002
@@ -94,7 +94,7 @@
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
-	pte_t *pte;
+	pte_t *pte, *mapped;
 	int i;
 
 	pgd = pgd_offset(tsk->mm, 0xA0000);
@@ -113,12 +113,13 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, 0xA0000);
+	pte = mapped = pte_offset_map(pmd, 0xA0000);
 	for (i = 0; i < 32; i++) {
 		if (pte_present(*pte))
 			set_pte(pte, pte_wrprotect(*pte));
 		pte++;
 	}
+	pte_unmap(mapped);
 	flush_tlb();
 }
 
diff -urN linux-2.4.18-prepte/arch/i386/mm/fault.c linux-2.4.18-highpte/arch/i386/mm/fault.c
--- linux-2.4.18-prepte/arch/i386/mm/fault.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/arch/i386/mm/fault.c	Thu Mar 21 17:37:29 2002
@@ -324,12 +324,20 @@
 	asm("movl %%cr3,%0":"=r" (page));
 	page = ((unsigned long *) __va(page))[address >> 22];
 	printk(KERN_ALERT "*pde = %08lx\n", page);
+	/*
+	 * We must not directly access the pte in the highpte
+	 * case, the page table might be allocated in highmem.
+	 * And lets rather not kmap-atomic the pte, just in case
+	 * it's allocated already.
+	 */
+#ifndef CONFIG_HIGHPTE
 	if (page & 1) {
 		page &= PAGE_MASK;
 		address &= 0x003ff000;
 		page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
 		printk(KERN_ALERT "*pte = %08lx\n", page);
 	}
+#endif
 	die("Oops", regs, error_code);
 	bust_spinlocks(0);
 	do_exit(SIGKILL);
@@ -399,7 +407,7 @@
 			goto no_context;
 		set_pmd(pmd, *pmd_k);
 
-		pte_k = pte_offset(pmd_k, address);
+		pte_k = pte_offset_kernel(pmd_k, address);
 		if (!pte_present(*pte_k))
 			goto no_context;
 		return;
diff -urN linux-2.4.18-prepte/arch/i386/mm/init.c linux-2.4.18-highpte/arch/i386/mm/init.c
--- linux-2.4.18-prepte/arch/i386/mm/init.c	Fri Dec 21 09:41:53 2001
+++ linux-2.4.18-highpte/arch/i386/mm/init.c	Thu Mar 21 17:37:29 2002
@@ -43,28 +43,6 @@
 static unsigned long totalram_pages;
 static unsigned long totalhigh_pages;
 
-int do_check_pgt_cache(int low, int high)
-{
-	int freed = 0;
-	if(pgtable_cache_size > high) {
-		do {
-			if (pgd_quicklist) {
-				free_pgd_slow(get_pgd_fast());
-				freed++;
-			}
-			if (pmd_quicklist) {
-				pmd_free_slow(pmd_alloc_one_fast(NULL, 0));
-				freed++;
-			}
-			if (pte_quicklist) {
-				pte_free_slow(pte_alloc_one_fast(NULL, 0));
-				freed++;
-			}
-		} while(pgtable_cache_size > low);
-	}
-	return freed;
-}
-
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  * physical space so we can cache the place of the first one and move
@@ -76,7 +54,7 @@
 pgprot_t kmap_prot;
 
 #define kmap_get_fixmap_pte(vaddr)					\
-	pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
+	pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
 
 void __init kmap_init(void)
 {
@@ -116,7 +94,6 @@
 	printk("%d reserved pages\n",reserved);
 	printk("%d pages shared\n",shared);
 	printk("%d pages swap cached\n",cached);
-	printk("%ld pages in page table cache\n",pgtable_cache_size);
 	show_buffers();
 }
 
@@ -143,7 +120,7 @@
 		printk("PAE BUG #01!\n");
 		return;
 	}
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	if (pte_val(*pte))
 		pte_ERROR(*pte);
 	pgprot_val(prot) = pgprot_val(PAGE_KERNEL) | pgprot_val(flags);
@@ -196,7 +173,7 @@
 			if (pmd_none(*pmd)) {
 				pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
 				set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
-				if (pte != pte_offset(pmd, 0))
+				if (pte != pte_offset_kernel(pmd, 0))
 					BUG();
 			}
 			vaddr += PMD_SIZE;
@@ -267,7 +244,7 @@
 				*pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL);
 			}
 			set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
-			if (pte_base != pte_offset(pmd, 0))
+			if (pte_base != pte_offset_kernel(pmd, 0))
 				BUG();
 
 		}
@@ -289,7 +266,7 @@
 
 	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	pkmap_page_table = pte;
 #endif
 
@@ -398,7 +375,7 @@
 
 	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	old_pte = *pte;
 	*pte = mk_pte_phys(0, PAGE_READONLY);
 	local_flush_tlb();
diff -urN linux-2.4.18-prepte/arch/i386/mm/ioremap.c linux-2.4.18-highpte/arch/i386/mm/ioremap.c
--- linux-2.4.18-prepte/arch/i386/mm/ioremap.c	Tue Mar 20 08:13:33 2001
+++ linux-2.4.18-highpte/arch/i386/mm/ioremap.c	Thu Mar 21 17:37:29 2002
@@ -49,7 +49,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
diff -urN linux-2.4.18-prepte/drivers/char/drm/drm_proc.h linux-2.4.18-highpte/drivers/char/drm/drm_proc.h
--- linux-2.4.18-prepte/drivers/char/drm/drm_proc.h	Thu Nov 22 11:46:37 2001
+++ linux-2.4.18-highpte/drivers/char/drm/drm_proc.h	Thu Mar 21 19:36:14 2002
@@ -449,7 +449,7 @@
 		for (i = vma->vm_start; i < vma->vm_end; i += PAGE_SIZE) {
 			pgd = pgd_offset(vma->vm_mm, i);
 			pmd = pmd_offset(pgd, i);
-			pte = pte_offset(pmd, i);
+			pte = pte_offset_map(pmd, i);
 			if (pte_present(*pte)) {
 				address = __pa(pte_page(*pte))
 					+ (i & (PAGE_SIZE-1));
@@ -465,6 +465,7 @@
 			} else {
 				DRM_PROC_PRINT("      0x%08lx\n", i);
 			}
+			pte_unmap(pte);
 		}
 #endif
 	}
diff -urN linux-2.4.18-prepte/drivers/char/drm/drm_scatter.h linux-2.4.18-highpte/drivers/char/drm/drm_scatter.h
--- linux-2.4.18-prepte/drivers/char/drm/drm_scatter.h	Thu Nov 22 11:46:37 2001
+++ linux-2.4.18-highpte/drivers/char/drm/drm_scatter.h	Thu Mar 21 19:36:28 2002
@@ -143,9 +143,12 @@
 		if ( !pmd_present( *pmd ) )
 			goto failed;
 
-		pte = pte_offset( pmd, i );
-		if ( !pte_present( *pte ) )
+		pte = pte_offset_map( pmd, i );
+		if ( !pte_present( *pte ) ) {
+			pte_unmap(pte);
 			goto failed;
+		}
+		pte_unmap(pte);
 
 		entry->pagelist[j] = pte_page( *pte );
 
diff -urN linux-2.4.18-prepte/drivers/char/drm/drm_vm.h linux-2.4.18-highpte/drivers/char/drm/drm_vm.h
--- linux-2.4.18-prepte/drivers/char/drm/drm_vm.h	Thu Nov 22 11:46:37 2001
+++ linux-2.4.18-highpte/drivers/char/drm/drm_vm.h	Thu Mar 21 19:42:30 2002
@@ -169,8 +169,12 @@
 	if( !pgd_present( *pgd ) ) return NOPAGE_OOM;
 	pmd = pmd_offset( pgd, i );
 	if( !pmd_present( *pmd ) ) return NOPAGE_OOM;
-	pte = pte_offset( pmd, i );
-	if( !pte_present( *pte ) ) return NOPAGE_OOM;
+	pte = pte_offset_map( pmd, i );
+	if( !pte_present( *pte ) ) {
+		pte_unmap(pte);
+		return NOPAGE_OOM;
+	}
+	pte_unmap(pte);
 
 	page = pte_page(*pte);
 	get_page(page);
diff -urN linux-2.4.18-prepte/drivers/sgi/char/graphics.c linux-2.4.18-highpte/drivers/sgi/char/graphics.c
--- linux-2.4.18-prepte/drivers/sgi/char/graphics.c	Thu Oct 11 09:43:30 2001
+++ linux-2.4.18-highpte/drivers/sgi/char/graphics.c	Thu Mar 21 17:37:24 2002
@@ -221,6 +221,7 @@
 	int board = GRAPHICS_CARD (vma->vm_dentry->d_inode->i_rdev);
 
 	unsigned long virt_add, phys_add;
+	struct page * page;
 
 #ifdef DEBUG
 	printk ("Got a page fault for board %d address=%lx guser=%lx\n", board,
@@ -247,8 +248,10 @@
 
 	pgd = pgd_offset(current->mm, address);
 	pmd = pmd_offset(pgd, address);
-	pte = pte_offset(pmd, address);
-	return pte_page(*pte);
+	pte = pte_kmap_offset(pmd, address);
+	page = pte_page(*pte);
+	pte_kunmap(pte);
+	return page;
 }
 
 /*
diff -urN linux-2.4.18-prepte/drivers/usb/stv680.c linux-2.4.18-highpte/drivers/usb/stv680.c
--- linux-2.4.18-prepte/drivers/usb/stv680.c	Mon Feb 25 11:38:07 2002
+++ linux-2.4.18-highpte/drivers/usb/stv680.c	Thu Mar 21 19:42:47 2002
@@ -133,8 +133,9 @@
 	if (!pgd_none (*pgd)) {
 		pmd = pmd_offset (pgd, adr);
 		if (!pmd_none (*pmd)) {
-			ptep = pte_offset (pmd, adr);
+			ptep = pte_offset_map (pmd, adr);
 			pte = *ptep;
+			pte_unmap(pte);
 			if (pte_present (pte)) {
 				ret = (unsigned long) page_address (pte_page (pte));
 				ret |= (adr & (PAGE_SIZE - 1));
diff -urN linux-2.4.18-prepte/drivers/usb/vicam.c linux-2.4.18-highpte/drivers/usb/vicam.c
--- linux-2.4.18-prepte/drivers/usb/vicam.c	Mon Feb 25 11:38:07 2002
+++ linux-2.4.18-highpte/drivers/usb/vicam.c	Thu Mar 21 19:42:59 2002
@@ -115,8 +115,9 @@
 	if (!pgd_none(*pgd)) {
 		pmd = pmd_offset(pgd, adr);
 		if (!pmd_none(*pmd)) {
-			ptep = pte_offset(pmd, adr);
+			ptep = pte_offset_map(pmd, adr);
 			pte = *ptep;
+			pte_unmap(pte);
 			if(pte_present(pte)) {
 				ret  = (unsigned long) page_address(pte_page(pte));
 				ret |= (adr & (PAGE_SIZE - 1));
diff -urN linux-2.4.18-prepte/fs/exec.c linux-2.4.18-highpte/fs/exec.c
--- linux-2.4.18-prepte/fs/exec.c	Fri Dec 21 09:41:55 2001
+++ linux-2.4.18-highpte/fs/exec.c	Thu Mar 21 17:37:24 2002
@@ -270,15 +270,18 @@
 	pmd = pmd_alloc(tsk->mm, pgd, address);
 	if (!pmd)
 		goto out;
-	pte = pte_alloc(tsk->mm, pmd, address);
+	pte = pte_alloc_map(tsk->mm, pmd, address);
 	if (!pte)
 		goto out;
-	if (!pte_none(*pte))
+	if (!pte_none(*pte)) {
+		pte_unmap(pte);
 		goto out;
+	}
 	lru_cache_add(page);
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
+	pte_unmap(pte);
 	tsk->mm->rss++;
 	spin_unlock(&tsk->mm->page_table_lock);
 
diff -urN linux-2.4.18-prepte/fs/proc/array.c linux-2.4.18-highpte/fs/proc/array.c
--- linux-2.4.18-prepte/fs/proc/array.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/fs/proc/array.c	Thu Mar 21 19:34:57 2002
@@ -392,11 +392,10 @@
 	return res;
 }
 		
-static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
-	int * pages, int * shared, int * dirty, int * total)
+static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, int * pages, int * shared, int * dirty, int * total)
 {
-	pte_t * pte;
-	unsigned long end;
+	unsigned long end, pmd_end;
+	pte_t *pte;
 
 	if (pmd_none(*pmd))
 		return;
@@ -405,11 +404,11 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, address);
-	address &= ~PMD_MASK;
+	pte = pte_offset_map(pmd, address);
 	end = address + size;
-	if (end > PMD_SIZE)
-		end = PMD_SIZE;
+	pmd_end = (address + PMD_SIZE) & PMD_MASK;
+	if (end > pmd_end)
+		end = pmd_end;
 	do {
 		pte_t page = *pte;
 		struct page *ptpage;
@@ -430,6 +429,7 @@
 		if (page_count(pte_page(page)) > 1)
 			++*shared;
 	} while (address < end);
+	pte_unmap(pte - 1);
 }
 
 static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size,
diff -urN linux-2.4.18-prepte/include/asm-i386/highmem.h linux-2.4.18-highpte/include/asm-i386/highmem.h
--- linux-2.4.18-prepte/include/asm-i386/highmem.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.18-highpte/include/asm-i386/highmem.h	Thu Mar 21 19:32:58 2002
@@ -26,12 +26,6 @@
 #include <asm/kmap_types.h>
 #include <asm/pgtable.h>
 
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define HIGHMEM_DEBUG 1
-#else
-#define HIGHMEM_DEBUG 0
-#endif
-
 /* declarations for highmem.c */
 extern unsigned long highstart_pfn, highend_pfn;
 
@@ -93,7 +87,7 @@
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-#if HIGHMEM_DEBUG
+#if CONFIG_DEBUG_HIGHMEM
 	if (!pte_none(*(kmap_pte-idx)))
 		BUG();
 #endif
@@ -105,8 +99,8 @@
 
 static inline void kunmap_atomic(void *kvaddr, enum km_type type)
 {
-#if HIGHMEM_DEBUG
-	unsigned long vaddr = (unsigned long) kvaddr;
+#if CONFIG_DEBUG_HIGHMEM
+	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
 	if (vaddr < FIXADDR_START) // FIXME
diff -urN linux-2.4.18-prepte/include/asm-i386/kmap_types.h linux-2.4.18-highpte/include/asm-i386/kmap_types.h
--- linux-2.4.18-prepte/include/asm-i386/kmap_types.h	Mon Sep 17 13:16:30 2001
+++ linux-2.4.18-highpte/include/asm-i386/kmap_types.h	Thu Mar 21 19:16:24 2002
@@ -1,13 +1,26 @@
 #ifndef _ASM_KMAP_TYPES_H
 #define _ASM_KMAP_TYPES_H
 
+#include <linux/config.h>
+
+#if CONFIG_DEBUG_HIGHMEM
+# define D(n) __KM_FENCE_##n ,
+#else
+# define D(n)
+#endif
+
 enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_TYPE_NR
+D(0)  KM_BOUNCE_READ,
+D(1)  KM_SKB_DATA,
+D(2)  KM_SKB_DATA_SOFTIRQ,
+D(3)  KM_USER0,
+D(4)  KM_USER1,
+D(5)  KM_BIO_IRQ,
+D(6)  KM_PTE0,
+D(7)  KM_PTE1,
+D(8)  KM_TYPE_NR
 };
+
+#undef D
 
 #endif
diff -urN linux-2.4.18-prepte/include/asm-i386/pgalloc.h linux-2.4.18-highpte/include/asm-i386/pgalloc.h
--- linux-2.4.18-prepte/include/asm-i386/pgalloc.h	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/include/asm-i386/pgalloc.h	Thu Mar 21 19:51:50 2002
@@ -5,15 +5,17 @@
 #include <asm/processor.h>
 #include <asm/fixmap.h>
 #include <linux/threads.h>
+#include <linux/highmem.h>
 
-#define pgd_quicklist (current_cpu_data.pgd_quick)
-#define pmd_quicklist (current_cpu_data.pmd_quick)
-#define pte_quicklist (current_cpu_data.pte_quick)
-#define pgtable_cache_size (current_cpu_data.pgtable_cache_sz)
-
-#define pmd_populate(mm, pmd, pte) \
+#define pmd_populate_kernel(mm, pmd, pte) \
 		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
 
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+	set_pmd(pmd, __pmd(_PAGE_TABLE +
+		((unsigned long long)(pte - mem_map) <<
+			(unsigned long long) PAGE_SHIFT)));
+}
 /*
  * Allocate and free page tables.
  */
@@ -29,7 +31,7 @@
 extern void kmem_cache_free(struct kmem_cache_s *, void *);
 
 
-static inline pgd_t *get_pgd_slow(void)
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	int i;
 	pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);
@@ -56,7 +58,7 @@
 
 #else
 
-static inline pgd_t *get_pgd_slow(void)
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
 
@@ -71,27 +73,7 @@
 
 #endif /* CONFIG_X86_PAE */
 
-static inline pgd_t *get_pgd_fast(void)
-{
-	unsigned long *ret;
-
-	if ((ret = pgd_quicklist) != NULL) {
-		pgd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		pgtable_cache_size--;
-	} else
-		ret = (unsigned long *)get_pgd_slow();
-	return (pgd_t *)ret;
-}
-
-static inline void free_pgd_fast(pgd_t *pgd)
-{
-	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
-	pgd_quicklist = (unsigned long *) pgd;
-	pgtable_cache_size++;
-}
-
-static inline void free_pgd_slow(pgd_t *pgd)
+static inline void pgd_free(pgd_t *pgd)
 {
 #if defined(CONFIG_X86_PAE)
 	int i;
@@ -104,59 +86,64 @@
 #endif
 }
 
-static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
+	int count = 0;
 	pte_t *pte;
-
-	pte = (pte_t *) __get_free_page(GFP_KERNEL);
-	if (pte)
-		clear_page(pte);
+   
+   	do {
+		pte = (pte_t *) __get_free_page(GFP_KERNEL);
+		if (pte)
+			clear_page(pte);
+		else {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(HZ);
+		}
+	} while (!pte && (count++ < 10));
 	return pte;
 }
-
-static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm,
-					unsigned long address)
+  
+static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	unsigned long *ret;
-
-	if ((ret = (unsigned long *)pte_quicklist) != NULL) {
-		pte_quicklist = (unsigned long *)(*ret);
-		ret[0] = ret[1];
-		pgtable_cache_size--;
-	}
-	return (pte_t *)ret;
+	int count = 0;
+	struct page *pte;
+   
+   	do {
+#if CONFIG_HIGHPTE
+		pte = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0);
+#else
+		pte = alloc_pages(GFP_KERNEL, 0);
+#endif
+		if (pte)
+			clear_highpage(pte);
+		else {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(HZ);
+		}
+	} while (!pte && (count++ < 10));
+	return pte;
 }
-
-static inline void pte_free_fast(pte_t *pte)
+  
+static inline void pte_free_kernel(pte_t *pte)
 {
-	*(unsigned long *)pte = (unsigned long) pte_quicklist;
-	pte_quicklist = (unsigned long *) pte;
-	pgtable_cache_size++;
+	free_page((unsigned long)pte);
 }
-
-static __inline__ void pte_free_slow(pte_t *pte)
+  
+static inline void pte_free(struct page *pte)
 {
-	free_page((unsigned long)pte);
+	__free_page(pte);
 }
-
-#define pte_free(pte)		pte_free_slow(pte)
-#define pgd_free(pgd)		free_pgd_slow(pgd)
-#define pgd_alloc(mm)		get_pgd_fast()
-
+  
 /*
  * allocating and freeing a pmd is trivial: the 1-entry pmd is
  * inside the pgd, so has no extra memory associated with it.
  * (In the PAE case we free the pmds as part of the pgd.)
  */
 
-#define pmd_alloc_one_fast(mm, addr)	({ BUG(); ((pmd_t *)1); })
 #define pmd_alloc_one(mm, addr)		({ BUG(); ((pmd_t *)2); })
-#define pmd_free_slow(x)		do { } while (0)
-#define pmd_free_fast(x)		do { } while (0)
 #define pmd_free(x)			do { } while (0)
 #define pgd_populate(mm, pmd, pte)	BUG()
-
-extern int do_check_pgt_cache(int, int);
 
 /*
  * TLB flushing:
diff -urN linux-2.4.18-prepte/include/asm-i386/pgtable.h linux-2.4.18-highpte/include/asm-i386/pgtable.h
--- linux-2.4.18-prepte/include/asm-i386/pgtable.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.18-highpte/include/asm-i386/pgtable.h	Thu Mar 21 19:32:57 2002
@@ -315,9 +315,12 @@
 
 #define page_pte(page) page_pte_prot(page, __pgprot(0))
 
-#define pmd_page(pmd) \
+#define pmd_page_kernel(pmd) \
 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
+#define pmd_page(pmd) \
+	(mem_map + (pmd_val(pmd) >> PAGE_SHIFT))
+
 /* to find an entry in a page-table-directory. */
 #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 
@@ -334,8 +337,14 @@
 /* Find an entry in the third-level page table.. */
 #define __pte_offset(address) \
 		((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset(dir, address) ((pte_t *) pmd_page(*(dir)) + \
-			__pte_offset(address))
+#define pte_offset_kernel(dir, address) \
+	((pte_t *) pmd_page_kernel(*(dir)) +  __pte_offset(address))
+#define pte_offset_map(dir, address) \
+	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + __pte_offset(address))
+#define pte_offset_map2(dir, address) \
+	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + __pte_offset(address))
+#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
+#define pte_unmap2(pte) kunmap_atomic(pte, KM_PTE1)
 
 /*
  * The i386 doesn't have any external MMU info: the kernel page
diff -urN linux-2.4.18-prepte/include/asm-i386/processor.h linux-2.4.18-highpte/include/asm-i386/processor.h
--- linux-2.4.18-prepte/include/asm-i386/processor.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.18-highpte/include/asm-i386/processor.h	Thu Mar 21 19:32:57 2002
@@ -49,10 +49,6 @@
 	int	f00f_bug;
 	int	coma_bug;
 	unsigned long loops_per_jiffy;
-	unsigned long *pgd_quick;
-	unsigned long *pmd_quick;
-	unsigned long *pte_quick;
-	unsigned long pgtable_cache_sz;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL 0
diff -urN linux-2.4.18-prepte/include/linux/highmem.h linux-2.4.18-highpte/include/linux/highmem.h
--- linux-2.4.18-prepte/include/linux/highmem.h	Mon Feb 25 11:38:13 2002
+++ linux-2.4.18-highpte/include/linux/highmem.h	Thu Mar 21 19:32:59 2002
@@ -2,7 +2,6 @@
 #define _LINUX_HIGHMEM_H
 
 #include <linux/config.h>
-#include <asm/pgalloc.h>
 
 #ifdef CONFIG_HIGHMEM
 
@@ -14,7 +13,7 @@
 unsigned int nr_free_highpages(void);
 
 extern struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig);
-
+extern void check_highmem_ptes(void);
 
 static inline char *bh_kmap(struct buffer_head *bh)
 {
@@ -52,8 +51,9 @@
 
 static inline void clear_highpage(struct page *page)
 {
-	clear_page(kmap(page));
-	kunmap(page);
+	void *kaddr = kmap_atomic(page, KM_USER0);
+	clear_page(kaddr);
+	kunmap_atomic(kaddr, KM_USER0);
 }
 
 /*
@@ -61,15 +61,16 @@
  */
 static inline void memclear_highpage_flush(struct page *page, unsigned int offset, unsigned int size)
 {
-	char *kaddr;
+	void *kaddr;
 
 	if (offset + size > PAGE_SIZE)
 		BUG();
-	kaddr = kmap(page);
-	memset(kaddr + offset, 0, size);
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset((char *)kaddr + offset, 0, size);
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
-	kunmap(page);
+	kunmap_atomic(kaddr, KM_USER0);
 }
 
 static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr)
diff -urN linux-2.4.18-prepte/include/linux/mm.h linux-2.4.18-highpte/include/linux/mm.h
--- linux-2.4.18-prepte/include/linux/mm.h	Thu Mar 21 19:20:20 2002
+++ linux-2.4.18-highpte/include/linux/mm.h	Thu Mar 21 19:32:59 2002
@@ -411,7 +411,8 @@
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
 extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
@@ -427,7 +428,7 @@
 
 /*
  * On a two-level page table, this ends up being trivial. Thus the
- * inlining and the symmetry break with pte_alloc() that does all
+ * inlining and the symmetry break with pte_alloc_map() that does all
  * of this out-of-line.
  */
 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
@@ -436,9 +437,6 @@
 		return __pmd_alloc(mm, pgd, address);
 	return pmd_offset(pgd, address);
 }
-
-extern int pgt_cache_water[2];
-extern int check_pgt_cache(void);
 
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
diff -urN linux-2.4.18-prepte/kernel/sched.c linux-2.4.18-highpte/kernel/sched.c
--- linux-2.4.18-prepte/kernel/sched.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/kernel/sched.c	Thu Mar 21 17:49:37 2002
@@ -20,6 +20,7 @@
 #include <linux/interrupt.h>
 #include <asm/mmu_context.h>
 #include <linux/kernel_stat.h>
+#include <linux/highmem.h>
 
 /*
  * Priority of a process goes from 0 to 139. The 0-99
@@ -751,6 +752,10 @@
 
 	if (unlikely(in_interrupt()))
 		BUG();
+#if CONFIG_DEBUG_HIGHMEM
+	check_highmem_ptes();
+#endif
+
 	release_kernel_lock(prev, smp_processor_id());
 	prev->sleep_timestamp = jiffies;
 	spin_lock_irq(&rq->lock);
diff -urN linux-2.4.18-prepte/kernel/sysctl.c linux-2.4.18-highpte/kernel/sysctl.c
--- linux-2.4.18-prepte/kernel/sysctl.c	Fri Dec 21 09:42:04 2001
+++ linux-2.4.18-highpte/kernel/sysctl.c	Thu Mar 21 17:37:24 2002
@@ -96,8 +96,6 @@
 extern int acct_parm[];
 #endif
 
-extern int pgt_cache_water[];
-
 static int parse_table(int *, int, void *, size_t *, void *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -267,8 +265,6 @@
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
 	{VM_PAGERDAEMON, "kswapd",
 	 &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
-	{VM_PGT_CACHE, "pagetable_cache", 
-	 &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_PAGE_CLUSTER, "page-cluster", 
 	 &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_MIN_READAHEAD, "min-readahead",
diff -urN linux-2.4.18-prepte/mm/filemap.c linux-2.4.18-highpte/mm/filemap.c
--- linux-2.4.18-prepte/mm/filemap.c	Mon Feb 25 11:38:13 2002
+++ linux-2.4.18-highpte/mm/filemap.c	Thu Mar 21 18:21:11 2002
@@ -1989,7 +1989,7 @@
 /* Called with mm->page_table_lock held to protect against other
  * threads/the swapper from ripping pte's out from under us.
  */
-static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
+static inline int filemap_sync_pte(pte_t *ptep, pmd_t *pmdp, struct vm_area_struct *vma,
 	unsigned long address, unsigned int flags)
 {
 	pte_t pte = *ptep;
@@ -2005,11 +2005,10 @@
 }
 
 static inline int filemap_sync_pte_range(pmd_t * pmd,
-	unsigned long address, unsigned long size, 
-	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
+	unsigned long address, unsigned long end, 
+	struct vm_area_struct *vma, unsigned int flags)
 {
-	pte_t * pte;
-	unsigned long end;
+	pte_t *pte;
 	int error;
 
 	if (pmd_none(*pmd))
@@ -2019,27 +2018,26 @@
 		pmd_clear(pmd);
 		return 0;
 	}
-	pte = pte_offset(pmd, address);
-	offset += address & PMD_MASK;
-	address &= ~PMD_MASK;
-	end = address + size;
-	if (end > PMD_SIZE)
-		end = PMD_SIZE;
+	pte = pte_offset_map(pmd, address);
+	if ((address & PMD_MASK) != (end & PMD_MASK))
+		end = (address & PMD_MASK) + PMD_SIZE;
 	error = 0;
 	do {
-		error |= filemap_sync_pte(pte, vma, address + offset, flags);
+		error |= filemap_sync_pte(pte, pmd, vma, address, flags);
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+
+	pte_unmap(pte - 1);
+
 	return error;
 }
 
 static inline int filemap_sync_pmd_range(pgd_t * pgd,
-	unsigned long address, unsigned long size, 
+	unsigned long address, unsigned long end, 
 	struct vm_area_struct *vma, unsigned int flags)
 {
 	pmd_t * pmd;
-	unsigned long offset, end;
 	int error;
 
 	if (pgd_none(*pgd))
@@ -2050,14 +2048,11 @@
 		return 0;
 	}
 	pmd = pmd_offset(pgd, address);
-	offset = address & PGDIR_MASK;
-	address &= ~PGDIR_MASK;
-	end = address + size;
-	if (end > PGDIR_SIZE)
-		end = PGDIR_SIZE;
+	if ((address & PGDIR_MASK) != (end & PGDIR_MASK))
+		end = (address & PGDIR_MASK) + PGDIR_SIZE;
 	error = 0;
 	do {
-		error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
+		error |= filemap_sync_pte_range(pmd, address, end, vma, flags);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -2077,11 +2072,11 @@
 	spin_lock(&vma->vm_mm->page_table_lock);
 
 	dir = pgd_offset(vma->vm_mm, address);
-	flush_cache_range(vma->vm_mm, end - size, end);
+	flush_cache_range(vma, address, end);
 	if (address >= end)
 		BUG();
 	do {
-		error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
+		error |= filemap_sync_pmd_range(dir, address, end, vma, flags);
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
diff -urN linux-2.4.18-prepte/mm/highmem.c linux-2.4.18-highpte/mm/highmem.c
--- linux-2.4.18-prepte/mm/highmem.c	Thu Mar 21 19:20:08 2002
+++ linux-2.4.18-highpte/mm/highmem.c	Thu Mar 21 17:51:29 2002
@@ -21,6 +21,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <asm/pgalloc.h>
 
 /*
  * Virtual_count is not a pure "count".
@@ -446,3 +447,17 @@
 	return bh;
 }
 
+#if CONFIG_DEBUG_HIGHMEM
+void check_highmem_ptes(void)
+{
+	int idx, type;
+ 
+	for (type = 0; type < KM_TYPE_NR; type++) {
+		idx = type + KM_TYPE_NR*smp_processor_id();
+ 		if (!pte_none(*(kmap_pte-idx))) {
+ 			printk("scheduling with KM_TYPE %d held!\n", type);
+ 			BUG();
+ 		}
+ 	}
+}
+#endif
diff -urN linux-2.4.18-prepte/mm/memory.c linux-2.4.18-highpte/mm/memory.c
--- linux-2.4.18-prepte/mm/memory.c	Thu Mar 21 19:20:20 2002
+++ linux-2.4.18-highpte/mm/memory.c	Thu Mar 21 19:35:50 2002
@@ -90,7 +90,7 @@
  */
 static inline void free_one_pmd(pmd_t * dir)
 {
-	pte_t * pte;
+	struct page *pte;
 
 	if (pmd_none(*dir))
 		return;
@@ -99,7 +99,7 @@
 		pmd_clear(dir);
 		return;
 	}
-	pte = pte_offset(dir, 0);
+	pte = pmd_page(*dir);
 	pmd_clear(dir);
 	pte_free(pte);
 }
@@ -125,18 +125,6 @@
 	pmd_free(pmd);
 }
 
-/* Low and high watermarks for page table cache.
-   The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
- */
-int pgt_cache_water[2] = { 25, 50 };
-
-/* Returns the number of pages freed */
-int check_pgt_cache(void)
-{
-	return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
-}
-
-
 /*
  * This function clears all user-level page tables of a process - this
  * is needed by execve(), so that old pages aren't in the way.
@@ -152,11 +140,59 @@
 		page_dir++;
 	} while (--nr);
 	spin_unlock(&mm->page_table_lock);
+}
+
+pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (!pmd_present(*pmd)) {
+		struct page *new;
 
-	/* keep the page table cache within bounds */
-	check_pgt_cache();
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
+
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free(new);
+			goto out;
+		}
+		pmd_populate(mm, pmd, new);
+	}
+out:
+	if (pmd_present(*pmd))
+		return pte_offset_map(pmd, address);
+	return NULL;
 }
 
+pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (!pmd_present(*pmd)) {
+		pte_t *new;
+
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one_kernel(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
+
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free_kernel(new);
+			goto out;
+		}
+		pmd_populate_kernel(mm, pmd, new);
+	}
+out:
+	return pte_offset_kernel(pmd, address);
+}
 #define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
 #define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
 
@@ -169,7 +205,7 @@
  *         variable count and make things faster. -jj
  *
  * dst->page_table_lock is held on entry and exit,
- * but may be dropped within pmd_alloc() and pte_alloc().
+ * but may be dropped within pmd_alloc() and pte_alloc_map().
  */
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma)
@@ -221,12 +257,11 @@
 				goto cont_copy_pmd_range;
 			}
 
-			src_pte = pte_offset(src_pmd, address);
-			dst_pte = pte_alloc(dst, dst_pmd, address);
+			dst_pte = pte_alloc_map(dst, dst_pmd, address);
 			if (!dst_pte)
 				goto nomem;
-
 			spin_lock(&src->page_table_lock);			
+			src_pte = pte_offset_map2(src_pmd, address);
 			do {
 				pte_t pte = *src_pte;
 				struct page *ptepage;
@@ -259,11 +294,16 @@
 
 cont_copy_pte_range:		set_pte(dst_pte, pte);
 cont_copy_pte_range_noset:	address += PAGE_SIZE;
-				if (address >= end)
+				if (address >= end) {
+					pte_unmap2(src_pte);
+					pte_unmap(dst_pte);
 					goto out_unlock;
+				}
 				src_pte++;
 				dst_pte++;
 			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
+			pte_unmap2(src_pte-1);
+			pte_unmap(dst_pte-1);
 			spin_unlock(&src->page_table_lock);
 		
 cont_copy_pmd_range:	src_pmd++;
@@ -292,7 +332,7 @@
 static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
 {
 	unsigned long offset;
-	pte_t * ptep;
+	pte_t *ptep;
 	int freed = 0;
 
 	if (pmd_none(*pmd))
@@ -302,7 +342,7 @@
 		pmd_clear(pmd);
 		return 0;
 	}
-	ptep = pte_offset(pmd, address);
+	ptep = pte_offset_map(pmd, address);
 	offset = address & ~PMD_MASK;
 	if (offset + size > PMD_SIZE)
 		size = PMD_SIZE - offset;
@@ -322,6 +362,7 @@
 			pte_clear(ptep);
 		}
 	}
+	pte_unmap(ptep-1);
 
 	return freed;
 }
@@ -414,11 +455,13 @@
 	if (pmd_none(*pmd) || pmd_bad(*pmd))
 		goto out;
 
-	ptep = pte_offset(pmd, address);
-	if (!ptep)
+	ptep = pte_offset_map(pmd, address);
+	if (!ptep) {
 		goto out;
+	}
 
 	pte = *ptep;
+	pte_unmap(ptep);
 	if (pte_present(pte)) {
 		if (!write ||
 		    (pte_write(pte) && pte_dirty(pte)))
@@ -771,10 +814,11 @@
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		pte_t * pte = pte_alloc(mm, pmd, address);
+		pte_t * pte = pte_alloc_map(mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		zeromap_pte_range(pte, address, end - address, prot);
+		pte_unmap(pte);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -851,10 +895,11 @@
 		end = PGDIR_SIZE;
 	phys_addr -= address;
 	do {
-		pte_t * pte = pte_alloc(mm, pmd, address);
+		pte_t * pte = pte_alloc_map(mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_pte_range(pte, address, end - address, address + phys_addr, prot);
+		pte_unmap(pte);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -940,7 +985,7 @@
  * with the page_table_lock released.
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
-	unsigned long address, pte_t *page_table, pte_t pte)
+	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
 {
 	struct page *old_page, *new_page;
 
@@ -954,10 +999,12 @@
 		if (reuse) {
 			flush_cache_page(vma, address);
 			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
+			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
 			return 1;	/* Minor fault */
 		}
 	}
+	pte_unmap(page_table);
 
 	/*
 	 * Ok, we need to copy. Oh, well..
@@ -974,6 +1021,7 @@
 	 * Re-check the pte - we dropped the lock
 	 */
 	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
 			++mm->rss;
@@ -983,12 +1031,14 @@
 		/* Free the old page.. */
 		new_page = old_page;
 	}
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	return 1;	/* Minor fault */
 
 bad_wp_page:
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
 	return -1;
@@ -1110,13 +1160,14 @@
  */
 static int do_swap_page(struct mm_struct * mm,
 	struct vm_area_struct * vma, unsigned long address,
-	pte_t * page_table, pte_t orig_pte, int write_access)
+	pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
 {
 	struct page *page;
 	swp_entry_t entry = pte_to_swp_entry(orig_pte);
 	pte_t pte;
 	int ret = 1;
 
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	page = lookup_swap_cache(entry);
 	if (!page) {
@@ -1129,7 +1180,9 @@
 			 */
 			int retval;
 			spin_lock(&mm->page_table_lock);
+			page_table = pte_offset_map(pmd, address);
 			retval = pte_same(*page_table, orig_pte) ? -1 : 1;
+			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
 			return retval;
 		}
@@ -1147,7 +1200,9 @@
 	 * released the page table lock.
 	 */
 	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
 	if (!pte_same(*page_table, orig_pte)) {
+		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
 		unlock_page(page);
 		page_cache_release(page);
@@ -1172,6 +1227,7 @@
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
@@ -1181,7 +1237,7 @@
  * spinlock held to protect against concurrent faults in
  * multithreaded programs. 
  */
-static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
+static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr)
 {
 	pte_t entry;
 
@@ -1193,6 +1249,7 @@
 		struct page *page;
 
 		/* Allocate our own private page. */
+		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
 
 		page = alloc_page(GFP_HIGHUSER);
@@ -1201,7 +1258,10 @@
 		clear_user_highpage(page, addr);
 
 		spin_lock(&mm->page_table_lock);
+		page_table = pte_offset_map(pmd, addr);
+
 		if (!pte_none(*page_table)) {
+			pte_unmap(page_table);
 			page_cache_release(page);
 			spin_unlock(&mm->page_table_lock);
 			return 1;
@@ -1214,6 +1274,7 @@
 	}
 
 	set_pte(page_table, entry);
+	pte_unmap(page_table);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, entry);
@@ -1237,13 +1298,14 @@
  * spinlock held. Exit with the spinlock released.
  */
 static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
-	unsigned long address, int write_access, pte_t *page_table)
+	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
 {
 	struct page * new_page;
 	pte_t entry;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
-		return do_anonymous_page(mm, vma, page_table, write_access, address);
+		return do_anonymous_page(mm, vma, page_table, pmd, write_access, address);
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
@@ -1269,6 +1331,8 @@
 	}
 
 	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
+
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
 	 * due to the bad i386 page protection. But it's valid
@@ -1288,8 +1352,10 @@
 		if (write_access)
 			entry = pte_mkwrite(pte_mkdirty(entry));
 		set_pte(page_table, entry);
+		pte_unmap(page_table);
 	} else {
 		/* One of our sibling threads was faster, back out. */
+		pte_unmap(page_table);
 		page_cache_release(new_page);
 		spin_unlock(&mm->page_table_lock);
 		return 1;
@@ -1324,7 +1390,7 @@
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 	struct vm_area_struct * vma, unsigned long address,
-	int write_access, pte_t * pte)
+	int write_access, pte_t *pte, pmd_t *pmd)
 {
 	pte_t entry;
 
@@ -1336,18 +1402,19 @@
 		 * drop the lock.
 		 */
 		if (pte_none(entry))
-			return do_no_page(mm, vma, address, write_access, pte);
-		return do_swap_page(mm, vma, address, pte, entry, write_access);
+			return do_no_page(mm, vma, address, write_access, pte, pmd);
+		return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
 	}
 
 	if (write_access) {
 		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address, pte, entry);
+			return do_wp_page(mm, vma, address, pte, pmd, entry);
 
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
 	establish_pte(vma, address, pte, entry);
+	pte_unmap(pte);
 	spin_unlock(&mm->page_table_lock);
 	return 1;
 }
@@ -1372,9 +1439,9 @@
 	pmd = pmd_alloc(mm, pgd, address);
 
 	if (pmd) {
-		pte_t * pte = pte_alloc(mm, pmd, address);
+		pte_t * pte = pte_alloc_map(mm, pmd, address);
 		if (pte)
-			return handle_pte_fault(mm, vma, address, write_access, pte);
+			return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
 	}
 	spin_unlock(&mm->page_table_lock);
 	return -1;
@@ -1393,64 +1460,25 @@
 {
 	pmd_t *new;
 
-	/* "fast" allocation can happen without dropping the lock.. */
-	new = pmd_alloc_one_fast(mm, address);
-	if (!new) {
-		spin_unlock(&mm->page_table_lock);
-		new = pmd_alloc_one(mm, address);
-		spin_lock(&mm->page_table_lock);
-		if (!new)
-			return NULL;
+	spin_unlock(&mm->page_table_lock);
+	new = pmd_alloc_one(mm, address);
+	spin_lock(&mm->page_table_lock);
+	if (!new)
+		return NULL;
 
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (!pgd_none(*pgd)) {
-			pmd_free(new);
-			goto out;
-		}
+	/*
+	 * Because we dropped the lock, we should re-check the
+	 * entry, as somebody else could have populated it..
+	 */
+	if (pgd_present(*pgd)) {
+		pmd_free(new);
+		goto out;
 	}
 	pgd_populate(mm, pgd, new);
 out:
 	return pmd_offset(pgd, address);
 }
 
-/*
- * Allocate the page table directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
- */
-pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
-{
-	if (pmd_none(*pmd)) {
-		pte_t *new;
-
-		/* "fast" allocation can happen without dropping the lock.. */
-		new = pte_alloc_one_fast(mm, address);
-		if (!new) {
-			spin_unlock(&mm->page_table_lock);
-			new = pte_alloc_one(mm, address);
-			spin_lock(&mm->page_table_lock);
-			if (!new)
-				return NULL;
-
-			/*
-			 * Because we dropped the lock, we should re-check the
-			 * entry, as somebody else could have populated it..
-			 */
-			if (!pmd_none(*pmd)) {
-				pte_free(new);
-				goto out;
-			}
-		}
-		pmd_populate(mm, pmd, new);
-	}
-out:
-	return pte_offset(pmd, address);
-}
-
 int make_pages_present(unsigned long addr, unsigned long end)
 {
 	int ret, len, write;
@@ -1481,13 +1509,11 @@
 	if (!pgd_none(*pgd)) {
 		pmd = pmd_offset(pgd, addr);
 		if (!pmd_none(*pmd)) {
-			preempt_disable();
 			ptep = pte_offset_map(pmd, addr);
 			pte = *ptep;
 			if (pte_present(pte))
 				page = pte_page(pte);
 			pte_unmap(ptep);
-			preempt_enable();
 		}
 	}
 	/*
diff -urN linux-2.4.18-prepte/mm/mprotect.c linux-2.4.18-highpte/mm/mprotect.c
--- linux-2.4.18-prepte/mm/mprotect.c	Mon Sep 17 15:30:23 2001
+++ linux-2.4.18-highpte/mm/mprotect.c	Thu Mar 21 17:37:24 2002
@@ -11,6 +11,7 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
+#include <linux/highmem.h>
 
 static inline void change_pte_range(pmd_t * pmd, unsigned long address,
 	unsigned long size, pgprot_t newprot)
@@ -25,7 +26,7 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, address);
+	pte = pte_offset_map(pmd, address);
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
@@ -44,6 +45,7 @@
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	pte_unmap(pte - 1);
 }
 
 static inline void change_pmd_range(pgd_t * pgd, unsigned long address,
diff -urN linux-2.4.18-prepte/mm/mremap.c linux-2.4.18-highpte/mm/mremap.c
--- linux-2.4.18-prepte/mm/mremap.c	Thu Sep 20 20:31:26 2001
+++ linux-2.4.18-highpte/mm/mremap.c	Thu Mar 21 17:37:24 2002
@@ -15,7 +15,7 @@
 
 extern int vm_enough_memory(long pages);
 
-static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
+static inline pte_t *get_one_pte_map2(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t * pgd;
 	pmd_t * pmd;
@@ -39,21 +39,23 @@
 		goto end;
 	}
 
-	pte = pte_offset(pmd, addr);
-	if (pte_none(*pte))
+	pte = pte_offset_map2(pmd, addr);
+	if (pte_none(*pte)) {
+		pte_unmap2(pte);
 		pte = NULL;
+	}
 end:
 	return pte;
 }
 
-static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr)
+static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
 {
 	pmd_t * pmd;
 	pte_t * pte = NULL;
 
 	pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr);
 	if (pmd)
-		pte = pte_alloc(mm, pmd, addr);
+		pte = pte_alloc_map(mm, pmd, addr);
 	return pte;
 }
 
@@ -77,12 +79,16 @@
 static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr)
 {
 	int error = 0;
-	pte_t * src;
+	pte_t *src, *dst;
 
 	spin_lock(&mm->page_table_lock);
-	src = get_one_pte(mm, old_addr);
-	if (src)
-		error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr));
+	src = get_one_pte_map2(mm, old_addr);
+	if (src) {
+		dst = alloc_one_pte_map(mm, new_addr);
+		error = copy_one_pte(mm, src, dst);
+		pte_unmap2(src);
+		pte_unmap(dst);
+	}
 	spin_unlock(&mm->page_table_lock);
 	return error;
 }
diff -urN linux-2.4.18-prepte/mm/swapfile.c linux-2.4.18-highpte/mm/swapfile.c
--- linux-2.4.18-prepte/mm/swapfile.c	Mon Feb 25 11:38:14 2002
+++ linux-2.4.18-highpte/mm/swapfile.c	Thu Mar 21 17:37:24 2002
@@ -393,7 +393,7 @@
 		pmd_clear(dir);
 		return;
 	}
-	pte = pte_offset(dir, address);
+	pte = pte_offset_map(dir, address);
 	offset += address & PMD_MASK;
 	address &= ~PMD_MASK;
 	end = address + size;
@@ -404,6 +404,7 @@
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	pte_unmap(pte - 1);
 }
 
 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
diff -urN linux-2.4.18-prepte/mm/vmalloc.c linux-2.4.18-highpte/mm/vmalloc.c
--- linux-2.4.18-prepte/mm/vmalloc.c	Mon Feb 25 11:38:14 2002
+++ linux-2.4.18-highpte/mm/vmalloc.c	Thu Mar 21 17:37:24 2002
@@ -31,7 +31,7 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, address);
+	pte = pte_offset_kernel(pmd, address);
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
@@ -126,7 +126,7 @@
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		pte_t * pte = pte_alloc(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		if (alloc_area_pte(pte, address, end - address, gfp_mask, prot))
diff -urN linux-2.4.18-prepte/mm/vmscan.c linux-2.4.18-highpte/mm/vmscan.c
--- linux-2.4.18-prepte/mm/vmscan.c	Mon Feb 25 11:38:14 2002
+++ linux-2.4.18-highpte/mm/vmscan.c	Thu Mar 21 17:37:24 2002
@@ -166,7 +166,7 @@
 		return count;
 	}
 	
-	pte = pte_offset(dir, address);
+	pte = pte_offset_map(dir, address);
 	
 	pmd_end = (address + PMD_SIZE) & PMD_MASK;
 	if (end > pmd_end)
@@ -180,6 +180,7 @@
 				count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
 				if (!count) {
 					address += PAGE_SIZE;
+					pte++;
 					break;
 				}
 			}
@@ -187,6 +188,7 @@
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	pte_unmap(pte - 1);
 	mm->swap_address = address;
 	return count;
 }


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler)
  2002-03-21 23:10 Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler) Martin J. Bligh
  2002-03-21 23:20 ` Martin J. Bligh
@ 2002-03-26 17:08 ` Andrea Arcangeli
  2002-03-26 18:02   ` Martin J. Bligh
  2002-03-29 22:49   ` Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler) Martin J. Bligh
  1 sibling, 2 replies; 9+ messages in thread
From: Andrea Arcangeli @ 2002-03-26 17:08 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel, gerrit

On Thu, Mar 21, 2002 at 03:10:54PM -0800, Martin J. Bligh wrote:
> diff -urN linux-2.4.18-prepte/mm/mremap.c linux-2.4.18-highpte/mm/mremap.c
> --- linux-2.4.18-prepte/mm/mremap.c	Thu Sep 20 20:31:26 2001
> +++ linux-2.4.18-highpte/mm/mremap.c	Thu Mar 21 17:37:24 2002
> @@ -15,7 +15,7 @@
>  
>  extern int vm_enough_memory(long pages);
>  
> -static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
> +static inline pte_t *get_one_pte_map2(struct mm_struct *mm, unsigned long addr)
>  {
>  	pgd_t * pgd;
>  	pmd_t * pmd;
> @@ -39,21 +39,23 @@
>  		goto end;
>  	}
>  
> -	pte = pte_offset(pmd, addr);
> -	if (pte_none(*pte))
> +	pte = pte_offset_map2(pmd, addr);
> +	if (pte_none(*pte)) {
> +		pte_unmap2(pte);
>  		pte = NULL;
> +	}
>  end:
>  	return pte;
>  }
>  
> -static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr)
> +static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
>  {
>  	pmd_t * pmd;
>  	pte_t * pte = NULL;
>  
>  	pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr);
>  	if (pmd)
> -		pte = pte_alloc(mm, pmd, addr);
> +		pte = pte_alloc_map(mm, pmd, addr);
>  	return pte;
>  }
>  
> @@ -77,12 +79,16 @@
>  static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr)
>  {
>  	int error = 0;
> -	pte_t * src;
> +	pte_t *src, *dst;
>  
>  	spin_lock(&mm->page_table_lock);
> -	src = get_one_pte(mm, old_addr);
> -	if (src)
> -		error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr));
> +	src = get_one_pte_map2(mm, old_addr);
> +	if (src) {
> +		dst = alloc_one_pte_map(mm, new_addr);
> +		error = copy_one_pte(mm, src, dst);
> +		pte_unmap2(src);
> +		pte_unmap(dst);
> +	}
>  	spin_unlock(&mm->page_table_lock);
>  	return error;
>  }

while removing the persistent kmaps in pte-highmem from the fast paths,
I had a look how you solved this mremap thing without the persistent
kmaps. First, your backport is clearly broken because it will oops in
copy_one_pte if the alloc_one_pte_map fails.

2.5.7 gets such bit right (checking for dst == null within
copy_one_pte), but it's broken too and it will be unstable with
high-pte enabled. Either it's a coincidence or Ingo copied the changes
from my pte-highmem but he apparently forgot that his pte_offset_map
isn't persistent. The code works fine in pte-highmem, and it will break
on 2.5.7 due the atomic kmaps. The reason is that it's doing a
pte_alloc within a pte_offset_map_nested, so the pte_offset_map_nested
can be corrupted if pte_alloc sleeps while allocating the memory for the
pagetable.

Now, I'm not nearly backporting Ingo's code to 2.4, in particular the
drop of the quicklist without an affine allocator is a performance
optimization removal that doesn't make sense in 2.4, and as well the
whole pte_offset_kernel thing is flawed in general IMHO (in particular
in 2.5 where all drivers are using proper api to get physical pages of
vmalloc space). mremap isn't an extremely fast path like the anon page
faults/execve etc.., so in pte-highmem I'm going to left the persistent
kmap in get_one_pte, to left the code simple and to avoid an overwork
with preallocation of memory or ugly #ifdefs around.  That's one of the
purposes of the persistent kmaps of course, to keep the code clean and
efficient (think UP here).  And in 2.5, once we'll switch all the
pte_offset_map to use the KM_USER_PAGETABLE0 entry in the user-fixmap
area, those mremap mappings will be automatically persistent and the
current 2.5 bug will go away automatically that way. Then also all the
memory.c changes that I will have to backport to make your workload
scale (sensless in particular with a 64bit point of view) in Ingo's
patch can be backed out from 2.5 (with the exclusion of the pte_unmap*
of course :).

BTW, another place where I'm not going to write ugly code in 2.4 is
/proc/, while it could be atomic in mainline, I've a scheduling point
there to provide lower latency, and I don't want to clobber it with
stuff like:

	if (current->need_resched) {
		pte_kunmap(pte);
		schedule();
		pte = pte_offset(...)
	}

just to make `ps` faster.

The persistent kmaps scales less on a 16-way NUMA-Q (not even a problem
for normal servers), so I'm just replacing the absolutely required fast
paths to make the kernel compile faster, but in all the slow paths where
I'd need to write ugly code, I'm definitely keeping the persistent
kmaps.

With the user-fixmap that you proposed some day ago, this whole thing
will be solved cleanly in 2.5. 2.4 pte-highmem will remain an hybrid.

Then there's also the naming issues of the pte_offset calls, for now I'm
keeping them called:

	pte_offset_atomic/pte_kunmap (atomic)
	pte_offset/pte_kunmap (persistent)
	pte_offset_atomic2/pte_kunmap_atomic2 (atomic 2nd entry)

the original names in pte-highmem, because of this hybrid thing and to
remeber it works differently than the atomic-and-persistent user-fixmap
that we'll soon have in 2.5 with pte_offset_map.

(I could have pte_kunmap to take care of pte_offset_atomic2 too, but
doing explicit is faster and it happens only in a few places anyways, if
you prefer to have a pte_kunmap able to unmap pte_offset_atomic2 too let
me know)

I'm not considering to drop pte-highmem from 2.4 and to only support the
user-fixmaps in 2.5 because it is a showstopper bugfix for lots of
important users that definitely cannot wait 2.6. I'm also not
considering backporting the user-fixmaps because that would be a quite
invasive change messing also with the alignment of the user stack (I
know it could stay into kernel space, but right after the user stack it
will be more optimized and cleaner/simpler, so I prefer to put the few
virtual pages there).

Andrea

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler)
  2002-03-26 17:08 ` Andrea Arcangeli
@ 2002-03-26 18:02   ` Martin J. Bligh
  2002-03-26 18:18     ` Andrea Arcangeli
  2002-03-29 22:49   ` Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler) Martin J. Bligh
  1 sibling, 1 reply; 9+ messages in thread
From: Martin J. Bligh @ 2002-03-26 18:02 UTC (permalink / raw)
  To: Andrea Arcangeli, Arjan van de Ven; +Cc: linux-kernel, gerrit


> First, your backport is clearly broken because it will oops in
> copy_one_pte if the alloc_one_pte_map fails.

That doesn't suprise me ... I did a quick backport without staring
at the code too much, just so I could get some testing number to
see what the difference in performance would be. Arjan is doing
a proper backport to 2.4, and he obviously knows this patch far
better than I, so hopefully he'll address this ;-)

Thanks for pointing this out.

> ....

The bulk of the rest of this will take me a while to think about ;-)

Thanks,

M.

PS. The backport of the 2.5 highpte stuff works fine for me in limited
touch-testing, but I don't have it playing with the discontigmem stuff
yet, so I can't give you any numbers at the moment ...



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler)
  2002-03-26 18:02   ` Martin J. Bligh
@ 2002-03-26 18:18     ` Andrea Arcangeli
  2002-03-28 22:48       ` Testing of Ingo/Arjan highpte on 16-way NUMA-Q Martin J. Bligh
  0 siblings, 1 reply; 9+ messages in thread
From: Andrea Arcangeli @ 2002-03-26 18:18 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Arjan van de Ven, linux-kernel, gerrit

On Tue, Mar 26, 2002 at 10:02:27AM -0800, Martin J. Bligh wrote:
> 
> > First, your backport is clearly broken because it will oops in
> > copy_one_pte if the alloc_one_pte_map fails.
> 
> That doesn't suprise me ... I did a quick backport without staring
> at the code too much, just so I could get some testing number to
> see what the difference in performance would be. Arjan is doing
> a proper backport to 2.4, and he obviously knows this patch far

I need persistent kmaps for the pagetables, and also the quicklists, so
I'm not excited to integrate a 2.4 backport of an halfway solution that
isn't optimal for 2.5 either (plus at the moment it's buggy and
incidentally the right fix is to add the persistent kmaps in 2.5 too, go
figure).

Andrea

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Testing of Ingo/Arjan highpte on 16-way NUMA-Q
  2002-03-26 18:18     ` Andrea Arcangeli
@ 2002-03-28 22:48       ` Martin J. Bligh
  0 siblings, 0 replies; 9+ messages in thread
From: Martin J. Bligh @ 2002-03-28 22:48 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: linux-kernel

Adding your highpte patches to the NUMA-Q seems to make no
performance impact whatsoever (just doing kernel compile). The
profiles are so similar, it almost looks like it's not doing anything at
all ;-) I find this a little strange, as all my ZONE_NORMAL is on
node 0, so I'd expected this to have some sort of impact (either 
positive or negative ;-)). 

Is there an easy way to test whether this was working or not?

Thanks,

M.

PS. yes I did remember to turn on the 4G-highpte option ;-)




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler)
  2002-03-26 17:08 ` Andrea Arcangeli
  2002-03-26 18:02   ` Martin J. Bligh
@ 2002-03-29 22:49   ` Martin J. Bligh
  2002-03-30 19:44     ` Andrea Arcangeli
  1 sibling, 1 reply; 9+ messages in thread
From: Martin J. Bligh @ 2002-03-29 22:49 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel, gerrit

> I'm not considering to drop pte-highmem from 2.4 and to only support the
> user-fixmaps in 2.5 because it is a showstopper bugfix for lots of
> important users that definitely cannot wait 2.6. I'm also not
> considering backporting the user-fixmaps because that would be a quite
> invasive change messing also with the alignment of the user stack (I
> know it could stay into kernel space, but right after the user stack it
> will be more optimized and cleaner/simpler, so I prefer to put the few
> virtual pages there).

Can you explain the problem with the aligment of the user stack? I can't
see what the problem is here .... and we need to start thinking about how
to fix it if you've seen a problem that we haven't ....

Thanks,

M.



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler)
  2002-03-29 22:49   ` Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler) Martin J. Bligh
@ 2002-03-30 19:44     ` Andrea Arcangeli
  2002-03-31 14:51       ` Andrea Arcangeli
  0 siblings, 1 reply; 9+ messages in thread
From: Andrea Arcangeli @ 2002-03-30 19:44 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel, gerrit

On Fri, Mar 29, 2002 at 02:49:07PM -0800, Martin J. Bligh wrote:
> > I'm not considering to drop pte-highmem from 2.4 and to only support the
> > user-fixmaps in 2.5 because it is a showstopper bugfix for lots of
> > important users that definitely cannot wait 2.6. I'm also not
> > considering backporting the user-fixmaps because that would be a quite
> > invasive change messing also with the alignment of the user stack (I
> > know it could stay into kernel space, but right after the user stack it
> > will be more optimized and cleaner/simpler, so I prefer to put the few
> > virtual pages there).
> 
> Can you explain the problem with the aligment of the user stack? I can't
> see what the problem is here .... and we need to start thinking about how
> to fix it if you've seen a problem that we haven't ....

the user stack never started on a non power of two, so far, so some
buggy app may do assumptions on that, and bugs could trigger than. No
app should break in theory. If somebody is doing a map fixed near the
stack using a and-mask (instead of a minus) on %esp to calculate the
offset of the map-fixed, that could break if we eat a copule of virtual
pages of stack. I triggered something related when I was using a bit
larger heap-stack-gap, I know there are apps putting mappings very near
the stack (the one I've in mind isn't even open source so I don't know
how it chooses the address), but most probably correctly none of them
makes assumption on the alignment of the top of the stack, so it's
should be safe, but you know you never know :). Also given the
complexity of the patch, and also given it's definitely not necessary
in 2.4 just to get the scalability bit I recommend it for 2.5 only (my
new scalable pte-highmem using atomics in the fast paths and persistent
in the slow paths where 2.5 is also currently broken is just working
just fine on top of pre4).  I was ready to release it today (just
running for one day of stress testing), but pre5 came out so I'll have
to spend some more hour to slowly resync and rediff.

Andrea

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler)
  2002-03-30 19:44     ` Andrea Arcangeli
@ 2002-03-31 14:51       ` Andrea Arcangeli
  0 siblings, 0 replies; 9+ messages in thread
From: Andrea Arcangeli @ 2002-03-31 14:51 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel, gerrit

On Sat, Mar 30, 2002 at 08:44:44PM +0100, Andrea Arcangeli wrote:
> just fine on top of pre4).  I was ready to release it today (just
> running for one day of stress testing), but pre5 came out so I'll have
> to spend some more hour to slowly resync and rediff.

Ok, after a few hours of a very pleasant resyncing work and re-testing
(it was great to see lots of good stuff integrated from Marcelo) it's
out right now.  If you could retest the NUMA-Q support on top of
2.4.19pre5aa1 that would be helpful to know if the presistent kmap is a
problem in mremap (the place where it's currently buggy in 2.5). Thanks!

Andrea

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2002-03-31 14:51 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-03-21 23:10 Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler) Martin J. Bligh
2002-03-21 23:20 ` Martin J. Bligh
2002-03-26 17:08 ` Andrea Arcangeli
2002-03-26 18:02   ` Martin J. Bligh
2002-03-26 18:18     ` Andrea Arcangeli
2002-03-28 22:48       ` Testing of Ingo/Arjan highpte on 16-way NUMA-Q Martin J. Bligh
2002-03-29 22:49   ` Backport of Ingo/Arjan highpte to 2.4.18 (+O1 scheduler) Martin J. Bligh
2002-03-30 19:44     ` Andrea Arcangeli
2002-03-31 14:51       ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).