linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] i386 uaccess to fixmap pages
@ 2003-05-09  2:03 Roland McGrath
  2003-05-09  4:31 ` Andrew Morton
  0 siblings, 1 reply; 22+ messages in thread
From: Roland McGrath @ 2003-05-09  2:03 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

This patch against 2.5.69 makes uaccess (i.e., access_ok and get/put_user)
to user-accessible fixmap addresses on x86 work.  This is the last missing
piece in restoring the invariant that all kinds of special kernel access
"to user process virtual addresses" do in fact permit exactly all the same
access to all the same addresses that normal loads and stores in the user
process can do.

This is not only a hypothetical concern.  glibc's dynamic linker uses
direct pointers into mapped DSO images for strings such as symbol names and
library soname strings.  When using the vsyscall DSO, these pointers are in
the fixmap area.  To minimize copying and stack use, the dynamic linker
writes debugging messages using writev, where the %s for a string like a
symbol name will be an iovec pointing directly into the mapped image.  In
2.5.69, all such writevs referring to strings in the vsyscall DSO image
fail with EFAULT, and the expected output is elided or mangled.

This patch modifies pervasively used code like access_ok and get_user, so
it looks scary and performance-problematical.  However, the new code is
only in the paths leading to EFAULT, the straight-through code paths should
wholly be unchanged.  So in fact it should not have any performance impact.

I have tested this a fair bit, using system calls like writev and using a
contrivance to test get_user for 1, 2, and 4 byte sizes.  The essential
code that decides what is ok is the same code in my get_user_pages change
that already went in.  This patch pulls most of that code out into a
private function shared by get_user_pages and the new __fixmap_access_ok.

I changed the get_user assembly code only for completeness.  Pragmatically
speaking, that part of the patch can probably be left out without harm.  I
can't off hand think of a real-world case in which someone will need 1/2/4
byte get_user to work with fixmap addresses.  System calls taking
strings/buffers like write are what really matter.


Thanks,
Roland



--- linux-2.5.69/include/asm-i386/uaccess.h.~1~	Sun May  4 16:53:02 2003
+++ linux-2.5.69/include/asm-i386/uaccess.h	Wed May  7 17:20:13 2003
@@ -62,6 +62,8 @@ int __verify_write(const void *, unsigne
 		:"1" (addr),"g" ((int)(size)),"g" (current_thread_info()->addr_limit.seg)); \
 	flag; })
 
+extern int FASTCALL(__fixmap_access_ok(unsigned long, unsigned long, int));
+
 #ifdef CONFIG_X86_WP_WORKS_OK
 
 /**
@@ -83,13 +85,16 @@ int __verify_write(const void *, unsigne
  * checks that the pointer is in the user space range - after calling
  * this function, memory access functions may still return -EFAULT.
  */
-#define access_ok(type,addr,size) (__range_ok(addr,size) == 0)
+#define access_ok(type,addr,size) (__range_ok(addr,size) == 0 || \
+				   __fixmap_access_ok((unsigned long)(addr), \
+						      (size), (type)))
 
 #else
 
-#define access_ok(type,addr,size) ( (__range_ok(addr,size) == 0) && \
+#define access_ok(type,addr,size) ((__range_ok(addr,size) == 0) ? \
 			 ((type) == VERIFY_READ || boot_cpu_data.wp_works_ok || \
-			  __verify_write((void *)(addr),(size))))
+			  __verify_write((void *)(addr),(size))) : \
+			  __fixmap_access_ok((unsigned long)(addr),size,type))
 
 #endif
 
@@ -191,10 +196,6 @@ extern void __get_user_4(void);
 	__ret_gu;							\
 })
 
-extern void __put_user_1(void);
-extern void __put_user_2(void);
-extern void __put_user_4(void);
-extern void __put_user_8(void);
 
 extern void __put_user_bad(void);
 
--- linux-2.5.69/arch/i386/lib/getuser.S.~1~	Sun May  4 16:52:49 2003
+++ linux-2.5.69/arch/i386/lib/getuser.S	Wed May  7 17:34:47 2003
@@ -29,10 +29,13 @@
 __get_user_1:
 	GET_THREAD_INFO(%edx)
 	cmpl TI_ADDR_LIMIT(%edx),%eax
-	jae bad_get_user
+	jae 0f
 1:	movzbl (%eax),%edx
 	xorl %eax,%eax
 	ret
+0:	pushl $1b
+	xorl %edx,%edx
+	jmp get_user_check_fixmap
 
 .align 4
 .globl __get_user_2
@@ -41,10 +44,13 @@ __get_user_2:
 	jc bad_get_user
 	GET_THREAD_INFO(%edx)
 	cmpl TI_ADDR_LIMIT(%edx),%eax
-	jae bad_get_user
+	jae 0f
 2:	movzwl -1(%eax),%edx
 	xorl %eax,%eax
 	ret
+0:	pushl $2b
+	movl $1,%edx
+	jmp get_user_check_fixmap
 
 .align 4
 .globl __get_user_4
@@ -53,10 +59,27 @@ __get_user_4:
 	jc bad_get_user
 	GET_THREAD_INFO(%edx)
 	cmpl TI_ADDR_LIMIT(%edx),%eax
-	jae bad_get_user
+	jae 0f
 3:	movl -3(%eax),%edx
 	xorl %eax,%eax
 	ret
+0:	pushl $3b
+	movl $3,%edx
+/*	jmp get_user_check_fixmap */
+
+get_user_check_fixmap:
+	pushl %eax
+	pushl %ecx
+	subl %edx,%eax	/* undo address bias, %eax = addr */
+	incl %edx	/* bump bias to size, %edx = size */
+	xorl %ecx,%ecx	/* VERIFY_READ == 0 */
+	call __fixmap_access_ok
+	testl %eax,%eax
+	popl %ecx
+	popl %eax
+	jz 0f
+	ret
+0:	popl %eax	/* eat return address pushed by __get_user_N */
 
 bad_get_user:
 	xorl %edx,%edx
--- linux-2.5.69/mm/memory.c.~1~	Sun May  4 16:53:14 2003
+++ linux-2.5.69/mm/memory.c	Thu May  8 15:56:16 2003
@@ -45,6 +45,7 @@
 #include <linux/pagemap.h>
 #include <linux/vcache.h>
 #include <linux/rmap-locking.h>
+#include <linux/module.h>
 
 #include <asm/pgalloc.h>
 #include <asm/rmap.h>
@@ -669,6 +670,50 @@ static inline struct page *get_page_map(
 }
 
 
+#ifdef FIXADDR_START
+static int fixmap_page_ok(unsigned long addr, int write, struct page **pagep)
+{
+	unsigned long pg = addr & PAGE_MASK;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	pgd = pgd_offset_k(pg);
+	if (!pgd)
+		return 0;
+	pmd = pmd_offset(pgd, pg);
+	if (!pmd)
+		return 0;
+	pte = pte_offset_kernel(pmd, pg);
+	if (!pte || !pte_present(*pte) || !pte_user(*pte) ||
+	    !(write ? pte_write(*pte) : pte_read(*pte)))
+		return 0;
+
+	if (pagep) {
+		*pagep = pte_page(*pte);
+		get_page(*pagep);
+	}
+	return 1;
+}
+
+int __fixmap_access_ok(unsigned long addr, unsigned long size, int type)
+{
+	unsigned long limit = addr + size;
+	if (limit < addr)
+		return 0;
+	if (addr < FIXADDR_START || limit > FIXADDR_TOP)
+		return 0;
+	do {
+		if (!fixmap_page_ok(addr, type == VERIFY_WRITE, NULL))
+			return 0;
+		addr += PAGE_SIZE;
+	} while (addr < limit);
+	return 1;
+}
+
+EXPORT_SYMBOL(__fixmap_access_ok);
+#endif
+
+
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
@@ -701,24 +746,9 @@ int get_user_pages(struct task_struct *t
 				.vm_page_prot = PAGE_READONLY,
 				.vm_flags = VM_READ | VM_EXEC,
 			};
-			unsigned long pg = start & PAGE_MASK;
-			pgd_t *pgd;
-			pmd_t *pmd;
-			pte_t *pte;
-			pgd = pgd_offset_k(pg);
-			if (!pgd)
-				return i ? : -EFAULT;
-			pmd = pmd_offset(pgd, pg);
-			if (!pmd)
-				return i ? : -EFAULT;
-			pte = pte_offset_kernel(pmd, pg);
-			if (!pte || !pte_present(*pte) || !pte_user(*pte) ||
-			    !(write ? pte_write(*pte) : pte_read(*pte)))
+			if (!fixmap_page_ok(start, write,
+					    pages ? &pages[i] : NULL))
 				return i ? : -EFAULT;
-			if (pages) {
-				pages[i] = pte_page(*pte);
-				get_page(pages[i]);
-			}
 			if (vmas)
 				vmas[i] = &fixmap_vma;
 			i++;

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09  2:03 [PATCH] i386 uaccess to fixmap pages Roland McGrath
@ 2003-05-09  4:31 ` Andrew Morton
  2003-05-09  8:55   ` Roland McGrath
  0 siblings, 1 reply; 22+ messages in thread
From: Andrew Morton @ 2003-05-09  4:31 UTC (permalink / raw)
  To: Roland McGrath; +Cc: torvalds, linux-kernel

Roland McGrath <roland@redhat.com> wrote:
>
> This patch against 2.5.69 makes uaccess (i.e., access_ok and get/put_user)
> to user-accessible fixmap addresses on x86 work.

This doesn't apply against Linus's current tree.

> -#define access_ok(type,addr,size) (__range_ok(addr,size) == 0)
> +#define access_ok(type,addr,size) (__range_ok(addr,size) == 0 || \
> +				   __fixmap_access_ok((unsigned long)(addr), \
> +						      (size), (type)))

Your patch increases the kernel text by nearly 1%.  That's rather a lot for
what is a fairly esoteric feature.


   text    data     bss     dec     hex filename
2805911  592912  732516 4131339  3f0a0b /tmp/vmlinux
2825167  592982  732516 4150665  3f5589 vmlinux

Would it be possible to avoid this by just taking the fault and fixing
things up in the exception handler?

>  #else
>  
> -#define access_ok(type,addr,size) ( (__range_ok(addr,size) == 0) && \
> +#define access_ok(type,addr,size) ((__range_ok(addr,size) == 0) ? \
>  			 ((type) == VERIFY_READ || boot_cpu_data.wp_works_ok || \
> -			  __verify_write((void *)(addr),(size))))
> +			  __verify_write((void *)(addr),(size))) : \
> +			  __fixmap_access_ok((unsigned long)(addr),size,type))

You'll be wanting to parenthesise `size' and `type' here.


For some reason the patch causes gcc-2.95.3 to choke over the

	__put_user(d_off, &lastdirent->d_off);

statement in sys_getdents64().


fs/readdir.c: In function `sys_getdents64':
fs/readdir.c:285: internal error--unrecognizable insn:
(insn 138 212 147 (set (reg:SI 3 %ebx)
        (asm_operands/v ("1:	movl %%eax,0(%2)
2:	movl %%edx,4(%2)
3:
.section .fixup,"ax"
4:	movl %3,%0
	jmp 3b
.previous
.section __ex_table,"a"
	.align 4
	.long 1b,4b
	.long 2b,4b
.previous") ("=r") 0[ 
                (reg:DI 1 %edx)
                (reg:SI 0 %eax)
                (const_int -14 [0xfffffff2])
                (reg:SI 3 %ebx)
            ] 
            [ 
                (asm_input:DI ("A"))
                (asm_input:SI ("r"))
                (asm_input:SI ("i"))
                (asm_input:SI ("0"))
            ]  ("fs/readdir.c") 277)) -1 (insn_list 112 (insn_list 119 (insn_list 137 (nil))))
    (nil))
make[1]: *** [fs/readdir.o] Error 1
make[1]: *** Waiting for unfinished jobs....
make: *** [fs] Error 2

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09  4:31 ` Andrew Morton
@ 2003-05-09  8:55   ` Roland McGrath
  2003-05-09  9:19     ` Andrew Morton
  2003-05-09 12:40     ` Jamie Lokier
  0 siblings, 2 replies; 22+ messages in thread
From: Roland McGrath @ 2003-05-09  8:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, linux-kernel

> This doesn't apply against Linus's current tree.

Ok.  I don't use bk, but I can update relative to the latest snapshot on
kernel.org.

> Your patch increases the kernel text by nearly 1%.  That's rather a lot for
> what is a fairly esoteric feature.

Agreed.  I hadn't thought about that angle.  I am open to suggestions on
other ways to make it work.

> Would it be possible to avoid this by just taking the fault and fixing
> things up in the exception handler?

There is no fault that would be taken.  The address is valid, but above
TASK_SIZE.  The purpose of access_ok is to say that it's ok to try it and
let it fault, because it's a user-visible address and not the kernel memory
mapped into the high part of every process's address space.  The accesses
that follow are done in kernel mode, so there is no fault for pages marked
as not user-visible.  The fixmap addresses are > TASK_SIZE and so fail the
__range_ok test, heretofore making access_ok return false.  Those are the
code paths leading to EFAULT that I mentioned.

So far I can't think of a better way to do it.

> You'll be wanting to parenthesise `size' and `type' here.

I didn't bother because the existing macros do not consistently
parenthesize their arguments and so if there were really any problems they
would already show.  But I agree it's what should be done.

> For some reason the patch causes gcc-2.95.3 to choke over the

You got me.  That version of gcc has many, many bugs and is long obsolete.
Random meaningless perturbations of the code might change its behavior.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09  8:55   ` Roland McGrath
@ 2003-05-09  9:19     ` Andrew Morton
  2003-05-09  9:40       ` Roland McGrath
  2003-05-09 10:43       ` Roland McGrath
  2003-05-09 12:40     ` Jamie Lokier
  1 sibling, 2 replies; 22+ messages in thread
From: Andrew Morton @ 2003-05-09  9:19 UTC (permalink / raw)
  To: Roland McGrath; +Cc: torvalds, linux-kernel

Roland McGrath <roland@redhat.com> wrote:
>
> > This doesn't apply against Linus's current tree.
> 
> Ok.  I don't use bk, but I can update relative to the latest snapshot on
> kernel.org.

Yup.  The best place usually is the first link here:

	http://www.kernel.org/pub/linux/kernel/v2.5/testing/cset/

the "Gzipped full patch".

> > Your patch increases the kernel text by nearly 1%.  That's rather a lot for
> > what is a fairly esoteric feature.
> 
> Agreed.  I hadn't thought about that angle.  I am open to suggestions on
> other ways to make it work.
> 
> > Would it be possible to avoid this by just taking the fault and fixing
> > things up in the exception handler?
> 
> There is no fault that would be taken.

oop, very true.

Nasty.  Maybe the best approach is to mostly uninline the access_ok()
check.  Do the check for constant-sized small copies first, so those guys
still do the access_ok() check inline; uninline the rest.

It'll hurt the microbenchmarks (again), but it's a general article of faith
that keeping the kernel's cache footprint small is a net win...

Let me think about that for a bit.

> > For some reason the patch causes gcc-2.95.3 to choke over the
> 
> You got me.  That version of gcc has many, many bugs and is long obsolete.
> Random meaningless perturbations of the code might change its behavior.

Turns out that it only happens with `-O1'.  -O2 is OK.  I use -O1 because
it is faster.  I use gcc-2.95.3 because gcc-3.x is unacceptably slow.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09  9:19     ` Andrew Morton
@ 2003-05-09  9:40       ` Roland McGrath
  2003-05-09 10:43       ` Roland McGrath
  1 sibling, 0 replies; 22+ messages in thread
From: Roland McGrath @ 2003-05-09  9:40 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, linux-kernel

> Nasty.  Maybe the best approach is to mostly uninline the access_ok()
> check.  Do the check for constant-sized small copies first, so those guys
> still do the access_ok() check inline; uninline the rest.

That was the only thing I could think of too.  I haven't made any attempt
to figure out how much of the code size comes from the various inlined
user-memory copying functions that call access_ok, and could be reworked
not to inline any of the uncommon paths, vs direct uses of access_ok in
miscellaneous code.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09  9:19     ` Andrew Morton
  2003-05-09  9:40       ` Roland McGrath
@ 2003-05-09 10:43       ` Roland McGrath
  2003-05-09 11:42         ` Dave Jones
  2003-05-09 11:55         ` Alan Cox
  1 sibling, 2 replies; 22+ messages in thread
From: Roland McGrath @ 2003-05-09 10:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, linux-kernel

Here's an updated version of the patch.  Since the support for 386s without
WP support seems to be gone, I shaved an instruction here and there by not
passing the read/write flag to the helper function.  

Btw, in my build (gcc 3.2-rh, -O2, a $100 processor from last year so the
kernel build is plenty fast) the text size overhead is less than 0.5%,
which is to say 11kb (but my configuration is somewhat minimal so you may
be compiling in many more access_ok calls).


Thanks,
Roland


--- linux-2.5.69-1.1083/include/asm-i386/uaccess.h.orig	Fri May  9 03:07:15 2003
+++ linux-2.5.69-1.1083/include/asm-i386/uaccess.h	Fri May  9 03:25:16 2003
@@ -43,6 +43,7 @@ extern struct movsl_mask {
 #endif
 
 int __verify_write(const void *, unsigned long);
+int FASTCALL(__fixmap_range_ok(unsigned long, unsigned long));
 
 #define __addr_ok(addr) ((unsigned long)(addr) < (current_thread_info()->addr_limit.seg))
 
@@ -81,7 +82,9 @@ int __verify_write(const void *, unsigne
  * checks that the pointer is in the user space range - after calling
  * this function, memory access functions may still return -EFAULT.
  */
-#define access_ok(type,addr,size) (__range_ok(addr,size) == 0)
+#define access_ok(type,addr,size) (__range_ok(addr,size) == 0 || \
+ 				   __fixmap_range_ok((unsigned long)(addr), \
+						     (size)))
 
 /**
  * verify_area: - Obsolete, use access_ok()
--- linux-2.5.69/arch/i386/lib/getuser.S.~1~	Sun May  4 16:52:49 2003
+++ linux-2.5.69-1.1083/arch/i386/lib/getuser.S	Fri May  9 03:24:30 2003
@@ -29,10 +29,13 @@
 __get_user_1:
 	GET_THREAD_INFO(%edx)
 	cmpl TI_ADDR_LIMIT(%edx),%eax
-	jae bad_get_user
+	jae 0f
 1:	movzbl (%eax),%edx
 	xorl %eax,%eax
 	ret
+0:	pushl $1b
+	xorl %edx,%edx
+	jmp get_user_check_fixmap
 
 .align 4
 .globl __get_user_2
@@ -41,10 +44,13 @@ __get_user_2:
 	jc bad_get_user
 	GET_THREAD_INFO(%edx)
 	cmpl TI_ADDR_LIMIT(%edx),%eax
-	jae bad_get_user
+	jae 0f
 2:	movzwl -1(%eax),%edx
 	xorl %eax,%eax
 	ret
+0:	pushl $2b
+	movl $1,%edx
+	jmp get_user_check_fixmap
 
 .align 4
 .globl __get_user_4
@@ -53,10 +59,26 @@ __get_user_4:
 	jc bad_get_user
 	GET_THREAD_INFO(%edx)
 	cmpl TI_ADDR_LIMIT(%edx),%eax
-	jae bad_get_user
+	jae 0f
 3:	movl -3(%eax),%edx
 	xorl %eax,%eax
 	ret
+0:	pushl $3b
+	movl $3,%edx
+/*	jmp get_user_check_fixmap */
+
+get_user_check_fixmap:
+	pushl %eax
+	pushl %ecx
+	subl %edx,%eax	/* undo address bias, %eax = addr */
+	incl %edx	/* bump bias to size, %edx = size */
+	call __fixmap_range_ok
+	testl %eax,%eax
+	popl %ecx
+	popl %eax
+	jz 0f
+	ret
+0:	popl %eax	/* eat return address pushed by __get_user_N */
 
 bad_get_user:
 	xorl %edx,%edx
--- linux-2.5.69/mm/memory.c.~1~	Sun May  4 16:53:14 2003
+++ linux-2.5.69-1.1083/mm/memory.c	Fri May  9 03:23:33 2003
@@ -45,6 +45,7 @@
 #include <linux/pagemap.h>
 #include <linux/vcache.h>
 #include <linux/rmap-locking.h>
+#include <linux/module.h>
 
 #include <asm/pgalloc.h>
 #include <asm/rmap.h>
@@ -669,6 +670,50 @@ static inline struct page *get_page_map(
 }
 
 
+#ifdef FIXADDR_START
+static int fixmap_page_ok(unsigned long addr, int write, struct page **pagep)
+{
+	unsigned long pg = addr & PAGE_MASK;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	pgd = pgd_offset_k(pg);
+	if (!pgd)
+		return 0;
+	pmd = pmd_offset(pgd, pg);
+	if (!pmd)
+		return 0;
+	pte = pte_offset_kernel(pmd, pg);
+	if (!pte || !pte_present(*pte) || !pte_user(*pte) ||
+	    !(write ? pte_write(*pte) : pte_read(*pte)))
+		return 0;
+
+	if (pagep) {
+		*pagep = pte_page(*pte);
+		get_page(*pagep);
+	}
+	return 1;
+}
+
+int __fixmap_range_ok(unsigned long addr, unsigned long size)
+{
+	unsigned long limit = addr + size;
+	if (limit < addr)
+		return 0;
+	if (addr < FIXADDR_START || limit > FIXADDR_TOP)
+		return 0;
+	do {
+		if (!fixmap_page_ok(addr, 0, NULL))
+			return 0;
+		addr += PAGE_SIZE;
+	} while (addr < limit);
+	return 1;
+}
+
+EXPORT_SYMBOL(__fixmap_range_ok);
+#endif
+
+
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
@@ -701,24 +746,9 @@ int get_user_pages(struct task_struct *t
 				.vm_page_prot = PAGE_READONLY,
 				.vm_flags = VM_READ | VM_EXEC,
 			};
-			unsigned long pg = start & PAGE_MASK;
-			pgd_t *pgd;
-			pmd_t *pmd;
-			pte_t *pte;
-			pgd = pgd_offset_k(pg);
-			if (!pgd)
-				return i ? : -EFAULT;
-			pmd = pmd_offset(pgd, pg);
-			if (!pmd)
-				return i ? : -EFAULT;
-			pte = pte_offset_kernel(pmd, pg);
-			if (!pte || !pte_present(*pte) || !pte_user(*pte) ||
-			    !(write ? pte_write(*pte) : pte_read(*pte)))
+			if (!fixmap_page_ok(start, write,
+					    pages ? &pages[i] : NULL))
 				return i ? : -EFAULT;
-			if (pages) {
-				pages[i] = pte_page(*pte);
-				get_page(pages[i]);
-			}
 			if (vmas)
 				vmas[i] = &fixmap_vma;
 			i++;

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 10:43       ` Roland McGrath
@ 2003-05-09 11:42         ` Dave Jones
  2003-05-09 11:55         ` Alan Cox
  1 sibling, 0 replies; 22+ messages in thread
From: Dave Jones @ 2003-05-09 11:42 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, torvalds, linux-kernel

On Fri, May 09, 2003 at 03:43:07AM -0700, Roland McGrath wrote:
 > Here's an updated version of the patch.  Since the support for 386s without
 > WP support seems to be gone, I shaved an instruction here and there by not
 > passing the read/write flag to the helper function.  

should still be supported. this cset - 
http://linus.bkbits.net:8080/linux-2.5/cset@1.1078.1.24
rejigged how the WP test is done.

		Dave


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 10:43       ` Roland McGrath
  2003-05-09 11:42         ` Dave Jones
@ 2003-05-09 11:55         ` Alan Cox
  1 sibling, 0 replies; 22+ messages in thread
From: Alan Cox @ 2003-05-09 11:55 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, Linus Torvalds, Linux Kernel Mailing List

On Gwe, 2003-05-09 at 11:43, Roland McGrath wrote:
> Here's an updated version of the patch.  Since the support for 386s without
> WP support seems to be gone, I shaved an instruction here and there by not
> passing the read/write flag to the helper function.  

Manfred Spraul redid the 386 support very nicely so the older CPU's
should be fine


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09  8:55   ` Roland McGrath
  2003-05-09  9:19     ` Andrew Morton
@ 2003-05-09 12:40     ` Jamie Lokier
  2003-05-09 16:06       ` Linus Torvalds
  1 sibling, 1 reply; 22+ messages in thread
From: Jamie Lokier @ 2003-05-09 12:40 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Andrew Morton, torvalds, linux-kernel

Roland McGrath wrote:
> The address is valid, but above TASK_SIZE.  The purpose of access_ok
> is to say that it's ok to try it and let it fault, because it's a
> user-visible address and not the kernel memory mapped into the high
> part of every process's address space.  The accesses that follow are
> done in kernel mode, so there is no fault for pages marked as not
> user-visible.  The fixmap addresses are > TASK_SIZE and so fail the
> __range_ok test, heretofore making access_ok return false.  Those
> are the code paths leading to EFAULT that I mentioned.
> 
> So far I can't think of a better way to do it.

Why don't you change TASK_SIZE to 0xc0001000 (or so) and place the
user-visible fixmaps at 0xc0000000?

That would have no cost at all.

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 16:38         ` Dave Hansen
@ 2003-05-09 15:55           ` Martin J. Bligh
  2003-05-09 18:20             ` William Lee Irwin III
  2003-05-09 16:48           ` Linus Torvalds
  1 sibling, 1 reply; 22+ messages in thread
From: Martin J. Bligh @ 2003-05-09 15:55 UTC (permalink / raw)
  To: Dave Hansen, Linus Torvalds
  Cc: Jamie Lokier, Roland McGrath, Andrew Morton, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1051 bytes --]

>> It actually does have some cost in that form, namely the fact that the
>> kernel 1:1 mapping needs to be 4MB-aligned in order to take advantage of
>> large-pte support. So we'd have to move the kernel to something like
>> 0xc0400000 (and preferably higher, to make sure there is a nice hole in
>> between - say 0xc1000000), which in turn has a cost of verifying that 
>> nothing assumes the current lay-out (we've had the 1/2/3GB TASK_SIZE 
>> patches floating around, but they've never had "odd sizes").

I should explicitly state that with PAE on, you need to be PMD aligned (1GB)
as well, which everyone seems aware of, but nobody has explicitly mentioned
as far as I can see ;-) 

> Don't anyone go applying these yet, though.  I think there has been a
> bugfix or two since Martin released 2.5.68-mjb1, where these came from.
>  So, consider them just an example for now.

Here's the latest (fixed) sequence of patches, which seems to work pretty 
happily. Might need some merging to get them to go against mainline, but 
nothing major.

M.



[-- Attachment #2: 102-config_page_offset --]
[-- Type: text/plain, Size: 5678 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 101-config_hz/arch/i386/Kconfig 102-config_page_offset/arch/i386/Kconfig
--- 101-config_hz/arch/i386/Kconfig	Thu Apr 24 21:37:26 2003
+++ 102-config_page_offset/arch/i386/Kconfig	Thu Apr 24 21:37:27 2003
@@ -656,6 +656,44 @@ config HIGHMEM64G
 
 endchoice
 
+choice
+	help
+	  On i386, a process can only virtually address 4GB of memory.  This
+	  lets you select how much of that virtual space you would like to 
+	  devoted to userspace, and how much to the kernel.
+
+	  Some userspace programs would like to address as much as possible and 
+	  have few demands of the kernel other than it get out of the way.  These
+	  users may opt to use the 3.5GB option to give their userspace program 
+	  as much room as possible.  Due to alignment issues imposed by PAE, 
+	  the "3.5GB" option is unavailable if "64GB" high memory support is 
+	  enabled.
+
+	  Other users (especially those who use PAE) may be running out of
+	  ZONE_NORMAL memory.  Those users may benefit from increasing the
+	  kernel's virtual address space size by taking it away from userspace, 
+	  which may not need all of its space.  An indicator that this is 
+	  happening is when /proc/Meminfo's "LowFree:" is a small percentage of
+	  "LowTotal:" while "HighFree:" is very large.
+
+	  If unsure, say "3GB"
+	prompt "User address space size"
+        default 1GB
+	
+config	05GB
+	bool "3.5 GB"
+	depends on !HIGHMEM64G
+	
+config	1GB
+	bool "3 GB"
+	
+config	2GB
+	bool "2 GB"
+	
+config	3GB
+	bool "1 GB"
+endchoice
+
 config HIGHMEM
 	bool
 	depends on HIGHMEM64G || HIGHMEM4G
diff -urpN -X /home/fletch/.diff.exclude 101-config_hz/arch/i386/Makefile 102-config_page_offset/arch/i386/Makefile
--- 101-config_hz/arch/i386/Makefile	Tue Apr  8 14:38:14 2003
+++ 102-config_page_offset/arch/i386/Makefile	Thu Apr 24 21:37:27 2003
@@ -89,6 +89,7 @@ drivers-$(CONFIG_OPROFILE)		+= arch/i386
 
 CFLAGS += $(mflags-y)
 AFLAGS += $(mflags-y)
+AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h
 
 boot := arch/i386/boot
 
diff -urpN -X /home/fletch/.diff.exclude 101-config_hz/arch/i386/vmlinux.lds.S 102-config_page_offset/arch/i386/vmlinux.lds.S
--- 101-config_hz/arch/i386/vmlinux.lds.S	Sun Apr 20 19:34:57 2003
+++ 102-config_page_offset/arch/i386/vmlinux.lds.S	Thu Apr 24 21:37:27 2003
@@ -10,7 +10,7 @@ ENTRY(startup_32)
 jiffies = jiffies_64;
 SECTIONS
 {
-  . = 0xC0000000 + 0x100000;
+  . = __PAGE_OFFSET + 0x100000;
   /* read-only */
   _text = .;			/* Text and read-only data */
   .text : {
diff -urpN -X /home/fletch/.diff.exclude 101-config_hz/include/asm-i386/page.h 102-config_page_offset/include/asm-i386/page.h
--- 101-config_hz/include/asm-i386/page.h	Tue Apr  8 14:38:20 2003
+++ 102-config_page_offset/include/asm-i386/page.h	Thu Apr 24 21:37:27 2003
@@ -115,9 +115,26 @@ static __inline__ int get_order(unsigned
 #endif /* __ASSEMBLY__ */
 
 #ifdef __ASSEMBLY__
-#define __PAGE_OFFSET		(0xC0000000)
+#include <linux/config.h>
+#ifdef CONFIG_05GB
+#define __PAGE_OFFSET          (0xE0000000)
+#elif defined(CONFIG_1GB)
+#define __PAGE_OFFSET          (0xC0000000)
+#elif defined(CONFIG_2GB)
+#define __PAGE_OFFSET          (0x80000000)
+#elif defined(CONFIG_3GB)
+#define __PAGE_OFFSET          (0x40000000)
+#endif
 #else
-#define __PAGE_OFFSET		(0xC0000000UL)
+#ifdef CONFIG_05GB
+#define __PAGE_OFFSET          (0xE0000000UL)
+#elif defined(CONFIG_1GB)
+#define __PAGE_OFFSET          (0xC0000000UL)
+#elif defined(CONFIG_2GB)
+#define __PAGE_OFFSET          (0x80000000UL)
+#elif defined(CONFIG_3GB)
+#define __PAGE_OFFSET          (0x40000000UL)
+#endif
 #endif
 
 
diff -urpN -X /home/fletch/.diff.exclude 101-config_hz/include/asm-i386/processor.h 102-config_page_offset/include/asm-i386/processor.h
--- 101-config_hz/include/asm-i386/processor.h	Sun Apr 20 19:35:05 2003
+++ 102-config_page_offset/include/asm-i386/processor.h	Thu Apr 24 21:37:27 2003
@@ -287,7 +287,11 @@ extern unsigned int mca_pentium_flag;
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
+#ifdef CONFIG_05GB
+#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 16))
+#else
 #define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
+#endif
 
 /*
  * Size of io_bitmap in longwords: 32 is ports 0-0x3ff.
diff -urpN -X /home/fletch/.diff.exclude 101-config_hz/mm/memory.c 102-config_page_offset/mm/memory.c
--- 101-config_hz/mm/memory.c	Sun Apr 20 19:35:08 2003
+++ 102-config_page_offset/mm/memory.c	Thu Apr 24 21:37:27 2003
@@ -101,8 +101,7 @@ static inline void free_one_pmd(struct m
 
 static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir)
 {
-	int j;
-	pmd_t * pmd;
+	pmd_t * pmd, * md, * emd;
 
 	if (pgd_none(*dir))
 		return;
@@ -113,8 +112,21 @@ static inline void free_one_pgd(struct m
 	}
 	pmd = pmd_offset(dir, 0);
 	pgd_clear(dir);
-	for (j = 0; j < PTRS_PER_PMD ; j++)
-		free_one_pmd(tlb, pmd+j);
+	/*
+	 * Beware if changing the loop below.  It once used int j,
+	 * 	for (j = 0; j < PTRS_PER_PMD; j++)
+	 * 		free_one_pmd(pmd+j);
+	 * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3)
+	 * terminated the loop with a _signed_ address comparison
+	 * using "jle", when configured for HIGHMEM64GB (X86_PAE).
+	 * If also configured for 3GB of kernel virtual address space,
+	 * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as
+	 * a pmd, when that mm exits the loop goes on to free "entries"
+	 * found at 0x80000000 onwards.  The loop below compiles instead
+	 * to be terminated by unsigned address comparison using "jb".
+	 */
+	for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++)
+		free_one_pmd(tlb,md);
 	pmd_free_tlb(tlb, pmd);
 }
 

[-- Attachment #3: 540-separate_pmd --]
[-- Type: text/plain, Size: 3823 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 520-queuestat/arch/i386/mm/init.c 540-separate_pmd/arch/i386/mm/init.c
--- 520-queuestat/arch/i386/mm/init.c	Thu Apr 24 21:38:20 2003
+++ 540-separate_pmd/arch/i386/mm/init.c	Thu Apr 24 21:39:37 2003
@@ -514,9 +514,11 @@ void __init mem_init(void)
 #include <linux/slab.h>
 
 kmem_cache_t *pmd_cache;
+kmem_cache_t *kernel_pmd_cache;
 kmem_cache_t *pgd_cache;
 
 void pmd_ctor(void *, kmem_cache_t *, unsigned long);
+void kernel_pmd_ctor(void *, kmem_cache_t *, unsigned long);
 void pgd_ctor(void *, kmem_cache_t *, unsigned long);
 
 void __init pgtable_cache_init(void)
@@ -528,9 +530,18 @@ void __init pgtable_cache_init(void)
 						SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
 						pmd_ctor,
 						NULL);
-
 		if (!pmd_cache)
 			panic("pgtable_cache_init(): cannot create pmd cache");
+
+		kernel_pmd_cache = kmem_cache_create("pae_kernel_pmd",
+						(PTRS_PER_PMD*sizeof(pmd_t))*KERNEL_PGD_PTRS,
+						0,
+						SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
+						kernel_pmd_ctor,
+						NULL);
+
+		if (!kernel_pmd_cache)
+			panic("pgtable_cache_init(): cannot create kernel pmd cache");
 	}
 
         /*
diff -urpN -X /home/fletch/.diff.exclude 520-queuestat/arch/i386/mm/pgtable.c 540-separate_pmd/arch/i386/mm/pgtable.c
--- 520-queuestat/arch/i386/mm/pgtable.c	Mon Mar 17 21:43:39 2003
+++ 540-separate_pmd/arch/i386/mm/pgtable.c	Thu Apr 24 21:39:37 2003
@@ -136,7 +136,7 @@ pte_t *pte_alloc_one_kernel(struct mm_st
    
    	do {
 		pte = (pte_t *) __get_free_page(GFP_KERNEL);
-		if (pte)
+		if (pte) 
 			clear_page(pte);
 		else {
 			current->state = TASK_UNINTERRUPTIBLE;
@@ -168,6 +168,7 @@ struct page *pte_alloc_one(struct mm_str
 }
 
 extern kmem_cache_t *pmd_cache;
+extern kmem_cache_t *kernel_pmd_cache;
 extern kmem_cache_t *pgd_cache;
 
 void pmd_ctor(void *__pmd, kmem_cache_t *pmd_cache, unsigned long flags)
@@ -175,6 +176,15 @@ void pmd_ctor(void *__pmd, kmem_cache_t 
 	clear_page(__pmd);
 }
 
+void kernel_pmd_ctor(void *__pmd, kmem_cache_t *kernel_pmd_cache, unsigned long flags)
+{
+	int i;
+	for (i=USER_PTRS_PER_PGD; i<PTRS_PER_PGD; i++) {
+		pmd_t *kern_pmd = (pmd_t *)pgd_page(swapper_pg_dir[i]);
+		memcpy(__pmd+PAGE_SIZE*(i-USER_PTRS_PER_PGD), kern_pmd, PAGE_SIZE);
+	}
+}
+
 void pgd_ctor(void *__pgd, kmem_cache_t *pgd_cache, unsigned long flags)
 {
 	pgd_t *pgd = __pgd;
@@ -190,14 +200,20 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	int i;
 	pgd_t *pgd = kmem_cache_alloc(pgd_cache, SLAB_KERNEL);
+	kmem_cache_t *pmd_cachep = pmd_cache;
 
 	if (PTRS_PER_PMD == 1)
 		return pgd;
 	else if (!pgd)
 		return NULL;
 
-	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-		pmd_t *pmd = kmem_cache_alloc(pmd_cache, SLAB_KERNEL);
+	for (i = 0; i < PTRS_PER_PGD; ++i) {
+		pmd_t *pmd;
+		
+		if (i >= USER_PTRS_PER_PGD) 
+			pmd_cachep = kernel_pmd_cache;
+			
+		pmd = kmem_cache_alloc(pmd_cachep, SLAB_KERNEL);
 		if (!pmd)
 			goto out_oom;
 		set_pgd(pgd + i, __pgd(1 + __pa((unsigned long long)((unsigned long)pmd))));
@@ -205,8 +221,10 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 
 out_oom:
-	for (i--; i >= 0; --i)
-		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+	for (i--; i >= 0; --i) {
+		pmd_t *pmd = pmd_offset(&pgd[i],0);
+		kmem_cache_free(pmd_cachep, pmd);
+	}
 	kmem_cache_free(pgd_cache, (void *)pgd);
 	return NULL;
 }
@@ -216,8 +234,14 @@ void pgd_free(pgd_t *pgd)
 	int i;
 
 	if (PTRS_PER_PMD > 1) {
-		for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+		for (i = 0; i < PTRS_PER_PGD; i++) {
+			pmd_t *pmd_to_free = pmd_offset(&pgd[i],0);
+			
+			if( i >= USER_PTRS_PER_PGD )
+				kmem_cache_free(kernel_pmd_cache, pmd_to_free);
+			else 
+				kmem_cache_free(pmd_cache, pmd_to_free);
+
 			set_pgd(pgd + i, __pgd(0));
 		}
 	}

[-- Attachment #4: 650-banana_split --]
[-- Type: text/plain, Size: 18867 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/arch/i386/Kconfig 650-banana_split/arch/i386/Kconfig
--- 640-per_node_idt/arch/i386/Kconfig	Thu Apr 24 21:39:38 2003
+++ 650-banana_split/arch/i386/Kconfig	Thu Apr 24 21:41:25 2003
@@ -687,7 +687,6 @@ choice
 	
 config	05GB
 	bool "3.5 GB"
-	depends on !HIGHMEM64G
 	
 config	1GB
 	bool "3 GB"
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/arch/i386/mm/init.c 650-banana_split/arch/i386/mm/init.c
--- 640-per_node_idt/arch/i386/mm/init.c	Thu Apr 24 21:39:37 2003
+++ 650-banana_split/arch/i386/mm/init.c	Thu Apr 24 21:41:25 2003
@@ -123,6 +123,24 @@ static void __init page_table_range_init
 	}
 }
 
+
+/*
+ * Abstract out using large pages when mapping KVA, or the SMP identity
+ * mapping
+ */
+void pmd_map_pfn_range(pmd_t* pmd_entry, unsigned long pfn, unsigned long max_pfn)
+{
+	int pte_ofs;
+	/* Map with big pages if possible, otherwise create normal page tables. */
+	if (cpu_has_pse) {
+		set_pmd(pmd_entry, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+		pfn += PTRS_PER_PTE;
+	} else {
+		pte_t* pte = one_page_table_init(pmd_entry);
+		for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_pfn; pte++, pfn++, pte_ofs++)
+			set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
+	}
+}
 /*
  * This maps the physical memory to kernel virtual address space, a total 
  * of max_low_pfn pages, by creating page tables starting from address 
@@ -133,8 +151,7 @@ static void __init kernel_physical_mappi
 	unsigned long pfn;
 	pgd_t *pgd;
 	pmd_t *pmd;
-	pte_t *pte;
-	int pgd_idx, pmd_idx, pte_ofs;
+	int pgd_idx, pmd_idx;
 
 	pgd_idx = pgd_index(PAGE_OFFSET);
 	pgd = pgd_base + pgd_idx;
@@ -144,21 +161,48 @@ static void __init kernel_physical_mappi
 		pmd = one_md_table_init(pgd);
 		if (pfn >= max_low_pfn)
 			continue;
-		for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
-			/* Map with big pages if possible, otherwise create normal page tables. */
-			if (cpu_has_pse) {
-				set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
-				pfn += PTRS_PER_PTE;
-			} else {
-				pte = one_page_table_init(pmd);
-
-				for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++)
-					set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
-			}
+	
+		/* beware of starting KVA in the middle of a pmd. */
+		if( pgd_idx == pgd_index(PAGE_OFFSET) ) {
+			pmd_idx = pmd_index(PAGE_OFFSET);
+			pmd = &pmd[pmd_idx];
+		} else
+			pmd_idx = 0;
+
+		for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
+			pmd_map_pfn_range(pmd, pfn, max_low_pfn);
+			pfn += PTRS_PER_PTE; 
 		}
 	}	
 }
 
+/*
+ * Add low memory identity-mappings - SMP needs it when
+ * starting up on an AP from real-mode. In the non-PAE
+ * case we already have these mappings through head.S.
+ * All user-space mappings are explicitly cleared after
+ * SMP startup in zap_low_mappings().
+ */
+static void __init low_physical_mapping_init(pgd_t *pgd_base)
+{
+#if CONFIG_X86_PAE
+	unsigned long pfn = 0;
+	int pmd_ofs = 0;
+	pmd_t *pmd = one_md_table_init(pgd_base);
+
+	if(!cpu_has_pse) {
+		printk("PAE enabled, but no support for PSE (large pages)!\n");
+		printk("this is likely to waste some RAM.");
+	}
+	
+	for (; pmd_ofs < PTRS_PER_PMD && pfn <= max_low_pfn; pmd++, pmd_ofs++) { 
+		pmd_map_pfn_range(pmd, pfn, max_low_pfn);
+		pfn += PTRS_PER_PTE;
+	}		
+#endif
+}
+
+
 static inline int page_kills_ppro(unsigned long pagenr)
 {
 	if (pagenr >= 0x70000 && pagenr <= 0x7003F)
@@ -225,7 +269,7 @@ void __init permanent_kmaps_init(pgd_t *
 	pgd = swapper_pg_dir + pgd_index(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
 	pte = pte_offset_kernel(pmd, vaddr);
-	pkmap_page_table = pte;	
+	pkmap_page_table = pte;
 }
 
 void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
@@ -290,6 +334,7 @@ static void __init pagetable_init (void)
 	}
 
 	kernel_physical_mapping_init(pgd_base);
+	low_physical_mapping_init(pgd_base);
 	remap_numa_kva();
 
 	/*
@@ -298,19 +343,7 @@ static void __init pagetable_init (void)
 	 */
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 	page_table_range_init(vaddr, 0, pgd_base);
-
 	permanent_kmaps_init(pgd_base);
-
-#if CONFIG_X86_PAE
-	/*
-	 * Add low memory identity-mappings - SMP needs it when
-	 * starting up on an AP from real-mode. In the non-PAE
-	 * case we already have these mappings through head.S.
-	 * All user-space mappings are explicitly cleared after
-	 * SMP startup.
-	 */
-	pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
-#endif
 }
 
 void zap_low_mappings (void)
@@ -322,7 +355,7 @@ void zap_low_mappings (void)
 	 * Note that "pgd_clear()" doesn't do it for
 	 * us, because pgd_clear() is a no-op on i386.
 	 */
-	for (i = 0; i < USER_PTRS_PER_PGD; i++)
+	for (i = 0; i < __USER_PTRS_PER_PGD; i++)
 #if CONFIG_X86_PAE
 		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
 #else
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/arch/i386/mm/pgtable.c 650-banana_split/arch/i386/mm/pgtable.c
--- 640-per_node_idt/arch/i386/mm/pgtable.c	Thu Apr 24 21:39:37 2003
+++ 650-banana_split/arch/i386/mm/pgtable.c	Thu Apr 24 21:41:25 2003
@@ -178,10 +178,22 @@ void pmd_ctor(void *__pmd, kmem_cache_t 
 
 void kernel_pmd_ctor(void *__pmd, kmem_cache_t *kernel_pmd_cache, unsigned long flags)
 {
+	pmd_t *pmd = __pmd;
 	int i;
-	for (i=USER_PTRS_PER_PGD; i<PTRS_PER_PGD; i++) {
+	
+	/* you only need to memset the portion which isn't used by
+	 * the kernel
+	 */
+	clear_page(__pmd);
+	
+	for (i=FIRST_KERNEL_PGD_PTR; i<PTRS_PER_PGD; i++, pmd+=PTRS_PER_PMD) {
 		pmd_t *kern_pmd = (pmd_t *)pgd_page(swapper_pg_dir[i]);
-		memcpy(__pmd+PAGE_SIZE*(i-USER_PTRS_PER_PGD), kern_pmd, PAGE_SIZE);
+		int start_index = USER_PTRS_PER_PMD(i);
+		pmd_t *dst_pmd = &pmd[start_index];
+		pmd_t *src_pmd = &kern_pmd[start_index];
+		int num_pmds = PTRS_PER_PMD-USER_PTRS_PER_PMD(i);
+		
+		memcpy(dst_pmd, src_pmd, num_pmds*sizeof(pmd_t));
 	}
 }
 
@@ -200,30 +212,42 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	int i;
 	pgd_t *pgd = kmem_cache_alloc(pgd_cache, SLAB_KERNEL);
-	kmem_cache_t *pmd_cachep = pmd_cache;
+	pmd_t *kern_pmd_pages;
 
 	if (PTRS_PER_PMD == 1)
 		return pgd;
 	else if (!pgd)
 		return NULL;
 
-	for (i = 0; i < PTRS_PER_PGD; ++i) {
+	for (i = 0; i < FIRST_KERNEL_PGD_PTR; ++i) {
 		pmd_t *pmd;
-		
-		if (i >= USER_PTRS_PER_PGD) 
-			pmd_cachep = kernel_pmd_cache;
-			
-		pmd = kmem_cache_alloc(pmd_cachep, SLAB_KERNEL);
+	
+		pmd = kmem_cache_alloc(pmd_cache, SLAB_KERNEL);
 		if (!pmd)
 			goto out_oom;
 		set_pgd(pgd + i, __pgd(1 + __pa((unsigned long long)((unsigned long)pmd))));
 	}
+	
+	/*
+	 * The kernel pages are allocated in one big chunk, instead of having
+	 * a different slab for each one. 
+	 */
+	kern_pmd_pages = kmem_cache_alloc(kernel_pmd_cache, SLAB_KERNEL);
+	if (!kern_pmd_pages)
+		goto out_oom;
+	
+	for (i = FIRST_KERNEL_PGD_PTR; i < PTRS_PER_PGD; ++i) {
+		set_pgd(pgd + i, __pgd(1 + __pa((unsigned long long)((unsigned long)kern_pmd_pages))));
+		/* kern_pmd_pages += PAGE_SIZE, effectively */
+		kern_pmd_pages = &kern_pmd_pages[PTRS_PER_PMD];
+	}
+	
 	return pgd;
 
 out_oom:
 	for (i--; i >= 0; --i) {
 		pmd_t *pmd = pmd_offset(&pgd[i],0);
-		kmem_cache_free(pmd_cachep, pmd);
+		kmem_cache_free(pmd_cache, pmd);
 	}
 	kmem_cache_free(pgd_cache, (void *)pgd);
 	return NULL;
@@ -232,19 +256,24 @@ out_oom:
 void pgd_free(pgd_t *pgd)
 {
 	int i;
-
+	pmd_t *pmd_to_free;
+		
 	if (PTRS_PER_PMD > 1) {
-		for (i = 0; i < PTRS_PER_PGD; i++) {
-			pmd_t *pmd_to_free = pmd_offset(&pgd[i],0);
-			
-			if( i >= USER_PTRS_PER_PGD )
-				kmem_cache_free(kernel_pmd_cache, pmd_to_free);
-			else 
-				kmem_cache_free(pmd_cache, pmd_to_free);
-
+		for (i = 0; i < FIRST_KERNEL_PGD_PTR; i++) {
+			pmd_to_free = pmd_offset(&pgd[i],0);
+			kmem_cache_free(pmd_cache, pmd_to_free);
 			set_pgd(pgd + i, __pgd(0));
 		}
+		
+		/*
+		 * All of the kernel pmd pages are allocated from a single
+		 * slab, as a unit.  They must only be freed once
+		 */
+		pmd_to_free = pmd_offset(&pgd[FIRST_KERNEL_PGD_PTR],0);
+		kmem_cache_free(kernel_pmd_cache, pmd_to_free);
+		for (i = FIRST_KERNEL_PGD_PTR; i < PTRS_PER_PGD; i++)
+			set_pgd(&pgd[i], __pgd(0));
 	}
-
+	
 	kmem_cache_free(pgd_cache, (void *)pgd);
 }
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-alpha/pgtable.h 650-banana_split/include/asm-alpha/pgtable.h
--- 640-per_node_idt/include/asm-alpha/pgtable.h	Tue Apr  8 14:38:20 2003
+++ 650-banana_split/include/asm-alpha/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -39,6 +39,7 @@
 #define PTRS_PER_PMD	(1UL << (PAGE_SHIFT-3))
 #define PTRS_PER_PGD	(1UL << (PAGE_SHIFT-3))
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 /* Number of pointers that fit on a page:  this will go away. */
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-arm/pgtable.h 650-banana_split/include/asm-arm/pgtable.h
--- 640-per_node_idt/include/asm-arm/pgtable.h	Mon Mar 17 21:43:48 2003
+++ 650-banana_split/include/asm-arm/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -45,6 +45,7 @@ extern void __pgd_error(const char *file
 
 #define FIRST_USER_PGD_NR	1
 #define USER_PTRS_PER_PGD	((TASK_SIZE/PGDIR_SIZE) - FIRST_USER_PGD_NR)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 
 /*
  * The table below defines the page protection levels that we insert into our
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-cris/pgtable.h 650-banana_split/include/asm-cris/pgtable.h
--- 640-per_node_idt/include/asm-cris/pgtable.h	Sun Apr 20 19:35:05 2003
+++ 650-banana_split/include/asm-cris/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -202,6 +202,7 @@ static inline void flush_tlb(void) 
  */
 
 #define USER_PTRS_PER_PGD       (TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR       0
 
 /*
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-i386/pgtable.h 650-banana_split/include/asm-i386/pgtable.h
--- 640-per_node_idt/include/asm-i386/pgtable.h	Thu Apr 24 21:38:42 2003
+++ 650-banana_split/include/asm-i386/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -55,7 +55,22 @@ void pgtable_cache_init(void);
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
-#define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define __USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define FIRST_KERNEL_PGD_PTR	(__USER_PTRS_PER_PGD)
+#define PARTIAL_PGD	(TASK_SIZE > __USER_PTRS_PER_PGD*PGDIR_SIZE ? 1 : 0)
+#define PARTIAL_PMD	((TASK_SIZE % PGDIR_SIZE)/PMD_SIZE)
+#define USER_PTRS_PER_PGD	(PARTIAL_PGD + __USER_PTRS_PER_PGD)
+#ifndef __ASSEMBLY__
+static inline int USER_PTRS_PER_PMD(int pgd_index) {
+	if (pgd_index < __USER_PTRS_PER_PGD)
+		return PTRS_PER_PMD;
+	else if (PARTIAL_PMD && (pgd_index == __USER_PTRS_PER_PGD))
+		return (PTRS_PER_PMD-PARTIAL_PMD);
+	else
+		return 0;
+}
+#endif
+
 #define FIRST_USER_PGD_NR	0
 
 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-ia64/pgtable.h 650-banana_split/include/asm-ia64/pgtable.h
--- 640-per_node_idt/include/asm-ia64/pgtable.h	Sun Apr 20 19:35:05 2003
+++ 650-banana_split/include/asm-ia64/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -90,6 +90,7 @@
 #define PGDIR_MASK		(~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD		(__IA64_UL(1) << (PAGE_SHIFT-3))
 #define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/8)	/* regions 0-4 are user regions */
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 /*
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-m68k/pgtable.h 650-banana_split/include/asm-m68k/pgtable.h
--- 640-per_node_idt/include/asm-m68k/pgtable.h	Sun Nov 17 20:29:53 2002
+++ 650-banana_split/include/asm-m68k/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -58,6 +58,7 @@
 #define PTRS_PER_PGD	128
 #endif
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 /* Virtual address region for use by kernel_map() */
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-mips/pgtable.h 650-banana_split/include/asm-mips/pgtable.h
--- 640-per_node_idt/include/asm-mips/pgtable.h	Sun Apr 20 19:35:05 2003
+++ 650-banana_split/include/asm-mips/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -95,6 +95,7 @@ extern int add_temporary_entry(unsigned 
 #define PTRS_PER_PMD	1
 #define PTRS_PER_PGD	1024
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define VMALLOC_START     KSEG2
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-mips64/pgtable.h 650-banana_split/include/asm-mips64/pgtable.h
--- 640-per_node_idt/include/asm-mips64/pgtable.h	Thu Apr 24 21:39:07 2003
+++ 650-banana_split/include/asm-mips64/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -124,6 +124,7 @@ extern void (*_flush_cache_l1)(void);
 #define PTRS_PER_PMD	1024
 #define PTRS_PER_PTE	512
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define KPTBL_PAGE_ORDER  2
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-parisc/pgtable.h 650-banana_split/include/asm-parisc/pgtable.h
--- 640-per_node_idt/include/asm-parisc/pgtable.h	Mon Mar 17 21:43:49 2003
+++ 650-banana_split/include/asm-parisc/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -81,6 +81,7 @@
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD    (1UL << (PAGE_SHIFT - PT_NLEVELS))
 #define USER_PTRS_PER_PGD       PTRS_PER_PGD
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 
 /* Definitions for 2nd level */
 #define pgtable_cache_init()	do { } while (0)
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-ppc/pgtable.h 650-banana_split/include/asm-ppc/pgtable.h
--- 640-per_node_idt/include/asm-ppc/pgtable.h	Tue Apr  8 14:38:20 2003
+++ 650-banana_split/include/asm-ppc/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -83,6 +83,7 @@ extern unsigned long ioremap_bot, iorema
 #define PTRS_PER_PMD	1
 #define PTRS_PER_PGD	1024
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-ppc64/pgtable.h 650-banana_split/include/asm-ppc64/pgtable.h
--- 640-per_node_idt/include/asm-ppc64/pgtable.h	Wed Mar 26 22:54:37 2003
+++ 650-banana_split/include/asm-ppc64/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -36,6 +36,7 @@
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
 
 #define USER_PTRS_PER_PGD	(1024)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-sh/pgtable.h 650-banana_split/include/asm-sh/pgtable.h
--- 640-per_node_idt/include/asm-sh/pgtable.h	Sun Apr 20 19:35:06 2003
+++ 650-banana_split/include/asm-sh/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -101,6 +101,7 @@ extern unsigned long empty_zero_page[102
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define PTE_PHYS_MASK	0x1ffff000
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-sparc/pgtable.h 650-banana_split/include/asm-sparc/pgtable.h
--- 640-per_node_idt/include/asm-sparc/pgtable.h	Sun Apr 20 19:35:07 2003
+++ 650-banana_split/include/asm-sparc/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -109,6 +109,7 @@ BTFIXUPDEF_INT(page_kernel)
 #define PTRS_PER_PMD    	BTFIXUP_SIMM13(ptrs_per_pmd)
 #define PTRS_PER_PGD    	BTFIXUP_SIMM13(ptrs_per_pgd)
 #define USER_PTRS_PER_PGD	BTFIXUP_SIMM13(user_ptrs_per_pgd)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define PAGE_NONE      __pgprot(BTFIXUP_INT(page_none))
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-sparc64/pgtable.h 650-banana_split/include/asm-sparc64/pgtable.h
--- 640-per_node_idt/include/asm-sparc64/pgtable.h	Wed Mar 26 22:54:37 2003
+++ 650-banana_split/include/asm-sparc64/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -93,6 +93,7 @@
 /* Kernel has a separate 44bit address space. */
 #define USER_PTRS_PER_PGD	((const int)(test_thread_flag(TIF_32BIT)) ? \
 				 (1) : (PTRS_PER_PGD))
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define pte_ERROR(e)	__builtin_trap()
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-um/pgtable.h 650-banana_split/include/asm-um/pgtable.h
--- 640-per_node_idt/include/asm-um/pgtable.h	Mon Mar 17 21:43:49 2003
+++ 650-banana_split/include/asm-um/pgtable.h	Thu Apr 24 21:41:25 2003
@@ -40,6 +40,7 @@ extern unsigned long *empty_zero_page;
 #define PTRS_PER_PMD	1
 #define PTRS_PER_PGD	1024
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR       0
 
 #define pte_ERROR(e) \
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-x86_64/pgtable.h 650-banana_split/include/asm-x86_64/pgtable.h
--- 640-per_node_idt/include/asm-x86_64/pgtable.h	Tue Apr  8 14:38:21 2003
+++ 650-banana_split/include/asm-x86_64/pgtable.h	Thu Apr 24 21:41:26 2003
@@ -111,6 +111,7 @@ static inline void set_pml4(pml4_t *dst,
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/mm/memory.c 650-banana_split/mm/memory.c
--- 640-per_node_idt/mm/memory.c	Thu Apr 24 21:37:39 2003
+++ 650-banana_split/mm/memory.c	Thu Apr 24 21:41:26 2003
@@ -99,9 +99,10 @@ static inline void free_one_pmd(struct m
 	pte_free_tlb(tlb, page);
 }
 
-static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir)
+static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * pgd, unsigned long pgdi)
 {
 	pmd_t * pmd, * md, * emd;
+	pgd_t *dir = pgd + pgdi;
 
 	if (pgd_none(*dir))
 		return;
@@ -125,7 +126,7 @@ static inline void free_one_pgd(struct m
 	 * found at 0x80000000 onwards.  The loop below compiles instead
 	 * to be terminated by unsigned address comparison using "jb".
 	 */
-	for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++)
+	for (md = pmd, emd = pmd + USER_PTRS_PER_PMD(pgdi); md < emd; md++)
 		free_one_pmd(tlb,md);
 	pmd_free_tlb(tlb, pmd);
 }
@@ -139,11 +140,11 @@ static inline void free_one_pgd(struct m
 void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr)
 {
 	pgd_t * page_dir = tlb->mm->pgd;
-
-	page_dir += first;
+	int index = first;
+	
 	do {
-		free_one_pgd(tlb, page_dir);
-		page_dir++;
+		free_one_pgd(tlb, page_dir, index);
+		index++;
 	} while (--nr);
 }
 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 12:40     ` Jamie Lokier
@ 2003-05-09 16:06       ` Linus Torvalds
  2003-05-09 16:38         ` Dave Hansen
  2003-05-10  3:26         ` H. Peter Anvin
  0 siblings, 2 replies; 22+ messages in thread
From: Linus Torvalds @ 2003-05-09 16:06 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Roland McGrath, Andrew Morton, linux-kernel


On Fri, 9 May 2003, Jamie Lokier wrote:
>
> Why don't you change TASK_SIZE to 0xc0001000 (or so) and place the
> user-visible fixmaps at 0xc0000000?

I think I almost agree..

> That would have no cost at all.

It actually does have some cost in that form, namely the fact that the
kernel 1:1 mapping needs to be 4MB-aligned in order to take advantage of
large-pte support. So we'd have to move the kernel to something like
0xc0400000 (and preferably higher, to make sure there is a nice hole in
between - say 0xc1000000), which in turn has a cost of verifying that 
nothing assumes the current lay-out (we've had the 1/2/3GB TASK_SIZE 
patches floating around, but they've never had "odd sizes").

There's another cost, which is that right now we share the pgd with the 
kernel fixmaps, and this would mean that we'd have a new one. That's just 
a single page, though.

But it might "just work", and it would be interesting to see what the
patch would look like. Hint hint.

[ The current "TASK_SIZE comes up to start of kernel" is actually a bad 
  design, since it makes the boundary between user mapping a kernel 
  mapping a very abrupt one without any hole in between - opening us up
  for problems with overflows from user space addresses to kernel
  addresses on x86.

  In particular, if we use the wrong size to "access_ok()", a user program
  could fool the kernel to access low kernel memory. I don't know of any
  such bug, but having to be careful about it _did_ complicate
  "strcpy_from_user()" on x86. So from a security angle it would also be
  good to do this ]

		Linus


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 16:06       ` Linus Torvalds
@ 2003-05-09 16:38         ` Dave Hansen
  2003-05-09 15:55           ` Martin J. Bligh
  2003-05-09 16:48           ` Linus Torvalds
  2003-05-10  3:26         ` H. Peter Anvin
  1 sibling, 2 replies; 22+ messages in thread
From: Dave Hansen @ 2003-05-09 16:38 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jamie Lokier, Roland McGrath, Andrew Morton, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1951 bytes --]

Linus Torvalds wrote:
> On Fri, 9 May 2003, Jamie Lokier wrote:
> 
>>Why don't you change TASK_SIZE to 0xc0001000 (or so) and place the
>>user-visible fixmaps at 0xc0000000?
> 
> I think I almost agree..
> 
>>That would have no cost at all.
> 
> It actually does have some cost in that form, namely the fact that the
> kernel 1:1 mapping needs to be 4MB-aligned in order to take advantage of
> large-pte support. So we'd have to move the kernel to something like
> 0xc0400000 (and preferably higher, to make sure there is a nice hole in
> between - say 0xc1000000), which in turn has a cost of verifying that 
> nothing assumes the current lay-out (we've had the 1/2/3GB TASK_SIZE 
> patches floating around, but they've never had "odd sizes").

We've been playing with patches in the -mjb tree which make PAGE_OFFSET
and TASK_SIZE move to some weird values.  I have it to the point where I
could do a 3.875:0.125 user:kernel split.  It was created to let some
crazy database nuts get 3.5:0.5 with PAE on.  But, it should be mostly
applicable here.  The attached banana_split batch does this, and depends
on the separate_pmd patch.

> There's another cost, which is that right now we share the pgd with the 
> kernel fixmaps, and this would mean that we'd have a new one. That's just 
> a single page, though.

Do you mean we share the pmd, not the pgd, right?  The attached
separate_pmd patch unshares the kernel pmd page.  It keeps a separate
slab of kernel pmds, but suffers from the same problem which Bill Irwin
just fixed with PGDs, where the pages in the slab might need to get
updated mappings.  They will need the same treatment as the non-PAE pgds.

http://marc.theaimsgroup.com/?l=linux-kernel&m=105213208522985&w=2

Don't anyone go applying these yet, though.  I think there has been a
bugfix or two since Martin released 2.5.68-mjb1, where these came from.
 So, consider them just an example for now.

-- 
Dave Hansen
haveblue@us.ibm.com

[-- Attachment #2: 540-separate_pmd --]
[-- Type: text/plain, Size: 3823 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 520-queuestat/arch/i386/mm/init.c 540-separate_pmd/arch/i386/mm/init.c
--- 520-queuestat/arch/i386/mm/init.c	Sun Apr 20 22:07:17 2003
+++ 540-separate_pmd/arch/i386/mm/init.c	Sun Apr 20 22:22:15 2003
@@ -514,9 +514,11 @@ void __init mem_init(void)
 #include <linux/slab.h>
 
 kmem_cache_t *pmd_cache;
+kmem_cache_t *kernel_pmd_cache;
 kmem_cache_t *pgd_cache;
 
 void pmd_ctor(void *, kmem_cache_t *, unsigned long);
+void kernel_pmd_ctor(void *, kmem_cache_t *, unsigned long);
 void pgd_ctor(void *, kmem_cache_t *, unsigned long);
 
 void __init pgtable_cache_init(void)
@@ -528,9 +530,18 @@ void __init pgtable_cache_init(void)
 						SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
 						pmd_ctor,
 						NULL);
-
 		if (!pmd_cache)
 			panic("pgtable_cache_init(): cannot create pmd cache");
+
+		kernel_pmd_cache = kmem_cache_create("pae_kernel_pmd",
+						(PTRS_PER_PMD*sizeof(pmd_t))*KERNEL_PGD_PTRS,
+						0,
+						SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
+						kernel_pmd_ctor,
+						NULL);
+
+		if (!kernel_pmd_cache)
+			panic("pgtable_cache_init(): cannot create kernel pmd cache");
 	}
 
         /*
diff -urpN -X /home/fletch/.diff.exclude 520-queuestat/arch/i386/mm/pgtable.c 540-separate_pmd/arch/i386/mm/pgtable.c
--- 520-queuestat/arch/i386/mm/pgtable.c	Mon Mar 17 21:43:39 2003
+++ 540-separate_pmd/arch/i386/mm/pgtable.c	Sun Apr 20 22:22:15 2003
@@ -136,7 +136,7 @@ pte_t *pte_alloc_one_kernel(struct mm_st
    
    	do {
 		pte = (pte_t *) __get_free_page(GFP_KERNEL);
-		if (pte)
+		if (pte) 
 			clear_page(pte);
 		else {
 			current->state = TASK_UNINTERRUPTIBLE;
@@ -168,6 +168,7 @@ struct page *pte_alloc_one(struct mm_str
 }
 
 extern kmem_cache_t *pmd_cache;
+extern kmem_cache_t *kernel_pmd_cache;
 extern kmem_cache_t *pgd_cache;
 
 void pmd_ctor(void *__pmd, kmem_cache_t *pmd_cache, unsigned long flags)
@@ -175,6 +176,15 @@ void pmd_ctor(void *__pmd, kmem_cache_t 
 	clear_page(__pmd);
 }
 
+void kernel_pmd_ctor(void *__pmd, kmem_cache_t *kernel_pmd_cache, unsigned long flags)
+{
+	int i;
+	for (i=USER_PTRS_PER_PGD; i<PTRS_PER_PGD; i++) {
+		pmd_t *kern_pmd = (pmd_t *)pgd_page(swapper_pg_dir[i]);
+		memcpy(__pmd+PAGE_SIZE*(i-USER_PTRS_PER_PGD), kern_pmd, PAGE_SIZE);
+	}
+}
+
 void pgd_ctor(void *__pgd, kmem_cache_t *pgd_cache, unsigned long flags)
 {
 	pgd_t *pgd = __pgd;
@@ -190,14 +200,20 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	int i;
 	pgd_t *pgd = kmem_cache_alloc(pgd_cache, SLAB_KERNEL);
+	kmem_cache_t *pmd_cachep = pmd_cache;
 
 	if (PTRS_PER_PMD == 1)
 		return pgd;
 	else if (!pgd)
 		return NULL;
 
-	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-		pmd_t *pmd = kmem_cache_alloc(pmd_cache, SLAB_KERNEL);
+	for (i = 0; i < PTRS_PER_PGD; ++i) {
+		pmd_t *pmd;
+		
+		if (i >= USER_PTRS_PER_PGD) 
+			pmd_cachep = kernel_pmd_cache;
+			
+		pmd = kmem_cache_alloc(pmd_cachep, SLAB_KERNEL);
 		if (!pmd)
 			goto out_oom;
 		set_pgd(pgd + i, __pgd(1 + __pa((unsigned long long)((unsigned long)pmd))));
@@ -205,8 +221,10 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 
 out_oom:
-	for (i--; i >= 0; --i)
-		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+	for (i--; i >= 0; --i) {
+		pmd_t *pmd = pmd_offset(&pgd[i],0);
+		kmem_cache_free(pmd_cachep, pmd);
+	}
 	kmem_cache_free(pgd_cache, (void *)pgd);
 	return NULL;
 }
@@ -216,8 +234,14 @@ void pgd_free(pgd_t *pgd)
 	int i;
 
 	if (PTRS_PER_PMD > 1) {
-		for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+		for (i = 0; i < PTRS_PER_PGD; i++) {
+			pmd_t *pmd_to_free = pmd_offset(&pgd[i],0);
+			
+			if( i >= USER_PTRS_PER_PGD )
+				kmem_cache_free(kernel_pmd_cache, pmd_to_free);
+			else 
+				kmem_cache_free(pmd_cache, pmd_to_free);
+
 			set_pgd(pgd + i, __pgd(0));
 		}
 	}

[-- Attachment #3: 650-banana_split.1 --]
[-- Type: text/plain, Size: 18867 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/arch/i386/Kconfig 650-banana_split/arch/i386/Kconfig
--- 640-per_node_idt/arch/i386/Kconfig	Sun Apr 20 22:22:27 2003
+++ 650-banana_split/arch/i386/Kconfig	Sun Apr 20 22:35:24 2003
@@ -687,7 +687,6 @@ choice
 	
 config	05GB
 	bool "3.5 GB"
-	depends on !HIGHMEM64G
 	
 config	1GB
 	bool "3 GB"
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/arch/i386/mm/init.c 650-banana_split/arch/i386/mm/init.c
--- 640-per_node_idt/arch/i386/mm/init.c	Sun Apr 20 22:22:15 2003
+++ 650-banana_split/arch/i386/mm/init.c	Sun Apr 20 22:35:24 2003
@@ -123,6 +123,24 @@ static void __init page_table_range_init
 	}
 }
 
+
+/*
+ * Abstract out using large pages when mapping KVA, or the SMP identity
+ * mapping
+ */
+void pmd_map_pfn_range(pmd_t* pmd_entry, unsigned long pfn, unsigned long max_pfn)
+{
+	int pte_ofs;
+	/* Map with big pages if possible, otherwise create normal page tables. */
+	if (cpu_has_pse) {
+		set_pmd(pmd_entry, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+		pfn += PTRS_PER_PTE;
+	} else {
+		pte_t* pte = one_page_table_init(pmd_entry);
+		for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_pfn; pte++, pfn++, pte_ofs++)
+			set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
+	}
+}
 /*
  * This maps the physical memory to kernel virtual address space, a total 
  * of max_low_pfn pages, by creating page tables starting from address 
@@ -133,8 +151,7 @@ static void __init kernel_physical_mappi
 	unsigned long pfn;
 	pgd_t *pgd;
 	pmd_t *pmd;
-	pte_t *pte;
-	int pgd_idx, pmd_idx, pte_ofs;
+	int pgd_idx, pmd_idx;
 
 	pgd_idx = pgd_index(PAGE_OFFSET);
 	pgd = pgd_base + pgd_idx;
@@ -144,21 +161,48 @@ static void __init kernel_physical_mappi
 		pmd = one_md_table_init(pgd);
 		if (pfn >= max_low_pfn)
 			continue;
-		for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
-			/* Map with big pages if possible, otherwise create normal page tables. */
-			if (cpu_has_pse) {
-				set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
-				pfn += PTRS_PER_PTE;
-			} else {
-				pte = one_page_table_init(pmd);
-
-				for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++)
-					set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
-			}
+	
+		/* beware of starting KVA in the middle of a pmd. */
+		if( pgd_idx == pgd_index(PAGE_OFFSET) ) {
+			pmd_idx = pmd_index(PAGE_OFFSET);
+			pmd = &pmd[pmd_idx];
+		} else
+			pmd_idx = 0;
+
+		for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
+			pmd_map_pfn_range(pmd, pfn, max_low_pfn);
+			pfn += PTRS_PER_PTE; 
 		}
 	}	
 }
 
+/*
+ * Add low memory identity-mappings - SMP needs it when
+ * starting up on an AP from real-mode. In the non-PAE
+ * case we already have these mappings through head.S.
+ * All user-space mappings are explicitly cleared after
+ * SMP startup in zap_low_mappings().
+ */
+static void __init low_physical_mapping_init(pgd_t *pgd_base)
+{
+#if CONFIG_X86_PAE
+	unsigned long pfn = 0;
+	int pmd_ofs = 0;
+	pmd_t *pmd = one_md_table_init(pgd_base);
+
+	if(!cpu_has_pse) {
+		printk("PAE enabled, but no support for PSE (large pages)!\n");
+		printk("this is likely to waste some RAM.");
+	}
+	
+	for (; pmd_ofs < PTRS_PER_PMD && pfn <= max_low_pfn; pmd++, pmd_ofs++) { 
+		pmd_map_pfn_range(pmd, pfn, max_low_pfn);
+		pfn += PTRS_PER_PTE;
+	}		
+#endif
+}
+
+
 static inline int page_kills_ppro(unsigned long pagenr)
 {
 	if (pagenr >= 0x70000 && pagenr <= 0x7003F)
@@ -225,7 +269,7 @@ void __init permanent_kmaps_init(pgd_t *
 	pgd = swapper_pg_dir + pgd_index(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
 	pte = pte_offset_kernel(pmd, vaddr);
-	pkmap_page_table = pte;	
+	pkmap_page_table = pte;
 }
 
 void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
@@ -290,6 +334,7 @@ static void __init pagetable_init (void)
 	}
 
 	kernel_physical_mapping_init(pgd_base);
+	low_physical_mapping_init(pgd_base);
 	remap_numa_kva();
 
 	/*
@@ -298,19 +343,7 @@ static void __init pagetable_init (void)
 	 */
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 	page_table_range_init(vaddr, 0, pgd_base);
-
 	permanent_kmaps_init(pgd_base);
-
-#if CONFIG_X86_PAE
-	/*
-	 * Add low memory identity-mappings - SMP needs it when
-	 * starting up on an AP from real-mode. In the non-PAE
-	 * case we already have these mappings through head.S.
-	 * All user-space mappings are explicitly cleared after
-	 * SMP startup.
-	 */
-	pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
-#endif
 }
 
 void zap_low_mappings (void)
@@ -322,7 +355,7 @@ void zap_low_mappings (void)
 	 * Note that "pgd_clear()" doesn't do it for
 	 * us, because pgd_clear() is a no-op on i386.
 	 */
-	for (i = 0; i < USER_PTRS_PER_PGD; i++)
+	for (i = 0; i < __USER_PTRS_PER_PGD; i++)
 #if CONFIG_X86_PAE
 		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
 #else
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/arch/i386/mm/pgtable.c 650-banana_split/arch/i386/mm/pgtable.c
--- 640-per_node_idt/arch/i386/mm/pgtable.c	Sun Apr 20 22:22:15 2003
+++ 650-banana_split/arch/i386/mm/pgtable.c	Sun Apr 20 22:35:24 2003
@@ -178,10 +178,22 @@ void pmd_ctor(void *__pmd, kmem_cache_t 
 
 void kernel_pmd_ctor(void *__pmd, kmem_cache_t *kernel_pmd_cache, unsigned long flags)
 {
+	pmd_t *pmd = __pmd;
 	int i;
-	for (i=USER_PTRS_PER_PGD; i<PTRS_PER_PGD; i++) {
+	
+	/* you only need to memset the portion which isn't used by
+	 * the kernel
+	 */
+	clear_page(__pmd);
+	
+	for (i=FIRST_KERNEL_PGD_PTR; i<PTRS_PER_PGD; i++, pmd+=PTRS_PER_PMD) {
 		pmd_t *kern_pmd = (pmd_t *)pgd_page(swapper_pg_dir[i]);
-		memcpy(__pmd+PAGE_SIZE*(i-USER_PTRS_PER_PGD), kern_pmd, PAGE_SIZE);
+		int start_index = USER_PTRS_PER_PMD(i);
+		pmd_t *dst_pmd = &pmd[start_index];
+		pmd_t *src_pmd = &kern_pmd[start_index];
+		int num_pmds = PTRS_PER_PMD-USER_PTRS_PER_PMD(i);
+		
+		memcpy(dst_pmd, src_pmd, num_pmds*sizeof(pmd_t));
 	}
 }
 
@@ -200,30 +212,42 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	int i;
 	pgd_t *pgd = kmem_cache_alloc(pgd_cache, SLAB_KERNEL);
-	kmem_cache_t *pmd_cachep = pmd_cache;
+	pmd_t *kern_pmd_pages;
 
 	if (PTRS_PER_PMD == 1)
 		return pgd;
 	else if (!pgd)
 		return NULL;
 
-	for (i = 0; i < PTRS_PER_PGD; ++i) {
+	for (i = 0; i < FIRST_KERNEL_PGD_PTR; ++i) {
 		pmd_t *pmd;
-		
-		if (i >= USER_PTRS_PER_PGD) 
-			pmd_cachep = kernel_pmd_cache;
-			
-		pmd = kmem_cache_alloc(pmd_cachep, SLAB_KERNEL);
+	
+		pmd = kmem_cache_alloc(pmd_cache, SLAB_KERNEL);
 		if (!pmd)
 			goto out_oom;
 		set_pgd(pgd + i, __pgd(1 + __pa((unsigned long long)((unsigned long)pmd))));
 	}
+	
+	/*
+	 * The kernel pages are allocated in one big chunk, instead of having
+	 * a different slab for each one. 
+	 */
+	kern_pmd_pages = kmem_cache_alloc(kernel_pmd_cache, SLAB_KERNEL);
+	if (!kern_pmd_pages)
+		goto out_oom;
+	
+	for (i = FIRST_KERNEL_PGD_PTR; i < PTRS_PER_PGD; ++i) {
+		set_pgd(pgd + i, __pgd(1 + __pa((unsigned long long)((unsigned long)kern_pmd_pages))));
+		/* kern_pmd_pages += PAGE_SIZE, effectively */
+		kern_pmd_pages = &kern_pmd_pages[PTRS_PER_PMD];
+	}
+	
 	return pgd;
 
 out_oom:
 	for (i--; i >= 0; --i) {
 		pmd_t *pmd = pmd_offset(&pgd[i],0);
-		kmem_cache_free(pmd_cachep, pmd);
+		kmem_cache_free(pmd_cache, pmd);
 	}
 	kmem_cache_free(pgd_cache, (void *)pgd);
 	return NULL;
@@ -232,19 +256,24 @@ out_oom:
 void pgd_free(pgd_t *pgd)
 {
 	int i;
-
+	pmd_t *pmd_to_free;
+		
 	if (PTRS_PER_PMD > 1) {
-		for (i = 0; i < PTRS_PER_PGD; i++) {
-			pmd_t *pmd_to_free = pmd_offset(&pgd[i],0);
-			
-			if( i >= USER_PTRS_PER_PGD )
-				kmem_cache_free(kernel_pmd_cache, pmd_to_free);
-			else 
-				kmem_cache_free(pmd_cache, pmd_to_free);
-
+		for (i = 0; i < FIRST_KERNEL_PGD_PTR; i++) {
+			pmd_to_free = pmd_offset(&pgd[i],0);
+			kmem_cache_free(pmd_cache, pmd_to_free);
 			set_pgd(pgd + i, __pgd(0));
 		}
+		
+		/*
+		 * All of the kernel pmd pages are allocated from a single
+		 * slab, as a unit.  They must only be freed once
+		 */
+		pmd_to_free = pmd_offset(&pgd[FIRST_KERNEL_PGD_PTR],0);
+		kmem_cache_free(kernel_pmd_cache, pmd_to_free);
+		for (i = FIRST_KERNEL_PGD_PTR; i < PTRS_PER_PGD; i++)
+			set_pgd(&pgd[i], __pgd(0));
 	}
-
+	
 	kmem_cache_free(pgd_cache, (void *)pgd);
 }
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-alpha/pgtable.h 650-banana_split/include/asm-alpha/pgtable.h
--- 640-per_node_idt/include/asm-alpha/pgtable.h	Tue Apr  8 14:38:20 2003
+++ 650-banana_split/include/asm-alpha/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -39,6 +39,7 @@
 #define PTRS_PER_PMD	(1UL << (PAGE_SHIFT-3))
 #define PTRS_PER_PGD	(1UL << (PAGE_SHIFT-3))
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 /* Number of pointers that fit on a page:  this will go away. */
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-arm/pgtable.h 650-banana_split/include/asm-arm/pgtable.h
--- 640-per_node_idt/include/asm-arm/pgtable.h	Mon Mar 17 21:43:48 2003
+++ 650-banana_split/include/asm-arm/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -45,6 +45,7 @@ extern void __pgd_error(const char *file
 
 #define FIRST_USER_PGD_NR	1
 #define USER_PTRS_PER_PGD	((TASK_SIZE/PGDIR_SIZE) - FIRST_USER_PGD_NR)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 
 /*
  * The table below defines the page protection levels that we insert into our
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-cris/pgtable.h 650-banana_split/include/asm-cris/pgtable.h
--- 640-per_node_idt/include/asm-cris/pgtable.h	Sun Apr 20 19:35:05 2003
+++ 650-banana_split/include/asm-cris/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -202,6 +202,7 @@ static inline void flush_tlb(void) 
  */
 
 #define USER_PTRS_PER_PGD       (TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR       0
 
 /*
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-i386/pgtable.h 650-banana_split/include/asm-i386/pgtable.h
--- 640-per_node_idt/include/asm-i386/pgtable.h	Sun Apr 20 22:14:44 2003
+++ 650-banana_split/include/asm-i386/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -55,7 +55,22 @@ void pgtable_cache_init(void);
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
-#define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define __USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define FIRST_KERNEL_PGD_PTR	(__USER_PTRS_PER_PGD)
+#define PARTIAL_PGD	(TASK_SIZE > __USER_PTRS_PER_PGD*PGDIR_SIZE ? 1 : 0)
+#define PARTIAL_PMD	((TASK_SIZE % PGDIR_SIZE)/PMD_SIZE)
+#define USER_PTRS_PER_PGD	(PARTIAL_PGD + __USER_PTRS_PER_PGD)
+#ifndef __ASSEMBLY__
+static inline int USER_PTRS_PER_PMD(int pgd_index) {
+	if (pgd_index < __USER_PTRS_PER_PGD)
+		return PTRS_PER_PMD;
+	else if (PARTIAL_PMD && (pgd_index == __USER_PTRS_PER_PGD))
+		return (PTRS_PER_PMD-PARTIAL_PMD);
+	else
+		return 0;
+}
+#endif
+
 #define FIRST_USER_PGD_NR	0
 
 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-ia64/pgtable.h 650-banana_split/include/asm-ia64/pgtable.h
--- 640-per_node_idt/include/asm-ia64/pgtable.h	Sun Apr 20 19:35:05 2003
+++ 650-banana_split/include/asm-ia64/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -90,6 +90,7 @@
 #define PGDIR_MASK		(~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD		(__IA64_UL(1) << (PAGE_SHIFT-3))
 #define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/8)	/* regions 0-4 are user regions */
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 /*
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-m68k/pgtable.h 650-banana_split/include/asm-m68k/pgtable.h
--- 640-per_node_idt/include/asm-m68k/pgtable.h	Sun Nov 17 20:29:53 2002
+++ 650-banana_split/include/asm-m68k/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -58,6 +58,7 @@
 #define PTRS_PER_PGD	128
 #endif
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 /* Virtual address region for use by kernel_map() */
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-mips/pgtable.h 650-banana_split/include/asm-mips/pgtable.h
--- 640-per_node_idt/include/asm-mips/pgtable.h	Sun Apr 20 19:35:05 2003
+++ 650-banana_split/include/asm-mips/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -95,6 +95,7 @@ extern int add_temporary_entry(unsigned 
 #define PTRS_PER_PMD	1
 #define PTRS_PER_PGD	1024
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define VMALLOC_START     KSEG2
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-mips64/pgtable.h 650-banana_split/include/asm-mips64/pgtable.h
--- 640-per_node_idt/include/asm-mips64/pgtable.h	Sun Apr 20 22:15:36 2003
+++ 650-banana_split/include/asm-mips64/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -124,6 +124,7 @@ extern void (*_flush_cache_l1)(void);
 #define PTRS_PER_PMD	1024
 #define PTRS_PER_PTE	512
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define KPTBL_PAGE_ORDER  2
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-parisc/pgtable.h 650-banana_split/include/asm-parisc/pgtable.h
--- 640-per_node_idt/include/asm-parisc/pgtable.h	Mon Mar 17 21:43:49 2003
+++ 650-banana_split/include/asm-parisc/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -81,6 +81,7 @@
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD    (1UL << (PAGE_SHIFT - PT_NLEVELS))
 #define USER_PTRS_PER_PGD       PTRS_PER_PGD
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 
 /* Definitions for 2nd level */
 #define pgtable_cache_init()	do { } while (0)
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-ppc/pgtable.h 650-banana_split/include/asm-ppc/pgtable.h
--- 640-per_node_idt/include/asm-ppc/pgtable.h	Tue Apr  8 14:38:20 2003
+++ 650-banana_split/include/asm-ppc/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -83,6 +83,7 @@ extern unsigned long ioremap_bot, iorema
 #define PTRS_PER_PMD	1
 #define PTRS_PER_PGD	1024
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-ppc64/pgtable.h 650-banana_split/include/asm-ppc64/pgtable.h
--- 640-per_node_idt/include/asm-ppc64/pgtable.h	Wed Mar 26 22:54:37 2003
+++ 650-banana_split/include/asm-ppc64/pgtable.h	Sun Apr 20 22:35:24 2003
@@ -36,6 +36,7 @@
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
 
 #define USER_PTRS_PER_PGD	(1024)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-sh/pgtable.h 650-banana_split/include/asm-sh/pgtable.h
--- 640-per_node_idt/include/asm-sh/pgtable.h	Sun Apr 20 19:35:06 2003
+++ 650-banana_split/include/asm-sh/pgtable.h	Sun Apr 20 22:35:48 2003
@@ -101,6 +101,7 @@ extern unsigned long empty_zero_page[102
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define PTE_PHYS_MASK	0x1ffff000
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-sparc/pgtable.h 650-banana_split/include/asm-sparc/pgtable.h
--- 640-per_node_idt/include/asm-sparc/pgtable.h	Sun Apr 20 19:35:07 2003
+++ 650-banana_split/include/asm-sparc/pgtable.h	Sun Apr 20 22:35:48 2003
@@ -109,6 +109,7 @@ BTFIXUPDEF_INT(page_kernel)
 #define PTRS_PER_PMD    	BTFIXUP_SIMM13(ptrs_per_pmd)
 #define PTRS_PER_PGD    	BTFIXUP_SIMM13(ptrs_per_pgd)
 #define USER_PTRS_PER_PGD	BTFIXUP_SIMM13(user_ptrs_per_pgd)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define PAGE_NONE      __pgprot(BTFIXUP_INT(page_none))
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-sparc64/pgtable.h 650-banana_split/include/asm-sparc64/pgtable.h
--- 640-per_node_idt/include/asm-sparc64/pgtable.h	Wed Mar 26 22:54:37 2003
+++ 650-banana_split/include/asm-sparc64/pgtable.h	Sun Apr 20 22:35:48 2003
@@ -93,6 +93,7 @@
 /* Kernel has a separate 44bit address space. */
 #define USER_PTRS_PER_PGD	((const int)(test_thread_flag(TIF_32BIT)) ? \
 				 (1) : (PTRS_PER_PGD))
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define pte_ERROR(e)	__builtin_trap()
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-um/pgtable.h 650-banana_split/include/asm-um/pgtable.h
--- 640-per_node_idt/include/asm-um/pgtable.h	Mon Mar 17 21:43:49 2003
+++ 650-banana_split/include/asm-um/pgtable.h	Sun Apr 20 22:35:48 2003
@@ -40,6 +40,7 @@ extern unsigned long *empty_zero_page;
 #define PTRS_PER_PMD	1
 #define PTRS_PER_PGD	1024
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR       0
 
 #define pte_ERROR(e) \
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/include/asm-x86_64/pgtable.h 650-banana_split/include/asm-x86_64/pgtable.h
--- 640-per_node_idt/include/asm-x86_64/pgtable.h	Tue Apr  8 14:38:21 2003
+++ 650-banana_split/include/asm-x86_64/pgtable.h	Sun Apr 20 22:35:48 2003
@@ -111,6 +111,7 @@ static inline void set_pml4(pml4_t *dst,
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PMD(x)	(PTRS_PER_PMD)
 #define FIRST_USER_PGD_NR	0
 
 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
diff -urpN -X /home/fletch/.diff.exclude 640-per_node_idt/mm/memory.c 650-banana_split/mm/memory.c
--- 640-per_node_idt/mm/memory.c	Sun Apr 20 21:24:43 2003
+++ 650-banana_split/mm/memory.c	Sun Apr 20 22:35:48 2003
@@ -99,9 +99,10 @@ static inline void free_one_pmd(struct m
 	pte_free_tlb(tlb, page);
 }
 
-static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir)
+static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * pgd, unsigned long pgdi)
 {
 	pmd_t * pmd, * md, * emd;
+	pgd_t *dir = pgd + pgdi;
 
 	if (pgd_none(*dir))
 		return;
@@ -125,7 +126,7 @@ static inline void free_one_pgd(struct m
 	 * found at 0x80000000 onwards.  The loop below compiles instead
 	 * to be terminated by unsigned address comparison using "jb".
 	 */
-	for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++)
+	for (md = pmd, emd = pmd + USER_PTRS_PER_PMD(pgdi); md < emd; md++)
 		free_one_pmd(tlb,md);
 	pmd_free_tlb(tlb, pmd);
 }
@@ -139,11 +140,11 @@ static inline void free_one_pgd(struct m
 void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr)
 {
 	pgd_t * page_dir = tlb->mm->pgd;
-
-	page_dir += first;
+	int index = first;
+	
 	do {
-		free_one_pgd(tlb, page_dir);
-		page_dir++;
+		free_one_pgd(tlb, page_dir, index);
+		index++;
 	} while (--nr);
 }
 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 16:38         ` Dave Hansen
  2003-05-09 15:55           ` Martin J. Bligh
@ 2003-05-09 16:48           ` Linus Torvalds
  2003-05-09 17:16             ` Dave Hansen
  1 sibling, 1 reply; 22+ messages in thread
From: Linus Torvalds @ 2003-05-09 16:48 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Jamie Lokier, Roland McGrath, Andrew Morton, linux-kernel


On Fri, 9 May 2003, Dave Hansen wrote:
> 
> We've been playing with patches in the -mjb tree which make PAGE_OFFSET
> and TASK_SIZE move to some weird values.  I have it to the point where I
> could do a 3.875:0.125 user:kernel split.

That's still not "weird" in the current sense.

We've always (well, for a long time) been able to handle a TASK_SIZE that 
has a 111..00000 pattern - and in fact we used to _depend_ on that kind of 
pattern, because macros like "virt_to_phys()" were simple bitwise-and 
operations. There may be some code in the kernel that still depends on 
that kind of bitwise operation with TASK_SIZE.

Your 3.875:0.125 split still fits that pattern, and thus doesn't create 
any new cases.

In contrast, a TASK_SIZE of 0xc1000000 can no longer just "mask off" the 
kernel address bits. And _that_ is what I meant with "strange value".

		Linus


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 16:48           ` Linus Torvalds
@ 2003-05-09 17:16             ` Dave Hansen
  0 siblings, 0 replies; 22+ messages in thread
From: Dave Hansen @ 2003-05-09 17:16 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jamie Lokier, Roland McGrath, Andrew Morton, linux-kernel

Linus Torvalds wrote:
> On Fri, 9 May 2003, Dave Hansen wrote:
> 
>>We've been playing with patches in the -mjb tree which make PAGE_OFFSET
>>and TASK_SIZE move to some weird values.  I have it to the point where I
>>could do a 3.875:0.125 user:kernel split.
> 
> That's still not "weird" in the current sense.
> 
> We've always (well, for a long time) been able to handle a TASK_SIZE that 
> has a 111..00000 pattern - and in fact we used to _depend_ on that kind of 
> pattern, because macros like "virt_to_phys()" were simple bitwise-and 
> operations. There may be some code in the kernel that still depends on 
> that kind of bitwise operation with TASK_SIZE.
> 
> Your 3.875:0.125 split still fits that pattern, and thus doesn't create 
> any new cases.
> 
> In contrast, a TASK_SIZE of 0xc1000000 can no longer just "mask off" the 
> kernel address bits. And _that_ is what I meant with "strange value".

Ahhh, weird*er* :)

These patches at least address a small class of problems where it is
assumed that the kernel pagetable entries start at the beginning of a
pmd.  Anyway, 2.5.68-mjb3 boots with a 0xE1000000 PAGE_OFFSET with PAE
on a 4GB machine.  It does "just work", at least in Martin's tree.

3343MB HIGHMEM available.
368MB LOWMEM available.

elm3b82:~# cat /proc/meminfo
MemTotal:      3758468 kB
MemFree:       3728368 kB
Buffers:          1696 kB
Cached:           8184 kB
SwapCached:          0 kB
Active:           7640 kB
Inactive:         4936 kB
HighTotal:     3424228 kB
HighFree:      3412224 kB
LowTotal:       334240 kB
LowFree:        316144 kB
SwapTotal:           0 kB
SwapFree:            0 kB
Dirty:             304 kB
Writeback:           0 kB
Mapped:           5900 kB
Slab:             6292 kB
Committed_AS:     7024 kB
PageTables:        472 kB
VmallocTotal:   114680 kB
VmallocUsed:      2400 kB
VmallocChunk:   112280 kB

-- 
Dave Hansen
haveblue@us.ibm.com


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 15:55           ` Martin J. Bligh
@ 2003-05-09 18:20             ` William Lee Irwin III
  0 siblings, 0 replies; 22+ messages in thread
From: William Lee Irwin III @ 2003-05-09 18:20 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Dave Hansen, Linus Torvalds, Jamie Lokier, Roland McGrath,
	Andrew Morton, linux-kernel

At some point in the past, dhansen wrote:
>> Don't anyone go applying these yet, though.  I think there has been a
>> bugfix or two since Martin released 2.5.68-mjb1, where these came from.
>>  So, consider them just an example for now.

On Fri, May 09, 2003 at 08:55:00AM -0700, Martin J. Bligh wrote:
> Here's the latest (fixed) sequence of patches, which seems to work pretty 
> happily. Might need some merging to get them to go against mainline, but 
> nothing major.

The patch needs to do some kind of work in kernel_pmd_ctor() and
introduce a kernel_pmd_dtor() and do things analogous to what the
re-slabification patches are doing for the non-PAE case in order to fix
the bugs where pageattr.c doesn't get a chance to clean up cached pgd's
and add a pmd invalidation loop to set_pmd_pte() in pageattr.c.


-- wli

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 16:06       ` Linus Torvalds
  2003-05-09 16:38         ` Dave Hansen
@ 2003-05-10  3:26         ` H. Peter Anvin
  2003-05-10 15:31           ` Jamie Lokier
  1 sibling, 1 reply; 22+ messages in thread
From: H. Peter Anvin @ 2003-05-10  3:26 UTC (permalink / raw)
  To: linux-kernel

Followup to:  <Pine.LNX.4.44.0305090856500.9705-100000@home.transmeta.com>
By author:    Linus Torvalds <torvalds@transmeta.com>
In newsgroup: linux.dev.kernel
> 
> It actually does have some cost in that form, namely the fact that the
> kernel 1:1 mapping needs to be 4MB-aligned in order to take advantage of
> large-pte support. So we'd have to move the kernel to something like
> 0xc0400000 (and preferably higher, to make sure there is a nice hole in
> between - say 0xc1000000), which in turn has a cost of verifying that 
> nothing assumes the current lay-out (we've had the 1/2/3GB TASK_SIZE 
> patches floating around, but they've never had "odd sizes").
> 
> There's another cost, which is that right now we share the pgd with the 
> kernel fixmaps, and this would mean that we'd have a new one. That's just 
> a single page, though.
> 
> But it might "just work", and it would be interesting to see what the
> patch would look like. Hint hint.
> 

Another option would be to put it in the "user" part of the address
space at 0xbffff000 (and move the %esp base value.)  That would have
the nice side benefit that stuff like UML or whatever who wanted to
map something over the vsyscall could do so.  Downside: each process
needs a PTE for this.

	-hpa
-- 
<hpa@transmeta.com> at work, <hpa@zytor.com> in private!
"Unix gives you enough rope to shoot yourself in the foot."
Architectures needed: ia64 m68k mips64 ppc ppc64 s390 s390x sh v850 x86-64

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-10  3:26         ` H. Peter Anvin
@ 2003-05-10 15:31           ` Jamie Lokier
  2003-05-10 17:23             ` Ulrich Drepper
  2003-05-10 19:23             ` H. Peter Anvin
  0 siblings, 2 replies; 22+ messages in thread
From: Jamie Lokier @ 2003-05-10 15:31 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: linux-kernel

H. Peter Anvin wrote:
> Another option would be to put it in the "user" part of the address
> space at 0xbffff000 (and move the %esp base value.)  That would have
> the nice side benefit that stuff like UML or whatever who wanted to
> map something over the vsyscall could do so.  Downside: each process
> needs a PTE for this.

It doesn't need a PTE.  The vsyscall code could be _copied_ to the end
of the page at 0xbffff000, with the stack immediately preceding it.

In fact that's 1 fewer TLB entries than the current arrangement.  The
fork-time copy should be negligable as it is a very small code
fragment.

I suspect this would cause difficulties with the latest ELF and unwind
tables, but apart from that, why _not_ place the vsyscall trampoline
at the end of the stack?

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-10 15:31           ` Jamie Lokier
@ 2003-05-10 17:23             ` Ulrich Drepper
  2003-05-10 19:23             ` H. Peter Anvin
  1 sibling, 0 replies; 22+ messages in thread
From: Ulrich Drepper @ 2003-05-10 17:23 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: H. Peter Anvin, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Jamie Lokier wrote:

> I suspect this would cause difficulties with the latest ELF and unwind
> tables, but apart from that, why _not_ place the vsyscall trampoline
> at the end of the stack?

The unwind information is in a position-independent form.

- -- 
- --------------.                        ,-.            444 Castro Street
Ulrich Drepper \    ,-----------------'   \ Mountain View, CA 94041 USA
Red Hat         `--' drepper at redhat.com `---------------------------
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.1 (GNU/Linux)

iD8DBQE+vTV/2ijCOnn/RHQRAvXPAKCaq5W34wOVwh3sMBw8fqlcQ1TuvgCfQZkQ
EoyJ6njw/JKg2rTbcAScmkU=
=ao0g
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-10 15:31           ` Jamie Lokier
  2003-05-10 17:23             ` Ulrich Drepper
@ 2003-05-10 19:23             ` H. Peter Anvin
  2003-05-10 20:52               ` Jamie Lokier
  1 sibling, 1 reply; 22+ messages in thread
From: H. Peter Anvin @ 2003-05-10 19:23 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: linux-kernel

Jamie Lokier wrote:
> 
> It doesn't need a PTE.  The vsyscall code could be _copied_ to the end
> of the page at 0xbffff000, with the stack immediately preceding it.
> 

You don't really want that, though.  If you're doing gettimeofday() in 
user space it's critically important that the kernel can modify all 
these pages at once.

	-hpa


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-10 19:23             ` H. Peter Anvin
@ 2003-05-10 20:52               ` Jamie Lokier
  0 siblings, 0 replies; 22+ messages in thread
From: Jamie Lokier @ 2003-05-10 20:52 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: linux-kernel

H. Peter Anvin wrote:
> >It doesn't need a PTE.  The vsyscall code could be _copied_ to the end
> >of the page at 0xbffff000, with the stack immediately preceding it.
> >
> 
> You don't really want that, though.  If you're doing gettimeofday() in 
> user space it's critically important that the kernel can modify all 
> these pages at once.

I think gettimeofday() is exceptional, and other system call
accelerators actually want per-task data.  For example sigprocmask()
can be done in user space, but that needs a per-task mask

For gettimeofday(), the kernel can (a) update the word in the mm of
currently running tasks on timer interrupts, (b) update the word at
context switch time, and then only when switching mm.  Ok, I'll admit
that is looking a bit messed up, and possible SMP synchronisation
problems too.

How about this: gettimeofday() in user space simply does rdtsc, and if
the number of clocks since the last jiffy update is above a threshold,
it does a syscall.  If rdtsc is not available, it always does the
syscall.

Per-mm vsyscall data is appropriate for that, and no globally shared
page is required.  (Think of the NUMA! :)

Granted, sharing the vsyscall and stack ptes is not great if the
vsyscall code expands considerably.  As it is now, it's a very small
bit of code which costs nothing to copy.

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
  2003-05-09 17:28 Chuck Ebbert
@ 2003-05-09 17:49 ` Linus Torvalds
  0 siblings, 0 replies; 22+ messages in thread
From: Linus Torvalds @ 2003-05-09 17:49 UTC (permalink / raw)
  To: Chuck Ebbert
  Cc: Dave Hansen, Jamie Lokier, linux-kernel, Andrew Morton, Roland McGrath


On Fri, 9 May 2003, Chuck Ebbert wrote:
> 
>  /arch/i386/kernel/doublefault.c:
> 
> #define ptr_ok(x) ((x) > 0xc0000000 && (x) < 0xc1000000)

That's just wrong _regardless_. Bad Linus. I was lazy when doing the 
double-fault verification.

		Linus


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] i386 uaccess to fixmap pages
@ 2003-05-09 17:28 Chuck Ebbert
  2003-05-09 17:49 ` Linus Torvalds
  0 siblings, 1 reply; 22+ messages in thread
From: Chuck Ebbert @ 2003-05-09 17:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Dave Hansen, Jamie Lokier, linux-kernel, Andrew Morton, Roland McGrath

Linus Torvalds wrote:

> So we'd have to move the kernel to something like
> 0xc0400000 (and preferably higher, to make sure there is a nice hole in
> between - say 0xc1000000), which in turn has a cost of verifying that 
> nothing assumes the current lay-out (we've had the 1/2/3GB TASK_SIZE 
> patches floating around, but they've never had "odd sizes").


 /arch/i386/kernel/doublefault.c:

#define ptr_ok(x) ((x) > 0xc0000000 && (x) < 0xc1000000)

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2003-05-10 20:40 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-05-09  2:03 [PATCH] i386 uaccess to fixmap pages Roland McGrath
2003-05-09  4:31 ` Andrew Morton
2003-05-09  8:55   ` Roland McGrath
2003-05-09  9:19     ` Andrew Morton
2003-05-09  9:40       ` Roland McGrath
2003-05-09 10:43       ` Roland McGrath
2003-05-09 11:42         ` Dave Jones
2003-05-09 11:55         ` Alan Cox
2003-05-09 12:40     ` Jamie Lokier
2003-05-09 16:06       ` Linus Torvalds
2003-05-09 16:38         ` Dave Hansen
2003-05-09 15:55           ` Martin J. Bligh
2003-05-09 18:20             ` William Lee Irwin III
2003-05-09 16:48           ` Linus Torvalds
2003-05-09 17:16             ` Dave Hansen
2003-05-10  3:26         ` H. Peter Anvin
2003-05-10 15:31           ` Jamie Lokier
2003-05-10 17:23             ` Ulrich Drepper
2003-05-10 19:23             ` H. Peter Anvin
2003-05-10 20:52               ` Jamie Lokier
2003-05-09 17:28 Chuck Ebbert
2003-05-09 17:49 ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).