linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache
@ 2007-02-10 11:50 Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation Andi Kleen
                   ` (23 more replies)
  0 siblings, 24 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: patches, linux-kernel


This does user copies in fs write() into the page cache with write combining.
This pushes the destination out of the CPU's cache, but allows higher bandwidth
in some case.

The theory is that the page cache data is usually not touched by the 
CPU again and it's better to not pollute the cache with it. Also it is a little
faster.

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/x8664_ksyms.c    |    1 
 arch/x86_64/lib/Makefile            |    2 
 arch/x86_64/lib/copy_user_nocache.S |  217 ++++++++++++++++++++++++++++++++++++
 include/asm-x86_64/uaccess.h        |   14 ++
 4 files changed, 233 insertions(+), 1 deletion(-)

Index: linux/arch/x86_64/lib/Makefile
===================================================================
--- linux.orig/arch/x86_64/lib/Makefile
+++ linux/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
 lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
 	usercopy.o getuser.o putuser.o  \
 	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
Index: linux/arch/x86_64/lib/copy_user_nocache.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag	when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+	CFI_STARTPROC
+	pushq %rbx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rbx, 0
+	pushq %rcx		/* save zero flag */
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rcx, 0
+
+	xorl %eax,%eax		/* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+	movq %rdx,%rcx
+
+	movl $64,%ebx
+	shrq $6,%rdx
+	decq %rdx
+	js   .Lhandle_tail
+
+	.p2align 4
+.Lloop:
+.Ls1:	movq (%rsi),%r11
+.Ls2:	movq 1*8(%rsi),%r8
+.Ls3:	movq 2*8(%rsi),%r9
+.Ls4:	movq 3*8(%rsi),%r10
+.Ld1:	movnti %r11,(%rdi)
+.Ld2:	movnti %r8,1*8(%rdi)
+.Ld3:	movnti %r9,2*8(%rdi)
+.Ld4:	movnti %r10,3*8(%rdi)
+
+.Ls5:	movq 4*8(%rsi),%r11
+.Ls6:	movq 5*8(%rsi),%r8
+.Ls7:	movq 6*8(%rsi),%r9
+.Ls8:	movq 7*8(%rsi),%r10
+.Ld5:	movnti %r11,4*8(%rdi)
+.Ld6:	movnti %r8,5*8(%rdi)
+.Ld7:	movnti %r9,6*8(%rdi)
+.Ld8:	movnti %r10,7*8(%rdi)
+
+	dec  %rdx
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+
+	jns  .Lloop
+
+	.p2align 4
+.Lhandle_tail:
+	movl %ecx,%edx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	movl $8,%ebx
+	.p2align 4
+.Lloop_8:
+.Ls9:	movq (%rsi),%r8
+.Ld9:	movnti %r8,(%rdi)
+	decl %ecx
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz .Lloop_8
+
+.Lhandle_7:
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz   .Lende
+	.p2align 4
+.Lloop_1:
+.Ls10:	movb (%rsi),%bl
+.Ld10:	movb %bl,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+
+	CFI_REMEMBER_STATE
+.Lende:
+	popq %rcx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE %rcx
+	popq %rbx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rbx
+	ret
+	CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+	/* align destination */
+	.p2align 4
+.Lbad_alignment:
+	movl $8,%r9d
+	subl %ecx,%r9d
+	movl %r9d,%ecx
+	cmpq %r9,%rdx
+	jz   .Lhandle_7
+	js   .Lhandle_7
+.Lalign_1:
+.Ls11:	movb (%rsi),%bl
+.Ld11:	movb %bl,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .Lalign_1
+	subq %r9,%rdx
+	jmp .Lafter_bad_alignment
+#endif
+
+	/* table sorted by exception address */
+	.section __ex_table,"a"
+	.align 8
+	.quad .Ls1,.Ls1e
+	.quad .Ls2,.Ls2e
+	.quad .Ls3,.Ls3e
+	.quad .Ls4,.Ls4e
+	.quad .Ld1,.Ls1e
+	.quad .Ld2,.Ls2e
+	.quad .Ld3,.Ls3e
+	.quad .Ld4,.Ls4e
+	.quad .Ls5,.Ls5e
+	.quad .Ls6,.Ls6e
+	.quad .Ls7,.Ls7e
+	.quad .Ls8,.Ls8e
+	.quad .Ld5,.Ls5e
+	.quad .Ld6,.Ls6e
+	.quad .Ld7,.Ls7e
+	.quad .Ld8,.Ls8e
+	.quad .Ls9,.Le_quad
+	.quad .Ld9,.Le_quad
+	.quad .Ls10,.Le_byte
+	.quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+	.quad .Ls11,.Lzero_rest
+	.quad .Ld11,.Lzero_rest
+#endif
+	.quad .Le5,.Le_zero
+	.previous
+
+	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
+	   pessimistic side. this is gross. it would be better to fix the
+	   interface. */
+	/* eax: zero, ebx: 64 */
+.Ls1e: 	addl $8,%eax
+.Ls2e: 	addl $8,%eax
+.Ls3e: 	addl $8,%eax
+.Ls4e: 	addl $8,%eax
+.Ls5e: 	addl $8,%eax
+.Ls6e: 	addl $8,%eax
+.Ls7e: 	addl $8,%eax
+.Ls8e: 	addl $8,%eax
+	addq %rbx,%rdi	/* +64 */
+	subq %rax,%rdi  /* correct destination with computed offset */
+
+	shlq $6,%rdx	/* loop counter * 64 (stride length) */
+	addq %rax,%rdx	/* add offset to loopcnt */
+	andl $63,%ecx	/* remaining bytes */
+	addq %rcx,%rdx	/* add them */
+	jmp .Lzero_rest
+
+	/* exception on quad word loop in tail handling */
+	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+	shll $3,%ecx
+	andl $7,%edx
+	addl %ecx,%edx
+	/* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+	cmpl $0,(%rsp)	/* zero flag set? */
+	jz   .Le_zero
+	movq %rdx,%rcx
+.Le_byte:
+	xorl %eax,%eax
+.Le5:	rep
+	stosb
+	/* when there is another exception while zeroing the rest just return */
+.Le_zero:
+	movq %rdx,%rax
+	jmp .Lende
+	CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
Index: linux/include/asm-x86_64/uaccess.h
===================================================================
--- linux.orig/include/asm-x86_64/uaccess.h
+++ linux/include/asm-x86_64/uaccess.h
@@ -367,4 +367,18 @@ __copy_to_user_inatomic(void __user *dst
 	return copy_user_generic((__force void *)dst, src, size);
 }
 
+#define ARCH_HAS_NOCACHE_UACCESS 1
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest);
+
+static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
+{
+	might_sleep();
+	return __copy_user_nocache(dst, (__force void *)src, size, 1);
+}
+
+static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size)
+{
+	return __copy_user_nocache(dst, (__force void *)src, size, 0);
+}
+
 #endif /* __X86_64_UACCESS_H */
Index: linux/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
 EXPORT_SYMBOL(__put_user_8);
 
 EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(copy_from_user);
 EXPORT_SYMBOL(copy_to_user);
 EXPORT_SYMBOL(__copy_from_user_inatomic);

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [3/25] i386: Convert i386 PDA code to use %fs Andi Kleen
                   ` (22 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Amul Shah, Andi Kleen, Rohit Seth, patches, linux-kernel


From: Amul Shah <amul.shah@unisys.com>
Remove the statically allocated memory to NUMA node hash map in favor of a
dynamically allocated memory to node hash map (it is cache aligned).

This patch has the nice side effect in that it allows the hash map to grow
for systems with large amounts of memory (256GB - 1TB), but suffer from
having small PCI space tacked onto the boot node (which is somewhere
between 192MB to 512MB on the ES7000).

Signed-off-by: Amul Shah <amul.shah@unisys.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
Updated patch to fix a bug that Andi Kleen found for platforms that
don't support NUMA (or "numa=off").

---
 arch/x86_64/kernel/e820.c   |    7 ++++
 arch/x86_64/kernel/setup.c  |    5 ++
 arch/x86_64/mm/numa.c       |   74 ++++++++++++++++++++++++++++++++++++++------
 include/asm-x86_64/e820.h   |    1 
 include/asm-x86_64/mmzone.h |   13 ++++---
 5 files changed, 85 insertions(+), 15 deletions(-)

Index: linux/arch/x86_64/kernel/e820.c
===================================================================
--- linux.orig/arch/x86_64/kernel/e820.c
+++ linux/arch/x86_64/kernel/e820.c
@@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long
 		return 1;
 	}
 
+#ifdef CONFIG_NUMA
+	/* NUMA memory to node map */
+	if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+		*addrp = nodemap_addr + nodemap_size;
+		return 1;
+	}
+#endif
 	/* XXX ramdisk image here? */ 
 	return 0;
 } 
Index: linux/arch/x86_64/kernel/setup.c
===================================================================
--- linux.orig/arch/x86_64/kernel/setup.c
+++ linux/arch/x86_64/kernel/setup.c
@@ -444,6 +444,11 @@ void __init setup_arch(char **cmdline_p)
 	/* reserve ebda region */
 	if (ebda_addr)
 		reserve_bootmem_generic(ebda_addr, ebda_size);
+#ifdef CONFIG_NUMA
+	/* reserve nodemap region */
+	if (nodemap_addr)
+		reserve_bootmem_generic(nodemap_addr, nodemap_size);
+#endif
 
 #ifdef CONFIG_SMP
 	/*
Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_A
 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
 int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;
 
 
 /*
@@ -52,34 +54,87 @@ populate_memnodemap(const struct bootnod
 	int res = -1;
 	unsigned long addr, end;
 
-	if (shift >= 64)
-		return -1;
-	memset(memnodemap, 0xff, sizeof(memnodemap));
+	memset(memnodemap, 0xff, memnodemapsize);
 	for (i = 0; i < numnodes; i++) {
 		addr = nodes[i].start;
 		end = nodes[i].end;
 		if (addr >= end)
 			continue;
-		if ((end >> shift) >= NODEMAPSIZE)
+		if ((end >> shift) >= memnodemapsize)
 			return 0;
 		do {
 			if (memnodemap[addr >> shift] != 0xff)
 				return -1;
 			memnodemap[addr >> shift] = i;
-                       addr += (1UL << shift);
+			addr += (1UL << shift);
 		} while (addr < end);
 		res = 1;
 	} 
 	return res;
 }
 
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+static int __init allocate_cachealigned_memnodemap(void)
+{
+	unsigned long pad, pad_addr;
+
+	memnodemap = memnode.embedded_map;
+	if (memnodemapsize <= 48) {
+		printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+		       nodemap_addr, nodemap_addr + nodemap_size);
+		return 0;
+	}
+
+	pad = L1_CACHE_BYTES - 1;
+	pad_addr = 0x8000;
+	nodemap_size = pad + memnodemapsize;
+	nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+				      nodemap_size);
+	if (nodemap_addr == -1UL) {
+		printk(KERN_ERR
+		       "NUMA: Unable to allocate Memory to Node hash map\n");
+		nodemap_addr = nodemap_size = 0;
+		return -1;
+	}
+	pad_addr = (nodemap_addr + pad) & ~pad;
+	memnodemap = phys_to_virt(pad_addr);
+
+	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+	       nodemap_addr, nodemap_addr + nodemap_size);
+	return 0;
+}
+
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
 {
-	int shift = 20;
+	int i;
+	unsigned long start, end;
+	unsigned long bitfield = 0, memtop = 0;
 
-	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
-		shift++;
+	for (i = 0; i < numnodes; i++) {
+		start = nodes[i].start;
+		end = nodes[i].end;
+		if (start >= end)
+			continue;
+		bitfield |= start | end;
+		if (end > memtop)
+			memtop = end;
+	}
+	i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+	memnodemapsize = (memtop >> i)+1;
+	return i;
+}
+
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+	int shift;
 
+	shift = extract_lsb_from_nodes(nodes, numnodes);
+	if (allocate_cachealigned_memnodemap())
+		return -1;
 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 		shift);
 
@@ -290,6 +345,7 @@ void __init numa_initmem_init(unsigned l
 	       end_pfn << PAGE_SHIFT); 
 		/* setup dummy node covering all memory */ 
 	memnode_shift = 63; 
+	memnodemap = memnode.embedded_map;
 	memnodemap[0] = 0;
 	nodes_clear(node_online_map);
 	node_set_online(0);
Index: linux/include/asm-x86_64/e820.h
===================================================================
--- linux.orig/include/asm-x86_64/e820.h
+++ linux/include/asm-x86_64/e820.h
@@ -56,6 +56,7 @@ extern void finish_e820_parsing(void);
 extern struct e820map e820;
 
 extern unsigned ebda_addr, ebda_size;
+extern unsigned long nodemap_addr, nodemap_size;
 #endif/*!__ASSEMBLY__*/
 
 #endif/*__E820_HEADER*/
Index: linux/include/asm-x86_64/mmzone.h
===================================================================
--- linux.orig/include/asm-x86_64/mmzone.h
+++ linux/include/asm-x86_64/mmzone.h
@@ -11,24 +11,25 @@
 
 #include <asm/smp.h>
 
-/* Should really switch to dynamic allocation at some point */
-#define NODEMAPSIZE 0x4fff
-
 /* Simple perfect hash to map physical addresses to node numbers */
 struct memnode {
 	int shift;
-	u8 map[NODEMAPSIZE];
-} ____cacheline_aligned;
+	unsigned int mapsize;
+	u8 *map;
+	u8 embedded_map[64-16];
+} ____cacheline_aligned; /* total size = 64 bytes */
 extern struct memnode memnode;
 #define memnode_shift memnode.shift
 #define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize
 
 extern struct pglist_data *node_data[];
 
 static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 
 { 
 	unsigned nid; 
-	VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+	VIRTUAL_BUG_ON(!memnodemap);
+	VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize);
 	nid = memnodemap[addr >> memnode_shift]; 
 	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
 	return nid; 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [3/25] i386: Convert i386 PDA code to use %fs
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Andi Kleen
                   ` (21 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Ian Campbell, Andi Kleen, Eric Dumazet,
	patches, linux-kernel


From: Jeremy Fitzhardinge <jeremy@goop.org>

Convert the PDA code to use %fs rather than %gs as the segment for
per-processor data.  This is because some processors show a small but
measurable performance gain for reloading a NULL segment selector (as %fs
generally is in user-space) versus a non-NULL one (as %gs generally is).

On modern processors the difference is very small, perhaps undetectable. 
Some old AMD "K6 3D+" processors are noticably slower when %fs is used
rather than %gs; I have no idea why this might be, but I think they're
sufficiently rare that it doesn't matter much.

This patch also fixes the math emulator, which had not been adjusted to
match the changed struct pt_regs.

[frederik.deweerdt@gmail.com: fixit with gdb]
[mingo@elte.hu: Fix KVM too]

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Ian Campbell <Ian.Campbell@XenSource.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <andi@muc.de>
Acked-by: Zachary Amsden <zach@vmware.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/kernel/asm-offsets.c   |    2 +-
 arch/i386/kernel/cpu/common.c    |   14 +++++++-------
 arch/i386/kernel/entry.S         |   32 ++++++++++++++++----------------
 arch/i386/kernel/head.S          |    6 +++---
 arch/i386/kernel/kprobes.c       |    4 ++--
 arch/i386/kernel/process.c       |   24 +++++++++++-------------
 arch/i386/kernel/ptrace.c        |   16 ++++++++--------
 arch/i386/kernel/signal.c        |   10 +++++-----
 arch/i386/kernel/traps.c         |    7 ++++---
 arch/i386/kernel/vm86.c          |   33 +++++++++++++++++----------------
 arch/i386/math-emu/get_address.c |   14 +++++---------
 drivers/kvm/vmx.c                |   12 ++++++------
 include/asm-i386/elf.h           |    4 ++--
 include/asm-i386/mmu_context.h   |    2 +-
 include/asm-i386/pda.h           |   12 ++++++------
 include/asm-i386/processor.h     |    6 +++---
 include/asm-i386/ptrace.h        |    4 ++--
 17 files changed, 99 insertions(+), 103 deletions(-)

Index: linux/arch/i386/kernel/asm-offsets.c
===================================================================
--- linux.orig/arch/i386/kernel/asm-offsets.c
+++ linux/arch/i386/kernel/asm-offsets.c
@@ -72,7 +72,7 @@ void foo(void)
 	OFFSET(PT_EAX, pt_regs, eax);
 	OFFSET(PT_DS,  pt_regs, xds);
 	OFFSET(PT_ES,  pt_regs, xes);
-	OFFSET(PT_GS,  pt_regs, xgs);
+	OFFSET(PT_FS,  pt_regs, xfs);
 	OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
 	OFFSET(PT_EIP, pt_regs, eip);
 	OFFSET(PT_CS,  pt_regs, xcs);
Index: linux/arch/i386/kernel/cpu/common.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/common.c
+++ linux/arch/i386/kernel/cpu/common.c
@@ -605,7 +605,7 @@ void __init early_cpu_init(void)
 struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
-	regs->xgs = __KERNEL_PDA;
+	regs->xfs = __KERNEL_PDA;
 	return regs;
 }
 
@@ -662,12 +662,12 @@ struct i386_pda boot_pda = {
 	.pcurrent = &init_task,
 };
 
-static inline void set_kernel_gs(void)
+static inline void set_kernel_fs(void)
 {
-	/* Set %gs for this CPU's PDA.  Memory clobber is to create a
+	/* Set %fs for this CPU's PDA.  Memory clobber is to create a
 	   barrier with respect to any PDA operations, so the compiler
 	   doesn't move any before here. */
-	asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
 }
 
 /* Initialize the CPU's GDT and PDA.  The boot CPU does this for
@@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu)
 	   the boot CPU, this will transition from the boot gdt+pda to
 	   the real ones). */
 	load_gdt(cpu_gdt_descr);
-	set_kernel_gs();
+	set_kernel_fs();
 }
 
 /* Common CPU init for both boot and secondary CPUs */
@@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu,
 	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
 #endif
 
-	/* Clear %fs. */
-	asm volatile ("mov %0, %%fs" : : "r" (0));
+	/* Clear %gs. */
+	asm volatile ("mov %0, %%gs" : : "r" (0));
 
 	/* Clear all 6 debug registers: */
 	set_debugreg(0, 0);
Index: linux/arch/i386/kernel/entry.S
===================================================================
--- linux.orig/arch/i386/kernel/entry.S
+++ linux/arch/i386/kernel/entry.S
@@ -30,7 +30,7 @@
  *	18(%esp) - %eax
  *	1C(%esp) - %ds
  *	20(%esp) - %es
- *	24(%esp) - %gs
+ *	24(%esp) - %fs
  *	28(%esp) - orig_eax
  *	2C(%esp) - %eip
  *	30(%esp) - %cs
@@ -99,9 +99,9 @@ VM_MASK		= 0x00020000
 
 #define SAVE_ALL \
 	cld; \
-	pushl %gs; \
+	pushl %fs; \
 	CFI_ADJUST_CFA_OFFSET 4;\
-	/*CFI_REL_OFFSET gs, 0;*/\
+	/*CFI_REL_OFFSET fs, 0;*/\
 	pushl %es; \
 	CFI_ADJUST_CFA_OFFSET 4;\
 	/*CFI_REL_OFFSET es, 0;*/\
@@ -133,7 +133,7 @@ VM_MASK		= 0x00020000
 	movl %edx, %ds; \
 	movl %edx, %es; \
 	movl $(__KERNEL_PDA), %edx; \
-	movl %edx, %gs
+	movl %edx, %fs
 
 #define RESTORE_INT_REGS \
 	popl %ebx;	\
@@ -166,9 +166,9 @@ VM_MASK		= 0x00020000
 2:	popl %es;	\
 	CFI_ADJUST_CFA_OFFSET -4;\
 	/*CFI_RESTORE es;*/\
-3:	popl %gs;	\
+3:	popl %fs;	\
 	CFI_ADJUST_CFA_OFFSET -4;\
-	/*CFI_RESTORE gs;*/\
+	/*CFI_RESTORE fs;*/\
 .pushsection .fixup,"ax";	\
 4:	movl $0,(%esp);	\
 	jmp 1b;		\
@@ -349,11 +349,11 @@ sysenter_past_esp:
 	movl PT_OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
-1:	mov  PT_GS(%esp), %gs
+1:	mov  PT_FS(%esp), %fs
 	ENABLE_INTERRUPTS_SYSEXIT
 	CFI_ENDPROC
 .pushsection .fixup,"ax"
-2:	movl $0,PT_GS(%esp)
+2:	movl $0,PT_FS(%esp)
 	jmp 1b
 .section __ex_table,"a"
 	.align 4
@@ -550,7 +550,7 @@ syscall_badsys:
 
 #define FIXUP_ESPFIX_STACK \
 	/* since we are on a wrong stack, we cant make it a C code :( */ \
-	movl %gs:PDA_cpu, %ebx; \
+	movl %fs:PDA_cpu, %ebx; \
 	PER_CPU(cpu_gdt_descr, %ebx); \
 	movl GDS_address(%ebx), %ebx; \
 	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
@@ -632,7 +632,7 @@ KPROBE_ENTRY(page_fault)
 	CFI_ADJUST_CFA_OFFSET 4
 	ALIGN
 error_code:
-	/* the function address is in %gs's slot on the stack */
+	/* the function address is in %fs's slot on the stack */
 	pushl %es
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET es, 0*/
@@ -661,20 +661,20 @@ error_code:
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ebx, 0
 	cld
-	pushl %gs
+	pushl %fs
 	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET gs, 0*/
+	/*CFI_REL_OFFSET fs, 0*/
 	movl $(__KERNEL_PDA), %ecx
-	movl %ecx, %gs
+	movl %ecx, %fs
 	UNWIND_ESPFIX_STACK
 	popl %ecx
 	CFI_ADJUST_CFA_OFFSET -4
 	/*CFI_REGISTER es, ecx*/
-	movl PT_GS(%esp), %edi		# get the function address
+	movl PT_FS(%esp), %edi		# get the function address
 	movl PT_ORIG_EAX(%esp), %edx	# get the error code
 	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
-	mov  %ecx, PT_GS(%esp)
-	/*CFI_REL_OFFSET gs, ES*/
+	mov  %ecx, PT_FS(%esp)
+	/*CFI_REL_OFFSET fs, ES*/
 	movl $(__USER_DS), %ecx
 	movl %ecx, %ds
 	movl %ecx, %es
Index: linux/arch/i386/kernel/head.S
===================================================================
--- linux.orig/arch/i386/kernel/head.S
+++ linux/arch/i386/kernel/head.S
@@ -319,12 +319,12 @@ is386:	movl $2,%ecx		# set MP
 	movl %eax,%ds
 	movl %eax,%es
 
-	xorl %eax,%eax			# Clear FS and LDT
-	movl %eax,%fs
+	xorl %eax,%eax			# Clear GS and LDT
+	movl %eax,%gs
 	lldt %ax
 
 	movl $(__KERNEL_PDA),%eax
-	mov  %eax,%gs
+	mov  %eax,%fs
 
 	cld			# gcc2 wants the direction flag cleared at all times
 	pushl $0		# fake return address for unwinder
Index: linux/arch/i386/kernel/kprobes.c
===================================================================
--- linux.orig/arch/i386/kernel/kprobes.c
+++ linux/arch/i386/kernel/kprobes.c
@@ -363,7 +363,7 @@ no_kprobe:
 			"	pushf\n"
 			/* skip cs, eip, orig_eax */
 			"	subl $12, %esp\n"
-			"	pushl %gs\n"
+			"	pushl %fs\n"
 			"	pushl %ds\n"
 			"	pushl %es\n"
 			"	pushl %eax\n"
@@ -387,7 +387,7 @@ no_kprobe:
 			"	popl %edi\n"
 			"	popl %ebp\n"
 			"	popl %eax\n"
-			/* skip eip, orig_eax, es, ds, gs */
+			/* skip eip, orig_eax, es, ds, fs */
 			"	addl $20, %esp\n"
 			"	popf\n"
 			"	ret\n");
Index: linux/arch/i386/kernel/process.c
===================================================================
--- linux.orig/arch/i386/kernel/process.c
+++ linux/arch/i386/kernel/process.c
@@ -308,8 +308,8 @@ void show_regs(struct pt_regs * regs)
 		regs->eax,regs->ebx,regs->ecx,regs->edx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx",
 		regs->esi, regs->edi, regs->ebp);
-	printk(" DS: %04x ES: %04x GS: %04x\n",
-	       0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
+	printk(" DS: %04x ES: %04x FS: %04x\n",
+	       0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
@@ -340,7 +340,7 @@ int kernel_thread(int (*fn)(void *), voi
 
 	regs.xds = __USER_DS;
 	regs.xes = __USER_DS;
-	regs.xgs = __KERNEL_PDA;
+	regs.xfs = __KERNEL_PDA;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
 	regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -425,7 +425,7 @@ int copy_thread(int nr, unsigned long cl
 
 	p->thread.eip = (unsigned long) ret_from_fork;
 
-	savesegment(fs,p->thread.fs);
+	savesegment(gs,p->thread.gs);
 
 	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -501,8 +501,8 @@ void dump_thread(struct pt_regs * regs, 
 	dump->regs.eax = regs->eax;
 	dump->regs.ds = regs->xds;
 	dump->regs.es = regs->xes;
-	savesegment(fs,dump->regs.fs);
-	dump->regs.gs = regs->xgs;
+	dump->regs.fs = regs->xfs;
+	savesegment(gs,dump->regs.gs);
 	dump->regs.orig_eax = regs->orig_eax;
 	dump->regs.eip = regs->eip;
 	dump->regs.cs = regs->xcs;
@@ -653,7 +653,7 @@ struct task_struct fastcall * __switch_t
 	load_esp0(tss, next);
 
 	/*
-	 * Save away %fs. No need to save %gs, as it was saved on the
+	 * Save away %gs. No need to save %fs, as it was saved on the
 	 * stack on entry.  No need to save %es and %ds, as those are
 	 * always kernel segments while inside the kernel.  Doing this
 	 * before setting the new TLS descriptors avoids the situation
@@ -662,7 +662,7 @@ struct task_struct fastcall * __switch_t
 	 * used %fs or %gs (it does not today), or if the kernel is
 	 * running inside of a hypervisor layer.
 	 */
-	savesegment(fs, prev->fs);
+	savesegment(gs, prev->gs);
 
 	/*
 	 * Load the per-thread Thread-Local Storage descriptor.
@@ -670,12 +670,10 @@ struct task_struct fastcall * __switch_t
 	load_TLS(next, cpu);
 
 	/*
-	 * Restore %fs if needed.
-	 *
-	 * Glibc normally makes %fs be zero.
+	 * Restore %gs if needed (which is common)
 	 */
-	if (unlikely(prev->fs | next->fs))
-		loadsegment(fs, next->fs);
+	if (prev->gs | next->gs)
+		loadsegment(gs, next->gs);
 
 	write_pda(pcurrent, next_p);
 
Index: linux/arch/i386/kernel/ptrace.c
===================================================================
--- linux.orig/arch/i386/kernel/ptrace.c
+++ linux/arch/i386/kernel/ptrace.c
@@ -89,14 +89,14 @@ static int putreg(struct task_struct *ch
 	unsigned long regno, unsigned long value)
 {
 	switch (regno >> 2) {
-		case FS:
+		case GS:
 			if (value && (value & 3) != 3)
 				return -EIO;
-			child->thread.fs = value;
+			child->thread.gs = value;
 			return 0;
 		case DS:
 		case ES:
-		case GS:
+		case FS:
 			if (value && (value & 3) != 3)
 				return -EIO;
 			value &= 0xffff;
@@ -112,7 +112,7 @@ static int putreg(struct task_struct *ch
 			value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
 			break;
 	}
-	if (regno > ES*4)
+	if (regno > FS*4)
 		regno -= 1*4;
 	put_stack_long(child, regno, value);
 	return 0;
@@ -124,18 +124,18 @@ static unsigned long getreg(struct task_
 	unsigned long retval = ~0UL;
 
 	switch (regno >> 2) {
-		case FS:
-			retval = child->thread.fs;
+		case GS:
+			retval = child->thread.gs;
 			break;
 		case DS:
 		case ES:
-		case GS:
+		case FS:
 		case SS:
 		case CS:
 			retval = 0xffff;
 			/* fall through */
 		default:
-			if (regno > ES*4)
+			if (regno > FS*4)
 				regno -= 1*4;
 			retval &= get_stack_long(child, regno);
 	}
Index: linux/arch/i386/kernel/signal.c
===================================================================
--- linux.orig/arch/i386/kernel/signal.c
+++ linux/arch/i386/kernel/signal.c
@@ -128,8 +128,8 @@ restore_sigcontext(struct pt_regs *regs,
 			 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
 			 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
 
-	COPY_SEG(gs);
-	GET_SEG(fs);
+	GET_SEG(gs);
+	COPY_SEG(fs);
 	COPY_SEG(es);
 	COPY_SEG(ds);
 	COPY(edi);
@@ -244,9 +244,9 @@ setup_sigcontext(struct sigcontext __use
 {
 	int tmp, err = 0;
 
-	err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
-	savesegment(fs, tmp);
-	err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+	err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+	savesegment(gs, tmp);
+	err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
 
 	err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
 	err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
Index: linux/arch/i386/kernel/traps.c
===================================================================
--- linux.orig/arch/i386/kernel/traps.c
+++ linux/arch/i386/kernel/traps.c
@@ -291,10 +291,11 @@ void show_registers(struct pt_regs *regs
 	int i;
 	int in_kernel = 1;
 	unsigned long esp;
-	unsigned short ss;
+	unsigned short ss, gs;
 
 	esp = (unsigned long) (&regs->esp);
 	savesegment(ss, ss);
+	savesegment(gs, gs);
 	if (user_mode_vm(regs)) {
 		in_kernel = 0;
 		esp = regs->esp;
@@ -313,8 +314,8 @@ void show_registers(struct pt_regs *regs
 		regs->eax, regs->ebx, regs->ecx, regs->edx);
 	printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 		regs->esi, regs->edi, regs->ebp, esp);
-	printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x\n",
-		regs->xds & 0xffff, regs->xes & 0xffff, ss);
+	printk(KERN_EMERG "ds: %04x   es: %04x   fs: %04x  gs: %04x  ss: %04x\n",
+	       regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
 	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
 		TASK_COMM_LEN, current->comm, current->pid,
 		current_thread_info(), current, current->thread_info);
Index: linux/arch/i386/kernel/vm86.c
===================================================================
--- linux.orig/arch/i386/kernel/vm86.c
+++ linux/arch/i386/kernel/vm86.c
@@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct
 {
 	int ret = 0;
 
-	/* kernel_vm86_regs is missing xfs, so copy everything up to
-	   (but not including) xgs, and then rest after xgs. */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
-	ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
+	/* kernel_vm86_regs is missing xgs, so copy everything up to
+	   (but not including) orig_eax, and then rest including orig_eax. */
+	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
 			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.xgs));
+			    offsetof(struct kernel_vm86_regs, pt.orig_eax));
 
 	return ret;
 }
@@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(stru
 {
 	int ret = 0;
 
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
-	ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
+	/* copy eax-xfs inclusive */
+	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+	/* copy orig_eax-__gsh+extra */
+	ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
 			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.xgs) +
+			      offsetof(struct kernel_vm86_regs, pt.orig_eax) +
 			      extra);
-
 	return ret;
 }
 
@@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state
 
 	ret = KVM86->regs32;
 
-	loadsegment(fs, current->thread.saved_fs);
-	ret->xgs = current->thread.saved_gs;
+	ret->xfs = current->thread.saved_fs;
+	loadsegment(gs, current->thread.saved_gs);
 
 	return ret;
 }
@@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm
  */
 	info->regs.pt.xds = 0;
 	info->regs.pt.xes = 0;
-	info->regs.pt.xgs = 0;
+	info->regs.pt.xfs = 0;
 
-/* we are clearing fs later just before "jmp resume_userspace",
+/* we are clearing gs later just before "jmp resume_userspace",
  * because it is not saved/restored.
  */
 
@@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm
  */
 	info->regs32->eax = 0;
 	tsk->thread.saved_esp0 = tsk->thread.esp0;
-	savesegment(fs, tsk->thread.saved_fs);
-	tsk->thread.saved_gs = info->regs32->xgs;
+	tsk->thread.saved_fs = info->regs32->xfs;
+	savesegment(gs, tsk->thread.saved_gs);
 
 	tss = &per_cpu(init_tss, get_cpu());
 	tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
 		"movl %1,%%ebp\n\t"
-		"mov  %2, %%fs\n\t"
+		"mov  %2, %%gs\n\t"
 		"jmp resume_userspace"
 		: /* no outputs */
 		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
Index: linux/arch/i386/math-emu/get_address.c
===================================================================
--- linux.orig/arch/i386/math-emu/get_address.c
+++ linux/arch/i386/math-emu/get_address.c
@@ -56,15 +56,14 @@ static int reg_offset_vm86[] = {
 #define VM86_REG_(x) (*(unsigned short *) \
 		      (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))
 
-/* These are dummy, fs and gs are not saved on the stack. */
-#define ___FS ___ds
+/* This dummy, gs is not saved on the stack. */
 #define ___GS ___ds
 
 static int reg_offset_pm[] = {
 	offsetof(struct info,___cs),
 	offsetof(struct info,___ds),
 	offsetof(struct info,___es),
-	offsetof(struct info,___FS),
+	offsetof(struct info,___fs),
 	offsetof(struct info,___GS),
 	offsetof(struct info,___ss),
 	offsetof(struct info,___ds)
@@ -169,13 +168,10 @@ static long pm_address(u_char FPU_modrm,
 
   switch ( segment )
     {
-      /* fs and gs aren't used by the kernel, so they still have their
-	 user-space values. */
-    case PREFIX_FS_-1:
-      /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
-      savesegment(fs, addr->selector);
-      break;
+      /* gs isn't used by the kernel, so it still has its
+	 user-space value. */
     case PREFIX_GS_-1:
+      /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
       savesegment(gs, addr->selector);
       break;
     default:
Index: linux/include/asm-i386/elf.h
===================================================================
--- linux.orig/include/asm-i386/elf.h
+++ linux/include/asm-i386/elf.h
@@ -90,8 +90,8 @@ typedef struct user_fxsr_struct elf_fpxr
 	pr_reg[6] = regs->eax;				\
 	pr_reg[7] = regs->xds;				\
 	pr_reg[8] = regs->xes;				\
-	savesegment(fs,pr_reg[9]);			\
-	pr_reg[10] = regs->xgs;				\
+	pr_reg[9] = regs->xfs;				\
+	savesegment(gs,pr_reg[10]);			\
 	pr_reg[11] = regs->orig_eax;			\
 	pr_reg[12] = regs->eip;				\
 	pr_reg[13] = regs->xcs;				\
Index: linux/include/asm-i386/mmu_context.h
===================================================================
--- linux.orig/include/asm-i386/mmu_context.h
+++ linux/include/asm-i386/mmu_context.h
@@ -63,7 +63,7 @@ static inline void switch_mm(struct mm_s
 }
 
 #define deactivate_mm(tsk, mm)			\
-	asm("movl %0,%%fs": :"r" (0));
+	asm("movl %0,%%gs": :"r" (0));
 
 #define activate_mm(prev, next) \
 	switch_mm((prev),(next),NULL)
Index: linux/include/asm-i386/pda.h
===================================================================
--- linux.orig/include/asm-i386/pda.h
+++ linux/include/asm-i386/pda.h
@@ -39,19 +39,19 @@ extern struct i386_pda _proxy_pda;
 		if (0) { T__ tmp__; tmp__ = (val); }			\
 		switch (sizeof(_proxy_pda.field)) {			\
 		case 1:							\
-			asm(op "b %1,%%gs:%c2"				\
+			asm(op "b %1,%%fs:%c2"				\
 			    : "+m" (_proxy_pda.field)			\
 			    :"ri" ((T__)val),				\
 			     "i"(pda_offset(field)));			\
 			break;						\
 		case 2:							\
-			asm(op "w %1,%%gs:%c2"				\
+			asm(op "w %1,%%fs:%c2"				\
 			    : "+m" (_proxy_pda.field)			\
 			    :"ri" ((T__)val),				\
 			     "i"(pda_offset(field)));			\
 			break;						\
 		case 4:							\
-			asm(op "l %1,%%gs:%c2"				\
+			asm(op "l %1,%%fs:%c2"				\
 			    : "+m" (_proxy_pda.field)			\
 			    :"ri" ((T__)val),				\
 			     "i"(pda_offset(field)));			\
@@ -65,19 +65,19 @@ extern struct i386_pda _proxy_pda;
 		typeof(_proxy_pda.field) ret__;				\
 		switch (sizeof(_proxy_pda.field)) {			\
 		case 1:							\
-			asm(op "b %%gs:%c1,%0"				\
+			asm(op "b %%fs:%c1,%0"				\
 			    : "=r" (ret__)				\
 			    : "i" (pda_offset(field)),			\
 			      "m" (_proxy_pda.field));			\
 			break;						\
 		case 2:							\
-			asm(op "w %%gs:%c1,%0"				\
+			asm(op "w %%fs:%c1,%0"				\
 			    : "=r" (ret__)				\
 			    : "i" (pda_offset(field)),			\
 			      "m" (_proxy_pda.field));			\
 			break;						\
 		case 4:							\
-			asm(op "l %%gs:%c1,%0"				\
+			asm(op "l %%fs:%c1,%0"				\
 			    : "=r" (ret__)				\
 			    : "i" (pda_offset(field)),			\
 			      "m" (_proxy_pda.field));			\
Index: linux/include/asm-i386/processor.h
===================================================================
--- linux.orig/include/asm-i386/processor.h
+++ linux/include/asm-i386/processor.h
@@ -424,7 +424,7 @@ struct thread_struct {
 	.vm86_info = NULL,						\
 	.sysenter_cs = __KERNEL_CS,					\
 	.io_bitmap_ptr = NULL,						\
-	.gs = __KERNEL_PDA,						\
+	.fs = __KERNEL_PDA,						\
 }
 
 /*
@@ -442,8 +442,8 @@ struct thread_struct {
 }
 
 #define start_thread(regs, new_eip, new_esp) do {		\
-	__asm__("movl %0,%%fs": :"r" (0));			\
-	regs->xgs = 0;						\
+	__asm__("movl %0,%%gs": :"r" (0));			\
+	regs->xfs = 0;						\
 	set_fs(USER_DS);					\
 	regs->xds = __USER_DS;					\
 	regs->xes = __USER_DS;					\
Index: linux/include/asm-i386/ptrace.h
===================================================================
--- linux.orig/include/asm-i386/ptrace.h
+++ linux/include/asm-i386/ptrace.h
@@ -16,8 +16,8 @@ struct pt_regs {
 	long eax;
 	int  xds;
 	int  xes;
-	/* int  xfs; */
-	int  xgs;
+	int  xfs;
+	/* int  xgs; */
 	long orig_eax;
 	long eip;
 	int  xcs;
Index: linux/drivers/kvm/vmx.c
===================================================================
--- linux.orig/drivers/kvm/vmx.c
+++ linux/drivers/kvm/vmx.c
@@ -1863,12 +1863,6 @@ again:
 	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 #endif
 
-	/*
-	 * Profile KVM exit RIPs:
-	 */
-	if (unlikely(prof_on == KVM_PROFILING))
-		profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
-
 	kvm_run->exit_type = 0;
 	if (fail) {
 		kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
@@ -1891,6 +1885,12 @@ again:
 
 			reload_tss();
 		}
+		/*
+		 * Profile KVM exit RIPs:
+		 */
+		if (unlikely(prof_on == KVM_PROFILING))
+			profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
+
 		vcpu->launched = 1;
 		kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
 		r = kvm_handle_exit(kvm_run, vcpu);

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [3/25] i386: Convert i386 PDA code to use %fs Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-12  9:32   ` [patches] " Jan Beulich
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [5/25] i386: revert i386-fix-the-verify_quirk_intel_irqbalance Andi Kleen
                   ` (20 subsequent siblings)
  23 siblings, 1 reply; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Jeff Dike, Andi Kleen, patches, linux-kernel


From: Jeff Dike <jdike@addtoit.com>

Kernel-mode traps on x86_64 can pollute the trap information for a previous
userspace trap for which the signal has not yet been delivered to the
process.

do_trap and do_general_protection set task->thread.error_code and .trapno
for kernel traps.  If a kernel-mode trap arrives between the arrival of a
userspace trap and the delivery of the associated SISGEGV to the process,
the process will get the kernel trap information in its sigcontext.

This causes UML process segfaults, as the trapno that the UML kernel sees
is 13, rather than the 14 for normal page faults.  So, the UML kernel
passes the SIGSEGV along to its process.

I don't claim to fully understand the problem.  On the one hand, a check in
do_general_protection for a pending SIGSEGV turned up nothing.  On the
other hand, this patch fixed the UML process segfault problem.

The patch below moves the setting of error_code and trapno so that that
only happens in the case of userspace faults.  As a side-effect, this
should speed up kernel-mode fault handling a tiny bit.

I looked at i386, and there is a similar situation.  In this case, there is
duplicate code setting task->thread.error_code and trapno.  I deleted one,
leaving the copy that runs in the case of a userspace fault.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/kernel/traps.c   |    8 +++-----
 arch/x86_64/kernel/traps.c |   12 ++++++------
 2 files changed, 9 insertions(+), 11 deletions(-)

Index: linux/arch/i386/kernel/traps.c
===================================================================
--- linux.orig/arch/i386/kernel/traps.c
+++ linux/arch/i386/kernel/traps.c
@@ -474,8 +474,6 @@ static void __kprobes do_trap(int trapnr
 			      siginfo_t *info)
 {
 	struct task_struct *tsk = current;
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = trapnr;
 
 	if (regs->eflags & VM_MASK) {
 		if (vm86)
@@ -487,6 +485,9 @@ static void __kprobes do_trap(int trapnr
 		goto kernel_trap;
 
 	trap_signal: {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+
 		if (info)
 			force_sig_info(signr, info, tsk);
 		else
@@ -601,9 +602,6 @@ fastcall void __kprobes do_general_prote
 	}
 	put_cpu();
 
-	current->thread.error_code = error_code;
-	current->thread.trap_no = 13;
-
 	if (regs->eflags & VM_MASK)
 		goto gp_in_vm86;
 
Index: linux/arch/x86_64/kernel/traps.c
===================================================================
--- linux.orig/arch/x86_64/kernel/traps.c
+++ linux/arch/x86_64/kernel/traps.c
@@ -581,10 +581,10 @@ static void __kprobes do_trap(int trapnr
 {
 	struct task_struct *tsk = current;
 
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = trapnr;
-
 	if (user_mode(regs)) {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+
 		if (exception_trace && unhandled_signal(tsk, signr))
 			printk(KERN_INFO
 			       "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
@@ -682,10 +682,10 @@ asmlinkage void __kprobes do_general_pro
 
 	conditional_sti(regs);
 
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 13;
-
 	if (user_mode(regs)) {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = 13;
+
 		if (exception_trace && unhandled_signal(tsk, SIGSEGV))
 			printk(KERN_INFO
 		       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [5/25] i386: revert i386-fix-the-verify_quirk_intel_irqbalance
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (2 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [6/25] x86_64: revert x86_64-mm-add-genapic_force Andi Kleen
                   ` (19 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Andrew Morton, Suresh Siddha, Andi Kleen, Ingo Molnar, patches,
	linux-kernel


From: Andrew Morton <akpm@osdl.org>

This is unneeded with Ingo's genapic rework.

Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/i386/kernel/quirks.c |   33 ++++-----------------------------
 1 file changed, 4 insertions(+), 29 deletions(-)

Index: linux/arch/i386/kernel/quirks.c
===================================================================
--- linux.orig/arch/i386/kernel/quirks.c
+++ linux/arch/i386/kernel/quirks.c
@@ -10,38 +10,13 @@
 #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
 static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
 {
-	u8 config, rev;
-	u32 word;
-
-	/* BIOS may enable hardware IRQ balancing for
-	 * E7520/E7320/E7525(revision ID 0x9 and below)
-	 * based platforms.
-	 * For those platforms, make sure that the genapic is set to 'flat'
-	 */
-	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
-	if (rev > 0x9)
-		return;
-
-	/* enable access to config space*/
-	pci_read_config_byte(dev, 0xf4, &config);
-	pci_write_config_byte(dev, 0xf4, config|0x2);
-
-	/* read xTPR register */
-	raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
-
-	if (!(word & (1 << 13))) {
 #ifdef CONFIG_X86_64
-		if (genapic !=  &apic_flat)
-			panic("APIC mode must be flat on this system\n");
+	if (genapic !=  &apic_flat)
+		panic("APIC mode must be flat on this system\n");
 #elif defined(CONFIG_X86_GENERICARCH)
-		if (genapic != &apic_default)
-			panic("APIC mode must be default(flat) on this system. Use apic=default\n");
+	if (genapic != &apic_default)
+		panic("APIC mode must be default(flat) on this system. Use apic=default\n");
 #endif
-	}
-
-	/* put back the original value for config space*/
-	if (!(config & 0x2))
-		pci_write_config_byte(dev, 0xf4, config);
 }
 
 void __init quirk_intel_irqbalance(void)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [6/25] x86_64: revert x86_64-mm-add-genapic_force
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (3 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [5/25] i386: revert i386-fix-the-verify_quirk_intel_irqbalance Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [7/25] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525 Andi Kleen
                   ` (18 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Andrew Morton, Suresh Siddha, Andi Kleen, Li, Shaohua,
	Ingo Molnar, patches, linux-kernel


From: Andrew Morton <akpm@osdl.org>

This is obsoleted by new Ingo genapic patches.

Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/x86_64/kernel/genapic.c |    9 +--------
 include/asm-x86_64/genapic.h |    2 +-
 2 files changed, 2 insertions(+), 9 deletions(-)

Index: linux/arch/x86_64/kernel/genapic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic.c
+++ linux/arch/x86_64/kernel/genapic.c
@@ -33,7 +33,7 @@ extern struct genapic apic_flat;
 extern struct genapic apic_physflat;
 
 struct genapic *genapic = &apic_flat;
-struct genapic *genapic_force;
+
 
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
@@ -46,13 +46,6 @@ void __init clustered_apic_check(void)
 	u8 cluster_cnt[NUM_APIC_CLUSTERS];
 	int max_apic = 0;
 
-	/* genapic selection can be forced because of certain quirks.
-	 */
-	if (genapic_force) {
-		genapic = genapic_force;
-		goto print;
-	}
-
 #if defined(CONFIG_ACPI)
 	/*
 	 * Some x86_64 machines use physical APIC mode regardless of how many
Index: linux/include/asm-x86_64/genapic.h
===================================================================
--- linux.orig/include/asm-x86_64/genapic.h
+++ linux/include/asm-x86_64/genapic.h
@@ -30,6 +30,6 @@ struct genapic {
 };
 
 
-extern struct genapic *genapic, *genapic_force, apic_flat;
+extern struct genapic *genapic;
 
 #endif

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [7/25] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (4 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [6/25] x86_64: revert x86_64-mm-add-genapic_force Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [8/25] x86_64: optimize & fix APIC mode setup Andi Kleen
                   ` (17 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Andrew Morton, Ingo Molnar, Suresh Siddha, Andi Kleen, Li,
	Shaohua, patches, linux-kernel


From: Andrew Morton <akpm@osdl.org>

Obsoleted by Ingo's genapic stuff.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/i386/kernel/acpi/earlyquirk.c |   21 ----------------
 arch/i386/kernel/quirks.c          |   46 ++++++++-----------------------------
 arch/i386/kernel/smpboot.c         |    7 -----
 arch/x86_64/kernel/early-quirks.c  |   13 ----------
 arch/x86_64/kernel/smpboot.c       |    8 ------
 include/asm-i386/genapic.h         |    2 -
 include/asm-i386/irq.h             |    2 -
 include/asm-x86_64/proto.h         |    1 
 8 files changed, 12 insertions(+), 88 deletions(-)

Index: linux/arch/i386/kernel/acpi/earlyquirk.c
===================================================================
--- linux.orig/arch/i386/kernel/acpi/earlyquirk.c
+++ linux/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,7 +10,6 @@
 #include <asm/pci-direct.h>
 #include <asm/acpi.h>
 #include <asm/apic.h>
-#include <asm/irq.h>
 
 #ifdef CONFIG_ACPI
 
@@ -50,24 +49,6 @@ static int __init check_bridge(int vendo
 	return 0;
 }
 
-static void check_intel(void)
-{
-	u16 vendor, device;
-
-	vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
-
-	if (vendor != PCI_VENDOR_ID_INTEL)
-		return;
-
-	device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
-#ifdef CONFIG_SMP
-	if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7525_MCH)
-		quirk_intel_irqbalance();
-#endif
-}
-
 void __init check_acpi_pci(void)
 {
 	int num, slot, func;
@@ -79,8 +60,6 @@ void __init check_acpi_pci(void)
 	if (!early_pci_allowed())
 		return;
 
-	check_intel();
-
 	/* Poor man's PCI discovery */
 	for (num = 0; num < 32; num++) {
 		for (slot = 0; slot < 32; slot++) {
Index: linux/arch/i386/kernel/quirks.c
===================================================================
--- linux.orig/arch/i386/kernel/quirks.c
+++ linux/arch/i386/kernel/quirks.c
@@ -3,23 +3,10 @@
  */
 #include <linux/pci.h>
 #include <linux/irq.h>
-#include <asm/pci-direct.h>
-#include <asm/genapic.h>
-#include <asm/cpu.h>
 
 #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
-static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
-{
-#ifdef CONFIG_X86_64
-	if (genapic !=  &apic_flat)
-		panic("APIC mode must be flat on this system\n");
-#elif defined(CONFIG_X86_GENERICARCH)
-	if (genapic != &apic_default)
-		panic("APIC mode must be default(flat) on this system. Use apic=default\n");
-#endif
-}
 
-void __init quirk_intel_irqbalance(void)
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 {
 	u8 config, rev;
 	u32 word;
@@ -29,18 +16,18 @@ void __init quirk_intel_irqbalance(void)
 	 * based platforms.
 	 * Disable SW irqbalance/affinity on those platforms.
 	 */
-	rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
+	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
 	if (rev > 0x9)
 		return;
 
 	printk(KERN_INFO "Intel E7520/7320/7525 detected.");
 
-	/* enable access to config space */
-	config = read_pci_config_byte(0, 0, 0, 0xf4);
-	write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
+	/* enable access to config space*/
+	pci_read_config_byte(dev, 0xf4, &config);
+	pci_write_config_byte(dev, 0xf4, config|0x2);
 
 	/* read xTPR register */
-	word = read_pci_config_16(0, 0, 0x40, 0x4c);
+	raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
 
 	if (!(word & (1 << 13))) {
 		printk(KERN_INFO "Disabling irq balancing and affinity\n");
@@ -51,24 +38,13 @@ void __init quirk_intel_irqbalance(void)
 #ifdef CONFIG_PROC_FS
 		no_irq_affinity = 1;
 #endif
-#ifdef CONFIG_HOTPLUG_CPU
-		printk(KERN_INFO "Disabling cpu hotplug control\n");
-		enable_cpu_hotplug = 0;
-#endif
-#ifdef CONFIG_X86_64
-		/* force the genapic selection to flat mode so that
-		 * interrupts can be redirected to more than one CPU.
-		 */
-		genapic_force = &apic_flat;
-#endif
 	}
 
-	/* put back the original value for config space */
+	/* put back the original value for config space*/
 	if (!(config & 0x2))
-		write_pci_config_byte(0, 0, 0, 0xf4, config);
+		pci_write_config_byte(dev, 0xf4, config);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  verify_quirk_intel_irqbalance);
-
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7320_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7525_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7520_MCH,	quirk_intel_irqbalance);
 #endif
Index: linux/arch/i386/kernel/smpboot.c
===================================================================
--- linux.orig/arch/i386/kernel/smpboot.c
+++ linux/arch/i386/kernel/smpboot.c
@@ -58,7 +58,6 @@
 #include <asm/arch_hooks.h>
 #include <asm/nmi.h>
 #include <asm/pda.h>
-#include <asm/genapic.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
@@ -1466,12 +1465,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	cpu_set(cpu, smp_commenced_mask);
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
-
-#ifdef CONFIG_X86_GENERICARCH
-	if (num_online_cpus() > 8 && genapic == &apic_default)
-		panic("Default flat APIC routing can't be used with > 8 cpus\n");
-#endif
-
 	return 0;
 }
 
Index: linux/arch/x86_64/kernel/early-quirks.c
===================================================================
--- linux.orig/arch/x86_64/kernel/early-quirks.c
+++ linux/arch/x86_64/kernel/early-quirks.c
@@ -76,18 +76,6 @@ static void ati_bugs(void)
 	}
 }
 
-static void intel_bugs(void)
-{
-	u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
-
-#ifdef CONFIG_SMP
-	if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7525_MCH)
-		quirk_intel_irqbalance();
-#endif
-}
-
 struct chipset {
 	u16 vendor;
 	void (*f)(void);
@@ -97,7 +85,6 @@ static struct chipset early_qrk[] = {
 	{ PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
 	{ PCI_VENDOR_ID_VIA, via_bugs },
 	{ PCI_VENDOR_ID_ATI, ati_bugs },
-	{ PCI_VENDOR_ID_INTEL, intel_bugs},
 	{}
 };
 
Index: linux/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux.orig/arch/x86_64/kernel/smpboot.c
+++ linux/arch/x86_64/kernel/smpboot.c
@@ -60,7 +60,6 @@
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
 #include <asm/numa.h>
-#include <asm/genapic.h>
 
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
@@ -1170,13 +1169,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
 
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
-
-	if (num_online_cpus() > 8 && genapic == &apic_flat) {
-		printk(KERN_WARNING
-		       "flat APIC routing can't be used with > 8 cpus\n");
-		BUG();
-	}
-
 	err = 0;
 
 	return err;
Index: linux/include/asm-i386/genapic.h
===================================================================
--- linux.orig/include/asm-i386/genapic.h
+++ linux/include/asm-i386/genapic.h
@@ -122,6 +122,6 @@ struct genapic { 
 	APICFUNC(phys_pkg_id) \
 	}
 
-extern struct genapic *genapic, apic_default;
+extern struct genapic *genapic;
 
 #endif
Index: linux/include/asm-i386/irq.h
===================================================================
--- linux.orig/include/asm-i386/irq.h
+++ linux/include/asm-i386/irq.h
@@ -37,8 +37,6 @@ static __inline__ int irq_canonicalize(i
 extern int irqbalance_disable(char *str);
 #endif
 
-extern void quirk_intel_irqbalance(void);
-
 #ifdef CONFIG_HOTPLUG_CPU
 extern void fixup_irqs(cpumask_t map);
 #endif
Index: linux/include/asm-x86_64/proto.h
===================================================================
--- linux.orig/include/asm-x86_64/proto.h
+++ linux/include/asm-x86_64/proto.h
@@ -87,7 +87,6 @@ extern void syscall32_cpu_init(void);
 extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);
 
 extern void early_quirks(void);
-extern void quirk_intel_irqbalance(void);
 extern void check_efer(void);
 
 extern int unhandled_signal(struct task_struct *tsk, int sig);

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [8/25] x86_64: optimize & fix APIC mode setup
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (5 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [7/25] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525 Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [9/25] x86_64: always use physical delivery mode on > 8 CPUs Andi Kleen
                   ` (16 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Ingo Molnar, Suresh Siddha, Andi Kleen, Li, Shaohua,
	Eric W. Biederman, patches, linux-kernel


From: Ingo Molnar <mingo@elte.hu>

Fix a couple of inconsistencies/problems I found while reviewing the x86_64
genapic code (when I was chasing mysterious eth0 timeouts that would only
trigger if CPU_HOTPLUG is enabled):

 - AMD systems defaulted to the slower flat-physical mode instead
   of the flat-logical mode. The only restriction on AMD systems
   is that they should not use clustered APIC mode.

 - removed the CPU hotplug hacks, switching the default for small
   systems back from phys-flat to logical-flat. The switching to logical
   flat mode on small systems fixed sporadic ethernet driver timeouts i
   was getting on a dual-core Athlon64 system:

    NETDEV WATCHDOG: eth0: transmit timed out
    eth0: Transmit timeout, status 0c 0005 c07f media 80.
    eth0: Tx queue start entry 32  dirty entry 28.
    eth0:  Tx descriptor 0 is 0008a04a. (queue head)
    eth0:  Tx descriptor 1 is 0008a04a.
    eth0:  Tx descriptor 2 is 0008a04a.
    eth0:  Tx descriptor 3 is 0008a04a.
    eth0: link up, 100Mbps, full-duplex, lpa 0xC5E1

 - The use of '<= 8' was a bug by itself (the valid APIC ids
   for logical flat mode go from 0 to 7, not 0 to 8). The new logic
   is to use logical flat mode on both AMD and Intel systems, and
   to only switch to physical mode when logical mode cannot be used.
   If CPU hotplug is racy wrt. APIC shutdown then CPU hotplug needs
   fixing, not the whole IRQ system be made inconsistent and slowed
   down.

 - minor cleanups: simplified some code constructs

build & booted on a couple of AMD and Intel SMP systems.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/x86_64/kernel/genapic.c |   39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

Index: linux/arch/x86_64/kernel/genapic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic.c
+++ linux/arch/x86_64/kernel/genapic.c
@@ -32,21 +32,20 @@ extern struct genapic apic_cluster;
 extern struct genapic apic_flat;
 extern struct genapic apic_physflat;
 
-struct genapic *genapic = &apic_flat;
-
+struct genapic __read_mostly *genapic = &apic_flat;
 
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
  */
 void __init clustered_apic_check(void)
 {
-	long i;
+	int i;
 	u8 clusters, max_cluster;
 	u8 id;
 	u8 cluster_cnt[NUM_APIC_CLUSTERS];
 	int max_apic = 0;
 
-#if defined(CONFIG_ACPI)
+#ifdef CONFIG_ACPI
 	/*
 	 * Some x86_64 machines use physical APIC mode regardless of how many
 	 * procs/clusters are present (x86_64 ES7000 is an example).
@@ -68,20 +67,17 @@ void __init clustered_apic_check(void)
 		cluster_cnt[APIC_CLUSTERID(id)]++;
 	}
 
-	/* Don't use clustered mode on AMD platforms. */
+	/*
+	 * Don't use clustered mode on AMD platforms, default
+	 * to flat logical mode.
+	 */
  	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
-		genapic = &apic_physflat;
-#ifndef CONFIG_HOTPLUG_CPU
-		/* In the CPU hotplug case we cannot use broadcast mode
-		   because that opens a race when a CPU is removed.
-		   Stay at physflat mode in this case.
-		   It is bad to do this unconditionally though. Once
-		   we have ACPI platform support for CPU hotplug
-		   we should detect hotplug capablity from ACPI tables and
-		   only do this when really needed. -AK */
-		if (max_apic <= 8)
-			genapic = &apic_flat;
-#endif
+		/*
+		 * Switch to physical flat mode if more than 8 APICs
+		 * (In the case of 8 CPUs APIC ID goes from 0 to 7):
+		 */
+		if (max_apic >= 8)
+			genapic = &apic_physflat;
  		goto print;
  	}
 
@@ -103,14 +99,9 @@ void __init clustered_apic_check(void)
 	 * (We don't use lowest priority delivery + HW APIC IRQ steering, so
 	 * can ignore the clustered logical case and go straight to physical.)
 	 */
-	if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
-#ifdef CONFIG_HOTPLUG_CPU
-		/* Don't use APIC shortcuts in CPU hotplug to avoid races */
-		genapic = &apic_physflat;
-#else
+	if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
 		genapic = &apic_flat;
-#endif
-	} else
+	else
 		genapic = &apic_cluster;
 
 print:

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [9/25] x86_64: always use physical delivery mode on > 8 CPUs
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (6 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [8/25] x86_64: optimize & fix APIC mode setup Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [10/25] x86_64: remove clustered APIC mode Andi Kleen
                   ` (15 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Ingo Molnar, Suresh Siddha, Andi Kleen, Li, Shaohua,
	Eric W. Biederman, patches, linux-kernel


From: Ingo Molnar <mingo@elte.hu>

Remove clustered APIC mode.  There's little point in the use of clustered APIC
mode, broadcasting is limited to within the cluster only, and chipsets have
bugs in this area as well.  So default to physical APIC mode when the CPU
count is large, and default to logical APIC mode when the CPU count is 8 or
smaller.

(this patch only removes the use of genapic_cluster and cleans up the
resulting genapic.c file - removal of all remaining traces of clustered
mode will be done by another patch.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/x86_64/kernel/genapic.c |   71 +++++++++----------------------------------
 include/asm-x86_64/genapic.h |    4 +-
 2 files changed, 18 insertions(+), 57 deletions(-)

Index: linux/arch/x86_64/kernel/genapic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic.c
+++ linux/arch/x86_64/kernel/genapic.c
@@ -11,26 +11,24 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/string.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
-#include <linux/module.h>
 
 #include <asm/smp.h>
 #include <asm/ipi.h>
 
-#if defined(CONFIG_ACPI)
+#ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
 #endif
 
 /* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
+					= { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
-u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
-extern struct genapic apic_cluster;
-extern struct genapic apic_flat;
-extern struct genapic apic_physflat;
+u8 x86_cpu_to_log_apicid[NR_CPUS]	= { [0 ... NR_CPUS-1] = BAD_APICID };
 
 struct genapic __read_mostly *genapic = &apic_flat;
 
@@ -39,76 +37,37 @@ struct genapic __read_mostly *genapic = 
  */
 void __init clustered_apic_check(void)
 {
-	int i;
-	u8 clusters, max_cluster;
+	unsigned int i, max_apic = 0;
 	u8 id;
-	u8 cluster_cnt[NUM_APIC_CLUSTERS];
-	int max_apic = 0;
 
 #ifdef CONFIG_ACPI
 	/*
-	 * Some x86_64 machines use physical APIC mode regardless of how many
-	 * procs/clusters are present (x86_64 ES7000 is an example).
+	 * Quirk: some x86_64 machines can only use physical APIC mode
+	 * regardless of how many processors are present (x86_64 ES7000
+	 * is an example).
 	 */
-	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID)
-		if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) {
-			genapic = &apic_cluster;
-			goto print;
-		}
+	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+			(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+		genapic = &apic_physflat;
 #endif
 
-	memset(cluster_cnt, 0, sizeof(cluster_cnt));
 	for (i = 0; i < NR_CPUS; i++) {
 		id = bios_cpu_apicid[i];
 		if (id == BAD_APICID)
 			continue;
 		if (id > max_apic)
 			max_apic = id;
-		cluster_cnt[APIC_CLUSTERID(id)]++;
 	}
 
-	/*
-	 * Don't use clustered mode on AMD platforms, default
-	 * to flat logical mode.
-	 */
- 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
-		/*
-		 * Switch to physical flat mode if more than 8 APICs
-		 * (In the case of 8 CPUs APIC ID goes from 0 to 7):
-		 */
-		if (max_apic >= 8)
-			genapic = &apic_physflat;
- 		goto print;
- 	}
-
-	clusters = 0;
-	max_cluster = 0;
-
-	for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
-		if (cluster_cnt[i] > 0) {
-			++clusters;
-			if (cluster_cnt[i] > max_cluster)
-				max_cluster = cluster_cnt[i];
-		}
-	}
-
-	/*
-	 * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
-	 * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
-	 * else physical mode.
-	 * (We don't use lowest priority delivery + HW APIC IRQ steering, so
-	 * can ignore the clustered logical case and go straight to physical.)
-	 */
-	if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
+	if (max_apic < 8)
 		genapic = &apic_flat;
 	else
-		genapic = &apic_cluster;
+		genapic = &apic_physflat;
 
-print:
 	printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
 }
 
-/* Same for both flat and clustered. */
+/* Same for both flat and physical. */
 
 void send_IPI_self(int vector)
 {
Index: linux/include/asm-x86_64/genapic.h
===================================================================
--- linux.orig/include/asm-x86_64/genapic.h
+++ linux/include/asm-x86_64/genapic.h
@@ -29,7 +29,9 @@ struct genapic {
 	unsigned int (*phys_pkg_id)(int index_msb);
 };
 
-
 extern struct genapic *genapic;
 
+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+
 #endif

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [10/25] x86_64: remove clustered APIC mode
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (7 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [9/25] x86_64: always use physical delivery mode on > 8 CPUs Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels Andi Kleen
                   ` (14 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Ingo Molnar, Suresh Siddha, Andi Kleen, Li, Shaohua,
	Eric W. Biederman, patches, linux-kernel


From: Ingo Molnar <mingo@elte.hu>

Remove now unused clustered APIC mode code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/x86_64/kernel/Makefile          |    3 
 arch/x86_64/kernel/genapic_cluster.c |  137 -----------------------------------
 2 files changed, 1 insertion(+), 139 deletions(-)

Index: linux/arch/x86_64/kernel/Makefile
===================================================================
--- linux.orig/arch/x86_64/kernel/Makefile
+++ linux/arch/x86_64/kernel/Makefile
@@ -21,8 +21,7 @@ obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
 obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o
 obj-y				+= apic.o  nmi.o
-obj-y				+= io_apic.o mpparse.o \
-		genapic.o genapic_cluster.o genapic_flat.o
+obj-y				+= io_apic.o mpparse.o genapic.o genapic_flat.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_PM)		+= suspend.o
Index: linux/arch/x86_64/kernel/genapic_cluster.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic_cluster.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
- *
- * Clustered APIC subarch code.  Up to 255 CPUs, physical delivery.
- * (A more realistic maximum is around 230 CPUs.)
- *
- * Hacked for x86-64 by James Cleverdon from i386 architecture code by
- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
- * James Cleverdon.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <asm/smp.h>
-#include <asm/ipi.h>
-
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116).  So here it goes...
- */
-static void cluster_init_apic_ldr(void)
-{
-	unsigned long val, id;
-	long i, count;
-	u8 lid;
-	u8 my_id = hard_smp_processor_id();
-	u8 my_cluster = APIC_CLUSTER(my_id);
-
-	/* Create logical APIC IDs by counting CPUs already in cluster. */
-	for (count = 0, i = NR_CPUS; --i >= 0; ) {
-		lid = x86_cpu_to_log_apicid[i];
-		if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
-			++count;
-	}
-	/*
-	 * We only have a 4 wide bitmap in cluster mode.  There's no way
-	 * to get above 60 CPUs and still give each one it's own bit.
-	 * But, we're using physical IRQ delivery, so we don't care.
-	 * Use bit 3 for the 4th through Nth CPU in each cluster.
-	 */
-	if (count >= XAPIC_DEST_CPUS_SHIFT)
-		count = 3;
-	id = my_cluster | (1UL << count);
-	x86_cpu_to_log_apicid[smp_processor_id()] = id;
-	apic_write(APIC_DFR, APIC_DFR_CLUSTER);
-	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
-	val |= SET_APIC_LOGICAL_ID(id);
-	apic_write(APIC_LDR, val);
-}
-
-/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
-
-static cpumask_t cluster_target_cpus(void)
-{
-	return cpumask_of_cpu(0);
-}
-
-static cpumask_t cluster_vector_allocation_domain(int cpu)
-{
-	cpumask_t domain = CPU_MASK_NONE;
-	cpu_set(cpu, domain);
-	return domain;
-}
-
-static void cluster_send_IPI_mask(cpumask_t mask, int vector)
-{
-	send_IPI_mask_sequence(mask, vector);
-}
-
-static void cluster_send_IPI_allbutself(int vector)
-{
-	cpumask_t mask = cpu_online_map;
-
-	cpu_clear(smp_processor_id(), mask);
-
-	if (!cpus_empty(mask))
-		cluster_send_IPI_mask(mask, vector);
-}
-
-static void cluster_send_IPI_all(int vector)
-{
-	cluster_send_IPI_mask(cpu_online_map, vector);
-}
-
-static int cluster_apic_id_registered(void)
-{
-	return 1;
-}
-
-static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask)
-{
-	int cpu;
-
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	cpu = first_cpu(cpumask);
-	if ((unsigned)cpu < NR_CPUS)
-		return x86_cpu_to_apicid[cpu];
-	else
-		return BAD_APICID;
-}
-
-/* cpuid returns the value latched in the HW at reset, not the APIC ID
- * register's value.  For any box whose BIOS changes APIC IDs, like
- * clustered APIC systems, we must use hard_smp_processor_id.
- *
- * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
- */
-static unsigned int phys_pkg_id(int index_msb)
-{
-	return hard_smp_processor_id() >> index_msb;
-}
-
-struct genapic apic_cluster = {
-	.name = "clustered",
-	.int_delivery_mode = dest_Fixed,
-	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
-	.target_cpus = cluster_target_cpus,
-	.vector_allocation_domain = cluster_vector_allocation_domain,
-	.apic_id_registered = cluster_apic_id_registered,
-	.init_apic_ldr = cluster_init_apic_ldr,
-	.send_IPI_all = cluster_send_IPI_all,
-	.send_IPI_allbutself = cluster_send_IPI_allbutself,
-	.send_IPI_mask = cluster_send_IPI_mask,
-	.cpu_mask_to_apicid = cluster_cpu_mask_to_apicid,
-	.phys_pkg_id = phys_pkg_id,
-};

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (8 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [10/25] x86_64: remove clustered APIC mode Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-11 11:13   ` Eric W. Biederman
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [12/25] x86_64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix Andi Kleen
                   ` (13 subsequent siblings)
  23 siblings, 1 reply; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Ingo Molnar, Suresh Siddha, Andi Kleen, Li, Shaohua,
	Eric W. Biederman, patches, linux-kernel


From: Ingo Molnar <mingo@elte.hu>

Default to physical mode on hotplug CPU kernels.  Furher simplify and clean up
the APIC initialization code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/kernel/acpi/boot.c              |    2 +-
 arch/i386/kernel/mpparse.c                |    2 +-
 arch/x86_64/kernel/genapic.c              |   16 +++-------------
 arch/x86_64/kernel/mpparse.c              |    2 +-
 include/asm-i386/genapic.h                |    4 ++--
 include/asm-i386/mach-bigsmp/mach_apic.h  |    2 +-
 include/asm-i386/mach-default/mach_apic.h |    2 +-
 include/asm-i386/mach-es7000/mach_apic.h  |    2 +-
 include/asm-i386/mach-generic/mach_apic.h |    2 +-
 include/asm-i386/mach-numaq/mach_apic.h   |    2 +-
 include/asm-i386/mach-summit/mach_apic.h  |    2 +-
 include/asm-i386/mach-visws/mach_apic.h   |    2 +-
 include/asm-x86_64/apic.h                 |    2 +-
 13 files changed, 16 insertions(+), 26 deletions(-)

Index: linux/arch/i386/kernel/acpi/boot.c
===================================================================
--- linux.orig/arch/i386/kernel/acpi/boot.c
+++ linux/arch/i386/kernel/acpi/boot.c
@@ -890,7 +890,7 @@ static void __init acpi_process_madt(voi
 				acpi_ioapic = 1;
 
 				smp_found_config = 1;
-				clustered_apic_check();
+				setup_apic_routing();
 			}
 		}
 		if (error == -EINVAL) {
Index: linux/arch/i386/kernel/mpparse.c
===================================================================
--- linux.orig/arch/i386/kernel/mpparse.c
+++ linux/arch/i386/kernel/mpparse.c
@@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp
 		}
 		++mpc_record;
 	}
-	clustered_apic_check();
+	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "SMP mptable: no processors registered!\n");
 	return num_processors;
Index: linux/arch/x86_64/kernel/genapic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic.c
+++ linux/arch/x86_64/kernel/genapic.c
@@ -35,11 +35,8 @@ struct genapic __read_mostly *genapic = 
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
  */
-void __init clustered_apic_check(void)
+void __init setup_apic_routing(void)
 {
-	unsigned int i, max_apic = 0;
-	u8 id;
-
 #ifdef CONFIG_ACPI
 	/*
 	 * Quirk: some x86_64 machines can only use physical APIC mode
@@ -49,17 +46,10 @@ void __init clustered_apic_check(void)
 	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
 			(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
 		genapic = &apic_physflat;
+	else
 #endif
 
-	for (i = 0; i < NR_CPUS; i++) {
-		id = bios_cpu_apicid[i];
-		if (id == BAD_APICID)
-			continue;
-		if (id > max_apic)
-			max_apic = id;
-	}
-
-	if (max_apic < 8)
+	if (cpus_weight(cpu_possible_map) <= 8)
 		genapic = &apic_flat;
 	else
 		genapic = &apic_physflat;
Index: linux/arch/x86_64/kernel/mpparse.c
===================================================================
--- linux.orig/arch/x86_64/kernel/mpparse.c
+++ linux/arch/x86_64/kernel/mpparse.c
@@ -300,7 +300,7 @@ static int __init smp_read_mpc(struct mp
 			}
 		}
 	}
-	clustered_apic_check();
+	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
 	return num_processors;
Index: linux/include/asm-i386/genapic.h
===================================================================
--- linux.orig/include/asm-i386/genapic.h
+++ linux/include/asm-i386/genapic.h
@@ -36,7 +36,7 @@ struct genapic { 
 	void (*init_apic_ldr)(void);
 	physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
 
-	void (*clustered_apic_check)(void);
+	void (*setup_apic_routing)(void);
 	int (*multi_timer_check)(int apic, int irq);
 	int (*apicid_to_node)(int logical_apicid); 
 	int (*cpu_to_logical_apicid)(int cpu);
@@ -99,7 +99,7 @@ struct genapic { 
 	APICFUNC(check_apicid_present) \
 	APICFUNC(init_apic_ldr) \
 	APICFUNC(ioapic_phys_id_map) \
-	APICFUNC(clustered_apic_check) \
+	APICFUNC(setup_apic_routing) \
 	APICFUNC(multi_timer_check) \
 	APICFUNC(apicid_to_node) \
 	APICFUNC(cpu_to_logical_apicid) \
Index: linux/include/asm-i386/mach-bigsmp/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-bigsmp/mach_apic.h
+++ linux/include/asm-i386/mach-bigsmp/mach_apic.h
@@ -71,7 +71,7 @@ static inline void init_apic_ldr(void)
 	apic_write_around(APIC_LDR, val);
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
 		"Physflat", nr_ioapics);
Index: linux/include/asm-i386/mach-default/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-default/mach_apic.h
+++ linux/include/asm-i386/mach-default/mach_apic.h
@@ -54,7 +54,7 @@ static inline physid_mask_t ioapic_phys_
 	return phys_map;
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
 					"Flat", nr_ioapics);
Index: linux/include/asm-i386/mach-es7000/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-es7000/mach_apic.h
+++ linux/include/asm-i386/mach-es7000/mach_apic.h
@@ -81,7 +81,7 @@ static inline void enable_apic_mode(void
 }
 
 extern int apic_version [MAX_APICS];
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	int apic = bios_cpu_apicid[smp_processor_id()];
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs, target cpus %lx\n",
Index: linux/include/asm-i386/mach-generic/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-generic/mach_apic.h
+++ linux/include/asm-i386/mach-generic/mach_apic.h
@@ -13,7 +13,7 @@
 #define apic_id_registered (genapic->apic_id_registered)
 #define init_apic_ldr (genapic->init_apic_ldr)
 #define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
-#define clustered_apic_check (genapic->clustered_apic_check) 
+#define setup_apic_routing (genapic->setup_apic_routing)
 #define multi_timer_check (genapic->multi_timer_check)
 #define apicid_to_node (genapic->apicid_to_node)
 #define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid) 
Index: linux/include/asm-i386/mach-numaq/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-numaq/mach_apic.h
+++ linux/include/asm-i386/mach-numaq/mach_apic.h
@@ -34,7 +34,7 @@ static inline void init_apic_ldr(void)
 	/* Already done in NUMA-Q firmware */
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
 		"NUMA-Q", nr_ioapics);
Index: linux/include/asm-i386/mach-summit/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-summit/mach_apic.h
+++ linux/include/asm-i386/mach-summit/mach_apic.h
@@ -80,7 +80,7 @@ static inline int apic_id_registered(voi
 	return 1;
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 	printk("Enabling APIC mode:  Summit.  Using %d I/O APICs\n",
 						nr_ioapics);
Index: linux/include/asm-i386/mach-visws/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-visws/mach_apic.h
+++ linux/include/asm-i386/mach-visws/mach_apic.h
@@ -47,7 +47,7 @@ static inline void summit_check(char *oe
 {
 }
 
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
 {
 }
 
Index: linux/include/asm-x86_64/apic.h
===================================================================
--- linux.orig/include/asm-x86_64/apic.h
+++ linux/include/asm-x86_64/apic.h
@@ -83,7 +83,7 @@ extern void setup_secondary_APIC_clock (
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
-extern void clustered_apic_check(void);
+extern void setup_apic_routing(void);
 
 extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
 				   unsigned char msg_type, unsigned char mask);

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [12/25] x86_64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (9 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [13/25] i386: Fix a typo in an IRQ handler name Andi Kleen
                   ` (12 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Amul Shah, Andi Kleen, Rohit Seth, patches, linux-kernel


From: Amul Shah <amul.shah@unisys.com>

- Removed an extraneous debug message from allocate_cachealigned_map

- Changed extract_lsb_from_nodes to return 63 for the case where there was
  only one memory node.  The prevents the creation of the dynamic hashmap.

- Changed extract_lsb_from_nodes to use only the starting memory address of
  a node.  On an ES7000, our nodes overlap the starting and ending address,
  meaning, that we see nodes like

	00000 - 10000
	10000 - 20000

  But other systems have nodes whose start and end addresses do not overlap.
   For example:

	00000 - 0FFFF
	10000 - 1FFFF

  In this case, using the ending address will result in an LSB much lower
  than what is possible.  In this case an LSB of 1 when in reality it should
  be 16.  

Cc: Andi Kleen <ak@suse.de>
Cc: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/x86_64/mm/numa.c |   15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -78,11 +78,8 @@ static int __init allocate_cachealigned_
 	unsigned long pad, pad_addr;
 
 	memnodemap = memnode.embedded_map;
-	if (memnodemapsize <= 48) {
-		printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
-		       nodemap_addr, nodemap_addr + nodemap_size);
+	if (memnodemapsize <= 48)
 		return 0;
-	}
 
 	pad = L1_CACHE_BYTES - 1;
 	pad_addr = 0x8000;
@@ -110,7 +107,7 @@ static int __init allocate_cachealigned_
 static int __init
 extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
 {
-	int i;
+	int i, nodes_used = 0;
 	unsigned long start, end;
 	unsigned long bitfield = 0, memtop = 0;
 
@@ -119,11 +116,15 @@ extract_lsb_from_nodes (const struct boo
 		end = nodes[i].end;
 		if (start >= end)
 			continue;
-		bitfield |= start | end;
+		bitfield |= start;
+		nodes_used++;
 		if (end > memtop)
 			memtop = end;
 	}
-	i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+	if (nodes_used <= 1)
+		i = 63;
+	else
+		i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
 	memnodemapsize = (memtop >> i)+1;
 	return i;
 }

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [13/25] i386: Fix a typo in an IRQ handler name
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (10 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [12/25] x86_64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [14/25] x86: Share what's shareable Andi Kleen
                   ` (11 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Maciej W. Rozycki, Andi Kleen, patches, linux-kernel


From: "Maciej W. Rozycki" <macro@linux-mips.org>

The "fasteoi" IRQ handler is named "fasteio" incorrectly.  This is a fix.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/kernel/io_apic.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/arch/i386/kernel/io_apic.c
===================================================================
--- linux.orig/arch/i386/kernel/io_apic.c
+++ linux/arch/i386/kernel/io_apic.c
@@ -2310,7 +2310,7 @@ static inline void __init check_timer(vo
 
 	disable_8259A_irq(0);
 	set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
-				      "fasteio");
+				      "fasteoi");
 	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [14/25] x86: Share what's shareable.
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (11 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [13/25] i386: Fix a typo in an IRQ handler name Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [15/25] i386: Only call unreachable_devices() when type 1 is available Andi Kleen
                   ` (10 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Olivier Galibert, Andi Kleen, patches, linux-kernel


From: Olivier Galibert <galibert@pobox.com>

i386 and x86-64 pci mmconfig code have a lot in common.  So share what's
shareable between the two.

Signed-off-by: Olivier Galibert <galibert@pobox.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/pci/Makefile          |    2 
 arch/i386/pci/mmconfig-shared.c |   86 ++++++++++++++++++++++++++++++++++++++++
 arch/i386/pci/mmconfig.c        |   74 +---------------------------------
 arch/i386/pci/pci.h             |    6 ++
 arch/x86_64/pci/Makefile        |    3 -
 arch/x86_64/pci/mmconfig.c      |   76 +++++------------------------------
 6 files changed, 111 insertions(+), 136 deletions(-)

Index: linux/arch/i386/pci/Makefile
===================================================================
--- linux.orig/arch/i386/pci/Makefile
+++ linux/arch/i386/pci/Makefile
@@ -1,7 +1,7 @@
 obj-y				:= i386.o init.o
 
 obj-$(CONFIG_PCI_BIOS)		+= pcbios.o
-obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig.o direct.o
+obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig.o direct.o mmconfig-shared.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 
 pci-y				:= fixup.o
Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- /dev/null
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -0,0 +1,86 @@
+/*
+ * mmconfig-shared.c - Low-level direct PCI config space access via
+ *                     MMCONFIG - common code between i386 and x86-64.
+ *
+ * This code does:
+ * - ACPI decoding and validation
+ *
+ * Per-architecture code takes care of the mappings and accesses
+ * themselves.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/bitmap.h>
+#include <asm/e820.h>
+
+#include "pci.h"
+
+/* aperture is up to 256MB but BIOS may reserve less */
+#define MMCONFIG_APER_MIN	(2 * 1024*1024)
+#define MMCONFIG_APER_MAX	(256 * 1024*1024)
+
+/* Verify the first 16 busses. We assume that systems with more busses
+   get MCFG right. */
+#define PCI_MMCFG_MAX_CHECK_BUS 16
+
+DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
+
+/* K8 systems have some devices (typically in the builtin northbridge)
+   that are only accessible using type1
+   Normally this can be expressed in the MCFG by not listing them
+   and assigning suitable _SEGs, but this isn't implemented in some BIOS.
+   Instead try to discover all devices on bus 0 that are unreachable using MM
+   and fallback for them. */
+static __init void unreachable_devices(void)
+{
+	int i, k;
+	/* Use the max bus number from ACPI here? */
+	for (k = 0; k < PCI_MMCFG_MAX_CHECK_BUS; k++) {
+		for (i = 0; i < 32; i++) {
+			u32 val1, val2;
+
+			pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
+			if (val1 == 0xffffffff)
+				continue;
+
+			raw_pci_ops->read(0, k, PCI_DEVFN(i, 0), 0, 4, &val2);
+			if (val1 != val2) {
+				set_bit(i + 32*k, pci_mmcfg_fallback_slots);
+				printk(KERN_NOTICE "PCI: No mmconfig possible"
+				       " on device %02x:%02x\n", k, i);
+			}
+		}
+	}
+}
+
+void __init pci_mmcfg_init(int type)
+{
+	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+		return;
+
+	acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+
+	if ((pci_mmcfg_config_num == 0) ||
+	    (pci_mmcfg_config == NULL) ||
+	    (pci_mmcfg_config[0].address == 0))
+		return;
+
+	/* Only do this check when type 1 works. If it doesn't work
+           assume we run on a Mac and always use MCFG */
+	if (type == 1 &&
+	    !e820_all_mapped(pci_mmcfg_config[0].address,
+			     pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
+			     E820_RESERVED)) {
+		printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not E820-reserved\n",
+				pci_mmcfg_config[0].address);
+		printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+		return;
+	}
+
+	if (pci_mmcfg_arch_init()) {
+		unreachable_devices();
+		pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+	}
+}
Index: linux/arch/i386/pci/mmconfig.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig.c
+++ linux/arch/i386/pci/mmconfig.c
@@ -15,21 +15,13 @@
 #include <asm/e820.h>
 #include "pci.h"
 
-/* aperture is up to 256MB but BIOS may reserve less */
-#define MMCONFIG_APER_MIN	(2 * 1024*1024)
-#define MMCONFIG_APER_MAX	(256 * 1024*1024)
-
 /* Assume systems with more busses have correct MCFG */
-#define MAX_CHECK_BUS 16
-
 #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
 
 /* The base address of the last MMCONFIG device accessed */
 static u32 mmcfg_last_accessed_device;
 static int mmcfg_last_accessed_cpu;
 
-static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32);
-
 /*
  * Functions for accessing PCI configuration space with MMCONFIG accesses
  */
@@ -38,8 +30,8 @@ static u32 get_base_addr(unsigned int se
 	int cfg_num = -1;
 	struct acpi_mcfg_allocation *cfg;
 
-	if (seg == 0 && bus < MAX_CHECK_BUS &&
-	    test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots))
+	if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
+	    test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots))
 		return 0;
 
 	while (1) {
@@ -158,67 +150,9 @@ static struct pci_raw_ops pci_mmcfg = {
 	.write =	pci_mmcfg_write,
 };
 
-/* K8 systems have some devices (typically in the builtin northbridge)
-   that are only accessible using type1
-   Normally this can be expressed in the MCFG by not listing them
-   and assigning suitable _SEGs, but this isn't implemented in some BIOS.
-   Instead try to discover all devices on bus 0 that are unreachable using MM
-   and fallback for them. */
-static __init void unreachable_devices(void)
-{
-	int i, k;
-	unsigned long flags;
-
-	for (k = 0; k < MAX_CHECK_BUS; k++) {
-		for (i = 0; i < 32; i++) {
-			u32 val1;
-			u32 addr;
-
-			pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1);
-			if (val1 == 0xffffffff)
-				continue;
-
-			/* Locking probably not needed, but safer */
-			spin_lock_irqsave(&pci_config_lock, flags);
-			addr = get_base_addr(0, k, PCI_DEVFN(i, 0));
-			if (addr != 0)
-				pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0));
-			if (addr == 0 ||
-			    readl((u32 __iomem *)mmcfg_virt_addr) != val1) {
-				set_bit(i + 32*k, fallback_slots);
-				printk(KERN_NOTICE
-			"PCI: No mmconfig possible on %x:%x\n", k, i);
-			}
-			spin_unlock_irqrestore(&pci_config_lock, flags);
-		}
-	}
-}
-
-void __init pci_mmcfg_init(int type)
+int __init pci_mmcfg_arch_init(void)
 {
-	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
-		return;
-
-	acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
-	if ((pci_mmcfg_config_num == 0) ||
-	    (pci_mmcfg_config == NULL) ||
-	    (pci_mmcfg_config[0].address == 0))
-		return;
-
-	/* Only do this check when type 1 works. If it doesn't work
-	   assume we run on a Mac and always use MCFG */
-	if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address,
-			pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
-			E820_RESERVED)) {
-		printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n",
-				(unsigned long)pci_mmcfg_config[0].address);
-		printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
-		return;
-	}
-
 	printk(KERN_INFO "PCI: Using MMCONFIG\n");
 	raw_pci_ops = &pci_mmcfg;
-	pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
-
-	unreachable_devices();
+	return 1;
 }
Index: linux/arch/i386/pci/pci.h
===================================================================
--- linux.orig/arch/i386/pci/pci.h
+++ linux/arch/i386/pci/pci.h
@@ -94,3 +94,9 @@ extern void pci_pcbios_init(void);
 extern void pci_mmcfg_init(int type);
 extern void pcibios_sort(void);
 
+/* pci-mmconfig.c */
+
+#define PCI_MMCFG_MAX_CHECK_BUS 16
+extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
+
+extern int pci_mmcfg_arch_init(void);
Index: linux/arch/x86_64/pci/Makefile
===================================================================
--- linux.orig/arch/x86_64/pci/Makefile
+++ linux/arch/x86_64/pci/Makefile
@@ -11,7 +11,7 @@ obj-y		+= fixup.o init.o
 obj-$(CONFIG_ACPI)	+= acpi.o
 obj-y			+= legacy.o irq.o common.o early.o
 # mmconfig has a 64bit special
-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
+obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o
 
 obj-$(CONFIG_NUMA)	+= k8-bus.o
 
@@ -24,3 +24,4 @@ fixup-y  += ../../i386/pci/fixup.o
 i386-y  += ../../i386/pci/i386.o
 init-y += ../../i386/pci/init.o
 early-y += ../../i386/pci/early.o
+mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o
Index: linux/arch/x86_64/pci/mmconfig.c
===================================================================
--- linux.orig/arch/x86_64/pci/mmconfig.c
+++ linux/arch/x86_64/pci/mmconfig.c
@@ -19,9 +19,7 @@
 
 /* Verify the first 16 busses. We assume that systems with more busses
    get MCFG right. */
-#define MAX_CHECK_BUS 16
-
-static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
+#define PCI_MMCFG_MAX_CHECK_BUS 16
 
 /* Static virtual mapping of the MMCONFIG aperture */
 struct mmcfg_virt {
@@ -63,8 +61,8 @@ static char __iomem *get_virt(unsigned i
 static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
 {
 	char __iomem *addr;
-	if (seg == 0 && bus < MAX_CHECK_BUS &&
-		test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
+	if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
+		test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots))
 		return NULL;
 	addr = get_virt(seg, bus);
 	if (!addr)
@@ -135,63 +133,16 @@ static struct pci_raw_ops pci_mmcfg = {
 	.write =	pci_mmcfg_write,
 };
 
-/* K8 systems have some devices (typically in the builtin northbridge)
-   that are only accessible using type1
-   Normally this can be expressed in the MCFG by not listing them
-   and assigning suitable _SEGs, but this isn't implemented in some BIOS.
-   Instead try to discover all devices on bus 0 that are unreachable using MM
-   and fallback for them. */
-static __init void unreachable_devices(void)
-{
-	int i, k;
-	/* Use the max bus number from ACPI here? */
-	for (k = 0; k < MAX_CHECK_BUS; k++) {
-		for (i = 0; i < 32; i++) {
-			u32 val1;
-			char __iomem *addr;
-
-			pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
-			if (val1 == 0xffffffff)
-				continue;
-			addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
-			if (addr == NULL|| readl(addr) != val1) {
-				set_bit(i + 32*k, fallback_slots);
-				printk(KERN_NOTICE "PCI: No mmconfig possible"
-				       " on device %02x:%02x\n", k, i);
-			}
-		}
-	}
-}
-
-void __init pci_mmcfg_init(int type)
+int __init pci_mmcfg_arch_init(void)
 {
 	int i;
-
-	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
-		return;
-
-	acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
-	if ((pci_mmcfg_config_num == 0) ||
-	    (pci_mmcfg_config == NULL) ||
-	    (pci_mmcfg_config[0].address == 0))
-		return;
-
-	/* Only do this check when type 1 works. If it doesn't work
-           assume we run on a Mac and always use MCFG */
-	if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address,
-			pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
-			E820_RESERVED)) {
-		printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n",
-				(unsigned long)pci_mmcfg_config[0].address);
-		printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
-		return;
-	}
-
-	pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
+	pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) *
+				 pci_mmcfg_config_num, GFP_KERNEL);
 	if (pci_mmcfg_virt == NULL) {
 		printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
-		return;
+		return 0;
 	}
+
 	for (i = 0; i < pci_mmcfg_config_num; ++i) {
 		pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
 		pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address,
@@ -200,14 +151,11 @@ void __init pci_mmcfg_init(int type)
 			printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
 					"segment %d\n",
 				pci_mmcfg_config[i].pci_segment);
-			return;
+			return 0;
 		}
-		printk(KERN_INFO "PCI: Using MMCONFIG at %lx\n",
-			(unsigned long)pci_mmcfg_config[i].address);
+		printk(KERN_INFO "PCI: Using MMCONFIG at %Lx\n",
+				pci_mmcfg_config[i].address);
 	}
-
-	unreachable_devices();
-
 	raw_pci_ops = &pci_mmcfg;
-	pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+	return 1;
 }

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [15/25] i386: Only call unreachable_devices() when type 1 is available.
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (12 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [14/25] x86: Share what's shareable Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [16/25] i386: Detect and support the E7520 and the 945G/GZ/P/PL Andi Kleen
                   ` (9 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Olivier Galibert, Andi Kleen, patches, linux-kernel


From: Olivier Galibert <galibert@pobox.com>

unreachable_devices compares between the results of pci configuration accesses
through type1 and mmconfig, so it should be called only if type1 actually
works in the first place.

Signed-off-by: Olivier Galibert <galibert@pobox.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/pci/mmconfig-shared.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig-shared.c
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -80,7 +80,8 @@ void __init pci_mmcfg_init(int type)
 	}
 
 	if (pci_mmcfg_arch_init()) {
-		unreachable_devices();
+		if (type == 1)
+			unreachable_devices();
 		pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
 	}
 }

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [16/25] i386: Detect and support the E7520 and the 945G/GZ/P/PL
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (13 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [15/25] i386: Only call unreachable_devices() when type 1 is available Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [17/25] i386: Reserve resources but only when we're sure about them Andi Kleen
                   ` (8 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Olivier Galibert, Andi Kleen, patches, linux-kernel


From: Olivier Galibert <galibert@pobox.com>

It seems that the only way to reliably support mmconfig in the presence of
funky biosen is to detect the hostbridge and read where the window is mapped
from its registers.  Do that for the E7520 and the 945G/GZ/P/PL for a start.

Signed-off-by: Olivier Galibert <galibert@pobox.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/pci/mmconfig-shared.c |  121 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 119 insertions(+), 2 deletions(-)

Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig-shared.c
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -3,6 +3,7 @@
  *                     MMCONFIG - common code between i386 and x86-64.
  *
  * This code does:
+ * - known chipset handling
  * - ACPI decoding and validation
  *
  * Per-architecture code takes care of the mappings and accesses
@@ -55,12 +56,128 @@ static __init void unreachable_devices(v
 	}
 }
 
+static __init const char *pci_mmcfg_e7520(void)
+{
+	u32 win;
+	pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win);
+
+	pci_mmcfg_config_num = 1;
+	pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
+	if (!pci_mmcfg_config)
+		return NULL;
+	pci_mmcfg_config[0].address = (win & 0xf000) << 16;
+	pci_mmcfg_config[0].pci_segment = 0;
+	pci_mmcfg_config[0].start_bus_number = 0;
+	pci_mmcfg_config[0].end_bus_number = 255;
+
+	return "Intel Corporation E7520 Memory Controller Hub";
+}
+
+static __init const char *pci_mmcfg_intel_945(void)
+{
+	u32 pciexbar, mask = 0, len = 0;
+
+	pci_mmcfg_config_num = 1;
+
+	pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar);
+
+	/* Enable bit */
+	if (!(pciexbar & 1))
+		pci_mmcfg_config_num = 0;
+
+	/* Size bits */
+	switch ((pciexbar >> 1) & 3) {
+	case 0:
+		mask = 0xf0000000U;
+		len  = 0x10000000U;
+		break;
+	case 1:
+		mask = 0xf8000000U;
+		len  = 0x08000000U;
+		break;
+	case 2:
+		mask = 0xfc000000U;
+		len  = 0x04000000U;
+		break;
+	default:
+		pci_mmcfg_config_num = 0;
+	}
+
+	/* Errata #2, things break when not aligned on a 256Mb boundary */
+	/* Can only happen in 64M/128M mode */
+
+	if ((pciexbar & mask) & 0x0fffffffU)
+		pci_mmcfg_config_num = 0;
+
+	if (pci_mmcfg_config_num) {
+		pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
+		if (!pci_mmcfg_config)
+			return NULL;
+		pci_mmcfg_config[0].address = pciexbar & mask;
+		pci_mmcfg_config[0].pci_segment = 0;
+		pci_mmcfg_config[0].start_bus_number = 0;
+		pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1;
+	}
+
+	return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
+}
+
+struct pci_mmcfg_hostbridge_probe {
+	u32 vendor;
+	u32 device;
+	const char *(*probe)(void);
+};
+
+static __initdata struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] = {
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 },
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 },
+};
+
+static int __init pci_mmcfg_check_hostbridge(void)
+{
+	u32 l;
+	u16 vendor, device;
+	int i;
+	const char *name;
+
+	pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0, 4, &l);
+	vendor = l & 0xffff;
+	device = (l >> 16) & 0xffff;
+
+	pci_mmcfg_config_num = 0;
+	pci_mmcfg_config = NULL;
+	name = NULL;
+
+	for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++)
+		if ((pci_mmcfg_probes[i].vendor == PCI_ANY_ID ||
+		     pci_mmcfg_probes[i].vendor == vendor) &&
+		    (pci_mmcfg_probes[i].device == PCI_ANY_ID ||
+		     pci_mmcfg_probes[i].device == device))
+			name = pci_mmcfg_probes[i].probe();
+
+	if (name) {
+		if (pci_mmcfg_config_num)
+			printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", name);
+		else
+			printk(KERN_INFO "PCI: Found %s without MMCONFIG support.\n",
+			       name);
+	}
+
+	return name != NULL;
+}
+
 void __init pci_mmcfg_init(int type)
 {
+	int known_bridge = 0;
+
 	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
 		return;
 
-	acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+	if (type == 1 && pci_mmcfg_check_hostbridge())
+		known_bridge = 1;
+
+	if (!known_bridge)
+		acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
 
 	if ((pci_mmcfg_config_num == 0) ||
 	    (pci_mmcfg_config == NULL) ||
@@ -69,7 +186,7 @@ void __init pci_mmcfg_init(int type)
 
 	/* Only do this check when type 1 works. If it doesn't work
            assume we run on a Mac and always use MCFG */
-	if (type == 1 &&
+	if (type == 1 && !known_bridge &&
 	    !e820_all_mapped(pci_mmcfg_config[0].address,
 			     pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
 			     E820_RESERVED)) {

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [17/25] i386: Reserve resources but only when we're sure about them.
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (14 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [16/25] i386: Detect and support the E7520 and the 945G/GZ/P/PL Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address Andi Kleen
                   ` (7 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Olivier Galibert, Andi Kleen, patches, linux-kernel


From: Olivier Galibert <galibert@pobox.com>

Put back the resource reservation as per
4c6e052adfe285ede5884e4e8c4d33af33932c13 but use it *only* when the range(s)
come from a chipset probe instead of the bios.

Signed-off-by: Olivier Galibert <galibert@pobox.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/pci/mmconfig-shared.c |   33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig-shared.c
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -166,6 +166,37 @@ static int __init pci_mmcfg_check_hostbr
 	return name != NULL;
 }
 
+static __init void pci_mmcfg_insert_resources(void)
+{
+#define PCI_MMCFG_RESOURCE_NAME_LEN 19
+	int i;
+	struct resource *res;
+	char *names;
+	unsigned num_buses;
+
+	res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
+			pci_mmcfg_config_num, GFP_KERNEL);
+
+	if (!res) {
+		printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
+		return;
+	}
+
+	names = (void *)&res[pci_mmcfg_config_num];
+	for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
+		num_buses = pci_mmcfg_config[i].end_bus_number -
+		    pci_mmcfg_config[i].start_bus_number + 1;
+		res->name = names;
+		snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
+			pci_mmcfg_config[i].pci_segment);
+		res->start = pci_mmcfg_config[i].address;
+		res->end = res->start + (num_buses << 20) - 1;
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		insert_resource(&iomem_resource, res);
+		names += PCI_MMCFG_RESOURCE_NAME_LEN;
+	}
+}
+
 void __init pci_mmcfg_init(int type)
 {
 	int known_bridge = 0;
@@ -199,6 +230,8 @@ void __init pci_mmcfg_init(int type)
 	if (pci_mmcfg_arch_init()) {
 		if (type == 1)
 			unreachable_devices();
+		if (known_bridge)
+			pci_mmcfg_insert_resources();
 		pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
 	}
 }

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (15 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [17/25] i386: Reserve resources but only when we're sure about them Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:58   ` Arjan van de Ven
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [19/25] x86: Reject a broken MCFG tables on Asus etc Andi Kleen
                   ` (6 subsequent siblings)
  23 siblings, 1 reply; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: OGAWA Hirofumi, Arjan van de Ven, Andi Kleen, patches, linux-kernel


From: OGAWA Hirofumi <hogawa@miraclelinux.com>

Current mmconfig has some problems of remapped range.

a) In the case of broken MCFG tables on Asus etc., we need to remap 256M
   range, but currently only remap 1M.

b) The base address always corresponds to bus number 0, but currently we
   are assuming it corresponds to start bus number.

This patch fixes the above problems.

(akpm: Arjan suggests that if the MCFG table is broken we just shouldn't use
it, rather than try to work around things).

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/x86_64/pci/mmconfig.c |   46 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

Index: linux/arch/x86_64/pci/mmconfig.c
===================================================================
--- linux.orig/arch/x86_64/pci/mmconfig.c
+++ linux/arch/x86_64/pci/mmconfig.c
@@ -28,6 +28,39 @@ struct mmcfg_virt {
 };
 static struct mmcfg_virt *pci_mmcfg_virt;
 
+static inline int mcfg_broken(void)
+{
+	struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[0];
+
+	/* Handle more broken MCFG tables on Asus etc.
+	   They only contain a single entry for bus 0-0. Assume
+ 	   this applies to all busses. */
+	if (pci_mmcfg_config_num == 1 &&
+	    cfg->pci_segment_group_number == 0 &&
+	    (cfg->start_bus_number | cfg->end_bus_number) == 0)
+		return 1;
+	return 0;
+}
+
+static void __iomem *mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
+{
+	void __iomem *addr;
+	u32 size;
+
+	if (mcfg_broken())
+		size = 256 << 20;
+	else
+		size = (cfg->end_bus_number + 1) << 20;
+
+	addr = ioremap_nocache(cfg->base_address, size);
+	if (addr) {
+		printk(KERN_INFO "PCI: Using MMCONFIG at %x - %x\n",
+		       cfg->base_address,
+		       cfg->base_address + size - 1);
+	}
+	return addr;
+}
+
 static char __iomem *get_virt(unsigned int seg, unsigned bus)
 {
 	int cfg_num = -1;
@@ -45,13 +78,7 @@ static char __iomem *get_virt(unsigned i
 			return pci_mmcfg_virt[cfg_num].virt;
 	}
 
-	/* Handle more broken MCFG tables on Asus etc.
-	   They only contain a single entry for bus 0-0. Assume
- 	   this applies to all busses. */
-	cfg = &pci_mmcfg_config[0];
-	if (pci_mmcfg_config_num == 1 &&
-		cfg->pci_segment == 0 &&
-		(cfg->start_bus_number | cfg->end_bus_number) == 0)
+	if (mcfg_broken())
 		return pci_mmcfg_virt[0].virt;
 
 	/* Fall back to type 0 */
@@ -145,16 +172,13 @@ int __init pci_mmcfg_arch_init(void)
 
 	for (i = 0; i < pci_mmcfg_config_num; ++i) {
 		pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
-		pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address,
-							 MMCONFIG_APER_MAX);
+		pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]);
 		if (!pci_mmcfg_virt[i].virt) {
 			printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
 					"segment %d\n",
 				pci_mmcfg_config[i].pci_segment);
 			return 0;
 		}
-		printk(KERN_INFO "PCI: Using MMCONFIG at %Lx\n", 
-				pci_mmcfg_config[i].address);
 	}
 	raw_pci_ops = &pci_mmcfg;
 	return 1;

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [19/25] x86: Reject a broken MCFG tables on Asus etc
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (16 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [20/25] x86_64: get rid of ARCH_HAVE_XTIME_LOCK Andi Kleen
                   ` (5 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: OGAWA Hirofumi, patches, linux-kernel


From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
This rejects a broken MCFG tables on Asus etc.
Arjan and Andi suggest this.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/i386/pci/mmconfig-shared.c |   24 ++++++++++++++++++-
 arch/i386/pci/mmconfig.c        |    9 -------
 arch/x86_64/pci/mmconfig.c      |   50 +++++++++++-----------------------------
 3 files changed, 37 insertions(+), 46 deletions(-)

Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig-shared.c
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -197,6 +197,26 @@ static __init void pci_mmcfg_insert_reso
 	}
 }
 
+static void __init pci_mmcfg_reject_broken(void)
+{
+	typeof(pci_mmcfg_config[0]) *cfg = &pci_mmcfg_config[0];
+
+	/*
+	 * Handle more broken MCFG tables on Asus etc.
+	 * They only contain a single entry for bus 0-0.
+	 */
+	if (pci_mmcfg_config_num == 1 &&
+	    cfg->pci_segment == 0 &&
+	    (cfg->start_bus_number | cfg->end_bus_number) == 0) {
+		kfree(pci_mmcfg_config);
+		pci_mmcfg_config = NULL;
+		pci_mmcfg_config_num = 0;
+
+		printk(KERN_ERR "PCI: start and end of bus number is 0. "
+		       "Rejected as broken MCFG.");
+	}
+}
+
 void __init pci_mmcfg_init(int type)
 {
 	int known_bridge = 0;
@@ -207,8 +227,10 @@ void __init pci_mmcfg_init(int type)
 	if (type == 1 && pci_mmcfg_check_hostbridge())
 		known_bridge = 1;
 
-	if (!known_bridge)
+	if (!known_bridge) {
 		acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+		pci_mmcfg_reject_broken();
+	}
 
 	if ((pci_mmcfg_config_num == 0) ||
 	    (pci_mmcfg_config == NULL) ||
Index: linux/arch/i386/pci/mmconfig.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig.c
+++ linux/arch/i386/pci/mmconfig.c
@@ -47,15 +47,6 @@ static u32 get_base_addr(unsigned int se
 			return cfg->address;
 	}
 
-	/* Handle more broken MCFG tables on Asus etc.
-	   They only contain a single entry for bus 0-0. Assume
- 	   this applies to all busses. */
-	cfg = &pci_mmcfg_config[0];
-	if (pci_mmcfg_config_num == 1 &&
-		cfg->pci_segment == 0 &&
-		(cfg->start_bus_number | cfg->end_bus_number) == 0)
-		return cfg->address;
-
 	/* Fall back to type 0 */
 	return 0;
 }
Index: linux/arch/x86_64/pci/mmconfig.c
===================================================================
--- linux.orig/arch/x86_64/pci/mmconfig.c
+++ linux/arch/x86_64/pci/mmconfig.c
@@ -28,39 +28,6 @@ struct mmcfg_virt {
 };
 static struct mmcfg_virt *pci_mmcfg_virt;
 
-static inline int mcfg_broken(void)
-{
-	struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[0];
-
-	/* Handle more broken MCFG tables on Asus etc.
-	   They only contain a single entry for bus 0-0. Assume
- 	   this applies to all busses. */
-	if (pci_mmcfg_config_num == 1 &&
-	    cfg->pci_segment_group_number == 0 &&
-	    (cfg->start_bus_number | cfg->end_bus_number) == 0)
-		return 1;
-	return 0;
-}
-
-static void __iomem *mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
-{
-	void __iomem *addr;
-	u32 size;
-
-	if (mcfg_broken())
-		size = 256 << 20;
-	else
-		size = (cfg->end_bus_number + 1) << 20;
-
-	addr = ioremap_nocache(cfg->base_address, size);
-	if (addr) {
-		printk(KERN_INFO "PCI: Using MMCONFIG at %x - %x\n",
-		       cfg->base_address,
-		       cfg->base_address + size - 1);
-	}
-	return addr;
-}
-
 static char __iomem *get_virt(unsigned int seg, unsigned bus)
 {
 	int cfg_num = -1;
@@ -78,9 +45,6 @@ static char __iomem *get_virt(unsigned i
 			return pci_mmcfg_virt[cfg_num].virt;
 	}
 
-	if (mcfg_broken())
-		return pci_mmcfg_virt[0].virt;
-
 	/* Fall back to type 0 */
 	return NULL;
 }
@@ -160,6 +124,20 @@ static struct pci_raw_ops pci_mmcfg = {
 	.write =	pci_mmcfg_write,
 };
 
+static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
+{
+	void __iomem *addr;
+	u32 size;
+
+	size = (cfg->end_bus_number + 1) << 20;
+	addr = ioremap_nocache(cfg->address, size);
+	if (addr) {
+		printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
+		       cfg->address, cfg->address + size - 1);
+	}
+	return addr;
+}
+
 int __init pci_mmcfg_arch_init(void)
 {
 	int i;

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [20/25] x86_64: get rid of ARCH_HAVE_XTIME_LOCK
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (17 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [19/25] x86: Reject a broken MCFG tables on Asus etc Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure Andi Kleen
                   ` (4 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Eric Dumazet, Andi Kleen, patches, linux-kernel


From: Eric Dumazet <dada1@cosmosbay.com>

ARCH_HAVE_XTIME_LOCK is used by x86_64 arch .  This arch needs to place a
read only copy of xtime_lock into vsyscall page.  This read only copy is
named __xtime_lock, and xtime_lock is defined in
arch/x86_64/kernel/vmlinux.lds.S as an alias.  So the declaration of
xtime_lock in kernel/timer.c was guarded by ARCH_HAVE_XTIME_LOCK define,
defined to true on x86_64.

We can get same result with _attribute__((weak)) in the declaration. linker
should do the job.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 include/asm-x86_64/vsyscall.h |    5 -----
 include/linux/time.h          |    2 +-
 kernel/timer.c                |    4 +---
 3 files changed, 2 insertions(+), 9 deletions(-)

Index: linux/include/asm-x86_64/vsyscall.h
===================================================================
--- linux.orig/include/asm-x86_64/vsyscall.h
+++ linux/include/asm-x86_64/vsyscall.h
@@ -56,11 +56,6 @@ extern struct vxtime_data vxtime;
 extern int vgetcpu_mode;
 extern struct timezone sys_tz;
 extern int sysctl_vsyscall;
-extern seqlock_t xtime_lock;
-
-extern int sysctl_vsyscall;
-
-#define ARCH_HAVE_XTIME_LOCK 1
 
 #endif /* __KERNEL__ */
 
Index: linux/include/linux/time.h
===================================================================
--- linux.orig/include/linux/time.h
+++ linux/include/linux/time.h
@@ -90,7 +90,7 @@ static inline struct timespec timespec_s
 
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
-extern seqlock_t xtime_lock;
+extern seqlock_t xtime_lock __attribute__((weak));
 
 void timekeeping_init(void);
 
Index: linux/kernel/timer.c
===================================================================
--- linux.orig/kernel/timer.c
+++ linux/kernel/timer.c
@@ -1162,11 +1162,9 @@ static inline void calc_load(unsigned lo
  * This read-write spinlock protects us from races in SMP while
  * playing with xtime and avenrun.
  */
-#ifndef ARCH_HAVE_XTIME_LOCK
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
 EXPORT_SYMBOL(xtime_lock);
-#endif
 
 /*
  * This function runs timers and the timer-tq in bottom half context.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (18 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [20/25] x86_64: get rid of ARCH_HAVE_XTIME_LOCK Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-12  9:57   ` [patches] " Jan Beulich
  2007-02-13 11:27   ` Eric Dumazet
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [22/25] x86_64: use memcpy_uncached_read() in RDMA interrupt handler to reduce packet loss Andi Kleen
                   ` (3 subsequent siblings)
  23 siblings, 2 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Bryan O'Sullivan, Andi Kleen, Roland Dreier, patches, linux-kernel


From: "Bryan O'Sullivan" <bos@pathscale.com>

This copy routine is memcpy-compatible, but on some architectures will use
cache-bypassing loads to avoid bringing the source data into the cache.

One case where this is useful is when a device issues a DMA to a memory
region, and the CPU must copy the DMAed data elsewhere before doing any work
with it.  Since the source data is read-once, write-never from the CPU's
perspective, caching the data at those addresses can only evict potentially
useful data.

We provide an x86_64 implementation that uses SSE non-temporal loads, and a
generic version that falls back to plain memcpy.

Implementors for other arches should not use cache-bypassing stores to the
destination, as in most cases, the destination is accessed almost immediately
after a copy finishes.

[akpm@osdl.org: add module export]
[akpm@osdl.org: remove an ARCH_HAS_foo]
Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/x86_64/kernel/x8664_ksyms.c       |    2 
 arch/x86_64/lib/Makefile               |    1 
 arch/x86_64/lib/memcpy_uncached_read.S |  142 +++++++++++++++++++++++++++++++++
 include/asm-x86_64/string.h            |    2 
 include/linux/string.h                 |    3 
 5 files changed, 150 insertions(+)

Index: linux/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -8,6 +8,7 @@
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm/string.h>
 
 EXPORT_SYMBOL(kernel_thread);
 
@@ -54,6 +55,7 @@ extern void * __memcpy(void *,const void
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(memcpy_uncached_read);
 
 EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(init_level4_pgt);
Index: linux/arch/x86_64/lib/Makefile
===================================================================
--- linux.orig/arch/x86_64/lib/Makefile
+++ linux/arch/x86_64/lib/Makefile
@@ -10,3 +10,4 @@ lib-y := csum-partial.o csum-copy.o csum
 	usercopy.o getuser.o putuser.o  \
 	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
 lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
+lib-y += memcpy_uncached_read.o
Index: linux/arch/x86_64/lib/memcpy_uncached_read.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/lib/memcpy_uncached_read.S
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 QLogic Corporation.  All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * memcpy_uncached_read - memcpy-compatible copy routine, using streaming loads
+ * @dest: destination address
+ * @src: source address (will not be cached)
+ * @count: number of bytes to copy
+ *
+ * Use streaming loads and normal stores for a special-case copy where
+ * we know we won't be reading the source again, but will be reading the
+ * destination again soon.
+ */
+	.text
+	.p2align 4,,15
+	/* rdi  destination, rsi source, rdx count */
+	.globl	memcpy_uncached_read
+	.type	memcpy_uncached_read, @function
+memcpy_uncached_read:
+	movq	%rdi, %rax
+.L5:
+	cmpq	$15, %rdx
+	ja	.L34
+.L3:
+	cmpl	$8, %edx	/* rdx is 0..15 */
+	jbe	.L9
+.L6:
+	testb	$8, %dxl	/* rdx is 3,5,6,7,9..15 */
+	je	.L13
+	movq	(%rsi), %rcx
+	addq	$8, %rsi
+	movq	%rcx, (%rdi)
+	addq	$8, %rdi
+.L13:
+	testb	$4, %dxl
+	je	.L15
+	movl	(%rsi), %ecx
+	addq	$4, %rsi
+	movl	%ecx, (%rdi)
+	addq	$4, %rdi
+.L15:
+	testb	$2, %dxl
+	je	.L17
+	movzwl	(%rsi), %ecx
+	addq	$2, %rsi
+	movw	%cx, (%rdi)
+	addq	$2, %rdi
+.L17:
+	testb	$1, %dxl
+	je	.L33
+.L1:
+	movzbl	(%rsi), %ecx
+	movb	%cl, (%rdi)
+.L33:
+	ret
+.L34:
+	cmpq	$63, %rdx	/* rdx is > 15 */
+	ja	.L64
+	movl	$16, %ecx	/* rdx is 16..63 */
+.L25:
+	movq	8(%rsi), %r8
+	movq	(%rsi), %r9
+	addq	%rcx, %rsi
+	movq	%r8, 8(%rdi)
+	movq	%r9, (%rdi)
+	addq	%rcx, %rdi
+	subq	%rcx, %rdx
+	cmpl	%edx, %ecx	/* is rdx >= 16? */
+	jbe	.L25
+	jmp	.L3		/* rdx is 0..15 */
+	.p2align 4,,7
+.L64:
+	movl	$64, %ecx
+.L42:
+	prefetchnta	128(%rsi)
+	movq	(%rsi), %r8
+	movq	8(%rsi), %r9
+	movq	16(%rsi), %r10
+	movq	24(%rsi), %r11
+	subq	%rcx, %rdx
+	movq	%r8, (%rdi)
+	movq	32(%rsi), %r8
+	movq	%r9, 8(%rdi)
+	movq	40(%rsi), %r9
+	movq	%r10, 16(%rdi)
+	movq	48(%rsi), %r10
+	movq	%r11, 24(%rdi)
+	movq	56(%rsi), %r11
+	addq	%rcx, %rsi
+	movq	%r8, 32(%rdi)
+	movq	%r9, 40(%rdi)
+	movq	%r10, 48(%rdi)
+	movq	%r11, 56(%rdi)
+	addq	%rcx, %rdi
+	cmpq	%rdx, %rcx	/* is rdx >= 64? */
+	jbe	.L42
+	sfence
+	orl	%edx, %edx
+	je	.L33
+	jmp	.L5
+.L9:
+	jmp	*.L12(,%rdx,8)	/* rdx is 0..8 */
+	.section	.rodata
+	.align 8
+	.align 4
+.L12:
+	.quad	.L33
+	.quad	.L1
+	.quad	.L2
+	.quad	.L6
+	.quad	.L4
+	.quad	.L6
+	.quad	.L6
+	.quad	.L6
+	.quad	.L8
+	.text
+.L2:
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+	ret
+.L4:
+	movl	(%rsi), %ecx
+	movl	%ecx, (%rdi)
+	ret
+.L8:
+	movq	(%rsi), %rcx
+	movq	%rcx, (%rdi)
+	ret
Index: linux/include/asm-x86_64/string.h
===================================================================
--- linux.orig/include/asm-x86_64/string.h
+++ linux/include/asm-x86_64/string.h
@@ -39,6 +39,8 @@ extern void *__memcpy(void *to, const vo
 		 __ret = __builtin_memcpy((dst),(src),__len);	\
 	   __ret; }) 
 
+extern void *memcpy_uncached_read(void *to, const void *from, size_t len);
+#define memcpy_uncached_read memcpy_uncached_read
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
Index: linux/include/linux/string.h
===================================================================
--- linux.orig/include/linux/string.h
+++ linux/include/linux/string.h
@@ -85,6 +85,9 @@ extern void * memset(void *,int,__kernel
 #ifndef __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *,const void *,__kernel_size_t);
 #endif
+#ifndef memcpy_uncached_read
+#define memcpy_uncached_read(dest, src, count) memcpy((dest), (src), (count))
+#endif
 #ifndef __HAVE_ARCH_MEMMOVE
 extern void * memmove(void *,const void *,__kernel_size_t);
 #endif

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [22/25] x86_64: use memcpy_uncached_read() in RDMA interrupt handler to reduce packet loss
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (19 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [23/25] x86_64: improved iommu documentation Andi Kleen
                   ` (2 subsequent siblings)
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Bryan O'Sullivan, Andi Kleen, Roland Dreier, patches, linux-kernel


From: "Bryan O'Sullivan" <bos@pathscale.com>

In cases where a large incoming RDMA is being received, we have to copy data
inside the interrupt handler before we can ACK each packet.  The source is
DMAed to by the hardware, which means that the CPU won't have it cached.  We
only read the source this one time; using normal load instructions pollutes
the dcache with useless data, reducing performance to the point where we can
lose a significant number of packets.

We use memcpy_uncached_read to try to not fill the dcache with useless data. 
Avoiding the cache refill penalty lets us keep up better with the sender,
resulting in many fewer dropped packets.

Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 drivers/infiniband/hw/ipath/ipath_verbs.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/drivers/infiniband/hw/ipath/ipath_verbs.c
===================================================================
--- linux.orig/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ linux/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -167,7 +167,7 @@ void ipath_copy_sge(struct ipath_sge_sta
 		BUG_ON(len == 0);
 		if (len > length)
 			len = length;
-		memcpy(sge->vaddr, data, len);
+		memcpy_uncached_read(sge->vaddr, data, len);
 		sge->vaddr += len;
 		sge->length -= len;
 		sge->sge_length -= len;

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [23/25] x86_64: improved iommu documentation
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (20 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [22/25] x86_64: use memcpy_uncached_read() in RDMA interrupt handler to reduce packet loss Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [24/25] x86_64: do not always end the stack trace with ULONG_MAX Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [25/25] i386: arch/i386/kernel/e820.c should #include <asm/setup.h Andi Kleen
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Karsten Weiss, Andi Kleen, patches, linux-kernel


From: Karsten Weiss <K.Weiss@science-computing.de>

- add SWIOTLB config help text
- mention Documentation/x86_64/boot-options.txt in
  Documentation/kernel-parameters.txt
- remove the duplication of the iommu kernel parameter documentation.
- Better explanation of some of the iommu kernel parameter options.
- "32MB<<order" instead of "32MB^order".
- Mention the default "order" value.
- list the four existing PCI-DMA mapping implementations of arch x86_64
- group the iommu= option keywords by PCI-DMA mapping implementation.
- Distinguish iommu= option keywords from number arguments.
- Explain the meaning of DAC and SAC.

Signed-off-by: Karsten Weiss <knweiss@science-computing.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Muli Ben-Yehuda <muli@il.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 Documentation/kernel-parameters.txt   |    3 
 Documentation/x86_64/boot-options.txt |  107 +++++++++++++++++++++++-----------
 arch/x86_64/Kconfig                   |   10 ++-
 arch/x86_64/kernel/pci-dma.c          |   28 +-------
 4 files changed, 89 insertions(+), 59 deletions(-)

Index: linux/Documentation/kernel-parameters.txt
===================================================================
--- linux.orig/Documentation/kernel-parameters.txt
+++ linux/Documentation/kernel-parameters.txt
@@ -104,6 +104,9 @@ loader, and have no meaning to the kerne
 Do not modify the syntax of boot loader parameters without extreme
 need or coordination with <Documentation/i386/boot.txt>.
 
+There are also arch-specific kernel-parameters not documented here.
+See for example <Documentation/x86_64/boot-options.txt>.
+
 Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
 a trailing = on the name of any parameter states that that parameter will
 be entered as an environment variable, whereas its absence indicates that
Index: linux/Documentation/x86_64/boot-options.txt
===================================================================
--- linux.orig/Documentation/x86_64/boot-options.txt
+++ linux/Documentation/x86_64/boot-options.txt
@@ -180,40 +180,81 @@ PCI
   pci=lastbus=NUMBER	       Scan upto NUMBER busses, no matter what the mptable says.
   pci=noacpi		Don't use ACPI to set up PCI interrupt routing.
 
-IOMMU
+IOMMU (input/output memory management unit)
 
- iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
-         [,forcesac][,fullflush][,nomerge][,noaperture][,calgary]
-   size  set size of iommu (in bytes)
-   noagp don't initialize the AGP driver and use full aperture.
-   off   don't use the IOMMU
-   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
-   memaper[=order] allocate an own aperture over RAM with size 32MB^order.
-   noforce don't force IOMMU usage. Default.
-   force  Force IOMMU.
-   merge  Do SG merging. Implies force (experimental)
-   nomerge Don't do SG merging.
-   forcesac For SAC mode for masks <40bits  (experimental)
-   fullflush Flush IOMMU on each allocation (default)
-   nofullflush Don't use IOMMU fullflush
-   allowed  overwrite iommu off workarounds for specific chipsets.
-   soft	 Use software bounce buffering (default for Intel machines)
-   noaperture Don't touch the aperture for AGP.
-   allowdac Allow DMA >4GB
-	    When off all DMA over >4GB is forced through an IOMMU or bounce
-	    buffering.
-   nodac    Forbid DMA >4GB
-   panic    Always panic when IOMMU overflows
-   calgary  Use the Calgary IOMMU if it is available
-
-  swiotlb=pages[,force]
-
-  pages  Prereserve that many 128K pages for the software IO bounce buffering.
-  force  Force all IO through the software TLB.
-
-  calgary=[64k,128k,256k,512k,1M,2M,4M,8M]
-  calgary=[translate_empty_slots]
-  calgary=[disable=<PCI bus number>]
+ Currently four x86-64 PCI-DMA mapping implementations exist:
+
+   1. <arch/x86_64/kernel/pci-nommu.c>: use no hardware/software IOMMU at all
+      (e.g. because you have < 3 GB memory).
+      Kernel boot message: "PCI-DMA: Disabling IOMMU"
+
+   2. <arch/x86_64/kernel/pci-gart.c>: AMD GART based hardware IOMMU.
+      Kernel boot message: "PCI-DMA: using GART IOMMU"
+
+   3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
+      e.g. if there is no hardware IOMMU in the system and it is need because
+      you have >3GB memory or told the kernel to us it (iommu=soft))
+      Kernel boot message: "PCI-DMA: Using software bounce buffering
+      for IO (SWIOTLB)"
+
+   4. <arch/x86_64/pci-calgary.c> : IBM Calgary hardware IOMMU. Used in IBM
+      pSeries and xSeries servers. This hardware IOMMU supports DMA address
+      mapping with memory protection, etc.
+      Kernel boot message: "PCI-DMA: Using Calgary IOMMU"
+
+ iommu=[<size>][,noagp][,off][,force][,noforce][,leak[=<nr_of_leak_pages>]
+	[,memaper[=<order>]][,merge][,forcesac][,fullflush][,nomerge]
+	[,noaperture][,calgary]
+
+  General iommu options:
+    off                Don't initialize and use any kind of IOMMU.
+    noforce            Don't force hardware IOMMU usage when it is not needed.
+                       (default).
+    force              Force the use of the hardware IOMMU even when it is
+                       not actually needed (e.g. because < 3 GB memory).
+    soft               Use software bounce buffering (SWIOTLB) (default for
+                       Intel machines). This can be used to prevent the usage
+                       of an available hardware IOMMU.
+
+  iommu options only relevant to the AMD GART hardware IOMMU:
+    <size>             Set the size of the remapping area in bytes.
+    allowed            Overwrite iommu off workarounds for specific chipsets.
+    fullflush          Flush IOMMU on each allocation (default).
+    nofullflush        Don't use IOMMU fullflush.
+    leak               Turn on simple iommu leak tracing (only when
+                       CONFIG_IOMMU_LEAK is on). Default number of leak pages
+                       is 20.
+    memaper[=<order>]  Allocate an own aperture over RAM with size 32MB<<order.
+                       (default: order=1, i.e. 64MB)
+    merge              Do scather-gather (SG) merging. Implies "force"
+                       (experimental).
+    nomerge            Don't do scather-gather (SG) merging.
+    noaperture         Ask the IOMMU not to touch the aperture for AGP.
+    forcesac           Force single-address cycle (SAC) mode for masks <40bits
+                       (experimental).
+    noagp              Don't initialize the AGP driver and use full aperture.
+    allowdac           Allow double-address cycle (DAC) mode, i.e. DMA >4GB.
+                       DAC is used with 32-bit PCI to push a 64-bit address in
+                       two cycles. When off all DMA over >4GB is forced through
+                       an IOMMU or software bounce buffering.
+    nodac              Forbid DAC mode, i.e. DMA >4GB.
+    panic              Always panic when IOMMU overflows.
+    calgary            Use the Calgary IOMMU if it is available
+
+  iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
+  implementation:
+    swiotlb=<pages>[,force]
+    <pages>            Prereserve that many 128K pages for the software IO
+                       bounce buffering.
+    force              Force all IO through the software TLB.
+
+  Settings for the IBM Calgary hardware IOMMU currently found in IBM
+  pSeries and xSeries machines:
+
+    calgary=[64k,128k,256k,512k,1M,2M,4M,8M]
+    calgary=[translate_empty_slots]
+    calgary=[disable=<PCI bus number>]
+    panic              Always panic when IOMMU overflows
 
     64k,...,8M - Set the size of each PCI slot's translation table
     when using the Calgary IOMMU. This is the size of the translation
Index: linux/arch/x86_64/Kconfig
===================================================================
--- linux.orig/arch/x86_64/Kconfig
+++ linux/arch/x86_64/Kconfig
@@ -454,8 +454,8 @@ config IOMMU
 	  on systems with more than 3GB. This is usually needed for USB,
 	  sound, many IDE/SATA chipsets and some other devices.
 	  Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
-	  based IOMMU and a software bounce buffer based IOMMU used on Intel
-	  systems and as fallback.
+	  based hardware IOMMU and a software bounce buffer based IOMMU used
+	  on Intel systems and as fallback.
 	  The code is only active when needed (enough memory and limited
 	  device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
 	  too.
@@ -492,6 +492,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
+	help
+	  Support for software bounce buffers used on x86-64 systems
+	  which don't have a hardware IOMMU (e.g. the current generation
+	  of Intel's x86-64 CPUs). Using this PCI devices which can only
+	  access 32-bits of memory can be used on systems with more than
+	  3 GB of memory. If unsure, say Y.
 
 config X86_MCE
 	bool "Machine check support" if EMBEDDED
Index: linux/arch/x86_64/kernel/pci-dma.c
===================================================================
--- linux.orig/arch/x86_64/kernel/pci-dma.c
+++ linux/arch/x86_64/kernel/pci-dma.c
@@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64
 }
 EXPORT_SYMBOL(dma_set_mask);
 
-/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
-         [,forcesac][,fullflush][,nomerge][,biomerge]
-   size  set size of iommu (in bytes)
-   noagp don't initialize the AGP driver and use full aperture.
-   off   don't use the IOMMU
-   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
-   memaper[=order] allocate an own aperture over RAM with size 32MB^order.
-   noforce don't force IOMMU usage. Default.
-   force  Force IOMMU.
-   merge  Do lazy merging. This may improve performance on some block devices.
-          Implies force (experimental)
-   biomerge Do merging at the BIO layer. This is more efficient than merge,
-            but should be only done with very big IOMMUs. Implies merge,force.
-   nomerge Don't do SG merging.
-   forcesac For SAC mode for masks <40bits  (experimental)
-   fullflush Flush IOMMU on each allocation (default)
-   nofullflush Don't use IOMMU fullflush
-   allowed  overwrite iommu off workarounds for specific chipsets.
-   soft	 Use software bounce buffering (default for Intel machines)
-   noaperture Don't touch the aperture for AGP.
-   allowdac Allow DMA >4GB
-   nodac    Forbid DMA >4GB
-   panic    Force panic when IOMMU overflows
-*/
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
 __init int iommu_setup(char *p)
 {
 	iommu_merge = 1;

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [24/25] x86_64: do not always end the stack trace with ULONG_MAX
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (21 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [23/25] x86_64: improved iommu documentation Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [25/25] i386: arch/i386/kernel/e820.c should #include <asm/setup.h Andi Kleen
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Catalin Marinas, Andi Kleen, Jan Beulich, patches, linux-kernel


From: Catalin Marinas <catalin.marinas@arm.com>

It makes more sense to end the stack trace with ULONG_MAX only if
nr_entries < max_entries.  Otherwise, we lose one entry in the long stack
traces and cannot know whether the trace was complete or not.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/x86_64/kernel/stacktrace.c |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/stacktrace.c
===================================================================
--- linux.orig/arch/x86_64/kernel/stacktrace.c
+++ linux/arch/x86_64/kernel/stacktrace.c
@@ -32,7 +32,7 @@ static void save_stack_address(void *dat
 		trace->skip--;
 		return;
 	}
-	if (trace->nr_entries < trace->max_entries - 1)
+	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = addr;
 }
 
@@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_
 void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
 {
 	dump_trace(task, NULL, NULL, &save_stack_ops, trace);
-	trace->entries[trace->nr_entries++] = ULONG_MAX;
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL(save_stack_trace);
 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6.21 review I] [25/25] i386: arch/i386/kernel/e820.c should #include <asm/setup.h
  2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
                   ` (22 preceding siblings ...)
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [24/25] x86_64: do not always end the stack trace with ULONG_MAX Andi Kleen
@ 2007-02-10 11:50 ` Andi Kleen
  23 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 11:50 UTC (permalink / raw)
  To: Adrian Bunk, patches, linux-kernel


From: Adrian Bunk <bunk@stusta.de>

Every file should #include the headers containing the prototypes for
its global functions.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/i386/kernel/e820.c |    1 +
 1 file changed, 1 insertion(+)

Index: linux/arch/i386/kernel/e820.c
===================================================================
--- linux.orig/arch/i386/kernel/e820.c
+++ linux/arch/i386/kernel/e820.c
@@ -14,6 +14,7 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/e820.h>
+#include <asm/setup.h>
 
 #ifdef CONFIG_EFI
 int efi_enabled = 0;

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address Andi Kleen
@ 2007-02-10 11:58   ` Arjan van de Ven
  2007-02-10 12:07     ` Andi Kleen
  0 siblings, 1 reply; 43+ messages in thread
From: Arjan van de Ven @ 2007-02-10 11:58 UTC (permalink / raw)
  To: Andi Kleen; +Cc: OGAWA Hirofumi, patches, linux-kernel

Andi Kleen wrote:
> From: OGAWA Hirofumi <hogawa@miraclelinux.com>
> 
> Current mmconfig has some problems of remapped range.


eh wasn't there a patch that just ignored the MCFG for the broken 
system instead?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address
  2007-02-10 11:58   ` Arjan van de Ven
@ 2007-02-10 12:07     ` Andi Kleen
  0 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-10 12:07 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: OGAWA Hirofumi, patches, linux-kernel

On Saturday 10 February 2007 12:58, Arjan van de Ven wrote:
> Andi Kleen wrote:
> > From: OGAWA Hirofumi <hogawa@miraclelinux.com>
> > 
> > Current mmconfig has some problems of remapped range.
> 
> 
> eh wasn't there a patch that just ignored the MCFG for the broken 
> system instead?

That's done in a followup patch

-Andi

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels Andi Kleen
@ 2007-02-11 11:13   ` Eric W. Biederman
  2007-02-12 22:36     ` Andi Kleen
  0 siblings, 1 reply; 43+ messages in thread
From: Eric W. Biederman @ 2007-02-11 11:13 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Ingo Molnar, Suresh Siddha, Li, Shaohua, patches, linux-kernel

Andi Kleen <ak@suse.de> writes:

> From: Ingo Molnar <mingo@elte.hu>
>
> Default to physical mode on hotplug CPU kernels.  Furher simplify and clean up
> the APIC initialization code.

Where is the code that the subject describes?

I have two problems here.

- I don't see anything handling the hotplug case, and forcing us to
  physical mode.
- Ingo's other patch asserts that hotplug should be made to handle
  logical deliver mode.

With logical deliver mode the experimental evidence is that the
destination cpu is a hint, and you can arrive at a cpu that is not
in your cpu mask.  Now I only saw that problem on hyperthreaded cpus
but we didn't have the code enabled long enough to see it in other
cases.

Maybe if I can finish getting irq migration back into process context,
and someone verifies that the cpu disable in the hotplug path actually
disables the cpu/hyperthread instead of sitting in a hlt loop.  We
won't have a problem.  But broadcast ipis and irqs that don't go where
you tell them to are things we need to be very careful with.

Eric

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Andi Kleen
@ 2007-02-12  9:32   ` Jan Beulich
  2007-02-12 16:42     ` Jeff Dike
  0 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2007-02-12  9:32 UTC (permalink / raw)
  To: Jeff Dike, Andi Kleen; +Cc: linux-kernel, patches

>>> Andi Kleen <ak@suse.de> 10.02.07 12:50 >>>
>
>From: Jeff Dike <jdike@addtoit.com>
>
>Kernel-mode traps on x86_64 can pollute the trap information for a previous
>userspace trap for which the signal has not yet been delivered to the
>process.
>
>do_trap and do_general_protection set task->thread.error_code and .trapno
>for kernel traps.  If a kernel-mode trap arrives between the arrival of a
>userspace trap and the delivery of the associated SISGEGV to the process,
>the process will get the kernel trap information in its sigcontext.
>
>This causes UML process segfaults, as the trapno that the UML kernel sees
>is 13, rather than the 14 for normal page faults.  So, the UML kernel
>passes the SIGSEGV along to its process.
>
>I don't claim to fully understand the problem.  On the one hand, a check in
>do_general_protection for a pending SIGSEGV turned up nothing.  On the
>other hand, this patch fixed the UML process segfault problem.
>
>The patch below moves the setting of error_code and trapno so that that
>only happens in the case of userspace faults.  As a side-effect, this
>should speed up kernel-mode fault handling a tiny bit.

This breaks consumers of notify_die() relying on the proper trap number being
passed, as the call to notify_die() from die() currently reads
current->thread.trap_no.

Also, you seem to leave other places where trap_no gets set untouched -
is this intentional (do_debug - probably correct here, kernel_math_error -
probably incorrect here)?

>I looked at i386, and there is a similar situation.  In this case, there is
>duplicate code setting task->thread.error_code and trapno.  I deleted one,
>leaving the copy that runs in the case of a userspace fault.

Likewise.

Jan

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure Andi Kleen
@ 2007-02-12  9:57   ` Jan Beulich
  2007-02-12 10:25     ` Andi Kleen
  2007-02-13 11:27   ` Eric Dumazet
  1 sibling, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2007-02-12  9:57 UTC (permalink / raw)
  To: Bryan O'Sullivan, Andi Kleen; +Cc: Roland Dreier, linux-kernel, patches

>>> Andi Kleen <ak@suse.de> 10.02.07 12:50 >>>
>
>From: "Bryan O'Sullivan" <bos@pathscale.com>
>
>This copy routine is memcpy-compatible, but on some architectures will use
>cache-bypassing loads to avoid bringing the source data into the cache.
>
>One case where this is useful is when a device issues a DMA to a memory
>region, and the CPU must copy the DMAed data elsewhere before doing any work
>with it.  Since the source data is read-once, write-never from the CPU's
>perspective, caching the data at those addresses can only evict potentially
>useful data.
>
>We provide an x86_64 implementation that uses SSE non-temporal loads, and a
>generic version that falls back to plain memcpy.
>
>Implementors for other arches should not use cache-bypassing stores to the
>destination, as in most cases, the destination is accessed almost immediately
>after a copy finishes.

This looks a little strange to me:
- the first 128 bytes are still going through the cache
- up to 192 bytes past the copied area are being marked non-temporal, while
  there's nothing known about that area
- sfence seems questionable here, I would have thought this should be lfence,
  or perhaps even none at all

Minor remarks would be to remove the double .align before .L12 and replace
or-ing a register with itself by test.

Jan

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure
  2007-02-12  9:57   ` [patches] " Jan Beulich
@ 2007-02-12 10:25     ` Andi Kleen
  0 siblings, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-12 10:25 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Bryan O'Sullivan, Roland Dreier, linux-kernel, patches


> This looks a little strange to me:
> - the first 128 bytes are still going through the cache
> - up to 192 bytes past the copied area are being marked non-temporal, while
>   there's nothing known about that area

Yes that seems quite bogus.

> - sfence seems questionable here, I would have thought this should be lfence,
>   or perhaps even none at all

Agreed -- it's not needed.

I think i also objected earlier to the jump table which is likely slower.

Will drop for now.

-Andi

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead
  2007-02-12  9:32   ` [patches] " Jan Beulich
@ 2007-02-12 16:42     ` Jeff Dike
  2007-02-12 17:01       ` Jan Beulich
  0 siblings, 1 reply; 43+ messages in thread
From: Jeff Dike @ 2007-02-12 16:42 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andi Kleen, linux-kernel, patches

On Mon, Feb 12, 2007 at 09:32:10AM +0000, Jan Beulich wrote:
> This breaks consumers of notify_die() relying on the proper trap number being
> passed, as the call to notify_die() from die() currently reads
> current->thread.trap_no.

Rats, good point.

> Also, you seem to leave other places where trap_no gets set untouched -
> is this intentional (do_debug - probably correct here, kernel_math_error -
> probably incorrect here)?

I did check the other trap handlers.  kernel_math_error calls die,
which calls do_exit(SIGSEGV).  This doesn't seem to allow the process
the opportunity to trap the SIGSEGV and examine the fault information.

> >I looked at i386, and there is a similar situation.  In this case, there is
> >duplicate code setting task->thread.error_code and trapno.  I deleted one,
> >leaving the copy that runs in the case of a userspace fault.
> 
> Likewise.

Yup.  How does this patch look to you?  We set error_code and trap_no
for userspace faults and kernel faults which call die().  We don't set
them for kernelspace faults which are fixed up.

Index: linux-2.6/arch/i386/kernel/traps.c
===================================================================
--- linux-2.6.orig/arch/i386/kernel/traps.c
+++ linux-2.6/arch/i386/kernel/traps.c
@@ -619,6 +619,8 @@ gp_in_vm86:
 
 gp_in_kernel:
 	if (!fixup_exception(regs)) {
+		current->thread.error_code = error_code;
+		current->thread.trap_no = 13;
 		if (notify_die(DIE_GPF, "general protection fault", regs,
 				error_code, 13, SIGSEGV) == NOTIFY_STOP)
 			return;
Index: linux-2.6/arch/x86_64/kernel/traps.c
===================================================================
--- linux-2.6.orig/arch/x86_64/kernel/traps.c
+++ linux-2.6/arch/x86_64/kernel/traps.c
@@ -605,8 +605,11 @@ static void __kprobes do_trap(int trapnr
 		fixup = search_exception_tables(regs->rip);
 		if (fixup)
 			regs->rip = fixup->fixup;
-		else	
+		else {
+			tsk->thread.error_code = error_code;
+			tsk->thread.trap_no = trapnr;
 			die(str, regs, error_code);
+		}
 		return;
 	}
 }


-- 
Work email - jdike at linux dot intel dot com


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead
  2007-02-12 16:42     ` Jeff Dike
@ 2007-02-12 17:01       ` Jan Beulich
  0 siblings, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2007-02-12 17:01 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Andi Kleen, linux-kernel, patches

>Yup.  How does this patch look to you?  We set error_code and trap_no
>for userspace faults and kernel faults which call die().  We don't set
>them for kernelspace faults which are fixed up.

That seems a reasonable approach.

Thanks, Jan

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels
  2007-02-11 11:13   ` Eric W. Biederman
@ 2007-02-12 22:36     ` Andi Kleen
  2007-02-12 23:10       ` Eric W. Biederman
  2007-02-12 23:43       ` Siddha, Suresh B
  0 siblings, 2 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-12 22:36 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Suresh Siddha, Li, Shaohua, patches, linux-kernel

On Sunday 11 February 2007 12:13, Eric W. Biederman wrote:
> Andi Kleen <ak@suse.de> writes:
> 
> > From: Ingo Molnar <mingo@elte.hu>
> >
> > Default to physical mode on hotplug CPU kernels.  Furher simplify and clean up
> > the APIC initialization code.
> 
> Where is the code that the subject describes?

True, that seems to be missing.

I agree that the patch seems to consist mostly of renaming doesn't make
it any easier to read.

And it's worrying that it doesn't handle the hotplug case at all.

> I have two problems here.
> 
> - I don't see anything handling the hotplug case, and forcing us to
>   physical mode.
> - Ingo's other patch asserts that hotplug should be made to handle
>   logical deliver mode.
> 
> With logical deliver mode the experimental evidence is that the
> destination cpu is a hint, 

What experimental evidence did you have? 

But I'm tempted to drop this unless the hotplug mystery can be cleared
up. There was past information that logical is unsafe for hotplug.

Ingo? Suresh? 

-Andi

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels
  2007-02-12 22:36     ` Andi Kleen
@ 2007-02-12 23:10       ` Eric W. Biederman
  2007-02-12 23:51         ` Siddha, Suresh B
  2007-02-12 23:43       ` Siddha, Suresh B
  1 sibling, 1 reply; 43+ messages in thread
From: Eric W. Biederman @ 2007-02-12 23:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Ingo Molnar, Suresh Siddha, Li, Shaohua, patches, linux-kernel

Andi Kleen <andi@firstfloor.org> writes:


> What experimental evidence did you have? 
>
> But I'm tempted to drop this unless the hotplug mystery can be cleared
> up. There was past information that logical is unsafe for hotplug.

Basically as I commented in genapic_flat, that at least on hyperthreading
cpus the destination mask is not always honored, and so if you only
allow one hyperthread I have seen the irq show up on the other hyperthread.

Now if the cpu is actually disabled I don't think that is a problem, but
I know early versions of hotplug did actually disable the cpu.

I think the renaming in this patch makes things clearer in a useful way.


The more I look at the hotplug cpu code the more I think it should be
filed under EXPERIMENTAL (as in the code is buggy and not ready for
production use yet).

Trying to see if I can improve the irq migration mess I stumbled upon
the following.  Currently set_affinity needs to be called with
irq_desc[irq].lock held.  It needs to be called from interrupt context.
And 1 millisecond delay appears utterly bogus, although the enable
the irqs do something disable the irqs likely flushes pending irqs.
A problem that cannot occurs differently if the irqs are migrated
from interrupt context.

Looking further this buggy set_affinity usage also appears in
setup_ioapic_dest.  Although it is much less dangerous there,
as the code is largely a noop.

Is it just me or are we crazy for support software controlled irq
migration?

void fixup_irqs(cpumask_t map)
{
	unsigned int irq;
	static int warned;

	for (irq = 0; irq < NR_IRQS; irq++) {
		cpumask_t mask;
		if (irq == 2)
			continue;

		cpus_and(mask, irq_desc[irq].affinity, map);
		if (any_online_cpu(mask) == NR_CPUS) {
			printk("Breaking affinity for irq %i\n", irq);
			mask = map;
		}
		if (irq_desc[irq].chip->set_affinity)
			irq_desc[irq].chip->set_affinity(irq, mask);
		else if (irq_desc[irq].action && !(warned++))
			printk("Cannot set affinity for irq %i\n", irq);
	}

	/* That doesn't seem sufficient.  Give it 1ms. */
	local_irq_enable();
	mdelay(1);
	local_irq_disable();
}

Eric

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels
  2007-02-12 22:36     ` Andi Kleen
  2007-02-12 23:10       ` Eric W. Biederman
@ 2007-02-12 23:43       ` Siddha, Suresh B
  1 sibling, 0 replies; 43+ messages in thread
From: Siddha, Suresh B @ 2007-02-12 23:43 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Eric W. Biederman, Ingo Molnar, Suresh Siddha, Li, Shaohua,
	patches, linux-kernel

On Mon, Feb 12, 2007 at 11:36:23PM +0100, Andi Kleen wrote:
> True, that seems to be missing.
> 
> I agree that the patch seems to consist mostly of renaming doesn't make
> it any easier to read.
> 
> And it's worrying that it doesn't handle the hotplug case at all.

This patch is mostly a cleanup patch and doesn't have to do anything with
hotplug. I think the change log comment about hotplug kernel is a leftover
from an old patch.

> But I'm tempted to drop this unless the hotplug mystery can be cleared
> up. There was past information that logical is unsafe for hotplug.
> 
> Ingo? Suresh? 

logical clustered mode has problems with cpu hotplug.
( http://marc.theaimsgroup.com/?l=linux-kernel&m=113261865814107&w=2 )

I think logical flat is fine. We should be fine in logical flat, as long as
we are not using apic IPI shortcuts.

thanks,
suresh

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels
  2007-02-12 23:10       ` Eric W. Biederman
@ 2007-02-12 23:51         ` Siddha, Suresh B
  0 siblings, 0 replies; 43+ messages in thread
From: Siddha, Suresh B @ 2007-02-12 23:51 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andi Kleen, Ingo Molnar, Suresh Siddha, Li, Shaohua, patches,
	linux-kernel

On Mon, Feb 12, 2007 at 04:10:44PM -0700, Eric W. Biederman wrote:
> Basically as I commented in genapic_flat, that at least on hyperthreading
> cpus the destination mask is not always honored, and so if you only
> allow one hyperthread I have seen the irq show up on the other hyperthread.

Which platform is this? I haven't heard this before.

thanks,
suresh

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure
  2007-02-10 11:50 ` [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure Andi Kleen
  2007-02-12  9:57   ` [patches] " Jan Beulich
@ 2007-02-13 11:27   ` Eric Dumazet
  1 sibling, 0 replies; 43+ messages in thread
From: Eric Dumazet @ 2007-02-13 11:27 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Bryan O'Sullivan, Roland Dreier, patches, linux-kernel

Andi Kleen a écrit :
> From: "Bryan O'Sullivan" <bos@pathscale.com>
> 
> This copy routine is memcpy-compatible, but on some architectures will use
> cache-bypassing loads to avoid bringing the source data into the cache.
> 
> One case where this is useful is when a device issues a DMA to a memory
> region, and the CPU must copy the DMAed data elsewhere before doing any work
> with it.  Since the source data is read-once, write-never from the CPU's
> perspective, caching the data at those addresses can only evict potentially
> useful data.
> 
> We provide an x86_64 implementation that uses SSE non-temporal loads, and a
> generic version that falls back to plain memcpy.

> +	movq	%r11, 56(%rdi)
> +	addq	%rcx, %rdi
> +	cmpq	%rdx, %rcx	/* is rdx >= 64? */
> +	jbe	.L42
> +	sfence
> +	orl	%edx, %edx
> +	je	.L33

I have three questions/remarks

1) Just curious why sfence is necessary here ?

2) Shouldnt we use this for large buffers, and restrict them to a size 
multiple of 64, to avoid all these conditional branches ?

3) Also, the first 128 bytes of the source buffer will be bring into cache.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead
  2007-02-15  8:01   ` Jan Beulich
@ 2007-02-15 16:23     ` Jeff Dike
  0 siblings, 0 replies; 43+ messages in thread
From: Jeff Dike @ 2007-02-15 16:23 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andi Kleen, linux-kernel, patches

On Thu, Feb 15, 2007 at 08:01:20AM +0000, Jan Beulich wrote:
> Ack.

Great, thanks for your help.

			Jeff

-- 
Work email - jdike at linux dot intel dot com

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead
  2007-02-14 17:51 ` Jeff Dike
@ 2007-02-15  8:01   ` Jan Beulich
  2007-02-15 16:23     ` Jeff Dike
  0 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2007-02-15  8:01 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Andi Kleen, linux-kernel, patches

>>> Jeff Dike <jdike@addtoit.com> 14.02.07 18:51 >>>
>On Tue, Feb 13, 2007 at 07:52:54AM +0000, Jan Beulich wrote:
>> Actually, after a second round of thinking I believe there's still more to do
>> - your second patch missed fixing i386's do_trap() similarly to x86-64's
>> and, vice versa, x86-64's do_general_protection() similarly to i386's.
>
>Sigh, here's another go at it - the full patch instead of
>incrementally fixing the old one:

Ack.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead
  2007-02-13  7:52 [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Jan Beulich
  2007-02-13 10:00 ` Andi Kleen
@ 2007-02-14 17:51 ` Jeff Dike
  2007-02-15  8:01   ` Jan Beulich
  1 sibling, 1 reply; 43+ messages in thread
From: Jeff Dike @ 2007-02-14 17:51 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andi Kleen, linux-kernel, patches

On Tue, Feb 13, 2007 at 07:52:54AM +0000, Jan Beulich wrote:
> Actually, after a second round of thinking I believe there's still more to do
> - your second patch missed fixing i386's do_trap() similarly to x86-64's
> and, vice versa, x86-64's do_general_protection() similarly to i386's.

Sigh, here's another go at it - the full patch instead of
incrementally fixing the old one:

Index: linux-2.6/arch/i386/kernel/traps.c
===================================================================
--- linux-2.6.orig/arch/i386/kernel/traps.c
+++ linux-2.6/arch/i386/kernel/traps.c
@@ -473,8 +473,6 @@ static void __kprobes do_trap(int trapnr
 			      siginfo_t *info)
 {
 	struct task_struct *tsk = current;
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = trapnr;
 
 	if (regs->eflags & VM_MASK) {
 		if (vm86)
@@ -486,6 +484,9 @@ static void __kprobes do_trap(int trapnr
 		goto kernel_trap;
 
 	trap_signal: {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+
 		if (info)
 			force_sig_info(signr, info, tsk);
 		else
@@ -494,8 +495,11 @@ static void __kprobes do_trap(int trapnr
 	}
 
 	kernel_trap: {
-		if (!fixup_exception(regs))
+		if (!fixup_exception(regs)) {
+			tsk->thread.error_code = error_code;
+			tsk->thread.trap_no = trapnr;
 			die(str, regs, error_code);
+		}
 		return;
 	}
 
@@ -600,15 +604,21 @@ fastcall void __kprobes do_general_prote
 	}
 	put_cpu();
 
-	current->thread.error_code = error_code;
-	current->thread.trap_no = 13;
-
 	if (regs->eflags & VM_MASK)
 		goto gp_in_vm86;
 
 	if (!user_mode(regs))
 		goto gp_in_kernel;
 
+	/*
+	 * We want error_code and trap_no set for userspace faults and
+	 * kernelspace faults which result in die(), but not
+	 * kernelspace faults which are fixed up.  die() gives the
+	 * process no chance to handle the signal and notice the
+	 * kernel fault information, so that won't result in polluting
+	 * the information about previously queued, but not yet
+	 * delivered, fault.
+	 */
 	current->thread.error_code = error_code;
 	current->thread.trap_no = 13;
 	force_sig(SIGSEGV, current);
@@ -621,6 +631,8 @@ gp_in_vm86:
 
 gp_in_kernel:
 	if (!fixup_exception(regs)) {
+		current->thread.error_code = error_code;
+		current->thread.trap_no = 13;
 		if (notify_die(DIE_GPF, "general protection fault", regs,
 				error_code, 13, SIGSEGV) == NOTIFY_STOP)
 			return;
Index: linux-2.6/arch/x86_64/kernel/traps.c
===================================================================
--- linux-2.6.orig/arch/x86_64/kernel/traps.c
+++ linux-2.6/arch/x86_64/kernel/traps.c
@@ -581,10 +581,19 @@ static void __kprobes do_trap(int trapnr
 {
 	struct task_struct *tsk = current;
 
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = trapnr;
-
 	if (user_mode(regs)) {
+		/*
+		 * We want error_code and trap_no set for userspace faults
+		 * and kernelspace faults which result in die(), but
+		 * not kernelspace faults which are fixed up.  die()
+		 * gives the process no chance to handle the signal
+		 * and notice the kernel fault information, so that
+		 * won't result in polluting the information about
+		 * previously queued, but not yet delivered, fault.
+		 */
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+
 		if (exception_trace && unhandled_signal(tsk, signr))
 			printk(KERN_INFO
 			       "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
@@ -605,8 +614,11 @@ static void __kprobes do_trap(int trapnr
 		fixup = search_exception_tables(regs->rip);
 		if (fixup)
 			regs->rip = fixup->fixup;
-		else	
+		else {
+			tsk->thread.error_code = error_code;
+			tsk->thread.trap_no = trapnr;
 			die(str, regs, error_code);
+		}
 		return;
 	}
 }
@@ -682,10 +694,10 @@ asmlinkage void __kprobes do_general_pro
 
 	conditional_sti(regs);
 
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 13;
-
 	if (user_mode(regs)) {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = 13;
+
 		if (exception_trace && unhandled_signal(tsk, SIGSEGV))
 			printk(KERN_INFO
 		       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
@@ -704,6 +716,10 @@ asmlinkage void __kprobes do_general_pro
 			regs->rip = fixup->fixup;
 			return;
 		}
+
+
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = 13;
 		if (notify_die(DIE_GPF, "general protection fault", regs,
 					error_code, 13, SIGSEGV) == NOTIFY_STOP)
 			return;

-- 
Work email - jdike at linux dot intel dot com

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead
  2007-02-13  7:52 [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Jan Beulich
@ 2007-02-13 10:00 ` Andi Kleen
  2007-02-14 17:51 ` Jeff Dike
  1 sibling, 0 replies; 43+ messages in thread
From: Andi Kleen @ 2007-02-13 10:00 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Jeff Dike, linux-kernel, patches

On Tuesday 13 February 2007 08:52, Jan Beulich wrote:
> >Yup.  How does this patch look to you?  We set error_code and trap_no
> >for userspace faults and kernel faults which call die().  We don't set
> >them for kernelspace faults which are fixed up.
> 
> Actually, after a second round of thinking I believe there's still more to do
> - your second patch missed fixing i386's do_trap() similarly to x86-64's
> and, vice versa, x86-64's do_general_protection() similarly to i386's.

I dropped the patch for now until that is all worked out

-Andi

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead
@ 2007-02-13  7:52 Jan Beulich
  2007-02-13 10:00 ` Andi Kleen
  2007-02-14 17:51 ` Jeff Dike
  0 siblings, 2 replies; 43+ messages in thread
From: Jan Beulich @ 2007-02-13  7:52 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Andi Kleen, linux-kernel, patches

>Yup.  How does this patch look to you?  We set error_code and trap_no
>for userspace faults and kernel faults which call die().  We don't set
>them for kernelspace faults which are fixed up.

Actually, after a second round of thinking I believe there's still more to do
- your second patch missed fixing i386's do_trap() similarly to x86-64's
and, vice versa, x86-64's do_general_protection() similarly to i386's.

Jan

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2007-02-15 16:31 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [3/25] i386: Convert i386 PDA code to use %fs Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Andi Kleen
2007-02-12  9:32   ` [patches] " Jan Beulich
2007-02-12 16:42     ` Jeff Dike
2007-02-12 17:01       ` Jan Beulich
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [5/25] i386: revert i386-fix-the-verify_quirk_intel_irqbalance Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [6/25] x86_64: revert x86_64-mm-add-genapic_force Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [7/25] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525 Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [8/25] x86_64: optimize & fix APIC mode setup Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [9/25] x86_64: always use physical delivery mode on > 8 CPUs Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [10/25] x86_64: remove clustered APIC mode Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels Andi Kleen
2007-02-11 11:13   ` Eric W. Biederman
2007-02-12 22:36     ` Andi Kleen
2007-02-12 23:10       ` Eric W. Biederman
2007-02-12 23:51         ` Siddha, Suresh B
2007-02-12 23:43       ` Siddha, Suresh B
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [12/25] x86_64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [13/25] i386: Fix a typo in an IRQ handler name Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [14/25] x86: Share what's shareable Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [15/25] i386: Only call unreachable_devices() when type 1 is available Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [16/25] i386: Detect and support the E7520 and the 945G/GZ/P/PL Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [17/25] i386: Reserve resources but only when we're sure about them Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address Andi Kleen
2007-02-10 11:58   ` Arjan van de Ven
2007-02-10 12:07     ` Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [19/25] x86: Reject a broken MCFG tables on Asus etc Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [20/25] x86_64: get rid of ARCH_HAVE_XTIME_LOCK Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure Andi Kleen
2007-02-12  9:57   ` [patches] " Jan Beulich
2007-02-12 10:25     ` Andi Kleen
2007-02-13 11:27   ` Eric Dumazet
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [22/25] x86_64: use memcpy_uncached_read() in RDMA interrupt handler to reduce packet loss Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [23/25] x86_64: improved iommu documentation Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [24/25] x86_64: do not always end the stack trace with ULONG_MAX Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [25/25] i386: arch/i386/kernel/e820.c should #include <asm/setup.h Andi Kleen
2007-02-13  7:52 [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Jan Beulich
2007-02-13 10:00 ` Andi Kleen
2007-02-14 17:51 ` Jeff Dike
2007-02-15  8:01   ` Jan Beulich
2007-02-15 16:23     ` Jeff Dike

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).