linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] [0/58] First batch of x86 patches for .23
@ 2007-07-19  9:54 Andi Kleen
  2007-07-19  9:54 ` [PATCH] [1/58] x86: Always flush pages in change_page_attr Andi Kleen
                   ` (57 more replies)
  0 siblings, 58 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


- Some more improvements for AMD family 10
- Some help for gcc 4.3
- Out of line string functions for i386 (saves >20k text) 
- x86-64 vDSO
- improved fake numa node code from David Rientjes
- various machine checking handling improvements from Tim H.
- timer cleanups and fixes from Thomas Gleixner
- various other cleanup and fixes

Please review. I plan to send them off relatively quickly
because I'm very late with this merge.

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [1/58] x86: Always flush pages in change_page_attr
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-08-06 10:15   ` [patches] " Jan Beulich
  2007-07-19  9:54 ` [PATCH] [2/58] x86_64: Tell gcc to only align stack to 8 bytes Andi Kleen
                   ` (56 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


Fix a bug introduced with the CLFLUSH changes: we must always flush pages
changed in cpa(), not just when they are reverted.

Reenable CLFLUSH usage with that now (it was temporarily disabled
for .22) 

Add some BUG_ONs

Contains fixes from  Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/i386/mm/pageattr.c   |   20 +++++++++++++++++---
 arch/x86_64/mm/pageattr.c |   23 ++++++++++++++---------
 2 files changed, 31 insertions(+), 12 deletions(-)

Index: linux/arch/x86_64/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86_64/mm/pageattr.c
+++ linux/arch/x86_64/mm/pageattr.c
@@ -74,14 +74,12 @@ static void flush_kernel_map(void *arg)
 	struct page *pg;
 
 	/* When clflush is available always use it because it is
-	   much cheaper than WBINVD. Disable clflush for now because
-	   the high level code is not ready yet */
-	if (1 || !cpu_has_clflush)
+	   much cheaper than WBINVD. */
+	if (!cpu_has_clflush)
 		asm volatile("wbinvd" ::: "memory");
 	else list_for_each_entry(pg, l, lru) {
 		void *adr = page_address(pg);
-		if (cpu_has_clflush)
-			cache_flush_page(adr);
+		cache_flush_page(adr);
 	}
 	__flush_tlb_all();
 }
@@ -95,7 +93,8 @@ static LIST_HEAD(deferred_pages); /* pro
 
 static inline void save_page(struct page *fpage)
 {
-	list_add(&fpage->lru, &deferred_pages);
+	if (!test_and_set_bit(PG_arch_1, &fpage->flags))
+		list_add(&fpage->lru, &deferred_pages);
 }
 
 /* 
@@ -129,9 +128,12 @@ __change_page_attr(unsigned long address
 	pte_t *kpte; 
 	struct page *kpte_page;
 	pgprot_t ref_prot2;
+
 	kpte = lookup_address(address);
 	if (!kpte) return 0;
 	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+	BUG_ON(PageLRU(kpte_page));
+	BUG_ON(PageCompound(kpte_page));
 	if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
 		if (!pte_huge(*kpte)) {
 			set_pte(kpte, pfn_pte(pfn, prot));
@@ -159,10 +161,9 @@ __change_page_attr(unsigned long address
 	/* on x86-64 the direct mapping set at boot is not using 4k pages */
  	BUG_ON(PageReserved(kpte_page));
 
-	if (page_private(kpte_page) == 0) {
-		save_page(kpte_page);
+	save_page(kpte_page);
+	if (page_private(kpte_page) == 0)
 		revert_page(address, ref_prot);
- 	}
 	return 0;
 } 
 
@@ -234,6 +235,10 @@ void global_flush_tlb(void)
 	flush_map(&l);
 
 	list_for_each_entry_safe(pg, next, &l, lru) {
+		list_del(&pg->lru);
+		clear_bit(PG_arch_1, &pg->flags);
+		if (page_private(pg) != 0)
+			continue;
 		ClearPagePrivate(pg);
 		__free_page(pg);
 	} 
Index: linux/arch/i386/mm/pageattr.c
===================================================================
--- linux.orig/arch/i386/mm/pageattr.c
+++ linux/arch/i386/mm/pageattr.c
@@ -82,7 +82,7 @@ static void flush_kernel_map(void *arg)
 	struct page *p;
 
 	/* High level code is not ready for clflush yet */
-	if (0 && cpu_has_clflush) {
+	if (cpu_has_clflush) {
 		list_for_each_entry (p, lh, lru)
 			cache_flush_page(p);
 	} else if (boot_cpu_data.x86_model >= 4)
@@ -136,6 +136,12 @@ static inline void revert_page(struct pa
 			    ref_prot));
 }
 
+static inline void save_page(struct page *kpte_page)
+{
+	if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
+		list_add(&kpte_page->lru, &df_list);
+}
+
 static int
 __change_page_attr(struct page *page, pgprot_t prot)
 { 
@@ -150,6 +156,9 @@ __change_page_attr(struct page *page, pg
 	if (!kpte)
 		return -EINVAL;
 	kpte_page = virt_to_page(kpte);
+	BUG_ON(PageLRU(kpte_page));
+	BUG_ON(PageCompound(kpte_page));
+
 	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
 		if (!pte_huge(*kpte)) {
 			set_pte_atomic(kpte, mk_pte(page, prot)); 
@@ -179,11 +188,11 @@ __change_page_attr(struct page *page, pg
 	 * time (not via split_large_page) and in turn we must not
 	 * replace it with a largepage.
 	 */
+
+	save_page(kpte_page);
 	if (!PageReserved(kpte_page)) {
 		if (cpu_has_pse && (page_private(kpte_page) == 0)) {
-			ClearPagePrivate(kpte_page);
 			paravirt_release_pt(page_to_pfn(kpte_page));
-			list_add(&kpte_page->lru, &df_list);
 			revert_page(kpte_page, address);
 		}
 	}
@@ -236,6 +245,11 @@ void global_flush_tlb(void)
 	spin_unlock_irq(&cpa_lock);
 	flush_map(&l);
 	list_for_each_entry_safe(pg, next, &l, lru) {
+		list_del(&pg->lru);
+		clear_bit(PG_arch_1, &pg->flags);
+		if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
+			continue;
+		ClearPagePrivate(pg);
 		__free_page(pg);
 	}
 }

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [2/58] x86_64: Tell gcc to only align stack to 8 bytes
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
  2007-07-19  9:54 ` [PATCH] [1/58] x86: Always flush pages in change_page_attr Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19 11:50   ` Serge Belyshev
  2007-07-19 14:42   ` Chuck Ebbert
  2007-07-19  9:54 ` [PATCH] [3/58] x86_64: asm/ptrace.h needs linux/compiler.h Andi Kleen
                   ` (55 subsequent siblings)
  57 siblings, 2 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


Don't need 16 byte alignment because kernel doesn't use SSE2

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/Makefile |    1 +
 1 file changed, 1 insertion(+)

Index: linux/arch/x86_64/Makefile
===================================================================
--- linux.orig/arch/x86_64/Makefile
+++ linux/arch/x86_64/Makefile
@@ -55,6 +55,7 @@ cflags-y += $(call cc-option,-mno-sse -m
 # this works around some issues with generating unwind tables in older gccs
 # newer gccs do it by default
 cflags-y += -maccumulate-outgoing-args
+cflags-y += -mpreferred-stack-boundary=4
 
 # do binutils support CFI?
 cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [3/58] x86_64: asm/ptrace.h needs linux/compiler.h
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
  2007-07-19  9:54 ` [PATCH] [1/58] x86: Always flush pages in change_page_attr Andi Kleen
  2007-07-19  9:54 ` [PATCH] [2/58] x86_64: Tell gcc to only align stack to 8 bytes Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19  9:54 ` [PATCH] [4/58] x86_64: Don't rely on a unique IO-APIC ID Andi Kleen
                   ` (54 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: khali, patches, linux-kernel


From: Jean Delvare <khali@linux-fr.org>
On x86_64, <asm/ptrace.h> uses __user but doesn't include
<linux/compiler.h>. This could lead to build failures.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 include/asm-x86_64/ptrace.h |    1 +
 1 file changed, 1 insertion(+)

Index: linux/include/asm-x86_64/ptrace.h
===================================================================
--- linux.orig/include/asm-x86_64/ptrace.h
+++ linux/include/asm-x86_64/ptrace.h
@@ -1,6 +1,7 @@
 #ifndef _X86_64_PTRACE_H
 #define _X86_64_PTRACE_H
 
+#include <linux/compiler.h>	/* For __user */
 #include <asm/ptrace-abi.h>
 
 #ifndef __ASSEMBLY__

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [4/58] x86_64: Don't rely on a unique IO-APIC ID
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (2 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [3/58] x86_64: asm/ptrace.h needs linux/compiler.h Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19  9:54 ` [PATCH] [5/58] x86_64: Report the pending irq if available in smp_affinity Andi Kleen
                   ` (53 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


Linux 64bit only uses the IO-APIC ID as an internal cookie. In the future
there could be some cases where the IO-APIC IDs are not unique because
they share an 8 bit space with CPUs and if there are enough CPUs 
it is difficult to get them that. But Linux needs the io apic ID
internally for its data structures. Assign unique IO APIC ids on 
table parsing.

TBD do for 32bit too

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/mpparse.c |   20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/mpparse.c
===================================================================
--- linux.orig/arch/x86_64/kernel/mpparse.c
+++ linux/arch/x86_64/kernel/mpparse.c
@@ -649,6 +649,20 @@ static int mp_find_ioapic(int gsi)
 	return -1;
 }
 
+static u8 uniq_ioapic_id(u8 id)
+{
+	int i;
+	DECLARE_BITMAP(used, 256);
+	bitmap_zero(used, 256);
+	for (i = 0; i < nr_ioapics; i++) {
+		struct mpc_config_ioapic *ia = &mp_ioapics[i];
+		__set_bit(ia->mpc_apicid, used);
+	}
+	if (!test_bit(id, used))
+		return id;
+	return find_first_zero_bit(used, 256);
+}
+
 void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
 {
 	int idx = 0;
@@ -656,14 +670,14 @@ void __init mp_register_ioapic(u8 id, u3
 	if (bad_ioapic(address))
 		return;
 
-	idx = nr_ioapics++;
+	idx = nr_ioapics;
 
 	mp_ioapics[idx].mpc_type = MP_IOAPIC;
 	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
 	mp_ioapics[idx].mpc_apicaddr = address;
 
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-	mp_ioapics[idx].mpc_apicid = id;
+	mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
 	mp_ioapics[idx].mpc_apicver = 0;
 	
 	/* 
@@ -680,6 +694,8 @@ void __init mp_register_ioapic(u8 id, u3
 		mp_ioapics[idx].mpc_apicaddr,
 		mp_ioapic_routing[idx].gsi_start,
 		mp_ioapic_routing[idx].gsi_end);
+
+	nr_ioapics++;
 }
 
 void __init

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [5/58] x86_64: Report the pending irq if available in smp_affinity
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (3 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [4/58] x86_64: Don't rely on a unique IO-APIC ID Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19 10:23   ` Ingo Molnar
  2007-07-19  9:54 ` [PATCH] [6/58] x86_64: Use LOCAL_DISTANCE and REMOTE_DISTANCE in x86_64 ACPI code Andi Kleen
                   ` (52 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: tglx, mingo, patches, linux-kernel


Otherwise smp_affinity would only update after the next interrupt
on x86 systems.

Cc: tglx@linutronix.de
Cc: mingo@elte.hu

Signed-off-by: Andi Kleen <ak@suse.de>

---
 kernel/irq/proc.c |    9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

Index: linux/kernel/irq/proc.c
===================================================================
--- linux.orig/kernel/irq/proc.c
+++ linux/kernel/irq/proc.c
@@ -19,7 +19,14 @@ static struct proc_dir_entry *root_irq_d
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
 {
-	int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
+	struct irq_desc *desc = irq_desc + (long)data;
+	cpumask_t *mask = &desc->affinity;
+	int len;
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	if (desc->status & IRQ_MOVE_PENDING)
+		mask = &desc->pending_mask;
+#endif
+	len = cpumask_scnprintf(page, count, *mask);
 
 	if (count - len < 2)
 		return -EINVAL;

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [6/58] x86_64: Use LOCAL_DISTANCE and  REMOTE_DISTANCE in x86_64 ACPI code
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (4 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [5/58] x86_64: Report the pending irq if available in smp_affinity Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19  9:54 ` [PATCH] [7/58] x86_64: various cleanups in NUMA scan node Andi Kleen
                   ` (51 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: rientjes, patches, linux-kernel


From: David Rientjes <rientjes@google.com>

Use LOCAL_DISTANCE and  REMOTE_DISTANCE in x86_64 ACPI code

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/mm/srat.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

Index: linux/arch/x86_64/mm/srat.c
===================================================================
--- linux.orig/arch/x86_64/mm/srat.c
+++ linux/arch/x86_64/mm/srat.c
@@ -106,9 +106,9 @@ static __init int slit_valid(struct acpi
 		for (j = 0; j < d; j++)  {
 			u8 val = slit->entry[d*i + j];
 			if (i == j) {
-				if (val != 10)
+				if (val != LOCAL_DISTANCE)
 					return 0;
-			} else if (val <= 10)
+			} else if (val <= LOCAL_DISTANCE)
 				return 0;
 		}
 	}
@@ -464,7 +464,7 @@ int __node_distance(int a, int b)
 	int index;
 
 	if (!acpi_slit)
-		return a == b ? 10 : 20;
+		return a == b ? LOCAL_DISTANCE : REMOTE_DISTANCE;
 	index = acpi_slit->locality_count * node_to_pxm(a);
 	return acpi_slit->entry[index + node_to_pxm(b)];
 }

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [7/58] x86_64: various cleanups in NUMA scan node
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (5 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [6/58] x86_64: Use LOCAL_DISTANCE and REMOTE_DISTANCE in x86_64 ACPI code Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19 17:15   ` Yinghai Lu
  2007-07-19  9:54 ` [PATCH] [8/58] x86_64: Use string instruction memcpy/memset on AMD Fam10 Andi Kleen
                   ` (50 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: rientjes, patches, linux-kernel


From: David Rientjes <rientjes@google.com>
In acpi_scan_nodes(), we immediately return -1 if acpi_numa <= 0, meaning
we haven't detected any underlying ACPI topology or we have explicitly
disabled its use from the command-line with numa=noacpi.

acpi_table_print_srat_entry() and acpi_table_parse_srat() are only
referenced within drivers/acpi/numa.c, so we can mark them as static and
remove their prototypes from the header file.

Likewise, pxm_to_node_map[] and node_to_pxm_map[] are only used within
drivers/acpi/numa.c, so we mark them as static and remove their externs
from the header file.

The automatic 'result' variable is unused in acpi_numa_init(), so it's
removed.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/mm/srat.c |    6 +++---
 drivers/acpi/numa.c   |   20 ++++++++++----------
 include/linux/acpi.h  |    2 --
 3 files changed, 13 insertions(+), 15 deletions(-)

Index: linux/arch/x86_64/mm/srat.c
===================================================================
--- linux.orig/arch/x86_64/mm/srat.c
+++ linux/arch/x86_64/mm/srat.c
@@ -394,6 +394,9 @@ int __init acpi_scan_nodes(unsigned long
 {
 	int i;
 
+	if (acpi_numa <= 0)
+		return -1;
+
 	/* First clean up the node list */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cutoff_node(i, start, end);
@@ -403,9 +406,6 @@ int __init acpi_scan_nodes(unsigned long
 		}
 	}
 
-	if (acpi_numa <= 0)
-		return -1;
-
 	if (!nodes_cover_memory()) {
 		bad_srat();
 		return -1;
Index: linux/drivers/acpi/numa.c
===================================================================
--- linux.orig/drivers/acpi/numa.c
+++ linux/drivers/acpi/numa.c
@@ -40,9 +40,9 @@ static nodemask_t nodes_found_map = NODE
 #define NID_INVAL	-1
 
 /* maps to convert between proximity domain and logical node ID */
-static int pxm_to_node_map[MAX_PXM_DOMAINS]
+static int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS]
 				= { [0 ... MAX_PXM_DOMAINS - 1] = NID_INVAL };
-static int node_to_pxm_map[MAX_NUMNODES]
+static int __cpuinitdata node_to_pxm_map[MAX_NUMNODES]
 				= { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
 
 int pxm_to_node(int pxm)
@@ -83,7 +83,8 @@ void __cpuinit acpi_unmap_pxm_to_node(in
 	node_clear(node, nodes_found_map);
 }
 
-void __init acpi_table_print_srat_entry(struct acpi_subtable_header * header)
+static void __init
+acpi_table_print_srat_entry(struct acpi_subtable_header *header)
 {
 
 	ACPI_FUNCTION_NAME("acpi_table_print_srat_entry");
@@ -200,7 +201,7 @@ static int __init acpi_parse_srat(struct
 	return 0;
 }
 
-int __init
+static int __init
 acpi_table_parse_srat(enum acpi_srat_type id,
 		      acpi_table_entry_handler handler, unsigned int max_entries)
 {
@@ -211,14 +212,13 @@ acpi_table_parse_srat(enum acpi_srat_typ
 
 int __init acpi_numa_init(void)
 {
-	int result;
-
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
-		result = acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-					       acpi_parse_processor_affinity,
-					       NR_CPUS);
-		result = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS);	// IA64 specific
+		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
+				      acpi_parse_processor_affinity, NR_CPUS);
+		acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
+				      acpi_parse_memory_affinity,
+				      NR_NODE_MEMBLKS);
 	}
 
 	/* SLIT: System Locality Information Table */
Index: linux/include/linux/acpi.h
===================================================================
--- linux.orig/include/linux/acpi.h
+++ linux/include/linux/acpi.h
@@ -88,10 +88,8 @@ int acpi_table_parse (char *id, acpi_tab
 int __init acpi_table_parse_entries(char *id, unsigned long table_size,
 	int entry_id, acpi_table_entry_handler handler, unsigned int max_entries);
 int acpi_table_parse_madt (enum acpi_madt_type id, acpi_table_entry_handler handler, unsigned int max_entries);
-int acpi_table_parse_srat (enum acpi_srat_type id, acpi_table_entry_handler handler, unsigned int max_entries);
 int acpi_parse_mcfg (struct acpi_table_header *header);
 void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
-void acpi_table_print_srat_entry (struct acpi_subtable_header *srat);
 
 /* the following four functions are architecture-dependent */
 #ifdef CONFIG_HAVE_ARCH_PARSE_SRAT

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [8/58] x86_64: Use string instruction memcpy/memset on AMD Fam10
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (6 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [7/58] x86_64: various cleanups in NUMA scan node Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19 16:43   ` Jan Engelhardt
  2007-07-19  9:54 ` [PATCH] [9/58] x86_64: Always use builtin memcpy on gcc 4.3 Andi Kleen
                   ` (49 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/setup.c |    2 ++
 1 file changed, 2 insertions(+)

Index: linux/arch/x86_64/kernel/setup.c
===================================================================
--- linux.orig/arch/x86_64/kernel/setup.c
+++ linux/arch/x86_64/kernel/setup.c
@@ -575,6 +575,8 @@ static void __cpuinit init_amd(struct cp
 	level = cpuid_eax(1);
 	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
 		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+	if (c->x86 == 0x10)
+		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
 
 	/* Enable workaround for FXSAVE leak */
 	if (c->x86 >= 6)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [9/58] x86_64: Always use builtin memcpy on gcc 4.3
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (7 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [8/58] x86_64: Use string instruction memcpy/memset on AMD Fam10 Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-21 23:16   ` Oleg Verych
  2007-07-19  9:54 ` [PATCH] [10/58] i386: Move all simple string operations out of line Andi Kleen
                   ` (48 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: jh, patches, linux-kernel


Jan asked to always use the builtin memcpy on gcc 4.3 mainline because
it should generate better code than the old macro. Let's try it.

Cc: jh@suse.cz

Signed-off-by: Andi Kleen <ak@suse.de>

---
 include/asm-x86_64/string.h |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux/include/asm-x86_64/string.h
===================================================================
--- linux.orig/include/asm-x86_64/string.h
+++ linux/include/asm-x86_64/string.h
@@ -29,6 +29,9 @@ return (to);
    function. */
 
 #define __HAVE_ARCH_MEMCPY 1
+#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
+extern void *memcpy(void *to, const void *from, size_t len);
+#else
 extern void *__memcpy(void *to, const void *from, size_t len); 
 #define memcpy(dst,src,len) \
 	({ size_t __len = (len);				\
@@ -38,7 +41,7 @@ extern void *__memcpy(void *to, const vo
 	   else							\
 		 __ret = __builtin_memcpy((dst),(src),__len);	\
 	   __ret; }) 
-
+#endif
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [10/58] i386: Move all simple string operations out of line
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (8 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [9/58] x86_64: Always use builtin memcpy on gcc 4.3 Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19  9:54 ` [PATCH] [11/58] x86: Support __attribute__((__cold__)) in gcc 4.3 Andi Kleen
                   ` (47 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


The compiler generally generates reasonable inline code for the simple
cases and for the rest it's better for code size for them to be out of line.
Also there they can be potentially optimized more in the future.

In fact they probably should be in a .S file because they're all pure
assembly, but that's for another day.

Also some code style cleanup on them while I was on it (this seems
to be the last untouched really early Linux code) 

This saves ~12k text for a defconfig kernel with gcc 4.1.


Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/i386/lib/Makefile    |    2 
 arch/i386/lib/string.c    |  233 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/string.h |  243 ++--------------------------------------------
 3 files changed, 247 insertions(+), 231 deletions(-)

Index: linux/include/asm-i386/string.h
===================================================================
--- linux.orig/include/asm-i386/string.h
+++ linux/include/asm-i386/string.h
@@ -2,203 +2,35 @@
 #define _I386_STRING_H_
 
 #ifdef __KERNEL__
-/*
- * On a 486 or Pentium, we are better off not using the
- * byte string operations. But on a 386 or a PPro the
- * byte string ops are faster than doing it by hand
- * (MUCH faster on a Pentium).
- */
-
-/*
- * This string-include defines all string functions as inline
- * functions. Use gcc. It also assumes ds=es=data space, this should be
- * normal. Most of the string-functions are rather heavily hand-optimized,
- * see especially strsep,strstr,str[c]spn. They should work, but are not
- * very easy to understand. Everything is done entirely within the register
- * set, making the functions fast and clean. String instructions have been
- * used through-out, making for "slightly" unclear code :-)
- *
- *		NO Copyright (C) 1991, 1992 Linus Torvalds,
- *		consider these trivial functions to be PD.
- */
 
-/* AK: in fact I bet it would be better to move this stuff all out of line.
- */
+/* Let gcc decide wether to inline or use the out of line functions */
 
 #define __HAVE_ARCH_STRCPY
-static inline char * strcpy(char * dest,const char *src)
-{
-int d0, d1, d2;
-__asm__ __volatile__(
-	"1:\tlodsb\n\t"
-	"stosb\n\t"
-	"testb %%al,%%al\n\t"
-	"jne 1b"
-	: "=&S" (d0), "=&D" (d1), "=&a" (d2)
-	:"0" (src),"1" (dest) : "memory");
-return dest;
-}
+extern char *strcpy(char *dest, const char *src);
 
 #define __HAVE_ARCH_STRNCPY
-static inline char * strncpy(char * dest,const char *src,size_t count)
-{
-int d0, d1, d2, d3;
-__asm__ __volatile__(
-	"1:\tdecl %2\n\t"
-	"js 2f\n\t"
-	"lodsb\n\t"
-	"stosb\n\t"
-	"testb %%al,%%al\n\t"
-	"jne 1b\n\t"
-	"rep\n\t"
-	"stosb\n"
-	"2:"
-	: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
-	:"0" (src),"1" (dest),"2" (count) : "memory");
-return dest;
-}
+extern char *strncpy(char *dest, const char *src, size_t count);
 
 #define __HAVE_ARCH_STRCAT
-static inline char * strcat(char * dest,const char * src)
-{
-int d0, d1, d2, d3;
-__asm__ __volatile__(
-	"repne\n\t"
-	"scasb\n\t"
-	"decl %1\n"
-	"1:\tlodsb\n\t"
-	"stosb\n\t"
-	"testb %%al,%%al\n\t"
-	"jne 1b"
-	: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
-	: "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu):"memory");
-return dest;
-}
+extern char *strcat(char *dest, const char *src);
 
 #define __HAVE_ARCH_STRNCAT
-static inline char * strncat(char * dest,const char * src,size_t count)
-{
-int d0, d1, d2, d3;
-__asm__ __volatile__(
-	"repne\n\t"
-	"scasb\n\t"
-	"decl %1\n\t"
-	"movl %8,%3\n"
-	"1:\tdecl %3\n\t"
-	"js 2f\n\t"
-	"lodsb\n\t"
-	"stosb\n\t"
-	"testb %%al,%%al\n\t"
-	"jne 1b\n"
-	"2:\txorl %2,%2\n\t"
-	"stosb"
-	: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
-	: "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
-	: "memory");
-return dest;
-}
+extern char *strncat(char *dest, const char *src, size_t count);
 
 #define __HAVE_ARCH_STRCMP
-static inline int strcmp(const char * cs,const char * ct)
-{
-int d0, d1;
-register int __res;
-__asm__ __volatile__(
-	"1:\tlodsb\n\t"
-	"scasb\n\t"
-	"jne 2f\n\t"
-	"testb %%al,%%al\n\t"
-	"jne 1b\n\t"
-	"xorl %%eax,%%eax\n\t"
-	"jmp 3f\n"
-	"2:\tsbbl %%eax,%%eax\n\t"
-	"orb $1,%%al\n"
-	"3:"
-	:"=a" (__res), "=&S" (d0), "=&D" (d1)
-	:"1" (cs),"2" (ct)
-	:"memory");
-return __res;
-}
+extern int strcmp(const char *cs, const char *ct);
 
 #define __HAVE_ARCH_STRNCMP
-static inline int strncmp(const char * cs,const char * ct,size_t count)
-{
-register int __res;
-int d0, d1, d2;
-__asm__ __volatile__(
-	"1:\tdecl %3\n\t"
-	"js 2f\n\t"
-	"lodsb\n\t"
-	"scasb\n\t"
-	"jne 3f\n\t"
-	"testb %%al,%%al\n\t"
-	"jne 1b\n"
-	"2:\txorl %%eax,%%eax\n\t"
-	"jmp 4f\n"
-	"3:\tsbbl %%eax,%%eax\n\t"
-	"orb $1,%%al\n"
-	"4:"
-	:"=a" (__res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
-	:"1" (cs),"2" (ct),"3" (count)
-	:"memory");
-return __res;
-}
+extern int strncmp(const char *cs, const char *ct, size_t count);
 
 #define __HAVE_ARCH_STRCHR
-static inline char * strchr(const char * s, int c)
-{
-int d0;
-register char * __res;
-__asm__ __volatile__(
-	"movb %%al,%%ah\n"
-	"1:\tlodsb\n\t"
-	"cmpb %%ah,%%al\n\t"
-	"je 2f\n\t"
-	"testb %%al,%%al\n\t"
-	"jne 1b\n\t"
-	"movl $1,%1\n"
-	"2:\tmovl %1,%0\n\t"
-	"decl %0"
-	:"=a" (__res), "=&S" (d0)
-	:"1" (s),"0" (c)
-	:"memory");
-return __res;
-}
+extern char *strchr(const char *s, int c);
 
 #define __HAVE_ARCH_STRRCHR
-static inline char * strrchr(const char * s, int c)
-{
-int d0, d1;
-register char * __res;
-__asm__ __volatile__(
-	"movb %%al,%%ah\n"
-	"1:\tlodsb\n\t"
-	"cmpb %%ah,%%al\n\t"
-	"jne 2f\n\t"
-	"leal -1(%%esi),%0\n"
-	"2:\ttestb %%al,%%al\n\t"
-	"jne 1b"
-	:"=g" (__res), "=&S" (d0), "=&a" (d1)
-	:"0" (0),"1" (s),"2" (c)
-	:"memory");
-return __res;
-}
+extern char *strrchr(const char *s, int c);
 
 #define __HAVE_ARCH_STRLEN
-static inline size_t strlen(const char * s)
-{
-int d0;
-register int __res;
-__asm__ __volatile__(
-	"repne\n\t"
-	"scasb\n\t"
-	"notl %0\n\t"
-	"decl %0"
-	:"=c" (__res), "=&D" (d0)
-	:"1" (s),"a" (0), "0" (0xffffffffu)
-	:"memory");
-return __res;
-}
+extern size_t strlen(const char *s);
 
 static __always_inline void * __memcpy(void * to, const void * from, size_t n)
 {
@@ -207,9 +39,7 @@ __asm__ __volatile__(
 	"rep ; movsl\n\t"
 	"movl %4,%%ecx\n\t"
 	"andl $3,%%ecx\n\t"
-#if 1	/* want to pay 2 byte penalty for a chance to skip microcoded rep? */
 	"jz 1f\n\t"
-#endif
 	"rep ; movsb\n\t"
 	"1:"
 	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
@@ -328,23 +158,7 @@ void *memmove(void * dest,const void * s
 #define memcmp __builtin_memcmp
 
 #define __HAVE_ARCH_MEMCHR
-static inline void * memchr(const void * cs,int c,size_t count)
-{
-int d0;
-register void * __res;
-if (!count)
-	return NULL;
-__asm__ __volatile__(
-	"repne\n\t"
-	"scasb\n\t"
-	"je 1f\n\t"
-	"movl $1,%0\n"
-	"1:\tdecl %0"
-	:"=D" (__res), "=&c" (d0)
-	:"a" (c),"0" (cs),"1" (count)
-	:"memory");
-return __res;
-}
+extern void *memchr(const void * cs,int c,size_t count);
 
 static inline void * __memset_generic(void * s, char c,size_t count)
 {
@@ -386,29 +200,10 @@ return (s);	
 
 /* Added by Gertjan van Wingerde to make minix and sysv module work */
 #define __HAVE_ARCH_STRNLEN
-static inline size_t strnlen(const char * s, size_t count)
-{
-int d0;
-register int __res;
-__asm__ __volatile__(
-	"movl %2,%0\n\t"
-	"jmp 2f\n"
-	"1:\tcmpb $0,(%0)\n\t"
-	"je 3f\n\t"
-	"incl %0\n"
-	"2:\tdecl %1\n\t"
-	"cmpl $-1,%1\n\t"
-	"jne 1b\n"
-	"3:\tsubl %2,%0"
-	:"=a" (__res), "=&d" (d0)
-	:"c" (s),"1" (count)
-	:"memory");
-return __res;
-}
+extern size_t strnlen(const char * s, size_t count);
 /* end of additional stuff */
 
 #define __HAVE_ARCH_STRSTR
-
 extern char *strstr(const char *cs, const char *ct);
 
 /*
@@ -474,19 +269,7 @@ __asm__  __volatile__( \
  * find the first occurrence of byte 'c', or 1 past the area if none
  */
 #define __HAVE_ARCH_MEMSCAN
-static inline void * memscan(void * addr, int c, size_t size)
-{
-	if (!size)
-		return addr;
-	__asm__("repnz; scasb\n\t"
-		"jnz 1f\n\t"
-		"dec %%edi\n"
-		"1:"
-		: "=D" (addr), "=c" (size)
-		: "0" (addr), "1" (size), "a" (c)
-		: "memory");
-	return addr;
-}
+extern void *memscan(void * addr, int c, size_t size);
 
 #endif /* __KERNEL__ */
 
Index: linux/arch/i386/lib/Makefile
===================================================================
--- linux.orig/arch/i386/lib/Makefile
+++ linux/arch/i386/lib/Makefile
@@ -4,7 +4,7 @@
 
 
 lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \
-	bitops.o semaphore.o
+	bitops.o semaphore.o string.o
 
 lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
 
Index: linux/arch/i386/lib/string.c
===================================================================
--- /dev/null
+++ linux/arch/i386/lib/string.c
@@ -0,0 +1,233 @@
+/*
+ * Most of the string-functions are rather heavily hand-optimized,
+ * see especially strsep,strstr,str[c]spn. They should work, but are not
+ * very easy to understand. Everything is done entirely within the register
+ * set, making the functions fast and clean. String instructions have been
+ * used through-out, making for "slightly" unclear code :-)
+ *
+ * AK: On P4 and K7 using non string instruction implementations might be faster
+ * for large memory blocks. But most of them are unlikely to be used on any
+ * of thse.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+
+char *strcpy(char * dest,const char *src)
+{
+	int d0, d1, d2;
+	asm( "1:\tlodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b"
+		: "=&S" (d0), "=&D" (d1), "=&a" (d2)
+		:"0" (src),"1" (dest) : "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strcpy);
+
+char *strncpy(char * dest,const char *src,size_t count)
+{
+	int d0, d1, d2, d3;
+	asm( "1:\tdecl %2\n\t"
+		"js 2f\n\t"
+		"lodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n\t"
+		"rep\n\t"
+		"stosb\n"
+		"2:"
+		: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
+		:"0" (src),"1" (dest),"2" (count) : "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strncpy);
+
+char *strcat(char * dest,const char * src)
+{
+	int d0, d1, d2, d3;
+	asm( "repne\n\t"
+		"scasb\n\t"
+		"decl %1\n"
+		"1:\tlodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b"
+		: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+		: "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu):"memory");
+	return dest;
+}
+EXPORT_SYMBOL(strcat);
+
+char *strncat(char * dest,const char * src,size_t count)
+{
+	int d0, d1, d2, d3;
+	asm( "repne\n\t"
+		"scasb\n\t"
+		"decl %1\n\t"
+		"movl %8,%3\n"
+		"1:\tdecl %3\n\t"
+		"js 2f\n\t"
+		"lodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n"
+		"2:\txorl %2,%2\n\t"
+		"stosb"
+		: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+		: "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
+		: "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strncat);
+
+int strcmp(const char * cs,const char * ct)
+{
+	int d0, d1;
+	int res;
+	asm( "1:\tlodsb\n\t"
+		"scasb\n\t"
+		"jne 2f\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n\t"
+		"xorl %%eax,%%eax\n\t"
+		"jmp 3f\n"
+		"2:\tsbbl %%eax,%%eax\n\t"
+		"orb $1,%%al\n"
+		"3:"
+		:"=a" (res), "=&S" (d0), "=&D" (d1)
+		:"1" (cs),"2" (ct)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strcmp);
+
+int strncmp(const char * cs,const char * ct,size_t count)
+{
+	int res;
+	int d0, d1, d2;
+	asm( "1:\tdecl %3\n\t"
+		"js 2f\n\t"
+		"lodsb\n\t"
+		"scasb\n\t"
+		"jne 3f\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n"
+		"2:\txorl %%eax,%%eax\n\t"
+		"jmp 4f\n"
+		"3:\tsbbl %%eax,%%eax\n\t"
+		"orb $1,%%al\n"
+		"4:"
+		:"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+		:"1" (cs),"2" (ct),"3" (count)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strncmp);
+
+char *strchr(const char * s, int c)
+{
+	int d0;
+	char * res;
+	asm( "movb %%al,%%ah\n"
+		"1:\tlodsb\n\t"
+		"cmpb %%ah,%%al\n\t"
+		"je 2f\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n\t"
+		"movl $1,%1\n"
+		"2:\tmovl %1,%0\n\t"
+		"decl %0"
+		:"=a" (res), "=&S" (d0)
+		:"1" (s),"0" (c)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strchr);
+
+char *strrchr(const char * s, int c)
+{
+	int d0, d1;
+	char * res;
+	asm( "movb %%al,%%ah\n"
+		"1:\tlodsb\n\t"
+		"cmpb %%ah,%%al\n\t"
+		"jne 2f\n\t"
+		"leal -1(%%esi),%0\n"
+		"2:\ttestb %%al,%%al\n\t"
+		"jne 1b"
+		:"=g" (res), "=&S" (d0), "=&a" (d1)
+		:"0" (0),"1" (s),"2" (c)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strrchr);
+
+size_t strlen(const char * s)
+{
+	int d0;
+	int res;
+	asm( "repne\n\t"
+		"scasb\n\t"
+		"notl %0\n\t"
+		"decl %0"
+		:"=c" (res), "=&D" (d0)
+		:"1" (s),"a" (0), "0" (0xffffffffu)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strlen);
+
+void *memchr(const void *cs,int c,size_t count)
+{
+	int d0;
+	void *res;
+	if (!count)
+		return NULL;
+	asm( "repne\n\t"
+		"scasb\n\t"
+		"je 1f\n\t"
+		"movl $1,%0\n"
+		"1:\tdecl %0"
+		:"=D" (res), "=&c" (d0)
+		:"a" (c),"0" (cs),"1" (count)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(memchr);
+
+void *memscan(void * addr, int c, size_t size)
+{
+	if (!size)
+		return addr;
+	asm("repnz; scasb\n\t"
+	    "jnz 1f\n\t"
+	    "dec %%edi\n"
+	    "1:"
+	    : "=D" (addr), "=c" (size)
+	    : "0" (addr), "1" (size), "a" (c)
+	    : "memory");
+	return addr;
+}
+EXPORT_SYMBOL(memscan);
+
+size_t strnlen(const char *s, size_t count)
+{
+	int d0;
+	int res;
+	asm( "movl %2,%0\n\t"
+		"jmp 2f\n"
+		"1:\tcmpb $0,(%0)\n\t"
+		"je 3f\n\t"
+		"incl %0\n"
+		"2:\tdecl %1\n\t"
+		"cmpl $-1,%1\n\t"
+		"jne 1b\n"
+		"3:\tsubl %2,%0"
+		:"=a" (res), "=&d" (d0)
+		:"c" (s),"1" (count)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strnlen);

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [11/58] x86: Support __attribute__((__cold__)) in gcc 4.3
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (9 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [10/58] i386: Move all simple string operations out of line Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19  9:54 ` [PATCH] [12/58] x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu Andi Kleen
                   ` (46 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: jh, patches, linux-kernel


gcc 4.3 supports a new __attribute__((__cold__)) to mark functions cold. Any 
path directly leading to a call of this function will be unlikely. And gcc
will try to generate smaller code for the function itself.

Please use with care. The code generation advantage isn't large and in most
cases it is not worth uglifying code with this.

This patch marks some common error functions like panic(), printk(), BUG()
as cold.  This will longer term make many unlikely()s unnecessary, although
we can keep them for now for older compilers.

Also all __init and __exit functions are marked cold. With a non -Os
build this will tell the compiler to generate slightly smaller code
for them. I think it currently only uses less alignments for labels,
but that might change in the future.

One disadvantage over *likely() is that they cannot be easily instrumented 
to verify them.

Another drawback is that only the latest gcc 4.3 snapshots support this. 
Unfortunately we cannot detect this using the preprocessor. This means older 
snapshots will fail now. I don't think that's a problem because they are 
unreleased compilers that nobody should be using.

gcc also has a __hot__ attribute, but I don't see any sense in using
this in the kernel right now. But someday I hope gcc will be able
to use more aggressive optimizing for hot functions even in -Os,
if that happens it should be added.

Includes compile fix from Thomas Gleixner.

TBD wait for COLD()

Cc: jh@suse.cz
Signed-off-by: Andi Kleen <ak@suse.de>

---
 include/asm-generic/bug.h     |    1 +
 include/asm-i386/bug.h        |    4 ++++
 include/asm-x86_64/bug.h      |    8 ++++++--
 include/linux/compiler-gcc4.h |   23 +++++++++++++++++++++++
 include/linux/compiler.h      |   12 ++++++++++++
 include/linux/init.h          |    8 ++++----
 include/linux/kernel.h        |    8 ++++----
 7 files changed, 54 insertions(+), 10 deletions(-)

Index: linux/include/asm-generic/bug.h
===================================================================
--- linux.orig/include/asm-generic/bug.h
+++ linux/include/asm-generic/bug.h
@@ -22,6 +22,7 @@ struct bug_entry {
 
 #ifndef HAVE_ARCH_BUG
 #define BUG() do { \
+	COLD(); \
 	printk("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __FUNCTION__); \
 	panic("BUG!"); \
 } while (0)
Index: linux/include/asm-i386/bug.h
===================================================================
--- linux.orig/include/asm-i386/bug.h
+++ linux/include/asm-i386/bug.h
@@ -10,9 +10,12 @@
 #ifdef CONFIG_BUG
 #define HAVE_ARCH_BUG
 
+#include <linux/compiler.h>
+
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 #define BUG()								\
 	do {								\
+		COLD();							\
 		asm volatile("1:\tud2\n"				\
 			     ".pushsection __bug_table,\"a\"\n"		\
 			     "2:\t.long 1b, %c0\n"			\
@@ -27,6 +30,7 @@
 #else
 #define BUG()								\
 	do {								\
+		COLD();							\
 		asm volatile("ud2");					\
 		for(;;) ;						\
 	} while(0)
Index: linux/include/asm-x86_64/bug.h
===================================================================
--- linux.orig/include/asm-x86_64/bug.h
+++ linux/include/asm-x86_64/bug.h
@@ -4,9 +4,12 @@
 #ifdef CONFIG_BUG
 #define HAVE_ARCH_BUG
 
+#include <linux/compiler.h>
+
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 #define BUG()								\
 	do {								\
+		COLD();							\
 		asm volatile("1:\tud2\n"				\
 			     ".pushsection __bug_table,\"a\"\n"		\
 			     "2:\t.quad 1b, %c0\n"			\
@@ -20,14 +23,15 @@
 #else
 #define BUG()								\
 	do {								\
+		COLD();							\
 		asm volatile("ud2");					\
 		for(;;) ;						\
 	} while(0)
 #endif
 
-void out_of_line_bug(void);
+void out_of_line_bug(void) __cold;
 #else
-static inline void out_of_line_bug(void) { }
+static inline void out_of_line_bug(void) __cold { }
 #endif
 
 #include <asm-generic/bug.h>
Index: linux/include/linux/compiler-gcc4.h
===================================================================
--- linux.orig/include/linux/compiler-gcc4.h
+++ linux/include/linux/compiler-gcc4.h
@@ -23,3 +23,26 @@
  * code
  */
 #define uninitialized_var(x) x = x
+
+#if !(__GNUC__ == 4 && __GNUC_MINOR__ < 3)
+/* Mark functions as cold. gcc will assume any path leading to a call
+   to them will be unlikely.  This means a lot of manual unlikely()s
+   are unnecessary now for any paths leading to the usual suspects
+   like BUG(), printk(), panic() etc. [but let's keep them for now for
+   older compilers]
+
+   Early snapshots of gcc 4.3 don't support this and we can't detect this
+   in the preprocessor, but we can live with this because they're unreleased.
+   Maketime probing would be overkill here.
+
+   gcc also has a __attribute__((__hot__)) to move hot functions into
+   a special section, but I don't see any sense in this right now in
+   the kernel context */
+#define __cold			__attribute__((__cold__))
+
+/* Use this to mark a path cold that isn't a function call
+   Use with care. The code generation advantage isn't large and it is rarely
+   worth it to uglify your code with this. */
+static inline void __cold cold_inline(void) {}
+#define COLD()			cold_inline();
+#endif
Index: linux/include/linux/compiler.h
===================================================================
--- linux.orig/include/linux/compiler.h
+++ linux/include/linux/compiler.h
@@ -174,4 +174,16 @@ extern void __chk_io_ptr(const void __io
 # define __attribute_const__	/* unimplemented */
 #endif
 
+/*
+ * Tell gcc if a function is cold. The compiler will assume any path
+ * directly leading to the call is unlikely.
+ */
+
+#ifndef __cold
+#define __cold
+#endif
+#ifndef COLD
+#define COLD() do {} while (0)
+#endif
+
 #endif /* __LINUX_COMPILER_H */
Index: linux/include/linux/init.h
===================================================================
--- linux.orig/include/linux/init.h
+++ linux/include/linux/init.h
@@ -40,10 +40,10 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__attribute__ ((__section__ (".init.text")))
+#define __init		__attribute__ ((__section__ (".init.text"))) __cold
 #define __initdata	__attribute__ ((__section__ (".init.data")))
 #define __exitdata	__attribute__ ((__section__(".exit.data")))
-#define __exit_call	__attribute_used__ __attribute__ ((__section__ (".exitcall.exit")))
+#define __exit_call	__attribute_used__ __attribute__ ((__section__ (".exitcall.exit"))) __cold
 
 /* modpost check for section mismatches during the kernel build.
  * A section mismatch happens when there are references from a
@@ -59,9 +59,9 @@
 #define __initdata_refok          __attribute__ ((__section__ (".data.init.refok")))
 
 #ifdef MODULE
-#define __exit		__attribute__ ((__section__(".exit.text")))
+#define __exit		__attribute__ ((__section__(".exit.text"))) __cold
 #else
-#define __exit		__attribute_used__ __attribute__ ((__section__(".exit.text")))
+#define __exit		__attribute_used__ __attribute__ ((__section__(".exit.text"))) __cold
 #endif
 
 /* For assembly routines */
Index: linux/include/linux/kernel.h
===================================================================
--- linux.orig/include/linux/kernel.h
+++ linux/include/linux/kernel.h
@@ -106,7 +106,7 @@ extern int cond_resched(void);
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
-	__attribute__ ((NORET_AND format (printf, 1, 2)));
+	__attribute__ ((NORET_AND format (printf, 1, 2))) __cold;
 extern void oops_enter(void);
 extern void oops_exit(void);
 extern int oops_may_print(void);
@@ -155,14 +155,14 @@ extern void dump_thread(struct pt_regs *
 asmlinkage int vprintk(const char *fmt, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
 asmlinkage int printk(const char * fmt, ...)
-	__attribute__ ((format (printf, 1, 2)));
+	__attribute__ ((format (printf, 1, 2))) __cold;
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
 static inline int vprintk(const char *s, va_list args) { return 0; }
 static inline int printk(const char *s, ...)
 	__attribute__ ((format (printf, 1, 2)));
-static inline int printk(const char *s, ...) { return 0; }
+static inline int __cold printk(const char *s, ...) { return 0; }
 #endif
 
 unsigned long int_sqrt(unsigned long);
@@ -212,7 +212,7 @@ extern enum system_states {
 #define TAINT_USER			(1<<6)
 #define TAINT_DIE			(1<<7)
 
-extern void dump_stack(void);
+extern void dump_stack(void) __cold;
 
 enum {
 	DUMP_PREFIX_NONE,

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [12/58] x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (10 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [11/58] x86: Support __attribute__((__cold__)) in gcc 4.3 Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-08-21 16:25   ` Daniel Walker
  2007-07-19  9:54 ` [PATCH] [13/58] x86: Separate checking of unsynchronized and unstable TSC Andi Kleen
                   ` (45 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


This implements new vDSO for x86-64.  The concept is similar
to the existing vDSOs on i386 and PPC.  x86-64 has had static
vsyscalls before,  but these are not flexible enough anymore.

A vDSO is a ELF shared library supplied by the kernel that is mapped into 
user address space.  The vDSO mapping is randomized for each process
for security reasons.

Doing this was needed for clock_gettime, because clock_gettime
always needs a syscall fallback and having one at a fixed
address would have made buffer overflow exploits too easy to write.

The vdso can be disabled with vdso=0

It currently includes a new gettimeofday implemention and optimized
clock_gettime(). The gettimeofday implementation is slightly faster
than the one in the old vsyscall.  clock_gettime is significantly faster 
than the syscall for CLOCK_MONOTONIC and CLOCK_REALTIME.

The new calls are generally faster than the old vsyscall. 

Advantages over the old x86-64 vsyscalls:
- Extensible
- Randomized
- Cleaner
- Easier to virtualize (the old static address range previously causes
overhead e.g. for Xen because it has to create special page tables for it) 

Weak points: 
- glibc support still to be written

The VM interface is partly based on Ingo Molnar's i386 version.

Includes compile fix from Joachim Deguara

Signed-off-by: Andi Kleen <ak@suse.de>

---
 Documentation/kernel-parameters.txt |    2 
 arch/x86_64/Makefile                |    3 
 arch/x86_64/ia32/ia32_binfmt.c      |    1 
 arch/x86_64/kernel/time.c           |    1 
 arch/x86_64/kernel/vmlinux.lds.S    |    9 ++
 arch/x86_64/kernel/vsyscall.c       |   22 +----
 arch/x86_64/mm/init.c               |    9 ++
 arch/x86_64/vdso/Makefile           |   49 ++++++++++++
 arch/x86_64/vdso/vclock_gettime.c   |  120 +++++++++++++++++++++++++++++++
 arch/x86_64/vdso/vdso-note.S        |   12 +++
 arch/x86_64/vdso/vdso-start.S       |    2 
 arch/x86_64/vdso/vdso.S             |    2 
 arch/x86_64/vdso/vdso.lds.S         |   77 +++++++++++++++++++
 arch/x86_64/vdso/vextern.h          |   16 ++++
 arch/x86_64/vdso/vgetcpu.c          |   50 ++++++++++++
 arch/x86_64/vdso/vma.c              |  139 ++++++++++++++++++++++++++++++++++++
 arch/x86_64/vdso/voffset.h          |    1 
 arch/x86_64/vdso/vvar.c             |   12 +++
 include/asm-x86_64/auxvec.h         |    2 
 include/asm-x86_64/elf.h            |   13 +++
 include/asm-x86_64/mmu.h            |    1 
 include/asm-x86_64/vgtod.h          |   29 +++++++
 include/asm-x86_64/vsyscall.h       |    3 
 23 files changed, 554 insertions(+), 21 deletions(-)

Index: linux/arch/x86_64/ia32/ia32_binfmt.c
===================================================================
--- linux.orig/arch/x86_64/ia32/ia32_binfmt.c
+++ linux/arch/x86_64/ia32/ia32_binfmt.c
@@ -38,6 +38,7 @@
 
 int sysctl_vsyscall32 = 1;
 
+#undef ARCH_DLINFO
 #define ARCH_DLINFO do {  \
 	if (sysctl_vsyscall32) { \
 	NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
Index: linux/arch/x86_64/kernel/vmlinux.lds.S
===================================================================
--- linux.orig/arch/x86_64/kernel/vmlinux.lds.S
+++ linux/arch/x86_64/kernel/vmlinux.lds.S
@@ -91,6 +91,9 @@ SECTIONS
   .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
 		{ *(.vsyscall_gtod_data) }
   vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+  .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
+		{ *(.vsyscall_clock) }
+  vsyscall_clock = VVIRT(.vsyscall_clock);
 
 
   .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
@@ -187,6 +190,12 @@ SECTIONS
   .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
   .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
 
+/* vdso blob that is mapped into user space */
+  vdso_start = . ;
+  .vdso  : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
+  . = ALIGN(4096);
+  vdso_end = .;
+
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
   __initramfs_start = .;
Index: linux/arch/x86_64/mm/init.c
===================================================================
--- linux.orig/arch/x86_64/mm/init.c
+++ linux/arch/x86_64/mm/init.c
@@ -774,3 +774,12 @@ void *alloc_bootmem_high_node(pg_data_t 
 	return __alloc_bootmem_core(pgdat->bdata, size,
 			SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
 }
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+		return "[vdso]";
+	if (vma == &gate_vma)
+		return "[vsyscall]";
+	return NULL;
+}
Index: linux/arch/x86_64/vdso/vdso-note.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/vdso-note.S
@@ -0,0 +1,12 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
Index: linux/arch/x86_64/vdso/vdso.lds.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/vdso.lds.S
@@ -0,0 +1,77 @@
+/*
+ * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
+ * object prelinked to its virtual address, and with only one read-only
+ * segment (that fits in one page).  This script controls its layout.
+ */
+#include <asm/asm-offsets.h>
+#include "voffset.h"
+
+#define VDSO_PRELINK 0xffffffffff700000
+
+SECTIONS
+{
+  . = VDSO_PRELINK + SIZEOF_HEADERS;
+
+  .hash           : { *(.hash) }		:text
+  .gnu.hash       : { *(.gnu.hash) }
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+
+  /* This linker script is used both with -r and with -shared.
+     For the layouts to match, we need to skip more than enough
+     space for the dynamic symbol table et al.  If this amount
+     is insufficient, ld -shared will barf.  Just increase it here.  */
+  . = VDSO_PRELINK + VDSO_TEXT_OFFSET;
+
+  .text           : { *(.text) }		:text
+  .text.ptr       : { *(.text.ptr) }		:text
+  . = VDSO_PRELINK + 0x900;
+  .data           : { *(.data) }		:text
+  .bss            : { *(.bss) }			:text
+
+  .altinstructions : { *(.altinstructions) }			:text
+  .altinstr_replacement  : { *(.altinstr_replacement) }	:text
+
+  .note		  : { *(.note.*) }		:text :note
+  .eh_frame_hdr   : { *(.eh_frame_hdr) }	:text :eh_frame_hdr
+  .eh_frame       : { KEEP (*(.eh_frame)) }	:text
+  .dynamic        : { *(.dynamic) }		:text :dynamic
+  .useless        : {
+  	*(.got.plt) *(.got)
+	*(.gnu.linkonce.d.*)
+	*(.dynbss)
+	*(.gnu.linkonce.b.*)
+  }						:text
+}
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+  note PT_NOTE FLAGS(4); /* PF_R */
+  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  LINUX_2.6 {
+    global:
+	clock_gettime;
+	__vdso_clock_gettime;
+	gettimeofday;
+	__vdso_gettimeofday;
+	getcpu;
+	__vdso_getcpu;
+    local: *;
+  };
+}
Index: linux/arch/x86_64/vdso/Makefile
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/Makefile
@@ -0,0 +1,49 @@
+#
+# x86-64 vDSO.
+#
+
+# files to link into the vdso
+# vdso-start.o has to be first
+vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
+
+# files to link into kernel
+obj-y := vma.o vdso.o vdso-syms.o
+
+vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+
+$(obj)/vdso.o: $(obj)/vdso.so
+
+targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o
+
+# The DSO images are built using a special linker script.
+quiet_cmd_syscall = SYSCALL $@
+      cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \
+		          -Wl,-T,$(filter-out FORCE,$^) -o $@
+
+export CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
+
+vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \
+		 $(call ld-option, -Wl$(comma)--hash-style=sysv) \
+		-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
+SYSCFLAGS_vdso.so = $(vdso-flags)
+
+$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
+
+$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE
+	$(call if_changed,syscall)
+
+CF := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
+
+$(obj)/vclock_gettime.o: CFLAGS = $(CF)
+$(obj)/vgetcpu.o: CFLAGS = $(CF)
+
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO.  With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vdso-syms.o
+$(obj)/built-in.o: $(obj)/vdso-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
+
+SYSCFLAGS_vdso-syms.o = -r -d
+$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE
+	$(call if_changed,syscall)
Index: linux/arch/x86_64/vdso/vclock_gettime.c
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/vclock_gettime.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Fast user context implementation of clock_gettime and gettimeofday.
+ *
+ * The code should have no internal unresolved relocations.
+ * Check with readelf after changing.
+ * Also alternative() doesn't work.
+ */
+
+#include <linux/kernel.h>
+#include <linux/posix-timers.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <asm/vsyscall.h>
+#include <asm/vgtod.h>
+#include <asm/timex.h>
+#include <asm/hpet.h>
+#include <asm/unistd.h>
+#include <asm/io.h>
+#include <asm/vgtod.h>
+#include "vextern.h"
+
+#define gtod vdso_vsyscall_gtod_data
+
+static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+	long ret;
+	asm("syscall" : "=a" (ret) :
+	    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
+	return ret;
+}
+
+static inline long vgetns(void)
+{
+	cycles_t (*vread)(void);
+	vread = gtod->clock.vread;
+	return ((vread() - gtod->clock.cycle_last) * gtod->clock.mult) >>
+		gtod->clock.shift;
+}
+
+static noinline int do_realtime(struct timespec *ts)
+{
+	unsigned long seq, ns;
+	do {
+		seq = read_seqbegin(&gtod->lock);
+		ts->tv_sec = gtod->wall_time_sec;
+		ts->tv_nsec = gtod->wall_time_nsec;
+		ns = vgetns();
+	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	timespec_add_ns(ts, ns);
+	return 0;
+}
+
+/* Copy of the version in kernel/time.c which we cannot directly access */
+static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+{
+	while (nsec >= NSEC_PER_SEC) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}
+
+static noinline int do_monotonic(struct timespec *ts)
+{
+	unsigned long seq, ns, secs;
+	do {
+		seq = read_seqbegin(&gtod->lock);
+		secs = gtod->wall_time_sec;
+		ns = gtod->wall_time_nsec + vgetns();
+		secs += gtod->wall_to_monotonic.tv_sec;
+		ns += gtod->wall_to_monotonic.tv_nsec;
+	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	vset_normalized_timespec(ts, secs, ns);
+	return 0;
+}
+
+int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+{
+	if (likely(gtod->sysctl_enabled && gtod->clock.vread))
+		switch (clock) {
+		case CLOCK_REALTIME:
+			return do_realtime(ts);
+		case CLOCK_MONOTONIC:
+			return do_monotonic(ts);
+		}
+	return vdso_fallback_gettime(clock, ts);
+}
+int clock_gettime(clockid_t, struct timespec *)
+	__attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+	long ret;
+	if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
+		BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
+			     offsetof(struct timespec, tv_nsec) ||
+			     sizeof(*tv) != sizeof(struct timespec));
+		do_realtime((struct timespec *)tv);
+		tv->tv_usec /= 1000;
+		if (unlikely(tz != NULL)) {
+			/* This relies on gcc inlining the memcpy. We'll notice
+			   if it ever fails to do so. */
+			memcpy(tz, &gtod->sys_tz, sizeof(struct timezone));
+		}
+		return 0;
+	}
+	asm("syscall" : "=a" (ret) :
+	    "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+	return ret;
+}
+int gettimeofday(struct timeval *, struct timezone *)
+	__attribute__((weak, alias("__vdso_gettimeofday")));
Index: linux/arch/x86_64/vdso/vma.c
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/vma.c
@@ -0,0 +1,139 @@
+/*
+ * Set up the VMAs to tell the VM about the vDSO.
+ * Copyright 2007 Andi Kleen, SUSE Labs.
+ * Subject to the GPL, v.2
+ */
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <asm/vsyscall.h>
+#include <asm/vgtod.h>
+#include <asm/proto.h>
+#include "voffset.h"
+
+int vdso_enabled = 1;
+
+#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x;
+#include "vextern.h"
+#undef VEXTERN
+
+extern char vdso_kernel_start[], vdso_start[], vdso_end[];
+extern unsigned short vdso_sync_cpuid;
+
+struct page **vdso_pages;
+
+static inline void *var_ref(void *vbase, char *var, char *name)
+{
+	unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET;
+	void *p = vbase + offset;
+	if (*(void **)p != (void *)VMAGIC) {
+		printk("VDSO: variable %s broken\n", name);
+		vdso_enabled = 0;
+	}
+	return p;
+}
+
+static int __init init_vdso_vars(void)
+{
+	int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
+	int i;
+	char *vbase;
+
+	vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
+	if (!vdso_pages)
+		goto oom;
+	for (i = 0; i < npages; i++) {
+		struct page *p;
+		p = alloc_page(GFP_KERNEL);
+		if (!p)
+			goto oom;
+		vdso_pages[i] = p;
+		copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
+	}
+
+	vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
+	if (!vbase)
+		goto oom;
+
+	if (memcmp(vbase, "\177ELF", 4)) {
+		printk("VDSO: I'm broken; not ELF\n");
+		vdso_enabled = 0;
+	}
+
+#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x)
+#define VEXTERN(x) \
+	V(vdso_ ## x) = &__ ## x;
+#include "vextern.h"
+#undef VEXTERN
+	return 0;
+
+ oom:
+	printk("Cannot allocate vdso\n");
+	vdso_enabled = 0;
+	return -ENOMEM;
+}
+__initcall(init_vdso_vars);
+
+struct linux_binprm;
+
+/* Put the vdso above the (randomized) stack with another randomized offset.
+   This way there is no hole in the middle of address space.
+   To save memory make sure it is still in the same PTE as the stack top.
+   This doesn't give that many random bits */
+static unsigned long vdso_addr(unsigned long start, unsigned len)
+{
+	unsigned long addr, end;
+	unsigned offset;
+	end = (start + PMD_SIZE - 1) & PMD_MASK;
+	if (end >= TASK_SIZE64)
+		end = TASK_SIZE64;
+	end -= len;
+	/* This loses some more bits than a modulo, but is cheaper */
+	offset = get_random_int() & (PTRS_PER_PTE - 1);
+	addr = start + (offset << PAGE_SHIFT);
+	if (addr >= end)
+		addr = end;
+	return addr;
+}
+
+/* Setup a VMA at program startup for the vsyscall page.
+   Not called for compat tasks */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret;
+	unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
+
+	if (!vdso_enabled)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+	addr = vdso_addr(mm->start_stack, len);
+	addr = get_unmapped_area(NULL, addr, len, 0, 0);
+	if (IS_ERR_VALUE(addr)) {
+		ret = addr;
+		goto up_fail;
+	}
+
+	ret = install_special_mapping(mm, addr, len,
+				      VM_READ|VM_EXEC|
+				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+				      VM_ALWAYSDUMP,
+				      vdso_pages);
+	if (ret)
+		goto up_fail;
+
+	current->mm->context.vdso = (void *)addr;
+up_fail:
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+
+static __init int vdso_setup(char *s)
+{
+	vdso_enabled = simple_strtoul(s, NULL, 0);
+	return 0;
+}
+__setup("vdso=", vdso_setup);
Index: linux/arch/x86_64/vdso/vdso.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/vdso.S
@@ -0,0 +1,2 @@
+	.section ".vdso","a"
+	.incbin "arch/x86_64/vdso/vdso.so"
Index: linux/arch/x86_64/vdso/vdso-start.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/vdso-start.S
@@ -0,0 +1,2 @@
+	.globl vdso_kernel_start
+vdso_kernel_start:
Index: linux/arch/x86_64/Makefile
===================================================================
--- linux.orig/arch/x86_64/Makefile
+++ linux/arch/x86_64/Makefile
@@ -79,7 +79,8 @@ head-y := arch/x86_64/kernel/head.o arch
 libs-y 					+= arch/x86_64/lib/
 core-y					+= arch/x86_64/kernel/ \
 					   arch/x86_64/mm/ \
-					   arch/x86_64/crypto/
+					   arch/x86_64/crypto/ \
+					   arch/x86_64/vdso/
 core-$(CONFIG_IA32_EMULATION)		+= arch/x86_64/ia32/
 drivers-$(CONFIG_PCI)			+= arch/x86_64/pci/
 drivers-$(CONFIG_OPROFILE)		+= arch/x86_64/oprofile/
Index: linux/include/asm-x86_64/mmu.h
===================================================================
--- linux.orig/include/asm-x86_64/mmu.h
+++ linux/include/asm-x86_64/mmu.h
@@ -15,6 +15,7 @@ typedef struct { 
 	rwlock_t ldtlock; 
 	int size;
 	struct semaphore sem; 
+	void *vdso;
 } mm_context_t;
 
 #endif
Index: linux/include/asm-x86_64/vsyscall.h
===================================================================
--- linux.orig/include/asm-x86_64/vsyscall.h
+++ linux/include/asm-x86_64/vsyscall.h
@@ -22,6 +22,8 @@ enum vsyscall_num {
 /* Definitions for CONFIG_GENERIC_TIME definitions */
 #define __section_vsyscall_gtod_data __attribute__ \
 	((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
+#define __section_vsyscall_clock __attribute__ \
+	((unused, __section__ (".vsyscall_clock"),aligned(16)))
 #define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
 
 #define VGETCPU_RDTSCP	1
@@ -36,7 +38,6 @@ extern volatile unsigned long __jiffies;
 /* kernel space (writeable) */
 extern int vgetcpu_mode;
 extern struct timezone sys_tz;
-extern struct vsyscall_gtod_data_t vsyscall_gtod_data;
 
 #endif /* __KERNEL__ */
 
Index: linux/include/asm-x86_64/auxvec.h
===================================================================
--- linux.orig/include/asm-x86_64/auxvec.h
+++ linux/include/asm-x86_64/auxvec.h
@@ -1,4 +1,6 @@
 #ifndef __ASM_X86_64_AUXVEC_H
 #define __ASM_X86_64_AUXVEC_H
 
+#define AT_SYSINFO_EHDR		33
+
 #endif
Index: linux/include/asm-x86_64/elf.h
===================================================================
--- linux.orig/include/asm-x86_64/elf.h
+++ linux/include/asm-x86_64/elf.h
@@ -162,6 +162,19 @@ extern int dump_task_fpu (struct task_st
 /* 1GB for 64bit, 8MB for 32bit */
 #define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
 
+
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+                                       int executable_stack);
+
+extern int vdso_enabled;
+
+#define ARCH_DLINFO						\
+do if (vdso_enabled) {						\
+	NEW_AUX_ENT(AT_SYSINFO_EHDR,(unsigned long)current->mm->context.vdso);\
+} while (0)
+
 #endif
 
 #endif
Index: linux/arch/x86_64/vdso/vextern.h
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/vextern.h
@@ -0,0 +1,16 @@
+#ifndef VEXTERN
+#include <asm/vsyscall.h>
+#define VEXTERN(x) \
+	extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
+#endif
+
+#define VMAGIC 0xfeedbabeabcdefabUL
+
+/* Any kernel variables used in the vDSO must be exported in the main
+   kernel's vmlinux.lds.S/vsyscall.h/proper __section and
+   put into vextern.h and be referenced as a pointer with vdso prefix.
+   The main kernel later fills in the values.   */
+
+VEXTERN(jiffies)
+VEXTERN(vgetcpu_mode)
+VEXTERN(vsyscall_gtod_data)
Index: linux/arch/x86_64/vdso/vgetcpu.c
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/vgetcpu.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Fast user context implementation of getcpu()
+ */
+
+#include <linux/kernel.h>
+#include <linux/getcpu.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <asm/vsyscall.h>
+#include <asm/vgtod.h>
+#include "vextern.h"
+
+long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+{
+	unsigned int dummy, p;
+	unsigned long j = 0;
+
+	/* Fast cache - only recompute value once per jiffies and avoid
+	   relatively costly rdtscp/cpuid otherwise.
+	   This works because the scheduler usually keeps the process
+	   on the same CPU and this syscall doesn't guarantee its
+	   results anyways.
+	   We do this here because otherwise user space would do it on
+	   its own in a likely inferior way (no access to jiffies).
+	   If you don't like it pass NULL. */
+	if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) {
+		p = tcache->blob[1];
+	} else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
+		/* Load per CPU data from RDTSCP */
+		rdtscp(dummy, dummy, p);
+	} else {
+		/* Load per CPU data from GDT */
+		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+	}
+	if (tcache) {
+		tcache->blob[0] = j;
+		tcache->blob[1] = p;
+	}
+	if (cpu)
+		*cpu = p & 0xfff;
+	if (node)
+		*node = p >> 12;
+	return 0;
+}
+
+long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+	__attribute__((weak, alias("__vdso_getcpu")));
Index: linux/arch/x86_64/vdso/vvar.c
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/vvar.c
@@ -0,0 +1,12 @@
+/* Define pointer to external vDSO variables.
+   These are part of the vDSO. The kernel fills in the real addresses
+   at boot time. This is done because when the vdso is linked the
+   kernel isn't yet and we don't know the final addresses. */
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/vgtod.h>
+
+#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC;
+#include "vextern.h"
Index: linux/Documentation/kernel-parameters.txt
===================================================================
--- linux.orig/Documentation/kernel-parameters.txt
+++ linux/Documentation/kernel-parameters.txt
@@ -1880,7 +1880,7 @@ and is between 256 and 4096 characters. 
 	usbhid.mousepoll=
 			[USBHID] The interval which mice are to be polled at.
 
-	vdso=		[IA-32,SH]
+	vdso=		[IA-32,SH,x86-64]
 			vdso=2: enable compat VDSO (default with COMPAT_VDSO)
 			vdso=1: enable VDSO (default)
 			vdso=0: disable VDSO mapping
Index: linux/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux.orig/arch/x86_64/kernel/vsyscall.c
+++ linux/arch/x86_64/kernel/vsyscall.c
@@ -42,6 +42,7 @@
 #include <asm/segment.h>
 #include <asm/desc.h>
 #include <asm/topology.h>
+#include <asm/vgtod.h>
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
 #define __syscall_clobber "r11","rcx","memory"
@@ -57,26 +58,9 @@
  * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
  * Try to keep this structure as small as possible to avoid cache line ping pongs
  */
-struct vsyscall_gtod_data_t {
-	seqlock_t	lock;
-
-	/* open coded 'struct timespec' */
-	time_t		wall_time_sec;
-	u32		wall_time_nsec;
-
-	int		sysctl_enabled;
-	struct timezone sys_tz;
-	struct { /* extract of a clocksource struct */
-		cycle_t (*vread)(void);
-		cycle_t	cycle_last;
-		cycle_t	mask;
-		u32	mult;
-		u32	shift;
-	} clock;
-};
 int __vgetcpu_mode __section_vgetcpu_mode;
 
-struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data =
+struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
 {
 	.lock = SEQLOCK_UNLOCKED,
 	.sysctl_enabled = 1,
@@ -96,6 +80,8 @@ void update_vsyscall(struct timespec *wa
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
 	vsyscall_gtod_data.sys_tz = sys_tz;
+	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -44,6 +44,7 @@
 #include <asm/hpet.h>
 #include <asm/mpspec.h>
 #include <asm/nmi.h>
+#include <asm/vgtod.h>
 
 static char *timename = NULL;
 
Index: linux/include/asm-x86_64/vgtod.h
===================================================================
--- /dev/null
+++ linux/include/asm-x86_64/vgtod.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_VGTOD_H
+#define _ASM_VGTOD_H 1
+
+#include <asm/vsyscall.h>
+#include <linux/clocksource.h>
+
+struct vsyscall_gtod_data {
+	seqlock_t	lock;
+
+	/* open coded 'struct timespec' */
+	time_t		wall_time_sec;
+	u32		wall_time_nsec;
+
+	int		sysctl_enabled;
+	struct timezone sys_tz;
+	struct { /* extract of a clocksource struct */
+		cycle_t (*vread)(void);
+		cycle_t	cycle_last;
+		cycle_t	mask;
+		u32	mult;
+		u32	shift;
+	} clock;
+	struct timespec wall_to_monotonic;
+};
+extern struct vsyscall_gtod_data __vsyscall_gtod_data
+__section_vsyscall_gtod_data;
+extern struct vsyscall_gtod_data vsyscall_gtod_data;
+
+#endif
Index: linux/arch/x86_64/vdso/voffset.h
===================================================================
--- /dev/null
+++ linux/arch/x86_64/vdso/voffset.h
@@ -0,0 +1 @@
+#define VDSO_TEXT_OFFSET 0x500

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [13/58] x86: Separate checking of unsynchronized and unstable TSC
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (11 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [12/58] x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19  9:54 ` [PATCH] [14/58] x86_64: Add on_cpu_single Andi Kleen
                   ` (44 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


Preparationary patch for the new sched/printk_clock()

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/i386/kernel/tsc.c   |    9 ++++-----
 arch/x86_64/kernel/tsc.c |    5 +----
 2 files changed, 5 insertions(+), 9 deletions(-)

Index: linux/arch/i386/kernel/tsc.c
===================================================================
--- linux.orig/arch/i386/kernel/tsc.c
+++ linux/arch/i386/kernel/tsc.c
@@ -331,7 +331,7 @@ static struct dmi_system_id __initdata b
  */
 __cpuinit int unsynchronized_tsc(void)
 {
-	if (!cpu_has_tsc || tsc_unstable)
+	if (!cpu_has_tsc)
 		return 1;
 	/*
 	 * Intel systems are normally all synchronized.
@@ -340,9 +340,9 @@ __cpuinit int unsynchronized_tsc(void)
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
 		/* assume multi socket systems are not synchronized: */
 		if (num_possible_cpus() > 1)
-			tsc_unstable = 1;
+			return 1;
 	}
-	return tsc_unstable;
+	return 0;
 }
 
 /*
@@ -386,13 +386,12 @@ void __init tsc_init(void)
 	/* Check and install the TSC clocksource */
 	dmi_check_system(bad_tsc_dmi_table);
 
-	unsynchronized_tsc();
 	check_geode_tsc_reliable();
 	current_tsc_khz = tsc_khz;
 	clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
 							clocksource_tsc.shift);
 	/* lower the rating if we already know its unstable: */
-	if (check_tsc_unstable()) {
+	if (check_tsc_unstable() || unsynchronized_tsc()) {
 		clocksource_tsc.rating = 0;
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
 	} else
Index: linux/arch/x86_64/kernel/tsc.c
===================================================================
--- linux.orig/arch/x86_64/kernel/tsc.c
+++ linux/arch/x86_64/kernel/tsc.c
@@ -144,9 +144,6 @@ static int tsc_unstable = 0;
  */
 __cpuinit int unsynchronized_tsc(void)
 {
-	if (tsc_unstable)
-		return 1;
-
 #ifdef CONFIG_SMP
 	if (apic_is_clustered_box())
 		return 1;
@@ -218,7 +215,7 @@ void __init init_tsc_clocksource(void)
 	if (!notsc) {
 		clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
 							clocksource_tsc.shift);
-		if (check_tsc_unstable())
+		if (unsynchronized_tsc() || check_tsc_unstable())
 			clocksource_tsc.rating = 0;
 
 		clocksource_register(&clocksource_tsc);

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [14/58] x86_64: Add on_cpu_single
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (12 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [13/58] x86: Separate checking of unsynchronized and unstable TSC Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19 11:09   ` Satyam Sharma
  2007-07-19  9:54 ` [PATCH] [15/58] i386: Rewrite sched_clock Andi Kleen
                   ` (43 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


Call a function on a target CPU but do the right thing when 
we're already on that CPU. That's the main difference from smp_call_function_single
which does the wrong thing in this case (erroring out)

Another advantage is that it is also defined for the UP case, avoiding
some ifdefs.

I also dropped retry (which never did anything) and wait (because the on
current cpu case will always wait)
Signed-off-by: Andi Kleen <ak@suse.de>

---
 include/linux/smp.h |   22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

Index: linux/include/linux/smp.h
===================================================================
--- linux.orig/include/linux/smp.h
+++ linux/include/linux/smp.h
@@ -138,4 +138,26 @@ static inline void smp_send_reschedule(i
 
 void smp_setup_processor_id(void);
 
+#ifdef CONFIG_SMP
+/* Similar to smp_call_function_single, but DTRT when we're already
+   on the right CPU. */
+static inline void on_cpu_single(int cpu, void (*func)(void *), void *info)
+{
+	int me = get_cpu();
+	if (cpu == me) {
+		func(info);
+		put_cpu();
+	} else {
+		put_cpu();
+		/* wait is forced on because the me==cpu case above will always wait */
+		smp_call_function_single(cpu, func, info, 0, 1);
+	}
+}
+#else
+static inline void on_cpu_single(int cpu, void (*func)(void *), void *info)
+{
+	func(info);
+}
+#endif
+
 #endif /* __LINUX_SMP_H */

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (13 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [14/58] x86_64: Add on_cpu_single Andi Kleen
@ 2007-07-19  9:54 ` Andi Kleen
  2007-07-19 16:51   ` Daniel Walker
  2007-07-19  9:55 ` [PATCH] [16/58] x86_64: Use new shared sched_clock in x86-64 too Andi Kleen
                   ` (42 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:54 UTC (permalink / raw)
  To: patches, linux-kernel


Move it into an own file for easy sharing.
Do everything per CPU. This avoids problems with TSCs that
tick at different frequencies per CPU.
Resync properly on cpufreq changes. CPU frequency is instable
around cpu frequency changing, so fall back during a backing
clock during this period.
Hopefully TSC will work now on all systems except when there isn't a
physical TSC. 

And

+From: Jeremy Fitzhardinge <jeremy@goop.org>
Three cleanups there:
 - change "instable" -> "unstable"
 - it's better to use get_cpu_var for getting this cpu's variables
 - change cycles_2_ns to do the full computation rather than just the
   tsc->ns scaling.  It's a simpler interface, and it makes the function

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/i386/kernel/Makefile      |    3 
 arch/i386/kernel/sched-clock.c |  265 +++++++++++++++++++++++++++++++++++++++++
 arch/i386/kernel/tsc.c         |   74 -----------
 include/asm-i386/timer.h       |   32 ----
 include/asm-i386/tsc.h         |    1 
 5 files changed, 269 insertions(+), 106 deletions(-)

Index: linux/arch/i386/kernel/sched-clock.c
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/sched-clock.c
@@ -0,0 +1,265 @@
+/* A fast clock for the scheduler.
+ * Copyright 2007 Andi Kleen SUSE Labs
+ * Subject to the GNU Public License, version 2 only.
+ */
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpufreq.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <asm/tsc.h>
+#include <asm/cpufeature.h>
+#include <asm/timer.h>
+
+/*
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *		ns = cycles / (freq / ns_per_sec)
+ *		ns = cycles * (ns_per_sec / freq)
+ *		ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *		ns = cycles * (10^6 / cpu_khz)
+ *
+ *	Then we use scaling math (suggested by george@mvista.com) to get:
+ *		ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *		ns = cycles * cyc2ns_scale / SC
+ *
+ *	And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+struct sc_data {
+	unsigned cyc2ns_scale;
+	unsigned unstable;
+	unsigned long long sync_base;		/* TSC or jiffies at syncpoint*/
+	unsigned long long ns_base;		/* nanoseconds at sync point */
+	unsigned long long last_val;		/* Last returned value */
+};
+
+static DEFINE_PER_CPU(struct sc_data, sc_data) =
+	{ .unstable = 1, .sync_base = INITIAL_JIFFIES };
+
+static inline u64 __cycles_2_ns(struct sc_data *sc, u64 cyc)
+{
+	u64 ns;
+
+	cyc -= sc->sync_base;
+	ns = (cyc * sc->cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+	ns += sc->ns_base;
+
+	return ns;
+}
+
+u64 cycles_2_ns(u64 cyc)
+{
+	struct sc_data *sc = &get_cpu_var(sc_data);
+	u64 ns = __cycles_2_ns(sc, cyc);
+	put_cpu_var(sc_data);
+	return ns;
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * All data is local to the CPU.
+ * The values are approximately[1] monotonic local to a CPU, but not
+ * between CPUs.   There might be also an occasionally random error,
+ * but not too bad. Between CPUs the values can be non monotonic.
+ *
+ * [1] no attempt to stop CPU instruction reordering, which can hit
+ * in a 100 instruction window or so.
+ *
+ * The clock can be in two states: stable and unstable.
+ * When it is stable we use the TSC per CPU.
+ * When it is unstable we use jiffies as fallback.
+ * stable->unstable->stable transitions can happen regularly
+ * during CPU frequency changes.
+ * There is special code to avoid having the clock jump backwards
+ * when we switch from TSC to jiffies, which needs to keep some state
+ * per CPU. This state is protected against parallel state changes
+ * with interrupts off.
+ */
+unsigned long long tsc_sched_clock(void)
+{
+	unsigned long long r;
+	struct sc_data *sc = &get_cpu_var(sc_data);
+
+	if (unlikely(sc->unstable)) {
+		r = (jiffies_64 - sc->sync_base) * (1000000000 / HZ);
+		r += sc->ns_base;
+		/*
+		 * last_val is used to avoid non monotonity on a
+		 * stable->unstable transition. Make sure the time
+		 * never goes to before the last value returned by the
+		 * TSC clock.
+		 */
+		while (r <= sc->last_val) {
+			rmb();
+			r = sc->last_val + 1;
+			rmb();
+		}
+		sc->last_val = r;
+	} else {
+		rdtscll(r);
+		r = __cycles_2_ns(sc, r);
+		sc->last_val = r;
+	}
+
+	put_cpu_var(sc_data);
+
+	return r;
+}
+
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+	return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+	__attribute__((alias("tsc_sched_clock")));
+#endif
+
+static int no_sc_for_printk;
+
+/*
+ * printk clock: when it is known the sc results are very non monotonic
+ * fall back to jiffies for printk. Other sched_clock users are supposed
+ * to handle this.
+ */
+unsigned long long printk_clock(void)
+{
+	if (unlikely(no_sc_for_printk))
+		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+	return tsc_sched_clock();
+}
+
+static void resolve_freq(struct cpufreq_freqs *freq)
+{
+	if (!freq->new) {
+		freq->new = cpufreq_get(freq->cpu);
+		if (!freq->new)
+			freq->new = tsc_khz;
+	}
+}
+
+/* Resync with new CPU frequency. Must run on to be synced CPU */
+static void resync_freq(void *arg)
+{
+	struct cpufreq_freqs *freq = (void *)arg;
+	struct sc_data *sc = &__get_cpu_var(sc_data);
+
+	sc->sync_base = jiffies;
+	if (!cpu_has_tsc) {
+		sc->unstable = 1;
+		return;
+	}
+	resolve_freq(freq);
+
+	/*
+	 * Handle nesting, but when we're zero multiple calls in a row
+	 * are ok too and not a bug. This can happen during startup
+	 * when the different callbacks race with each other.
+	 */
+	if (sc->unstable > 0)
+		sc->unstable--;
+	if (sc->unstable)
+		return;
+
+	/* Minor race window here, but should not add significant errors. */
+	sc->ns_base = ktime_to_ns(ktime_get());
+	rdtscll(sc->sync_base);
+	sc->cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR) / freq->new;
+}
+
+static void resync_freq_on_cpu(void *arg)
+{
+	struct cpufreq_freqs f = { .new = 0 };
+
+	f.cpu = get_cpu();
+	resync_freq(&f);
+	put_cpu();
+}
+
+static int sc_freq_event(struct notifier_block *nb, unsigned long event,
+			 void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	struct sc_data *sc = &per_cpu(sc_data, freq->cpu);
+
+	if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
+		return NOTIFY_DONE;
+	if (freq->old == freq->new)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case CPUFREQ_SUSPENDCHANGE:
+		/* Mark TSC unstable during suspend/resume */
+	case CPUFREQ_PRECHANGE:
+		/*
+		 * Mark TSC as unstable until cpu frequency change is
+		 * done because we don't know when exactly it will
+		 * change.  unstable in used as a counter to guard
+		 * against races between the cpu frequency notifiers
+		 * and normal resyncs
+		 */
+		sc->unstable++;
+		/* FALL THROUGH */
+	case CPUFREQ_RESUMECHANGE:
+	case CPUFREQ_POSTCHANGE:
+		/*
+		 * Frequency change or resume is done -- update everything and
+		 * mark TSC as stable again.
+		 */
+		on_cpu_single(freq->cpu, resync_freq, freq);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block sc_freq_notifier = {
+	.notifier_call = sc_freq_event
+};
+
+static int __cpuinit
+sc_cpu_event(struct notifier_block *self, unsigned long event, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	if (event == CPU_ONLINE) {
+		struct cpufreq_freqs f = { .cpu = cpu, .new = 0 };
+
+		on_cpu_single(cpu, resync_freq, &f);
+	}
+	return NOTIFY_DONE;
+}
+
+static __init int init_sched_clock(void)
+{
+	if (unsynchronized_tsc())
+		no_sc_for_printk = 1;
+
+	/*
+	 * On a race between the various events the initialization
+	 * might be done multiple times, but code is tolerant to
+	 * this .
+	 */
+	cpufreq_register_notifier(&sc_freq_notifier,
+				CPUFREQ_TRANSITION_NOTIFIER);
+	hotcpu_notifier(sc_cpu_event, 0);
+	on_each_cpu(resync_freq_on_cpu, NULL, 0, 0);
+	return 0;
+}
+core_initcall(init_sched_clock);
Index: linux/arch/i386/kernel/tsc.c
===================================================================
--- linux.orig/arch/i386/kernel/tsc.c
+++ linux/arch/i386/kernel/tsc.c
@@ -63,74 +63,6 @@ static inline int check_tsc_unstable(voi
 	return tsc_unstable;
 }
 
-/* Accellerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_khz * 10^3))
- *		ns = cycles * (10^6 / cpu_khz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^6 * SC / cpu_khz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better percision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-unsigned long cyc2ns_scale __read_mostly;
-
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
-	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
-}
-
-/*
- * Scheduler clock - returns current time in nanosec units.
- */
-unsigned long long native_sched_clock(void)
-{
-	unsigned long long this_offset;
-
-	/*
-	 * Fall back to jiffies if there's no TSC available:
-	 * ( But note that we still use it if the TSC is marked
-	 *   unstable. We do this because unlike Time Of Day,
-	 *   the scheduler clock tolerates small errors and it's
-	 *   very important for it to be as fast as the platform
-	 *   can achive it. )
-	 */
-	if (unlikely(!tsc_enabled && !tsc_unstable))
-		/* No locking but a rare wrong value is not a big deal: */
-		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
-
-	/* read the Time Stamp Counter: */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return cycles_2_ns(this_offset);
-}
-
-/* We need to define a real function for sched_clock, to override the
-   weak default version */
-#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
-{
-	return paravirt_sched_clock();
-}
-#else
-unsigned long long sched_clock(void)
-	__attribute__((alias("native_sched_clock")));
-#endif
-
 unsigned long native_calculate_cpu_khz(void)
 {
 	unsigned long long start, end;
@@ -238,11 +170,6 @@ time_cpufreq_notifier(struct notifier_bl
 						ref_freq, freq->new);
 			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
 				tsc_khz = cpu_khz;
-				set_cyc2ns_scale(cpu_khz);
-				/*
-				 * TSC based sched_clock turns
-				 * to junk w/ cpufreq
-				 */
 				mark_tsc_unstable("cpufreq changes");
 			}
 		}
@@ -380,7 +307,6 @@ void __init tsc_init(void)
 				(unsigned long)cpu_khz / 1000,
 				(unsigned long)cpu_khz % 1000);
 
-	set_cyc2ns_scale(cpu_khz);
 	use_tsc_delay();
 
 	/* Check and install the TSC clocksource */
Index: linux/arch/i386/kernel/Makefile
===================================================================
--- linux.orig/arch/i386/kernel/Makefile
+++ linux/arch/i386/kernel/Makefile
@@ -7,7 +7,8 @@ extra-y := head.o init_task.o vmlinux.ld
 obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
-		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
+		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o \
+		sched-clock.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
Index: linux/include/asm-i386/timer.h
===================================================================
--- linux.orig/include/asm-i386/timer.h
+++ linux/include/asm-i386/timer.h
@@ -6,7 +6,6 @@
 #define TICK_SIZE (tick_nsec / 1000)
 
 void setup_pit_timer(void);
-unsigned long long native_sched_clock(void);
 unsigned long native_calculate_cpu_khz(void);
 
 extern int timer_ack;
@@ -18,35 +17,6 @@ extern int recalibrate_cpu_khz(void);
 #define calculate_cpu_khz() native_calculate_cpu_khz()
 #endif
 
-/* Accellerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_khz * 10^3))
- *		ns = cycles * (10^6 / cpu_khz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^6 * SC / cpu_khz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better percision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-extern unsigned long cyc2ns_scale __read_mostly;
-
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
+u64 cycles_2_ns(u64 cyc);
 
 #endif
Index: linux/include/asm-i386/tsc.h
===================================================================
--- linux.orig/include/asm-i386/tsc.h
+++ linux/include/asm-i386/tsc.h
@@ -63,6 +63,7 @@ extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
 extern void init_tsc_clocksource(void);
+extern unsigned long long tsc_sched_clock(void);
 
 /*
  * Boot-time check whether the TSCs are synchronized across

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [16/58] x86_64: Use new shared sched_clock in x86-64 too
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (14 preceding siblings ...)
  2007-07-19  9:54 ` [PATCH] [15/58] i386: Rewrite sched_clock Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [17/58] i386: Add L3 cache support to AMD CPUID4 emulation Andi Kleen
                   ` (41 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: patches, linux-kernel


Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/Makefile |    3 ++-
 arch/x86_64/kernel/time.c   |    1 -
 arch/x86_64/kernel/tsc.c    |   29 +----------------------------
 include/asm-x86_64/timer.h  |    5 +++++
 include/asm-x86_64/timex.h  |    1 -
 5 files changed, 8 insertions(+), 31 deletions(-)

Index: linux/arch/x86_64/kernel/tsc.c
===================================================================
--- linux.orig/arch/x86_64/kernel/tsc.c
+++ linux/arch/x86_64/kernel/tsc.c
@@ -8,6 +8,7 @@
 #include <linux/cpufreq.h>
 
 #include <asm/timex.h>
+#include <asm/tsc.h>
 
 static int notsc __initdata = 0;
 
@@ -16,32 +17,6 @@ EXPORT_SYMBOL(cpu_khz);
 unsigned int tsc_khz;
 EXPORT_SYMBOL(tsc_khz);
 
-static unsigned int cyc2ns_scale __read_mostly;
-
-void set_cyc2ns_scale(unsigned long khz)
-{
-	cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
-}
-
-static unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> NS_SCALE;
-}
-
-unsigned long long sched_clock(void)
-{
-	unsigned long a = 0;
-
-	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
-	 * which means it is not completely exact and may not be monotonous
-	 * between CPUs. But the errors should be too small to matter for
-	 * scheduling purposes.
-	 */
-
-	rdtscll(a);
-	return cycles_2_ns(a);
-}
-
 static int tsc_unstable;
 
 static inline int check_tsc_unstable(void)
@@ -114,8 +89,6 @@ static int time_cpufreq_notifier(struct 
 			mark_tsc_unstable("cpufreq changes");
 	}
 
-	set_cyc2ns_scale(tsc_khz_ref);
-
 	return 0;
 }
 
Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -408,7 +408,6 @@ void __init time_init(void)
 	else
 		vgetcpu_mode = VGETCPU_LSL;
 
-	set_cyc2ns_scale(tsc_khz);
 	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
 		cpu_khz / 1000, cpu_khz % 1000);
 	init_tsc_clocksource();
Index: linux/include/asm-x86_64/timex.h
===================================================================
--- linux.orig/include/asm-x86_64/timex.h
+++ linux/include/asm-x86_64/timex.h
@@ -28,5 +28,4 @@ extern int read_current_timer(unsigned l
 #define US_SCALE        32 /* 2^32, arbitralrily chosen */
 
 extern void mark_tsc_unstable(char *msg);
-extern void set_cyc2ns_scale(unsigned long khz);
 #endif
Index: linux/arch/x86_64/kernel/Makefile
===================================================================
--- linux.orig/arch/x86_64/kernel/Makefile
+++ linux/arch/x86_64/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y	:= process.o signal.o entry.o trap
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
 		pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \
-		perfctr-watchdog.o
+		perfctr-watchdog.o sched-clock.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_X86_MCE)		+= mce.o therm_throt.o
@@ -64,3 +64,4 @@ msr-$(subst m,y,$(CONFIG_X86_MSR))  += .
 alternative-y			+= ../../i386/kernel/alternative.o
 pcspeaker-y			+= ../../i386/kernel/pcspeaker.o
 perfctr-watchdog-y		+= ../../i386/kernel/cpu/perfctr-watchdog.o
+sched-clock-y			+= ../../i386/kernel/sched-clock.o
Index: linux/include/asm-x86_64/timer.h
===================================================================
--- /dev/null
+++ linux/include/asm-x86_64/timer.h
@@ -0,0 +1,5 @@
+#ifndef _ASM_TIMER_H
+#define _ASM_TIMER_H 1
+
+#endif
+

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [17/58] i386: Add L3 cache support to AMD CPUID4 emulation
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (15 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [16/58] x86_64: Use new shared sched_clock in x86-64 too Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-20 17:00   ` [patches] " Andreas Herrmann
  2007-07-20 17:15   ` Andreas Herrmann
  2007-07-19  9:55 ` [PATCH] [18/58] x86_64: remove extra extern declaring about dmi_ioremap Andi Kleen
                   ` (40 subsequent siblings)
  57 siblings, 2 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: patches, linux-kernel


With that an L3 cache is correctly reported in the cache information in /sys

With fixes from Andreas Herrmann and Dean Gaudet

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/i386/kernel/cpu/intel_cacheinfo.c |   74 ++++++++++++++++++++++++---------
 arch/x86_64/kernel/setup.c             |    7 ++-
 2 files changed, 60 insertions(+), 21 deletions(-)

Index: linux/arch/i386/kernel/cpu/intel_cacheinfo.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ linux/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -4,7 +4,7 @@
  *      Changes:
  *      Venkatesh Pallipadi	: Adding cache identification through cpuid(4)
  *		Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
- *	Andi Kleen		: CPUID4 emulation on AMD.
+ *	Andi Kleen / Andreas Herrmann	: CPUID4 emulation on AMD.
  */
 
 #include <linux/init.h>
@@ -135,7 +135,7 @@ unsigned short			num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
    information to the user.  This makes some assumptions about the machine:
-   No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+   L2 not shared, no SMT etc. that is currently true on AMD CPUs.
 
    In theory the TLBs could be reported as fake type (they are in "dummy").
    Maybe later */
@@ -159,13 +159,26 @@ union l2_cache {
 	unsigned val;
 };
 
+union l3_cache {
+	struct {
+		unsigned line_size : 8;
+		unsigned lines_per_tag : 4;
+		unsigned assoc : 4;
+		unsigned res : 2;
+		unsigned size_encoded : 14;
+	};
+	unsigned val;
+};
+
 static const unsigned short assocs[] = {
 	[1] = 1, [2] = 2, [4] = 4, [6] = 8,
-	[8] = 16,
+	[8] = 16, [0xa] = 32, [0xb] = 48,
+	[0xc] = 64,
 	[0xf] = 0xffff // ??
-	};
-static const unsigned char levels[] = { 1, 1, 2 };
-static const unsigned char types[] = { 1, 2, 3 };
+};
+
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
 
 static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 		       union _cpuid4_leaf_ebx *ebx,
@@ -175,37 +188,60 @@ static void __cpuinit amd_cpuid4(int lea
 	unsigned line_size, lines_per_tag, assoc, size_in_kb;
 	union l1_cache l1i, l1d;
 	union l2_cache l2;
+	union l3_cache l3;
+	union l1_cache *l1 = &l1d;
 
 	eax->full = 0;
 	ebx->full = 0;
 	ecx->full = 0;
 
 	cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
-	cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy);
-
-	if (leaf > 2 || !l1d.val || !l1i.val || !l2.val)
-		return;
+	cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
 
-	eax->split.is_self_initializing = 1;
-	eax->split.type = types[leaf];
-	eax->split.level = levels[leaf];
-	eax->split.num_threads_sharing = 0;
-	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
-
-	if (leaf <= 1) {
-		union l1_cache *l1 = leaf == 0 ? &l1d : &l1i;
+	switch (leaf) {
+	case 1:
+		l1 = &l1i;
+	case 0:
+		if (!l1->val)
+			return;
 		assoc = l1->assoc;
 		line_size = l1->line_size;
 		lines_per_tag = l1->lines_per_tag;
 		size_in_kb = l1->size_in_kb;
-	} else {
+		break;
+	case 2:
+		if (!l2.val)
+			return;
 		assoc = l2.assoc;
 		line_size = l2.line_size;
 		lines_per_tag = l2.lines_per_tag;
 		/* cpu_data has errata corrections for K7 applied */
 		size_in_kb = current_cpu_data.x86_cache_size;
+		break;
+	case 3:
+		if (!l3.val)
+			return;
+		assoc = l3.assoc;
+		line_size = l3.line_size;
+		lines_per_tag = l3.lines_per_tag;
+		switch (l3.size_encoded) {
+		case 4:  size_in_kb = 2 * 1024; break;
+		case 8:  size_in_kb = 4 * 1024; break;
+		case 12: size_in_kb = 6 * 1024; break;
+		default: size_in_kb = 0; break;
+		}
+		break;
+	default:
+		return;
 	}
 
+	eax->split.is_self_initializing = 1;
+	eax->split.type = types[leaf];
+	eax->split.level = levels[leaf];
+	eax->split.num_threads_sharing = 0;
+	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
+
+
 	if (assoc == 0xf)
 		eax->split.is_fully_associative = 1;
 	ebx->split.coherency_line_size = line_size - 1;
Index: linux/arch/x86_64/kernel/setup.c
===================================================================
--- linux.orig/arch/x86_64/kernel/setup.c
+++ linux/arch/x86_64/kernel/setup.c
@@ -602,8 +602,11 @@ static void __cpuinit init_amd(struct cp
 	if (c->extended_cpuid_level >= 0x80000008)
 		amd_detect_cmp(c);
 
-	/* Fix cpuid4 emulation for more */
-	num_cache_leaves = 3;
+	if (c->extended_cpuid_level >= 0x80000006 &&
+		(cpuid_edx(0x80000006) & 0xf000))
+		num_cache_leaves = 4;
+	else
+		num_cache_leaves = 3;
 
 	/* RDTSC can be speculated around */
 	clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [18/58] x86_64: remove extra extern declaring about dmi_ioremap
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (16 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [17/58] i386: Add L3 cache support to AMD CPUID4 emulation Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [19/58] x86_64: Don't use softirq save locks in smp_call_function Andi Kleen
                   ` (39 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: yhlu.kernel, patches, linux-kernel


From: "Yinghai Lu" <yhlu.kernel@gmail.com>

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 include/asm-x86_64/dmi.h |    5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

Index: linux/include/asm-x86_64/dmi.h
===================================================================
--- linux.orig/include/asm-x86_64/dmi.h
+++ linux/include/asm-x86_64/dmi.h
@@ -3,15 +3,12 @@
 
 #include <asm/io.h>
 
-extern void *dmi_ioremap(unsigned long addr, unsigned long size);
-extern void dmi_iounmap(void *addr, unsigned long size);
-
 #define DMI_MAX_DATA 2048
 
 extern int dmi_alloc_index;
 extern char dmi_alloc_data[DMI_MAX_DATA];
 
-/* This is so early that there is no good way to allocate dynamic memory. 
+/* This is so early that there is no good way to allocate dynamic memory.
    Allocate data in an BSS array. */
 static inline void *dmi_alloc(unsigned len)
 {

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [19/58] x86_64: Don't use softirq save locks in smp_call_function
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (17 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [18/58] x86_64: remove extra extern declaring about dmi_ioremap Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19 12:16   ` Satyam Sharma
  2007-07-19  9:55 ` [PATCH] [20/58] x86: Always probe the NMI watchdog Andi Kleen
                   ` (38 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: patches, linux-kernel


It is not fully softirq safe anyways.

Can't do a WARN_ON unfortunately because it could trigger in the 
panic case.

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/smp.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/smp.c
===================================================================
--- linux.orig/arch/x86_64/kernel/smp.c
+++ linux/arch/x86_64/kernel/smp.c
@@ -386,9 +386,9 @@ int smp_call_function_single (int cpu, v
 		return 0;
 	}
 
-	spin_lock_bh(&call_lock);
+	spin_lock(&call_lock);
 	__smp_call_function_single(cpu, func, info, nonatomic, wait);
-	spin_unlock_bh(&call_lock);
+	spin_unlock(&call_lock);
 	put_cpu();
 	return 0;
 }

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [20/58] x86: Always probe the NMI watchdog
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (18 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [19/58] x86_64: Don't use softirq save locks in smp_call_function Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19 10:24   ` Björn Steinbrink
  2007-07-19  9:55 ` [PATCH] [21/58] i386: Reserve the right performance counter for the Intel PerfMon " Andi Kleen
                   ` (37 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: B.Steinbrink, patches, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain, Size: 3635 bytes --]


From: [** iso-8859-1 charset **] BjörnSteinbrink <B.Steinbrink@gmx.de>

The performance counter allocator relies on the nmi watchdog being
probed, so we have to do that even if the watchdog is not enabled.

Signed-off-by: Björn Steinbrink <B.Steinbrink@gmx.de>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/i386/kernel/cpu/perfctr-watchdog.c |   11 +++++------
 arch/i386/kernel/nmi.c                  |    3 +++
 arch/x86_64/kernel/nmi.c                |    3 +++
 include/asm-i386/nmi.h                  |    1 +
 include/asm-x86_64/nmi.h                |    1 +
 5 files changed, 13 insertions(+), 6 deletions(-)

Index: linux/arch/i386/kernel/cpu/perfctr-watchdog.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/perfctr-watchdog.c
+++ linux/arch/i386/kernel/cpu/perfctr-watchdog.c
@@ -603,7 +603,7 @@ static struct wd_ops intel_arch_wd_ops =
 	.evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
 };
 
-static void probe_nmi_watchdog(void)
+void probe_nmi_watchdog(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
@@ -641,17 +641,16 @@ static void probe_nmi_watchdog(void)
 
 int lapic_watchdog_init(unsigned nmi_hz)
 {
-	if (!wd_ops) {
-		probe_nmi_watchdog();
-		if (!wd_ops)
-			return -1;
+	if (!wd_ops)
+		return -1;
 
+	/* hack to make sure that we only try to reserver the perfctrs once */
+	if (smp_processor_id() == 0)
 		if (!wd_ops->reserve()) {
 			printk(KERN_ERR
 				"NMI watchdog: cannot reserve perfctrs\n");
 			return -1;
 		}
-	}
 
 	if (!(wd_ops->setup(nmi_hz))) {
 		printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
Index: linux/arch/i386/kernel/nmi.c
===================================================================
--- linux.orig/arch/i386/kernel/nmi.c
+++ linux/arch/i386/kernel/nmi.c
@@ -248,6 +248,9 @@ void setup_apic_nmi_watchdog (void *unus
 	if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
 		return;
 
+	/* always probe the watchdog, the perfctr allocator requires that */
+	probe_nmi_watchdog();
+
 	switch (nmi_watchdog) {
 	case NMI_LOCAL_APIC:
 		__get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
Index: linux/arch/x86_64/kernel/nmi.c
===================================================================
--- linux.orig/arch/x86_64/kernel/nmi.c
+++ linux/arch/x86_64/kernel/nmi.c
@@ -255,6 +255,9 @@ void setup_apic_nmi_watchdog(void *unuse
 	if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
 		return;
 
+	/* always probe the watchdog, the perfctr allocator requires that */
+	probe_nmi_watchdog();
+
 	switch (nmi_watchdog) {
 	case NMI_LOCAL_APIC:
 		__get_cpu_var(wd_enabled) = 1;
Index: linux/include/asm-i386/nmi.h
===================================================================
--- linux.orig/include/asm-i386/nmi.h
+++ linux/include/asm-i386/nmi.h
@@ -18,6 +18,7 @@
 int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern int nmi_watchdog_enabled;
+extern void probe_nmi_watchdog(void);
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int avail_to_resrv_perfctr_nmi(unsigned int);
 extern int reserve_perfctr_nmi(unsigned int);
Index: linux/include/asm-x86_64/nmi.h
===================================================================
--- linux.orig/include/asm-x86_64/nmi.h
+++ linux/include/asm-x86_64/nmi.h
@@ -45,6 +45,7 @@ extern int panic_on_timeout;
 extern int unknown_nmi_panic;
 extern int nmi_watchdog_enabled;
 
+extern void probe_nmi_watchdog(void);
 extern int check_nmi_watchdog(void);
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int avail_to_resrv_perfctr_nmi(unsigned int);

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [21/58] i386: Reserve the right performance counter for the Intel PerfMon NMI watchdog
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (19 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [20/58] x86: Always probe the NMI watchdog Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19 10:21   ` Björn Steinbrink
  2007-07-19  9:55 ` [PATCH] [22/58] x86_64: hpet tsc calibration fix broken smi detection logic Andi Kleen
                   ` (36 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: B.Steinbrink, patches, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain, Size: 1507 bytes --]


From: [** iso-8859-1 charset **] BjörnSteinbrink <B.Steinbrink@gmx.de>

The Intel PerfMon NMI watchdog was using the generic reservation
function which always reserves the first performance counter. But the
watchdog actually uses the second performance counter, thus we need a
specialised function.

Signed-off-by: Björn Steinbrink <B.Steinbrink@gmx.de>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/i386/kernel/cpu/perfctr-watchdog.c |   22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

Index: linux/arch/i386/kernel/cpu/perfctr-watchdog.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/perfctr-watchdog.c
+++ linux/arch/i386/kernel/cpu/perfctr-watchdog.c
@@ -593,9 +593,27 @@ static int setup_intel_arch_watchdog(uns
 	return 1;
 }
 
+static int intel_arch_reserve(void)
+{
+	if (!reserve_perfctr_nmi(MSR_ARCH_PERFMON_PERFCTR1))
+		return 0;
+
+	if (!reserve_evntsel_nmi(MSR_ARCH_PERFMON_EVENTSEL1)) {
+		release_perfctr_nmi(MSR_ARCH_PERFMON_PERFCTR1);
+		return 0;
+	}
+	return 1;
+}
+
+static void intel_arch_unreserve(void)
+{
+	release_evntsel_nmi(MSR_ARCH_PERFMON_EVENTSEL1);
+	release_perfctr_nmi(MSR_ARCH_PERFMON_PERFCTR1);
+}
+
 static struct wd_ops intel_arch_wd_ops = {
-	.reserve = single_msr_reserve,
-	.unreserve = single_msr_unreserve,
+	.reserve = intel_arch_reserve,
+	.unreserve = intel_arch_unreserve,
 	.setup = setup_intel_arch_watchdog,
 	.rearm = p6_rearm,
 	.stop = single_msr_stop_watchdog,

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [22/58] x86_64: hpet tsc calibration fix broken smi  detection logic
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (20 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [21/58] i386: Reserve the right performance counter for the Intel PerfMon " Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [23/58] i386: remove pit_interrupt_hook Andi Kleen
                   ` (35 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: tglx, patches, linux-kernel


From: Thomas Gleixner <tglx@linutronix.de>
The current SMI detection logic in read_hpet_tsc() makes sure,
that when a SMI happens between the read of the HPET counter and
the read of the TSC, this wrong value is used for TSC calibration.

This is not the intention of the function. The comparison must ensure,
that we do _NOT_ use such a value.

Fix the check to use calibration values where delta of the two TSC reads
is smaller than a reasonable threshold.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/hpet.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/hpet.c
===================================================================
--- linux.orig/arch/x86_64/kernel/hpet.c
+++ linux/arch/x86_64/kernel/hpet.c
@@ -190,7 +190,7 @@ int hpet_reenable(void)
  */
 
 #define TICK_COUNT 100000000
-#define TICK_MIN   5000
+#define SMI_THRESHOLD 50000
 #define MAX_TRIES  5
 
 /*
@@ -205,7 +205,7 @@ static void __init read_hpet_tsc(int *hp
 		tsc1 = get_cycles_sync();
 		hpet1 = hpet_readl(HPET_COUNTER);
 		tsc2 = get_cycles_sync();
-		if (tsc2 - tsc1 > TICK_MIN)
+		if ((tsc2 - tsc1) < SMI_THRESHOLD)
 			break;
 	}
 	*hpet = hpet1;

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [23/58] i386: remove pit_interrupt_hook
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (21 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [22/58] x86_64: hpet tsc calibration fix broken smi detection logic Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [24/58] x86_64: Untangle asm/hpet.h from asm/timex.h Andi Kleen
                   ` (34 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: chrisw, patches, linux-kernel


From: Chris Wright <chrisw@sous-sol.org>

Remove pit_interrupt_hook as it adds just an extra layer.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 include/asm-i386/i8253.h                 |   11 -----------
 include/asm-i386/mach-default/do_timer.h |    2 +-
 include/asm-i386/mach-voyager/do_timer.h |    2 +-
 3 files changed, 2 insertions(+), 13 deletions(-)

Index: linux/include/asm-i386/i8253.h
===================================================================
--- linux.orig/include/asm-i386/i8253.h
+++ linux/include/asm-i386/i8253.h
@@ -7,15 +7,4 @@ extern spinlock_t i8253_lock;
 
 extern struct clock_event_device *global_clock_event;
 
-/**
- * pit_interrupt_hook - hook into timer tick
- * @regs:	standard registers from interrupt
- *
- * Call the global clock event handler.
- **/
-static inline void pit_interrupt_hook(void)
-{
-	global_clock_event->event_handler(global_clock_event);
-}
-
 #endif	/* __ASM_I8253_H__ */
Index: linux/include/asm-i386/mach-default/do_timer.h
===================================================================
--- linux.orig/include/asm-i386/mach-default/do_timer.h
+++ linux/include/asm-i386/mach-default/do_timer.h
@@ -12,5 +12,5 @@
 
 static inline void do_timer_interrupt_hook(void)
 {
-	pit_interrupt_hook();
+	global_clock_event->event_handler(global_clock_event);
 }
Index: linux/include/asm-i386/mach-voyager/do_timer.h
===================================================================
--- linux.orig/include/asm-i386/mach-voyager/do_timer.h
+++ linux/include/asm-i386/mach-voyager/do_timer.h
@@ -12,7 +12,7 @@
  **/
 static inline void do_timer_interrupt_hook(void)
 {
-	pit_interrupt_hook();
+	global_clock_event->event_handler(global_clock_event);
 	voyager_timer_interrupt();
 }
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [24/58] x86_64: Untangle asm/hpet.h from asm/timex.h
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (22 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [23/58] i386: remove pit_interrupt_hook Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [25/58] x86_64: use generic cmos update Andi Kleen
                   ` (33 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: chrisw, johnstul, patches, linux-kernel


From: Chris Wright <chrisw@sous-sol.org>

When making changes to x86_64 timers, I noticed that touching hpet.h triggered
an unreasonably large rebuild.  Untangling it from timex.h quiets the extra
rebuild quite a bit.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: john stultz <johnstul@us.ibm.com>

---

 drivers/char/rtc.c         |    2 +-
 include/asm-x86_64/apic.h  |    2 ++
 include/asm-x86_64/hpet.h  |    1 -
 include/asm-x86_64/timex.h |    1 -
 4 files changed, 3 insertions(+), 3 deletions(-)

Index: linux/drivers/char/rtc.c
===================================================================
--- linux.orig/drivers/char/rtc.c
+++ linux/drivers/char/rtc.c
@@ -82,7 +82,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
-#if defined(__i386__)
+#ifdef CONFIG_X86
 #include <asm/hpet.h>
 #endif
 
Index: linux/include/asm-x86_64/apic.h
===================================================================
--- linux.orig/include/asm-x86_64/apic.h
+++ linux/include/asm-x86_64/apic.h
@@ -86,6 +86,8 @@ extern void setup_apic_routing(void);
 extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
 				   unsigned char msg_type, unsigned char mask);
 
+extern int apic_is_clustered_box(void);
+
 #define K8_APIC_EXT_LVT_BASE    0x500
 #define K8_APIC_EXT_INT_MSG_FIX 0x0
 #define K8_APIC_EXT_INT_MSG_SMI 0x2
Index: linux/include/asm-x86_64/hpet.h
===================================================================
--- linux.orig/include/asm-x86_64/hpet.h
+++ linux/include/asm-x86_64/hpet.h
@@ -55,7 +55,6 @@
 
 extern int is_hpet_enabled(void);
 extern int hpet_rtc_timer_init(void);
-extern int apic_is_clustered_box(void);
 extern int hpet_arch_init(void);
 extern int hpet_timer_stop_set_go(unsigned long tick);
 extern int hpet_reenable(void);
Index: linux/include/asm-x86_64/timex.h
===================================================================
--- linux.orig/include/asm-x86_64/timex.h
+++ linux/include/asm-x86_64/timex.h
@@ -9,7 +9,6 @@
 #include <asm/8253pit.h>
 #include <asm/msr.h>
 #include <asm/vsyscall.h>
-#include <asm/hpet.h>
 #include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/tsc.h>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [25/58] x86_64: use generic cmos update
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (23 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [24/58] x86_64: Untangle asm/hpet.h from asm/timex.h Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [26/58] x86_64: Use generic xtime init Andi Kleen
                   ` (32 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: tglx, johnstul, patches, linux-kernel


From: Thomas Gleixner <tglx@linutronix.de>
Use the generic cmos update function in kernel/time/ntp.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: john stultz <johnstul@us.ibm.com>
---

 arch/x86_64/Kconfig       |    4 ++++
 arch/x86_64/kernel/time.c |   25 +++++++++----------------
 2 files changed, 13 insertions(+), 16 deletions(-)

Index: linux/arch/x86_64/Kconfig
===================================================================
--- linux.orig/arch/x86_64/Kconfig
+++ linux/arch/x86_64/Kconfig
@@ -32,6 +32,10 @@ config GENERIC_TIME_VSYSCALL
 	bool
 	default y
 
+config GENERIC_CMOS_UPDATE
+	bool
+	default y
+
 config ZONE_DMA32
 	bool
 	default y
Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -80,8 +80,9 @@ EXPORT_SYMBOL(profile_pc);
  * sheet for details.
  */
 
-static void set_rtc_mmss(unsigned long nowtime)
+static int set_rtc_mmss(unsigned long nowtime)
 {
+	int retval = 0;
 	int real_seconds, real_minutes, cmos_minutes;
 	unsigned char control, freq_select;
 
@@ -121,6 +122,7 @@ static void set_rtc_mmss(unsigned long n
 	if (abs(real_minutes - cmos_minutes) >= 30) {
 		printk(KERN_WARNING "time.c: can't update CMOS clock "
 		       "from %d to %d\n", cmos_minutes, real_minutes);
+		retval = -1;
 	} else {
 		BIN_TO_BCD(real_seconds);
 		BIN_TO_BCD(real_minutes);
@@ -140,12 +142,17 @@ static void set_rtc_mmss(unsigned long n
 	CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
 
 	spin_unlock(&rtc_lock);
+
+	return retval;
 }
 
+int update_persistent_clock(struct timespec now)
+{
+	return set_rtc_mmss(now.tv_sec);
+}
 
 void main_timer_handler(void)
 {
-	static unsigned long rtc_update = 0;
 /*
  * Here we are in the timer irq handler. We have irqs locally disabled (so we
  * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
@@ -173,20 +180,6 @@ void main_timer_handler(void)
 	if (!using_apic_timer)
 		smp_local_timer_interrupt();
 
-/*
- * If we have an externally synchronized Linux clock, then update CMOS clock
- * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
- * closest to exactly 500 ms before the next second. If the update fails, we
- * don't care, as it'll be updated on the next turn, and the problem (time way
- * off) isn't likely to go away much sooner anyway.
- */
-
-	if (ntp_synced() && xtime.tv_sec > rtc_update &&
-		abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
-		set_rtc_mmss(xtime.tv_sec);
-		rtc_update = xtime.tv_sec + 660;
-	}
- 
 	write_sequnlock(&xtime_lock);
 }
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [26/58] x86_64: Use generic xtime init
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (24 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [25/58] x86_64: use generic cmos update Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [27/58] x86_64: Remove dead code and other janitor work in tsc.c Andi Kleen
                   ` (31 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: tglx, patches, linux-kernel


From: Thomas Gleixner <tglx@linutronix.de>
xtime can be initialized including the cmos update from the generic
timekeeping code. Remove the arch specific implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/time.c |   40 +---------------------------------------
 1 file changed, 1 insertion(+), 39 deletions(-)

Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -193,7 +193,7 @@ static irqreturn_t timer_interrupt(int i
 	return IRQ_HANDLED;
 }
 
-static unsigned long get_cmos_time(void)
+unsigned long read_persistent_clock(void)
 {
 	unsigned int year, mon, day, hour, min, sec;
 	unsigned long flags;
@@ -367,11 +367,6 @@ void __init time_init(void)
 {
 	if (nohpet)
 		hpet_address = 0;
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = 0;
-
-	set_normalized_timespec(&wall_to_monotonic,
-	                        -xtime.tv_sec, -xtime.tv_nsec);
 
 	if (hpet_arch_init())
 		hpet_address = 0;
@@ -408,54 +403,21 @@ void __init time_init(void)
 	setup_irq(0, &irq0);
 }
 
-
-static long clock_cmos_diff;
-static unsigned long sleep_start;
-
 /*
  * sysfs support for the timer.
  */
 
 static int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
-	/*
-	 * Estimate time zone so that set_time can update the clock
-	 */
-	long cmos_time =  get_cmos_time();
-
-	clock_cmos_diff = -cmos_time;
-	clock_cmos_diff += get_seconds();
-	sleep_start = cmos_time;
 	return 0;
 }
 
 static int timer_resume(struct sys_device *dev)
 {
-	unsigned long flags;
-	unsigned long sec;
-	unsigned long ctime = get_cmos_time();
-	long sleep_length = (ctime - sleep_start) * HZ;
-
-	if (sleep_length < 0) {
-		printk(KERN_WARNING "Time skew detected in timer resume!\n");
-		/* The time after the resume must not be earlier than the time
-		 * before the suspend or some nasty things will happen
-		 */
-		sleep_length = 0;
-		ctime = sleep_start;
-	}
 	if (hpet_address)
 		hpet_reenable();
 	else
 		i8254_timer_resume();
-
-	sec = ctime + clock_cmos_diff;
-	write_seqlock_irqsave(&xtime_lock,flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
-	jiffies += sleep_length;
-	write_sequnlock_irqrestore(&xtime_lock,flags);
-	touch_softlockup_watchdog();
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [27/58] x86_64: Remove dead code and other janitor work in  tsc.c
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (25 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [26/58] x86_64: Use generic xtime init Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [28/58] x86_64: Fix APIC typo Andi Kleen
                   ` (30 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: tglx, johnstul, patches, linux-kernel


From: Thomas Gleixner <tglx@linutronix.de>
Remove unused code and variables and do some codingstyle / whitespace
cleanups while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: john stultz <johnstul@us.ibm.com>
---

 arch/x86_64/kernel/tsc.c |   39 +++++++++++----------------------------
 1 file changed, 11 insertions(+), 28 deletions(-)

Index: linux/arch/x86_64/kernel/tsc.c
===================================================================
--- linux.orig/arch/x86_64/kernel/tsc.c
+++ linux/arch/x86_64/kernel/tsc.c
@@ -36,25 +36,9 @@ static inline int check_tsc_unstable(voi
  * first tick after the change will be slightly wrong.
  */
 
-#include <linux/workqueue.h>
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(struct work_struct *v)
-{
-	unsigned int cpu;
-	for_each_online_cpu(cpu) {
-		cpufreq_get(cpu);
-	}
-	cpufreq_delayed_issched = 0;
-}
-
-static unsigned int  ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-
-static unsigned long tsc_khz_ref = 0;
+static unsigned int  ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long tsc_khz_ref;
 
 static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 				 void *data)
@@ -98,10 +82,8 @@ static struct notifier_block time_cpufre
 
 static int __init cpufreq_tsc(void)
 {
-	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
-	if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
-				       CPUFREQ_TRANSITION_NOTIFIER))
-		cpufreq_init = 1;
+	cpufreq_register_notifier(&time_cpufreq_notifier_block,
+				  CPUFREQ_TRANSITION_NOTIFIER);
 	return 0;
 }
 
@@ -123,17 +105,18 @@ __cpuinit int unsynchronized_tsc(void)
 #endif
 	/* Most intel systems have synchronized TSCs except for
 	   multi node systems */
- 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
 #ifdef CONFIG_ACPI
 		/* But TSC doesn't tick in C3 so don't use it there */
-		if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000)
+		if (acpi_gbl_FADT.header.length > 0 &&
+		    acpi_gbl_FADT.C3latency < 1000)
 			return 1;
 #endif
- 		return 0;
+		return 0;
 	}
 
- 	/* Assume multi socket systems are not synchronized */
- 	return num_present_cpus() > 1;
+	/* Assume multi socket systems are not synchronized */
+	return num_present_cpus() > 1;
 }
 
 int __init notsc_setup(char *s)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [28/58] x86_64: Fix APIC typo
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (26 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [27/58] x86_64: Remove dead code and other janitor work in tsc.c Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [29/58] x86_64: fiuxp pt_reqs leftovers Andi Kleen
                   ` (29 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: tglx, patches, linux-kernel


From: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/x86_64/kernel/apic.c    |    4 ++--
 arch/x86_64/kernel/mce_amd.c |    6 +++---
 include/asm-x86_64/apic.h    |    4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

Index: linux/arch/x86_64/kernel/apic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/apic.c
+++ linux/arch/x86_64/kernel/apic.c
@@ -990,8 +990,8 @@ int setup_profiling_timer(unsigned int m
 	return -EINVAL;
 }
 
-void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
-			    unsigned char msg_type, unsigned char mask)
+void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
+			     unsigned char msg_type, unsigned char mask)
 {
 	unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
 	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
Index: linux/arch/x86_64/kernel/mce_amd.c
===================================================================
--- linux.orig/arch/x86_64/kernel/mce_amd.c
+++ linux/arch/x86_64/kernel/mce_amd.c
@@ -157,9 +157,9 @@ void __cpuinit mce_amd_feature_init(stru
 			high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
 			wrmsr(address, low, high);
 
-			setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
-					       THRESHOLD_APIC_VECTOR,
-					       K8_APIC_EXT_INT_MSG_FIX, 0);
+			setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
+						THRESHOLD_APIC_VECTOR,
+						K8_APIC_EXT_INT_MSG_FIX, 0);
 
 			threshold_defaults.address = address;
 			threshold_restart_bank(&threshold_defaults, 0, 0);
Index: linux/include/asm-x86_64/apic.h
===================================================================
--- linux.orig/include/asm-x86_64/apic.h
+++ linux/include/asm-x86_64/apic.h
@@ -83,8 +83,8 @@ extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
 extern void setup_apic_routing(void);
 
-extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
-				   unsigned char msg_type, unsigned char mask);
+extern void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
+				    unsigned char msg_type, unsigned char mask);
 
 extern int apic_is_clustered_box(void);
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [29/58] x86_64: fiuxp pt_reqs leftovers
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (27 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [28/58] x86_64: Fix APIC typo Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [30/58] x86: share hpet.h with i386 Andi Kleen
                   ` (28 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: tglx, patches, linux-kernel


From: Thomas Gleixner <tglx@linutronix.de>
The hpet_rtc_interrupt handler still uses pt_regs. Fix it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/hpet.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/arch/x86_64/kernel/hpet.c
===================================================================
--- linux.orig/arch/x86_64/kernel/hpet.c
+++ linux/arch/x86_64/kernel/hpet.c
@@ -439,7 +439,7 @@ int hpet_rtc_dropped_irq(void)
 	return 1;
 }
 
-irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 {
 	struct rtc_time curr_time;
 	unsigned long rtc_int_flag = 0;

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [30/58] x86: share hpet.h with i386
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (28 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [29/58] x86_64: fiuxp pt_reqs leftovers Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [31/58] x86_64: apic.c coding style janitor work Andi Kleen
                   ` (27 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: tglx, patches, linux-kernel


From: Thomas Gleixner <tglx@linutronix.de>
hpet.h in asm-i386 and asm-x86_64 contain tons of duplicated stuff.
Consolidate into one shared header file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 include/asm-i386/hpet.h   |  124 +++++++++++++++++-----------------------------
 include/asm-x86_64/hpet.h |   61 ----------------------
 2 files changed, 48 insertions(+), 137 deletions(-)

Index: linux/include/asm-i386/hpet.h
===================================================================
--- linux.orig/include/asm-i386/hpet.h
+++ linux/include/asm-i386/hpet.h
@@ -4,112 +4,82 @@
 
 #ifdef CONFIG_HPET_TIMER
 
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/irq.h>
-#include <asm/msr.h>
-#include <asm/delay.h>
-#include <asm/mpspec.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-
-#include <linux/timex.h>
-
 /*
  * Documentation on HPET can be found at:
  *      http://www.intel.com/ial/home/sp/pcmmspec.htm
  *      ftp://download.intel.com/ial/home/sp/mmts098.pdf
  */
 
-#define HPET_MMAP_SIZE	1024
+#define HPET_MMAP_SIZE		1024
 
-#define HPET_ID		0x000
-#define HPET_PERIOD	0x004
-#define HPET_CFG	0x010
-#define HPET_STATUS	0x020
-#define HPET_COUNTER	0x0f0
-#define HPET_T0_CFG	0x100
-#define HPET_T0_CMP	0x108
-#define HPET_T0_ROUTE	0x110
-#define HPET_T1_CFG	0x120
-#define HPET_T1_CMP	0x128
-#define HPET_T1_ROUTE	0x130
-#define HPET_T2_CFG	0x140
-#define HPET_T2_CMP	0x148
-#define HPET_T2_ROUTE	0x150
-
-#define HPET_ID_LEGSUP	0x00008000
-#define HPET_ID_NUMBER	0x00001f00
-#define HPET_ID_REV	0x000000ff
+#define HPET_ID			0x000
+#define HPET_PERIOD		0x004
+#define HPET_CFG		0x010
+#define HPET_STATUS		0x020
+#define HPET_COUNTER		0x0f0
+#define HPET_T0_CFG		0x100
+#define HPET_T0_CMP		0x108
+#define HPET_T0_ROUTE		0x110
+#define HPET_T1_CFG		0x120
+#define HPET_T1_CMP		0x128
+#define HPET_T1_ROUTE		0x130
+#define HPET_T2_CFG		0x140
+#define HPET_T2_CMP		0x148
+#define HPET_T2_ROUTE		0x150
+
+#define HPET_ID_REV		0x000000ff
+#define HPET_ID_NUMBER		0x00001f00
+#define HPET_ID_64BIT		0x00002000
+#define HPET_ID_LEGSUP		0x00008000
+#define HPET_ID_VENDOR		0xffff0000
 #define	HPET_ID_NUMBER_SHIFT	8
+#define HPET_ID_VENDOR_SHIFT	16
 
-#define HPET_CFG_ENABLE	0x001
-#define HPET_CFG_LEGACY	0x002
+#define HPET_ID_VENDOR_8086	0x8086
+
+#define HPET_CFG_ENABLE		0x001
+#define HPET_CFG_LEGACY		0x002
 #define	HPET_LEGACY_8254	2
 #define	HPET_LEGACY_RTC		8
 
-#define HPET_TN_ENABLE		0x004
-#define HPET_TN_PERIODIC	0x008
-#define HPET_TN_PERIODIC_CAP	0x010
-#define HPET_TN_SETVAL		0x040
-#define HPET_TN_32BIT		0x100
-
-/* Use our own asm for 64 bit multiply/divide */
-#define ASM_MUL64_REG(eax_out,edx_out,reg_in,eax_in) 			\
-		__asm__ __volatile__("mull %2" 				\
-				:"=a" (eax_out), "=d" (edx_out) 	\
-				:"r" (reg_in), "0" (eax_in))
-
-#define ASM_DIV64_REG(eax_out,edx_out,reg_in,eax_in,edx_in) 		\
-		__asm__ __volatile__("divl %2" 				\
-				:"=a" (eax_out), "=d" (edx_out) 	\
-				:"r" (reg_in), "0" (eax_in), "1" (edx_in))
+#define HPET_TN_LEVEL		0x0002
+#define HPET_TN_ENABLE		0x0004
+#define HPET_TN_PERIODIC	0x0008
+#define HPET_TN_PERIODIC_CAP	0x0010
+#define HPET_TN_64BIT_CAP	0x0020
+#define HPET_TN_SETVAL		0x0040
+#define HPET_TN_32BIT		0x0100
+#define HPET_TN_ROUTE		0x3e00
+#define HPET_TN_FSB		0x4000
+#define HPET_TN_FSB_CAP		0x8000
+#define HPET_TN_ROUTE_SHIFT	9
 
-#define KERNEL_TICK_USEC 	(1000000UL/HZ)	/* tick value in microsec */
 /* Max HPET Period is 10^8 femto sec as in HPET spec */
-#define HPET_MAX_PERIOD (100000000UL)
+#define HPET_MAX_PERIOD		100000000UL
 /*
  * Min HPET period is 10^5 femto sec just for safety. If it is less than this,
  * then 32 bit HPET counter wrapsaround in less than 0.5 sec.
  */
-#define HPET_MIN_PERIOD (100000UL)
-#define HPET_TICK_RATE  (HZ * 100000UL)
+#define HPET_MIN_PERIOD		100000UL
 
-extern unsigned long hpet_address;	/* hpet memory map physical address */
+/* hpet memory map physical address */
+extern unsigned long hpet_address;
 extern int is_hpet_enabled(void);
-
-#ifdef CONFIG_X86_64
-extern unsigned long hpet_tick;	/* hpet clks count per tick */
-extern int hpet_use_timer;
-extern int hpet_rtc_timer_init(void);
 extern int hpet_enable(void);
-extern int is_hpet_capable(void);
-extern int hpet_readl(unsigned long a);
-#else
-extern int hpet_enable(void);
-#endif
 
 #ifdef CONFIG_HPET_EMULATE_RTC
+
+#include <linux/interrupt.h>
+
 extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
 extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
-extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec);
+extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+			       unsigned char sec);
 extern int hpet_set_periodic_freq(unsigned long freq);
 extern int hpet_rtc_dropped_irq(void);
 extern int hpet_rtc_timer_init(void);
 extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
+
 #endif /* CONFIG_HPET_EMULATE_RTC */
 
 #else
Index: linux/include/asm-x86_64/hpet.h
===================================================================
--- linux.orig/include/asm-x86_64/hpet.h
+++ linux/include/asm-x86_64/hpet.h
@@ -1,59 +1,10 @@
 #ifndef _ASM_X8664_HPET_H
 #define _ASM_X8664_HPET_H 1
 
-/*
- * Documentation on HPET can be found at:
- *      http://www.intel.com/ial/home/sp/pcmmspec.htm
- *      ftp://download.intel.com/ial/home/sp/mmts098.pdf
- */
-
-#define HPET_MMAP_SIZE	1024
-
-#define HPET_ID		0x000
-#define HPET_PERIOD	0x004
-#define HPET_CFG	0x010
-#define HPET_STATUS	0x020
-#define HPET_COUNTER	0x0f0
-#define HPET_Tn_OFFSET	0x20
-#define HPET_Tn_CFG(n)	 (0x100 + (n) * HPET_Tn_OFFSET)
-#define HPET_Tn_ROUTE(n) (0x104 + (n) * HPET_Tn_OFFSET)
-#define HPET_Tn_CMP(n)	 (0x108 + (n) * HPET_Tn_OFFSET)
-#define HPET_T0_CFG	HPET_Tn_CFG(0)
-#define HPET_T0_CMP	HPET_Tn_CMP(0)
-#define HPET_T1_CFG	HPET_Tn_CFG(1)
-#define HPET_T1_CMP	HPET_Tn_CMP(1)
-
-#define HPET_ID_VENDOR	0xffff0000
-#define HPET_ID_LEGSUP	0x00008000
-#define HPET_ID_64BIT	0x00002000
-#define HPET_ID_NUMBER	0x00001f00
-#define HPET_ID_REV	0x000000ff
-#define	HPET_ID_NUMBER_SHIFT	8
-
-#define HPET_ID_VENDOR_SHIFT	16
-#define HPET_ID_VENDOR_8086	0x8086
-
-#define HPET_CFG_ENABLE	0x001
-#define HPET_CFG_LEGACY	0x002
-#define	HPET_LEGACY_8254	2
-#define	HPET_LEGACY_RTC		8
-
-#define HPET_TN_LEVEL		0x0002
-#define HPET_TN_ENABLE		0x0004
-#define HPET_TN_PERIODIC	0x0008
-#define HPET_TN_PERIODIC_CAP	0x0010
-#define HPET_TN_64BIT_CAP	0x0020
-#define HPET_TN_SETVAL		0x0040
-#define HPET_TN_32BIT		0x0100
-#define HPET_TN_ROUTE		0x3e00
-#define HPET_TN_FSB		0x4000
-#define HPET_TN_FSB_CAP		0x8000
-
-#define HPET_TN_ROUTE_SHIFT	9
+#include <asm-i386/hpet.h>
 
 #define HPET_TICK_RATE (HZ * 100000UL)
 
-extern int is_hpet_enabled(void);
 extern int hpet_rtc_timer_init(void);
 extern int hpet_arch_init(void);
 extern int hpet_timer_stop_set_go(unsigned long tick);
@@ -61,17 +12,7 @@ extern int hpet_reenable(void);
 extern unsigned int hpet_calibrate_tsc(void);
 
 extern int hpet_use_timer;
-extern unsigned long hpet_address;
 extern unsigned long hpet_period;
 extern unsigned long hpet_tick;
 
-#ifdef CONFIG_HPET_EMULATE_RTC
-extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
-extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
-extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec);
-extern int hpet_set_periodic_freq(unsigned long freq);
-extern int hpet_rtc_dropped_irq(void);
-extern int hpet_rtc_timer_init(void);
-#endif /* CONFIG_HPET_EMULATE_RTC */
-
 #endif

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [31/58] x86_64: apic.c coding style janitor work
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (29 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [30/58] x86: share hpet.h with i386 Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [32/58] x86_64: time.c white space wreckage cleanup Andi Kleen
                   ` (26 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: tglx, patches, linux-kernel


From: Thomas Gleixner <tglx@linutronix.de>
Fix coding style, white space wreckage and remove unused code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/apic.c |   73 ++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 43 deletions(-)

Index: linux/arch/x86_64/kernel/apic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/apic.c
+++ linux/arch/x86_64/kernel/apic.c
@@ -92,8 +92,9 @@ unsigned int safe_apic_wait_icr_idle(voi
 void enable_NMI_through_LVT0 (void * dummy)
 {
 	unsigned int v;
-	
-	v = APIC_DM_NMI;                        /* unmask and set to NMI */
+
+	/* unmask and set to NMI */
+	v = APIC_DM_NMI;
 	apic_write(APIC_LVT0, v);
 }
 
@@ -120,7 +121,7 @@ void ack_bad_irq(unsigned int irq)
 	 * holds up an irq slot - in excessive cases (when multiple
 	 * unexpected vectors occur) that might lock up the APIC
 	 * completely.
-  	 * But don't ack when the APIC is disabled. -AK
+	 * But don't ack when the APIC is disabled. -AK
 	 */
 	if (!disable_apic)
 		ack_APIC_irq();
@@ -616,7 +617,7 @@ early_param("apic", apic_set_verbosity);
  * Detect and enable local APICs on non-SMP boards.
  * Original code written by Keir Fraser.
  * On AMD64 we trust the BIOS - if it says no APIC it is likely
- * not correctly set up (usually the APIC timer won't work etc.) 
+ * not correctly set up (usually the APIC timer won't work etc.)
  */
 
 static int __init detect_init_APIC (void)
@@ -789,13 +790,13 @@ static void setup_APIC_timer(unsigned in
 	local_irq_save(flags);
 
 	/* wait for irq slice */
- 	if (hpet_address && hpet_use_timer) {
- 		int trigger = hpet_readl(HPET_T0_CMP);
- 		while (hpet_readl(HPET_COUNTER) >= trigger)
- 			/* do nothing */ ;
- 		while (hpet_readl(HPET_COUNTER) <  trigger)
- 			/* do nothing */ ;
- 	} else {
+	if (hpet_address && hpet_use_timer) {
+		int trigger = hpet_readl(HPET_T0_CMP);
+		while (hpet_readl(HPET_COUNTER) >= trigger)
+			/* do nothing */ ;
+		while (hpet_readl(HPET_COUNTER) <  trigger)
+			/* do nothing */ ;
+	} else {
 		int c1, c2;
 		outb_p(0x00, 0x43);
 		c2 = inb_p(0x40);
@@ -881,10 +882,10 @@ static unsigned int calibration_result;
 
 void __init setup_boot_APIC_clock (void)
 {
-	if (disable_apic_timer) { 
-		printk(KERN_INFO "Disabling APIC timer\n"); 
-		return; 
-	} 
+	if (disable_apic_timer) {
+		printk(KERN_INFO "Disabling APIC timer\n");
+		return;
+	}
 
 	printk(KERN_INFO "Using local APIC timer interrupts.\n");
 	using_apic_timer = 1;
@@ -1128,20 +1129,6 @@ asmlinkage void smp_spurious_interrupt(v
 	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
 		ack_APIC_irq();
 
-#if 0
-	static unsigned long last_warning; 
-	static unsigned long skipped; 
-
-	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
-	if (time_before(last_warning+30*HZ,jiffies)) { 
-		printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
-		       smp_processor_id(), skipped);
-		last_warning = jiffies; 
-		skipped = 0;
-	} else { 
-		skipped++; 
-	} 
-#endif 
 	irq_exit();
 }
 
@@ -1173,11 +1160,11 @@ asmlinkage void smp_error_interrupt(void
 	   7: Illegal register address
 	*/
 	printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
-	        smp_processor_id(), v , v1);
+		smp_processor_id(), v , v1);
 	irq_exit();
 }
 
-int disable_apic; 
+int disable_apic;
 
 /*
  * This initializes the IO-APIC and APIC hardware if this is
@@ -1185,11 +1172,11 @@ int disable_apic; 
  */
 int __init APIC_init_uniprocessor (void)
 {
-	if (disable_apic) { 
+	if (disable_apic) {
 		printk(KERN_INFO "Apic disabled\n");
-		return -1; 
+		return -1;
 	}
-	if (!cpu_has_apic) { 
+	if (!cpu_has_apic) {
 		disable_apic = 1;
 		printk(KERN_INFO "Apic disabled by BIOS\n");
 		return -1;
@@ -1211,8 +1198,8 @@ int __init APIC_init_uniprocessor (void)
 	return 0;
 }
 
-static __init int setup_disableapic(char *str) 
-{ 
+static __init int setup_disableapic(char *str)
+{
 	disable_apic = 1;
 	clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
 	return 0;
@@ -1220,10 +1207,10 @@ static __init int setup_disableapic(char
 early_param("disableapic", setup_disableapic);
 
 /* same as disableapic, for compatibility */
-static __init int setup_nolapic(char *str) 
-{ 
+static __init int setup_nolapic(char *str)
+{
 	return setup_disableapic(str);
-} 
+}
 early_param("nolapic", setup_nolapic);
 
 static int __init parse_lapic_timer_c2_ok(char *arg)
@@ -1233,13 +1220,13 @@ static int __init parse_lapic_timer_c2_o
 }
 early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
 
-static __init int setup_noapictimer(char *str) 
-{ 
+static __init int setup_noapictimer(char *str)
+{
 	if (str[0] != ' ' && str[0] != 0)
 		return 0;
 	disable_apic_timer = 1;
 	return 1;
-} 
+}
 
 static __init int setup_apicmaintimer(char *str)
 {
@@ -1264,5 +1251,5 @@ static __init int setup_apicpmtimer(char
 }
 __setup("apicpmtimer", setup_apicpmtimer);
 
-__setup("noapictimer", setup_noapictimer); 
+__setup("noapictimer", setup_noapictimer);
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [32/58] x86_64: time.c white space wreckage cleanup
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (30 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [31/58] x86_64: apic.c coding style janitor work Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [33/58] x86_64: Avoid too many remote cpu references due to /proc/stat Andi Kleen
                   ` (25 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: tglx, patches, linux-kernel


From: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/kernel/time.c |   88 +++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -220,7 +220,7 @@ unsigned long read_persistent_clock(void
 	/*
 	 * We know that x86-64 always uses BCD format, no need to check the
 	 * config register.
- 	 */
+	 */
 
 	BCD_TO_BIN(sec);
 	BCD_TO_BIN(min);
@@ -233,11 +233,11 @@ unsigned long read_persistent_clock(void
 		BCD_TO_BIN(century);
 		year += century * 100;
 		printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
-	} else { 
+	} else {
 		/*
 		 * x86-64 systems only exists since 2002.
 		 * This will work up to Dec 31, 2100
-	 	 */
+		 */
 		year += 2000;
 	}
 
@@ -249,45 +249,45 @@ unsigned long read_persistent_clock(void
 #define TICK_COUNT 100000000
 static unsigned int __init tsc_calibrate_cpu_khz(void)
 {
-       int tsc_start, tsc_now;
-       int i, no_ctr_free;
-       unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
-       unsigned long flags;
-
-       for (i = 0; i < 4; i++)
-               if (avail_to_resrv_perfctr_nmi_bit(i))
-                       break;
-       no_ctr_free = (i == 4);
-       if (no_ctr_free) {
-               i = 3;
-               rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
-               wrmsrl(MSR_K7_EVNTSEL3, 0);
-               rdmsrl(MSR_K7_PERFCTR3, pmc3);
-       } else {
-               reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-               reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-       }
-       local_irq_save(flags);
-       /* start meauring cycles, incrementing from 0 */
-       wrmsrl(MSR_K7_PERFCTR0 + i, 0);
-       wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
-       rdtscl(tsc_start);
-       do {
-               rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-               tsc_now = get_cycles_sync();
-       } while ((tsc_now - tsc_start) < TICK_COUNT);
-
-       local_irq_restore(flags);
-       if (no_ctr_free) {
-               wrmsrl(MSR_K7_EVNTSEL3, 0);
-               wrmsrl(MSR_K7_PERFCTR3, pmc3);
-               wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
-       } else {
-               release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-               release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-       }
+	int tsc_start, tsc_now;
+	int i, no_ctr_free;
+	unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
+	unsigned long flags;
+
+	for (i = 0; i < 4; i++)
+		if (avail_to_resrv_perfctr_nmi_bit(i))
+			break;
+	no_ctr_free = (i == 4);
+	if (no_ctr_free) {
+		i = 3;
+		rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
+		wrmsrl(MSR_K7_EVNTSEL3, 0);
+		rdmsrl(MSR_K7_PERFCTR3, pmc3);
+	} else {
+		reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+	local_irq_save(flags);
+	/* start meauring cycles, incrementing from 0 */
+	wrmsrl(MSR_K7_PERFCTR0 + i, 0);
+	wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
+	rdtscl(tsc_start);
+	do {
+		rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
+		tsc_now = get_cycles_sync();
+	} while ((tsc_now - tsc_start) < TICK_COUNT);
+
+	local_irq_restore(flags);
+	if (no_ctr_free) {
+		wrmsrl(MSR_K7_EVNTSEL3, 0);
+		wrmsrl(MSR_K7_PERFCTR3, pmc3);
+		wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
+	} else {
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
 
-       return pmc_now * tsc_khz / (tsc_now - tsc_start);
+	return pmc_now * tsc_khz / (tsc_now - tsc_start);
 }
 
 /*
@@ -315,7 +315,7 @@ static unsigned int __init pit_calibrate
 	end = get_cycles_sync();
 
 	spin_unlock_irqrestore(&i8253_lock, flags);
-	
+
 	return (end - start) / 50;
 }
 
@@ -360,7 +360,7 @@ static struct irqaction irq0 = {
 	.handler	= timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_IRQPOLL,
 	.mask		= CPU_MASK_NONE,
-	.name 		= "timer"
+	.name		= "timer"
 };
 
 void __init time_init(void)
@@ -373,7 +373,7 @@ void __init time_init(void)
 
 	if (hpet_use_timer) {
 		/* set tick_nsec to use the proper rate for HPET */
-	  	tick_nsec = TICK_NSEC_HPET;
+		tick_nsec = TICK_NSEC_HPET;
 		tsc_khz = hpet_calibrate_tsc();
 		timename = "HPET";
 	} else {

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [33/58] x86_64: Avoid too many remote cpu references due to /proc/stat
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (31 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [32/58] x86_64: time.c white space wreckage cleanup Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19 10:21   ` Christoph Hellwig
  2007-07-19  9:55 ` [PATCH] [34/58] x86_64: ia32entry adjustments Andi Kleen
                   ` (24 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: kiran, patches, linux-kernel


From: Ravikiran G Thirumalai <kiran@scalex86.org>
Too many remote cpu references due to /proc/stat.

On x86_64, with newer kernel versions, kstat_irqs is a bit of a problem.
On every call to kstat_irqs, the process brings in per-cpu data from all
online cpus.  Doing this for NR_IRQS, which is now 256 + 32 * NR_CPUS
results in (256+32*63) * 63 remote cpu references on a 64 cpu config.
/proc/stat is parsed by common commands like top, who etc, causing
lots of cacheline transfers

This statistic seems useless. Other 'big iron' arches disable this.
Can we disable computing/reporting this statistic?  This piece of
statistic is not human readable on x86_64 anymore,

If not, can we optimize computing this statistic so as to avoid
too many remote references (patch to follow)

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---
 fs/proc/proc_misc.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Index: linux/fs/proc/proc_misc.c
===================================================================
--- linux.orig/fs/proc/proc_misc.c
+++ linux/fs/proc/proc_misc.c
@@ -499,7 +499,8 @@ static int show_stat(struct seq_file *p,
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
-#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64)
+#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64) \
+					&& !defined(CONFIG_X86_64)
 	for (i = 0; i < NR_IRQS; i++)
 		seq_printf(p, " %u", kstat_irqs(i));
 #endif

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [34/58] x86_64: ia32entry adjustments
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (32 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [33/58] x86_64: Avoid too many remote cpu references due to /proc/stat Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19 14:46   ` Jeff Garzik
  2007-07-19  9:55 ` [PATCH] [35/58] i386: allow debuggers to access the vsyscall page with compat vDSO Andi Kleen
                   ` (23 subsequent siblings)
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: jbeulich, patches, linux-kernel


From: "Jan Beulich" <jbeulich@novell.com>
Consolidate the three 32-bit system call entry points so that they all
treat registers in similar ways.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 arch/x86_64/ia32/ia32entry.S |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/ia32/ia32entry.S
===================================================================
--- linux.orig/arch/x86_64/ia32/ia32entry.S
+++ linux/arch/x86_64/ia32/ia32entry.S
@@ -104,7 +104,7 @@ ENTRY(ia32_sysenter_target)
 	pushq	%rax
 	CFI_ADJUST_CFA_OFFSET 8
 	cld
-	SAVE_ARGS 0,0,0
+	SAVE_ARGS 0,0,1
  	/* no need to do an access_ok check here because rbp has been
  	   32bit zero extended */ 
 1:	movl	(%rbp),%r9d
@@ -294,7 +294,7 @@ ia32_badarg:
  */ 				
 
 ENTRY(ia32_syscall)
-	CFI_STARTPROC	simple
+	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-RIP
 	/*CFI_REL_OFFSET	ss,SS-RIP*/
@@ -330,6 +330,7 @@ ia32_sysret:
 
 ia32_tracesys:			 
 	SAVE_REST
+	CLEAR_RREGS
 	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [35/58] i386: allow debuggers to access the vsyscall page with  compat vDSO
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (33 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [34/58] x86_64: ia32entry adjustments Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [36/58] x86_64: minor exception trace variables cleanup Andi Kleen
                   ` (22 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: jbeulich, patches, linux-kernel


From: "Jan Beulich" <jbeulich@novell.com>
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 arch/i386/kernel/sysenter.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: linux/arch/i386/kernel/sysenter.c
===================================================================
--- linux.orig/arch/i386/kernel/sysenter.c
+++ linux/arch/i386/kernel/sysenter.c
@@ -336,7 +336,9 @@ struct vm_area_struct *get_gate_vma(stru
 
 int in_gate_area(struct task_struct *task, unsigned long addr)
 {
-	return 0;
+	const struct vm_area_struct *vma = get_gate_vma(task);
+
+	return vma && addr >= vma->vm_start && addr < vma->vm_end;
 }
 
 int in_gate_area_no_task(unsigned long addr)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [36/58] x86_64: minor exception trace variables cleanup
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (34 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [35/58] i386: allow debuggers to access the vsyscall page with compat vDSO Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [37/58] x86_64: remove unused variable maxcpus Andi Kleen
                   ` (21 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: jbeulich, patches, linux-kernel


From: "Jan Beulich" <jbeulich@novell.com>
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 arch/x86_64/mm/fault.c |    2 +-
 arch/x86_64/mm/init.c  |    2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

Index: linux/arch/x86_64/mm/fault.c
===================================================================
--- linux.orig/arch/x86_64/mm/fault.c
+++ linux/arch/x86_64/mm/fault.c
@@ -301,7 +301,7 @@ static int vmalloc_fault(unsigned long a
 	return 0;
 }
 
-int page_fault_trace = 0;
+static int page_fault_trace;
 int exception_trace = 1;
 
 /*
Index: linux/arch/x86_64/mm/init.c
===================================================================
--- linux.orig/arch/x86_64/mm/init.c
+++ linux/arch/x86_64/mm/init.c
@@ -700,8 +700,6 @@ int kern_addr_valid(unsigned long addr) 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 
-extern int exception_trace, page_fault_trace;
-
 static ctl_table debug_table2[] = {
 	{
 		.ctl_name	= 99,

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [37/58] x86_64: remove unused variable maxcpus
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (35 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [36/58] x86_64: minor exception trace variables cleanup Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [38/58] i386: smp-alt-once option is only useful with HOTPLUG_CPU Andi Kleen
                   ` (20 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: jbeulich, patches, linux-kernel


From: "Jan Beulich" <jbeulich@novell.com>
.. and adjust documentation to properly reflect options that are
x86-64 specific.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 Documentation/x86_64/boot-options.txt |    6 ------
 arch/x86_64/kernel/mpparse.c          |    1 -
 2 files changed, 7 deletions(-)

Index: linux/Documentation/x86_64/boot-options.txt
===================================================================
--- linux.orig/Documentation/x86_64/boot-options.txt
+++ linux/Documentation/x86_64/boot-options.txt
@@ -134,12 +134,6 @@ Non Executable Mappings
 
 SMP
 
-  nosmp	Only use a single CPU
-
-  maxcpus=NUMBER only use upto NUMBER CPUs
-
-  cpumask=MASK   only use cpus with bits set in mask
-
   additional_cpus=NUM Allow NUM more CPUs for hotplug
 		 (defaults are specified by the BIOS, see Documentation/x86_64/cpu-hotplug-spec)
 
Index: linux/arch/x86_64/kernel/mpparse.c
===================================================================
--- linux.orig/arch/x86_64/kernel/mpparse.c
+++ linux/arch/x86_64/kernel/mpparse.c
@@ -32,7 +32,6 @@
 
 /* Have we found an MP table */
 int smp_found_config;
-unsigned int __initdata maxcpus = NR_CPUS;
 
 /*
  * Various Linux-internal data structures created from the

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [38/58] i386: smp-alt-once option is only useful with  HOTPLUG_CPU
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (36 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [37/58] x86_64: remove unused variable maxcpus Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [39/58] i386: minor nx handling adjustment Andi Kleen
                   ` (19 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: jbeulich, patches, linux-kernel


From: "Jan Beulich" <jbeulich@novell.com>
Hence remove its handling in the opposite case.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 arch/i386/kernel/alternative.c |   14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

Index: linux/arch/i386/kernel/alternative.c
===================================================================
--- linux.orig/arch/i386/kernel/alternative.c
+++ linux/arch/i386/kernel/alternative.c
@@ -5,9 +5,8 @@
 #include <asm/alternative.h>
 #include <asm/sections.h>
 
-static int noreplace_smp     = 0;
-static int smp_alt_once      = 0;
-static int debug_alternative = 0;
+#ifdef CONFIG_HOTPLUG_CPU
+static int smp_alt_once;
 
 static int __init bootonly(char *str)
 {
@@ -15,6 +14,11 @@ static int __init bootonly(char *str)
 	return 1;
 }
 __setup("smp-alt-boot", bootonly);
+#else
+#define smp_alt_once 1
+#endif
+
+static int debug_alternative;
 
 static int __init debug_alt(char *str)
 {
@@ -23,6 +27,8 @@ static int __init debug_alt(char *str)
 }
 __setup("debug-alternative", debug_alt);
 
+static int noreplace_smp;
+
 static int __init setup_noreplace_smp(char *str)
 {
 	noreplace_smp = 1;
@@ -376,8 +382,6 @@ void __init alternative_instructions(voi
 #ifdef CONFIG_HOTPLUG_CPU
 	if (num_possible_cpus() < 2)
 		smp_alt_once = 1;
-#else
-	smp_alt_once = 1;
 #endif
 
 #ifdef CONFIG_SMP

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [39/58] i386: minor nx handling adjustment
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (37 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [38/58] i386: smp-alt-once option is only useful with HOTPLUG_CPU Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [40/58] i386: remapped_pgdat_init() static Andi Kleen
                   ` (18 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: jbeulich, patches, linux-kernel


From: "Jan Beulich" <jbeulich@novell.com>
Constrain __supported_pte_mask and NX handling to just the PAE kernel.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 arch/i386/mm/init.c     |    7 ++++---
 include/asm-i386/page.h |    1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

Index: linux/arch/i386/mm/init.c
===================================================================
--- linux.orig/arch/i386/mm/init.c
+++ linux/arch/i386/mm/init.c
@@ -471,6 +471,10 @@ void zap_low_mappings (void)
 	flush_tlb_all();
 }
 
+int nx_enabled = 0;
+
+#ifdef CONFIG_X86_PAE
+
 static int disable_nx __initdata = 0;
 u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
@@ -500,9 +504,6 @@ static int __init noexec_setup(char *str
 }
 early_param("noexec", noexec_setup);
 
-int nx_enabled = 0;
-#ifdef CONFIG_X86_PAE
-
 static void __init set_nx(void)
 {
 	unsigned int v[4], l, h;
Index: linux/include/asm-i386/page.h
===================================================================
--- linux.orig/include/asm-i386/page.h
+++ linux/include/asm-i386/page.h
@@ -44,7 +44,6 @@
 extern int nx_enabled;
 
 #ifdef CONFIG_X86_PAE
-extern unsigned long long __supported_pte_mask;
 typedef struct { unsigned long pte_low, pte_high; } pte_t;
 typedef struct { unsigned long long pmd; } pmd_t;
 typedef struct { unsigned long long pgd; } pgd_t;

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [40/58] i386: remapped_pgdat_init() static
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (38 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [39/58] i386: minor nx handling adjustment Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [41/58] i386: arch/i386/kernel/i8253.c should #include <asm/timer.h> Andi Kleen
                   ` (17 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: bunk, ak, patches, linux-kernel


From: Adrian Bunk <bunk@stusta.de>

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/i386/kernel/setup.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/arch/i386/kernel/setup.c
===================================================================
--- linux.orig/arch/i386/kernel/setup.c
+++ linux/arch/i386/kernel/setup.c
@@ -466,7 +466,7 @@ void __init setup_bootmem_allocator(void
  *
  * This should all compile down to nothing when NUMA is off.
  */
-void __init remapped_pgdat_init(void)
+static void __init remapped_pgdat_init(void)
 {
 	int nid;
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [41/58] i386: arch/i386/kernel/i8253.c should #include <asm/timer.h>
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (39 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [40/58] i386: remapped_pgdat_init() static Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [42/58] i386: timer_irq_works() static again Andi Kleen
                   ` (16 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: bunk, patches, linux-kernel


From: Adrian Bunk <bunk@stusta.de>

Every file should include the headers containing the prototypes for its
global functions.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/i386/kernel/i8253.c |    1 +
 1 file changed, 1 insertion(+)

Index: linux/arch/i386/kernel/i8253.c
===================================================================
--- linux.orig/arch/i386/kernel/i8253.c
+++ linux/arch/i386/kernel/i8253.c
@@ -13,6 +13,7 @@
 #include <asm/delay.h>
 #include <asm/i8253.h>
 #include <asm/io.h>
+#include <asm/timer.h>
 
 #include "io_ports.h"
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [42/58] i386: timer_irq_works() static again
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (40 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [41/58] i386: arch/i386/kernel/i8253.c should #include <asm/timer.h> Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [43/58] x86_64: Quicklist support for x86_64 Andi Kleen
                   ` (15 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: bunk, patches, linux-kernel


From: Adrian Bunk <bunk@stusta.de>

timer_irq_works() needlessly became global.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/i386/kernel/io_apic.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/arch/i386/kernel/io_apic.c
===================================================================
--- linux.orig/arch/i386/kernel/io_apic.c
+++ linux/arch/i386/kernel/io_apic.c
@@ -1902,7 +1902,7 @@ __setup("no_timer_check", notimercheck);
  *	- if this function detects that timer IRQs are defunct, then we fall
  *	  back to ISA timer IRQs
  */
-int __init timer_irq_works(void)
+static int __init timer_irq_works(void)
 {
 	unsigned long t1 = jiffies;
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [43/58] x86_64: Quicklist support for x86_64
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (41 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [42/58] i386: timer_irq_works() static again Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [44/58] x86_64: extract helper function from e820_register_active_regions Andi Kleen
                   ` (14 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: clameter, davem, ak, tony.luck, patches, linux-kernel


From: Christoph Lameter <clameter@sgi.com>

This adds caching of pgds and puds, pmds, pte.  That way we can avoid costly
zeroing and initialization of special mappings in the pgd.

A second quicklist is useful to separate out PGD handling.  We can carry the
initialized pgds over to the next process needing them.

Also clean up the pgd_list handling to use regular list macros.  There is no
need anymore to avoid the lru field.

Move the add/removal of the pgds to the pgdlist into the constructor /
destructor.  That way the implementation is congruent with i386.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andi Kleen <ak@suse.de>
Cc: "Luck, Tony" <tony.luck@intel.com>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86_64/Kconfig          |    8 ++++
 arch/x86_64/kernel/process.c |    1 
 arch/x86_64/kernel/smp.c     |    2 -
 include/asm-x86_64/pgalloc.h |   73 ++++++++++++++++++++++++++++---------------
 include/asm-x86_64/pgtable.h |    1 
 5 files changed, 59 insertions(+), 26 deletions(-)

Index: linux/arch/x86_64/Kconfig
===================================================================
--- linux.orig/arch/x86_64/Kconfig
+++ linux/arch/x86_64/Kconfig
@@ -60,6 +60,14 @@ config ZONE_DMA
 	bool
 	default y
 
+config QUICKLIST
+	bool
+	default y
+
+config NR_QUICK
+	int
+	default 2
+
 config ISA
 	bool
 
Index: linux/arch/x86_64/kernel/process.c
===================================================================
--- linux.orig/arch/x86_64/kernel/process.c
+++ linux/arch/x86_64/kernel/process.c
@@ -207,6 +207,7 @@ void cpu_idle (void)
 			if (__get_cpu_var(cpu_idle_state))
 				__get_cpu_var(cpu_idle_state) = 0;
 
+			check_pgt_cache();
 			rmb();
 			idle = pm_idle;
 			if (!idle)
Index: linux/arch/x86_64/kernel/smp.c
===================================================================
--- linux.orig/arch/x86_64/kernel/smp.c
+++ linux/arch/x86_64/kernel/smp.c
@@ -241,7 +241,7 @@ void flush_tlb_mm (struct mm_struct * mm
 	}
 	if (!cpus_empty(cpu_mask))
 		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
-
+	check_pgt_cache();
 	preempt_enable();
 }
 EXPORT_SYMBOL(flush_tlb_mm);
Index: linux/include/asm-x86_64/pgalloc.h
===================================================================
--- linux.orig/include/asm-x86_64/pgalloc.h
+++ linux/include/asm-x86_64/pgalloc.h
@@ -4,6 +4,10 @@
 #include <asm/pda.h>
 #include <linux/threads.h>
 #include <linux/mm.h>
+#include <linux/quicklist.h>
+
+#define QUICK_PGD 0	/* We preserve special mappings over free */
+#define QUICK_PT 1	/* Other page table pages that are zero on free */
 
 #define pmd_populate_kernel(mm, pmd, pte) \
 		set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
@@ -20,23 +24,23 @@ static inline void pmd_populate(struct m
 static inline void pmd_free(pmd_t *pmd)
 {
 	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-	free_page((unsigned long)pmd);
+	quicklist_free(QUICK_PT, NULL, pmd);
 }
 
 static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+	return (pmd_t *)quicklist_alloc(QUICK_PT, GFP_KERNEL|__GFP_REPEAT, NULL);
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+	return (pud_t *)quicklist_alloc(QUICK_PT, GFP_KERNEL|__GFP_REPEAT, NULL);
 }
 
 static inline void pud_free (pud_t *pud)
 {
 	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-	free_page((unsigned long)pud);
+	quicklist_free(QUICK_PT, NULL, pud);
 }
 
 static inline void pgd_list_add(pgd_t *pgd)
@@ -57,41 +61,57 @@ static inline void pgd_list_del(pgd_t *p
 	spin_unlock(&pgd_lock);
 }
 
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+static inline void pgd_ctor(void *x)
 {
 	unsigned boundary;
-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
-	if (!pgd)
-		return NULL;
-	pgd_list_add(pgd);
+	pgd_t *pgd = x;
+	struct page *page = virt_to_page(pgd);
+
 	/*
 	 * Copy kernel pointers in from init.
-	 * Could keep a freelist or slab cache of those because the kernel
-	 * part never changes.
 	 */
 	boundary = pgd_index(__PAGE_OFFSET);
-	memset(pgd, 0, boundary * sizeof(pgd_t));
 	memcpy(pgd + boundary,
-	       init_level4_pgt + boundary,
-	       (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
+		init_level4_pgt + boundary,
+		(PTRS_PER_PGD - boundary) * sizeof(pgd_t));
+
+	spin_lock(&pgd_lock);
+	list_add(&page->lru, &pgd_list);
+	spin_unlock(&pgd_lock);
+}
+
+static inline void pgd_dtor(void *x)
+{
+	pgd_t *pgd = x;
+	struct page *page = virt_to_page(pgd);
+
+        spin_lock(&pgd_lock);
+	list_del(&page->lru);
+	spin_unlock(&pgd_lock);
+}
+
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *pgd = (pgd_t *)quicklist_alloc(QUICK_PGD,
+		GFP_KERNEL|__GFP_REPEAT, pgd_ctor);
 	return pgd;
 }
 
 static inline void pgd_free(pgd_t *pgd)
 {
 	BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
-	pgd_list_del(pgd);
-	free_page((unsigned long)pgd);
+	quicklist_free(QUICK_PGD, pgd_dtor, pgd);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	return (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+	return (pte_t *)quicklist_alloc(QUICK_PT, GFP_KERNEL|__GFP_REPEAT, NULL);
 }
 
 static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	void *p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+	void *p = (void *)quicklist_alloc(QUICK_PT, GFP_KERNEL|__GFP_REPEAT, NULL);
+
 	if (!p)
 		return NULL;
 	return virt_to_page(p);
@@ -103,17 +123,22 @@ static inline struct page *pte_alloc_one
 static inline void pte_free_kernel(pte_t *pte)
 {
 	BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
-	free_page((unsigned long)pte); 
+	quicklist_free(QUICK_PT, NULL, pte);
 }
 
 static inline void pte_free(struct page *pte)
 {
-	__free_page(pte);
-} 
+	quicklist_free_page(QUICK_PT, NULL, pte);
+}
 
-#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
+#define __pte_free_tlb(tlb,pte) quicklist_free_page(QUICK_PT, NULL,(pte))
 
-#define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
-#define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
+#define __pmd_free_tlb(tlb,x)   quicklist_free(QUICK_PT, NULL, (x))
+#define __pud_free_tlb(tlb,x)   quicklist_free(QUICK_PT, NULL, (x))
 
+static inline void check_pgt_cache(void)
+{
+	quicklist_trim(QUICK_PGD, pgd_dtor, 25, 16);
+	quicklist_trim(QUICK_PT, NULL, 25, 16);
+}
 #endif /* _X86_64_PGALLOC_H */
Index: linux/include/asm-x86_64/pgtable.h
===================================================================
--- linux.orig/include/asm-x86_64/pgtable.h
+++ linux/include/asm-x86_64/pgtable.h
@@ -409,7 +409,6 @@ extern int kern_addr_valid(unsigned long
 #define HAVE_ARCH_UNMAPPED_AREA
 
 #define pgtable_cache_init()   do { } while (0)
-#define check_pgt_cache()      do { } while (0)
 
 #define PAGE_AGP    PAGE_KERNEL_NOCACHE
 #define HAVE_PAGE_AGP 1

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [44/58] x86_64: extract helper function from e820_register_active_regions
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (42 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [43/58] x86_64: Quicklist support for x86_64 Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [45/58] x86_64: fake pxm-to-node mapping for fake numa Andi Kleen
                   ` (13 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: rientjes, mel, ak, patches, linux-kernel


From: David Rientjes <rientjes@google.com>

The logic in e820_find_active_regions() for determining the true active
regions for an e820 entry given a range of PFN's is needed for
e820_hole_size() as well.

e820_hole_size() is called from the NUMA emulation code to determine the
reserved area within an address range on a per-node basis.  Its logic should
duplicate that of finding active regions in an e820 entry because these are
the only true ranges we may register anyway.

[akpm@linux-foundation.org: cleanup]
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86_64/kernel/e820.c |   82 ++++++++++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 34 deletions(-)

Index: linux/arch/x86_64/kernel/e820.c
===================================================================
--- linux.orig/arch/x86_64/kernel/e820.c
+++ linux/arch/x86_64/kernel/e820.c
@@ -289,47 +289,61 @@ void __init e820_mark_nosave_regions(voi
 	}
 }
 
+/*
+ * Finds an active region in the address range from start_pfn to end_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+static int __init e820_find_active_region(const struct e820entry *ei,
+					  unsigned long start_pfn,
+					  unsigned long end_pfn,
+					  unsigned long *ei_startpfn,
+					  unsigned long *ei_endpfn)
+{
+	*ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+	*ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Skip map entries smaller than a page */
+	if (*ei_startpfn >= *ei_endpfn)
+		return 0;
+
+	/* Check if end_pfn_map should be updated */
+	if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
+		end_pfn_map = *ei_endpfn;
+
+	/* Skip if map is outside the node */
+	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+				    *ei_startpfn >= end_pfn)
+		return 0;
+
+	/* Check for overlaps */
+	if (*ei_startpfn < start_pfn)
+		*ei_startpfn = start_pfn;
+	if (*ei_endpfn > end_pfn)
+		*ei_endpfn = end_pfn;
+
+	/* Obey end_user_pfn to save on memmap */
+	if (*ei_startpfn >= end_user_pfn)
+		return 0;
+	if (*ei_endpfn > end_user_pfn)
+		*ei_endpfn = end_user_pfn;
+
+	return 1;
+}
+
 /* Walk the e820 map and register active regions within a node */
 void __init
 e820_register_active_regions(int nid, unsigned long start_pfn,
 							unsigned long end_pfn)
 {
+	unsigned long ei_startpfn;
+	unsigned long ei_endpfn;
 	int i;
-	unsigned long ei_startpfn, ei_endpfn;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
-		ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
-								>> PAGE_SHIFT;
-
-		/* Skip map entries smaller than a page */
-		if (ei_startpfn >= ei_endpfn)
-			continue;
-
-		/* Check if end_pfn_map should be updated */
-		if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
-			end_pfn_map = ei_endpfn;
-
-		/* Skip if map is outside the node */
-		if (ei->type != E820_RAM ||
-				ei_endpfn <= start_pfn ||
-				ei_startpfn >= end_pfn)
-			continue;
-
-		/* Check for overlaps */
-		if (ei_startpfn < start_pfn)
-			ei_startpfn = start_pfn;
-		if (ei_endpfn > end_pfn)
-			ei_endpfn = end_pfn;
-
-		/* Obey end_user_pfn to save on memmap */
-		if (ei_startpfn >= end_user_pfn)
-			continue;
-		if (ei_endpfn > end_user_pfn)
-			ei_endpfn = end_user_pfn;
 
-		add_active_range(nid, ei_startpfn, ei_endpfn);
-	}
+	for (i = 0; i < e820.nr_map; i++)
+		if (e820_find_active_region(&e820.map[i],
+					    start_pfn, end_pfn,
+					    &ei_startpfn, &ei_endpfn))
+			add_active_range(nid, ei_startpfn, ei_endpfn);
 }
 
 /* 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [45/58] x86_64: fake pxm-to-node mapping for fake numa
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (43 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [44/58] x86_64: extract helper function from e820_register_active_regions Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [46/58] x86_64: fake apicid_to_node " Andi Kleen
                   ` (12 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: rientjes, ak, lenb, patches, linux-kernel


From: David Rientjes <rientjes@google.com>

For NUMA emulation, our SLIT should represent the true NUMA topology of the
system but our proximity domain to node ID mapping needs to reflect the
emulated state.

When NUMA emulation has successfully setup fake nodes on the system, a new
function, acpi_fake_nodes() is called.  This function determines the proximity
domain (_PXM) for each true node found on the system.  It then finds which
emulated nodes have been allocated on this true node as determined by its
starting address.  The node ID to PXM mapping is changed so that each fake
node ID points to the PXM of the true node that it is located on.

If the machine failed to register a SLIT, then we assume there is no special
requirement for emulated node affinity so we use the default LOCAL_DISTANCE,
which is newly exported to this code, as our measurement if the emulated nodes
appear in the same PXM.  Otherwise, we use REMOTE_DISTANCE.

PXM_INVAL and NID_INVAL are also exported to the ACPI header file so that we
can compare node_to_pxm() results in generic code (in this case, the SRAT
code).

Cc: Andi Kleen <ak@suse.de>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/x86_64/mm/numa.c     |    1 
 arch/x86_64/mm/srat.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++--
 drivers/acpi/numa.c       |   11 ++++--
 include/acpi/acpi_numa.h  |    1 
 include/asm-x86_64/acpi.h |   11 ++++++
 include/linux/acpi.h      |    3 +
 6 files changed, 96 insertions(+), 7 deletions(-)

Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -484,6 +484,7 @@ out:
 						nodes[i].end >> PAGE_SHIFT);
  		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 	}
+	acpi_fake_nodes(nodes, num_nodes);
  	numa_init_array();
  	return 0;
 }
Index: linux/arch/x86_64/mm/srat.c
===================================================================
--- linux.orig/arch/x86_64/mm/srat.c
+++ linux/arch/x86_64/mm/srat.c
@@ -350,7 +350,7 @@ acpi_numa_memory_affinity_init(struct ac
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
    Make sure the PXMs cover all memory. */
-static int nodes_cover_memory(void)
+static int __init nodes_cover_memory(const struct bootnode *nodes)
 {
 	int i;
 	unsigned long pxmram, e820ram;
@@ -406,7 +406,7 @@ int __init acpi_scan_nodes(unsigned long
 		}
 	}
 
-	if (!nodes_cover_memory()) {
+	if (!nodes_cover_memory(nodes)) {
 		bad_srat();
 		return -1;
 	}
@@ -440,6 +440,75 @@ int __init acpi_scan_nodes(unsigned long
 	return 0;
 }
 
+#ifdef CONFIG_NUMA_EMU
+static int __init find_node_by_addr(unsigned long addr)
+{
+	int ret = NUMA_NO_NODE;
+	int i;
+
+	for_each_node_mask(i, nodes_parsed) {
+		/*
+		 * Find the real node that this emulated node appears on.  For
+		 * the sake of simplicity, we only use a real node's starting
+		 * address to determine which emulated node it appears on.
+		 */
+		if (addr >= nodes[i].start && addr < nodes[i].end) {
+			ret = i;
+			break;
+		}
+	}
+	return i;
+}
+
+/*
+ * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
+ * mappings that respect the real ACPI topology but reflect our emulated
+ * environment.  For each emulated node, we find which real node it appears on
+ * and create PXM to NID mappings for those fake nodes which mirror that
+ * locality.  SLIT will now represent the correct distances between emulated
+ * nodes as a result of the real topology.
+ */
+void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
+{
+	int i;
+	int fake_node_to_pxm_map[MAX_NUMNODES] = {
+		[0 ... MAX_NUMNODES-1] = PXM_INVAL
+	};
+
+	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
+			 "topology.\n");
+	for (i = 0; i < num_nodes; i++) {
+		int nid, pxm;
+
+		nid = find_node_by_addr(fake_nodes[i].start);
+		if (nid == NUMA_NO_NODE)
+			continue;
+		pxm = node_to_pxm(nid);
+		if (pxm == PXM_INVAL)
+			continue;
+		fake_node_to_pxm_map[i] = pxm;
+	}
+	for (i = 0; i < num_nodes; i++)
+		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
+
+	nodes_clear(nodes_parsed);
+	for (i = 0; i < num_nodes; i++)
+		if (fake_nodes[i].start != fake_nodes[i].end)
+			node_set(i, nodes_parsed);
+	WARN_ON(!nodes_cover_memory(fake_nodes));
+}
+
+static int null_slit_node_compare(int a, int b)
+{
+	return node_to_pxm(a) == node_to_pxm(b);
+}
+#else
+static int null_slit_node_compare(int a, int b)
+{
+	return a == b;
+}
+#endif /* CONFIG_NUMA_EMU */
+
 void __init srat_reserve_add_area(int nodeid)
 {
 	if (found_add_area && nodes_add[nodeid].end) {
@@ -464,7 +533,8 @@ int __node_distance(int a, int b)
 	int index;
 
 	if (!acpi_slit)
-		return a == b ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
+						      REMOTE_DISTANCE;
 	index = acpi_slit->locality_count * node_to_pxm(a);
 	return acpi_slit->entry[index + node_to_pxm(b)];
 }
Index: linux/drivers/acpi/numa.c
===================================================================
--- linux.orig/drivers/acpi/numa.c
+++ linux/drivers/acpi/numa.c
@@ -36,8 +36,6 @@
 ACPI_MODULE_NAME("numa");
 
 static nodemask_t nodes_found_map = NODE_MASK_NONE;
-#define PXM_INVAL	-1
-#define NID_INVAL	-1
 
 /* maps to convert between proximity domain and logical node ID */
 static int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS]
@@ -59,6 +57,12 @@ int node_to_pxm(int node)
 	return node_to_pxm_map[node];
 }
 
+void __acpi_map_pxm_to_node(int pxm, int node)
+{
+	pxm_to_node_map[pxm] = node;
+	node_to_pxm_map[node] = pxm;
+}
+
 int acpi_map_pxm_to_node(int pxm)
 {
 	int node = pxm_to_node_map[pxm];
@@ -67,8 +71,7 @@ int acpi_map_pxm_to_node(int pxm)
 		if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
 			return NID_INVAL;
 		node = first_unset_node(nodes_found_map);
-		pxm_to_node_map[pxm] = node;
-		node_to_pxm_map[node] = pxm;
+		__acpi_map_pxm_to_node(pxm, node);
 		node_set(node, nodes_found_map);
 	}
 
Index: linux/include/acpi/acpi_numa.h
===================================================================
--- linux.orig/include/acpi/acpi_numa.h
+++ linux/include/acpi/acpi_numa.h
@@ -13,6 +13,7 @@
 
 extern int pxm_to_node(int);
 extern int node_to_pxm(int);
+extern void __acpi_map_pxm_to_node(int, int);
 extern int acpi_map_pxm_to_node(int);
 extern void __cpuinit acpi_unmap_pxm_to_node(int);
 
Index: linux/include/asm-x86_64/acpi.h
===================================================================
--- linux.orig/include/asm-x86_64/acpi.h
+++ linux/include/asm-x86_64/acpi.h
@@ -29,6 +29,7 @@
 #ifdef __KERNEL__
 
 #include <acpi/pdc_intel.h>
+#include <asm/numa.h>
 
 #define COMPILER_DEPENDENT_INT64   long long
 #define COMPILER_DEPENDENT_UINT64  unsigned long long
@@ -141,6 +142,16 @@ extern int acpi_pci_disabled;
 extern int acpi_skip_timer_override;
 extern int acpi_use_timer_override;
 
+#ifdef CONFIG_ACPI_NUMA
+extern void __init acpi_fake_nodes(const struct bootnode *fake_nodes,
+				   int num_nodes);
+#else
+static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
+				   int num_nodes)
+{
+}
+#endif
+
 #endif /*__KERNEL__*/
 
 #endif /*_ASM_ACPI_H*/
Index: linux/include/linux/acpi.h
===================================================================
--- linux.orig/include/linux/acpi.h
+++ linux/include/linux/acpi.h
@@ -231,6 +231,9 @@ extern int acpi_paddr_to_node(u64 start_
 
 extern int pnpacpi_disabled;
 
+#define PXM_INVAL	(-1)
+#define NID_INVAL	(-1)
+
 #else	/* CONFIG_ACPI */
 
 static inline int acpi_boot_init(void)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [46/58] x86_64: fake apicid_to_node mapping for fake numa
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (44 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [45/58] x86_64: fake pxm-to-node mapping for fake numa Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [47/58] i386: insert unclaimed MMCONFIG resources Andi Kleen
                   ` (11 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: rientjes, ak, patches, linux-kernel


From: David Rientjes <rientjes@google.com>

When we are in the emulated NUMA case, we need to make sure that all existing
apicid_to_node mappings that point to real node ID's now point to the
equivalent fake node ID's.

If we simply iterate over all apicid_to_node[] members for each node, we risk
remapping an entry if it shares a node ID with a real node.  Since apicid's
may not be consecutive, we're forced to create an automatic array of
apicid_to_node mappings and then copy it over once we have finished remapping
fake to real nodes.

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/x86_64/mm/srat.c |   13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

Index: linux/arch/x86_64/mm/srat.c
===================================================================
--- linux.orig/arch/x86_64/mm/srat.c
+++ linux/arch/x86_64/mm/srat.c
@@ -470,10 +470,13 @@ static int __init find_node_by_addr(unsi
  */
 void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 {
-	int i;
+	int i, j;
 	int fake_node_to_pxm_map[MAX_NUMNODES] = {
 		[0 ... MAX_NUMNODES-1] = PXM_INVAL
 	};
+	unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
+		[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+	};
 
 	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
 			 "topology.\n");
@@ -487,9 +490,17 @@ void __init acpi_fake_nodes(const struct
 		if (pxm == PXM_INVAL)
 			continue;
 		fake_node_to_pxm_map[i] = pxm;
+		/*
+		 * For each apicid_to_node mapping that exists for this real
+		 * node, it must now point to the fake node ID.
+		 */
+		for (j = 0; j < MAX_LOCAL_APIC; j++)
+			if (apicid_to_node[j] == nid)
+				fake_apicid_to_node[j] = i;
 	}
 	for (i = 0; i < num_nodes; i++)
 		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
+	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
 
 	nodes_clear(nodes_parsed);
 	for (i = 0; i < num_nodes; i++)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [47/58] i386: insert unclaimed MMCONFIG resources
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (45 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [46/58] x86_64: fake apicid_to_node " Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [48/58] x86_64: O_EXCL on /dev/mcelog Andi Kleen
                   ` (10 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: adurbin, ak, rientjes, patches, linux-kernel


From: Aaron Durbin <adurbin@google.com>

Insert the unclaimed MMCONFIG resources into the resource tree without the
IORESOURCE_BUSY flag during late initialization.  This allows the MMCONFIG
regions to be visible in the iomem resource tree without interfering with
other system resources that were discovered during PCI initialization.

[akpm@linux-foundation.org: nanofixes]
Signed-off-by: Aaron Durbin <adurbin@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/i386/pci/mmconfig-shared.c |   48 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig-shared.c
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -24,6 +24,9 @@
 
 DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
 
+/* Indicate if the mmcfg resources have been placed into the resource table. */
+static int __initdata pci_mmcfg_resources_inserted;
+
 /* K8 systems have some devices (typically in the builtin northbridge)
    that are only accessible using type1
    Normally this can be expressed in the MCFG by not listing them
@@ -170,7 +173,7 @@ static int __init pci_mmcfg_check_hostbr
 	return name != NULL;
 }
 
-static void __init pci_mmcfg_insert_resources(void)
+static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
 {
 #define PCI_MMCFG_RESOURCE_NAME_LEN 19
 	int i;
@@ -194,10 +197,13 @@ static void __init pci_mmcfg_insert_reso
 			 cfg->pci_segment);
 		res->start = cfg->address;
 		res->end = res->start + (num_buses << 20) - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		res->flags = IORESOURCE_MEM | resource_flags;
 		insert_resource(&iomem_resource, res);
 		names += PCI_MMCFG_RESOURCE_NAME_LEN;
 	}
+
+	/* Mark that the resources have been inserted. */
+	pci_mmcfg_resources_inserted = 1;
 }
 
 static void __init pci_mmcfg_reject_broken(int type)
@@ -267,7 +273,43 @@ void __init pci_mmcfg_init(int type)
 		if (type == 1)
 			unreachable_devices();
 		if (known_bridge)
-			pci_mmcfg_insert_resources();
+			pci_mmcfg_insert_resources(IORESOURCE_BUSY);
 		pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+	} else {
+		/*
+		 * Signal not to attempt to insert mmcfg resources because
+		 * the architecture mmcfg setup could not initialize.
+		 */
+		pci_mmcfg_resources_inserted = 1;
 	}
 }
+
+static int __init pci_mmcfg_late_insert_resources(void)
+{
+	/*
+	 * If resources are already inserted or we are not using MMCONFIG,
+	 * don't insert the resources.
+	 */
+	if ((pci_mmcfg_resources_inserted == 1) ||
+	    (pci_probe & PCI_PROBE_MMCONF) == 0 ||
+	    (pci_mmcfg_config_num == 0) ||
+	    (pci_mmcfg_config == NULL) ||
+	    (pci_mmcfg_config[0].address == 0))
+		return 1;
+
+	/*
+	 * Attempt to insert the mmcfg resources but not with the busy flag
+	 * marked so it won't cause request errors when __request_region is
+	 * called.
+	 */
+	pci_mmcfg_insert_resources(0);
+
+	return 0;
+}
+
+/*
+ * Perform MMCONFIG resource insertion after PCI initialization to allow for
+ * misprogrammed MCFG tables that state larger sizes but actually conflict
+ * with other system resources.
+ */
+late_initcall(pci_mmcfg_late_insert_resources);

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [48/58] x86_64: O_EXCL on /dev/mcelog
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (46 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [47/58] i386: insert unclaimed MMCONFIG resources Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [49/58] x86_64: support poll() " Andi Kleen
                   ` (9 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: thockin, ak, patches, linux-kernel


From: Tim Hockin <thockin@google.com>

Background:
 /dev/mcelog is a clear-on-read interface.  It is currently possible for
 multiple users to open and read() the device.  Users are protected from
 each other during any one read, but not across reads.

Description:
 This patch adds support for O_EXCL to /dev/mcelog.  If a user opens the
 device with O_EXCL, no other user may open the device (EBUSY).  Likewise,
 any user that tries to open the device with O_EXCL while another user has
 the device will fail (EBUSY).

Result:
 Applications can get exclusive access to /dev/mcelog.  Applications that
 do not care will be unchanged.

Alternatives:
 A simpler choice would be to only allow one open() at all, regardless of
 O_EXCL.

Testing:
 I wrote an application that opens /dev/mcelog with O_EXCL and observed
 that any other app that tried to open /dev/mcelog would fail until the
 exclusive app had closed the device.

Caveats:
 None.

Signed-off-by: Tim Hockin <thockin@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86_64/kernel/mce.c |   36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

Index: linux/arch/x86_64/kernel/mce.c
===================================================================
--- linux.orig/arch/x86_64/kernel/mce.c
+++ linux/arch/x86_64/kernel/mce.c
@@ -465,6 +465,40 @@ void __cpuinit mcheck_init(struct cpuinf
  * Character device to read and clear the MCE log.
  */
 
+static DEFINE_SPINLOCK(mce_state_lock);
+static int open_count;	/* #times opened */
+static int open_exclu;	/* already open exclusive? */
+
+static int mce_open(struct inode *inode, struct file *file)
+{
+	spin_lock(&mce_state_lock);
+
+	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
+		spin_unlock(&mce_state_lock);
+		return -EBUSY;
+	}
+
+	if (file->f_flags & O_EXCL)
+		open_exclu = 1;
+	open_count++;
+
+	spin_unlock(&mce_state_lock);
+
+	return 0;
+}
+
+static int mce_release(struct inode *inode, struct file *file)
+{
+	spin_lock(&mce_state_lock);
+
+	open_count--;
+	open_exclu = 0;
+
+	spin_unlock(&mce_state_lock);
+
+	return 0;
+}
+
 static void collect_tscs(void *data) 
 { 
 	unsigned long *cpu_tsc = (unsigned long *)data;
@@ -555,6 +589,8 @@ static int mce_ioctl(struct inode *i, st
 }
 
 static const struct file_operations mce_chrdev_ops = {
+	.open = mce_open,
+	.release = mce_release,
 	.read = mce_read,
 	.ioctl = mce_ioctl,
 };

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [49/58] x86_64: support poll() on /dev/mcelog
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (47 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [48/58] x86_64: O_EXCL on /dev/mcelog Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [50/58] x86_64: mcelog tolerant level cleanup Andi Kleen
                   ` (8 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: thockin, ak, patches, linux-kernel


From: Tim Hockin <thockin@google.com>

Background:
 /dev/mcelog is typically polled manually.  This is less than optimal for
 situations where accurate accounting of MCEs is important.  Calling
 poll() on /dev/mcelog does not work.

Description:
 This patch adds support for poll() to /dev/mcelog.  This results in
 immediate wakeup of user apps whenever the poller finds MCEs.  Because
 the exception handler can not take any locks, it can not call the wakeup
 itself.  Instead, it uses a thread_info flag (TIF_MCE_NOTIFY) which is
 caught at the next return from interrupt or exit from idle, calling the
 mce_user_notify() routine.  This patch also disables the "fake panic"
 path of the mce_panic(), because it results in printk()s in the exception
 handler and crashy systems.

 This patch also does some small cleanup for essentially unused variables,
 and moves the user notification into the body of the poller, so it is
 only called once per poll, rather than once per CPU.

Result:
 Applications can now poll() on /dev/mcelog.  When an error is logged
 (whether through the poller or through an exception) the applications are
 woken up promptly.  This should not affect any previous behaviors.  If no
 MCEs are being logged, there is no overhead.

Alternatives:
 I considered simply supporting poll() through the poller and not using
 TIF_MCE_NOTIFY at all.  However, the time between an uncorrectable error
 happening and the user application being notified is *the*most* critical
 window for us.  Many uncorrectable errors can be logged to the network if
 given a chance.

 I also considered doing the MCE poll directly from the idle notifier, but
 decided that was overkill.

Testing:
 I used an error-injecting DIMM to create lots of correctable DRAM errors
 and verified that my user app is woken up in sync with the polling interval.
 I also used the northbridge to inject uncorrectable ECC errors, and
 verified (printk() to the rescue) that the notify routine is called and the
 user app does wake up.  I built with PREEMPT on and off, and verified
 that my machine survives MCEs.

[wli@holomorphy.com: build fix]
Signed-off-by: Tim Hockin <thockin@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: William Irwin <bill.irwin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86_64/kernel/entry.S       |    6 +-
 arch/x86_64/kernel/mce.c         |  105 +++++++++++++++++++++++++--------------
 arch/x86_64/kernel/signal.c      |    7 ++
 include/asm-x86_64/mce.h         |    2 
 include/asm-x86_64/thread_info.h |    2 
 5 files changed, 82 insertions(+), 40 deletions(-)

Index: linux/arch/x86_64/kernel/entry.S
===================================================================
--- linux.orig/arch/x86_64/kernel/entry.S
+++ linux/arch/x86_64/kernel/entry.S
@@ -282,7 +282,7 @@ sysret_careful:
 sysret_signal:
 	TRACE_IRQS_ON
 	sti
-	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
 	jz    1f
 
 	/* Really a signal */
@@ -375,7 +375,7 @@ int_very_careful:
 	jmp int_restore_rest
 	
 int_signal:
-	testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
+	testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
 	jz 1f
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
@@ -599,7 +599,7 @@ retint_careful:
 	jmp retint_check
 	
 retint_signal:
-	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	sti
Index: linux/arch/x86_64/kernel/mce.c
===================================================================
--- linux.orig/arch/x86_64/kernel/mce.c
+++ linux/arch/x86_64/kernel/mce.c
@@ -18,6 +18,8 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/percpu.h>
+#include <linux/poll.h>
+#include <linux/thread_info.h>
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/kdebug.h>
@@ -26,6 +28,7 @@
 #include <asm/mce.h>
 #include <asm/uaccess.h>
 #include <asm/smp.h>
+#include <asm/idle.h>
 
 #define MISC_MCELOG_MINOR 227
 #define NR_BANKS 6
@@ -39,8 +42,7 @@ static int mce_dont_init;
 static int tolerant = 1;
 static int banks;
 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
-static unsigned long console_logged;
-static int notify_user;
+static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = 1;
 static atomic_t mce_events;
@@ -48,6 +50,8 @@ static atomic_t mce_events;
 static char trigger[128];
 static char *trigger_argv[2] = { trigger, NULL };
 
+static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+
 /*
  * Lockless MCE logging infrastructure.
  * This avoids deadlocks on printk locks without having to break locks. Also
@@ -94,8 +98,7 @@ void mce_log(struct mce *mce)
 	mcelog.entry[entry].finished = 1;
 	wmb();
 
-	if (!test_and_set_bit(0, &console_logged))
-		notify_user = 1;
+	set_bit(0, &notify_user);
 }
 
 static void print_mce(struct mce *m)
@@ -128,6 +131,10 @@ static void print_mce(struct mce *m)
 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 { 
 	int i;
+
+	if (tolerant >= 3)
+		return;
+
 	oops_begin();
 	for (i = 0; i < MCE_LOG_LEN; i++) {
 		unsigned long tsc = mcelog.entry[i].tsc;
@@ -139,10 +146,7 @@ static void mce_panic(char *msg, struct 
 	}
 	if (backup)
 		print_mce(backup);
-	if (tolerant >= 3)
-		printk("Fake panic: %s\n", msg);
-	else
-		panic(msg);
+	panic(msg);
 } 
 
 static int mce_available(struct cpuinfo_x86 *c)
@@ -167,17 +171,6 @@ static inline void mce_get_rip(struct mc
 	}
 }
 
-static void do_mce_trigger(void)
-{
-	static atomic_t mce_logged;
-	int events = atomic_read(&mce_events);
-	if (events != atomic_read(&mce_logged) && trigger[0]) {
-		/* Small race window, but should be harmless.  */
-		atomic_set(&mce_logged, events);
-		call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
-	}
-}
-
 /* 
  * The actual machine check handler
  */
@@ -251,12 +244,8 @@ void do_machine_check(struct pt_regs * r
 	}
 
 	/* Never do anything final in the polling timer */
-	if (!regs) {
-		/* Normal interrupt context here. Call trigger for any new
-		   events. */
-		do_mce_trigger();
+	if (!regs)
 		goto out;
-	}
 
 	/* If we didn't find an uncorrectable error, pick
 	   the last one (shouldn't happen, just being safe). */
@@ -288,6 +277,9 @@ void do_machine_check(struct pt_regs * r
 			do_exit(SIGBUS);
 	}
 
+	/* notify userspace ASAP */
+	set_thread_flag(TIF_MCE_NOTIFY);
+
  out:
 	/* Last thing done in the machine check exception to clear state. */
 	wrmsrl(MSR_IA32_MCG_STATUS, 0);
@@ -344,37 +336,67 @@ static void mcheck_timer(struct work_str
 	on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 
 	/*
-	 * It's ok to read stale data here for notify_user and
-	 * console_logged as we'll simply get the updated versions
-	 * on the next mcheck_timer execution and atomic operations
-	 * on console_logged act as synchronization for notify_user
-	 * writes.
+	 * Alert userspace if needed.  If we logged an MCE, reduce the
+	 * polling interval, otherwise increase the polling interval.
 	 */
-	if (notify_user && console_logged) {
+	if (mce_notify_user()) {
+		next_interval = max(next_interval/2, HZ/100);
+	} else {
+		next_interval = min(next_interval*2, check_interval*HZ);
+	}
+
+	schedule_delayed_work(&mcheck_work, next_interval);
+}
+
+/*
+ * This is only called from process context.  This is where we do
+ * anything we need to alert userspace about new MCEs.  This is called
+ * directly from the poller and also from entry.S and idle, thanks to
+ * TIF_MCE_NOTIFY.
+ */
+int mce_notify_user(void)
+{
+	clear_thread_flag(TIF_MCE_NOTIFY);
+	if (test_and_clear_bit(0, &notify_user)) {
 		static unsigned long last_print;
 		unsigned long now = jiffies;
 
-		/* if we logged an MCE, reduce the polling interval */
-		next_interval = max(next_interval/2, HZ/100);
-		notify_user = 0;
-		clear_bit(0, &console_logged);
+		wake_up_interruptible(&mce_wait);
+		if (trigger[0])
+			call_usermodehelper(trigger, trigger_argv, NULL,
+						UMH_NO_WAIT);
+
 		if (time_after_eq(now, last_print + (check_interval*HZ))) {
 			last_print = now;
 			printk(KERN_INFO "Machine check events logged\n");
 		}
-	} else {
-		next_interval = min(next_interval*2, check_interval*HZ);
+
+		return 1;
 	}
+	return 0;
+}
 
-	schedule_delayed_work(&mcheck_work, next_interval);
+/* see if the idle task needs to notify userspace */
+static int
+mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
+{
+	/* IDLE_END should be safe - interrupts are back on */
+	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
+		mce_notify_user();
+
+	return NOTIFY_OK;
 }
 
+static struct notifier_block mce_idle_notifier = {
+	.notifier_call = mce_idle_callback,
+};
 
 static __init int periodic_mcheck_init(void)
 { 
 	next_interval = check_interval * HZ;
 	if (next_interval)
 		schedule_delayed_work(&mcheck_work, next_interval);
+	idle_notifier_register(&mce_idle_notifier);
 	return 0;
 } 
 __initcall(periodic_mcheck_init);
@@ -566,6 +588,14 @@ static ssize_t mce_read(struct file *fil
 	return err ? -EFAULT : buf - ubuf; 
 }
 
+static unsigned int mce_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &mce_wait, wait);
+	if (rcu_dereference(mcelog.next))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
 {
 	int __user *p = (int __user *)arg;
@@ -592,6 +622,7 @@ static const struct file_operations mce_
 	.open = mce_open,
 	.release = mce_release,
 	.read = mce_read,
+	.poll = mce_poll,
 	.ioctl = mce_ioctl,
 };
 
Index: linux/arch/x86_64/kernel/signal.c
===================================================================
--- linux.orig/arch/x86_64/kernel/signal.c
+++ linux/arch/x86_64/kernel/signal.c
@@ -26,6 +26,7 @@
 #include <asm/i387.h>
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
+#include <asm/mce.h>
 
 /* #define DEBUG_SIG 1 */
 
@@ -472,6 +473,12 @@ do_notify_resume(struct pt_regs *regs, v
 		clear_thread_flag(TIF_SINGLESTEP);
 	}
 
+#ifdef CONFIG_X86_MCE
+	/* notify userspace of pending MCEs */
+	if (thread_info_flags & _TIF_MCE_NOTIFY)
+		mce_notify_user();
+#endif /* CONFIG_X86_MCE */
+
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
 		do_signal(regs);
Index: linux/include/asm-x86_64/mce.h
===================================================================
--- linux.orig/include/asm-x86_64/mce.h
+++ linux/include/asm-x86_64/mce.h
@@ -105,6 +105,8 @@ extern atomic_t mce_entry;
 
 extern void do_machine_check(struct pt_regs *, long);
 
+extern int mce_notify_user(void);
+
 #endif
 
 #endif
Index: linux/include/asm-x86_64/thread_info.h
===================================================================
--- linux.orig/include/asm-x86_64/thread_info.h
+++ linux/include/asm-x86_64/thread_info.h
@@ -115,6 +115,7 @@ static inline struct thread_info *stack_
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
+#define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 /* 16 free */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -133,6 +134,7 @@ static inline struct thread_info *stack_
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_MCE_NOTIFY		(1<<TIF_MCE_NOTIFY)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [50/58] x86_64: mcelog tolerant level cleanup
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (48 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [49/58] x86_64: support poll() " Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [51/58] i386: fix machine rebooting Andi Kleen
                   ` (7 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: thockin, ak, patches, linux-kernel


From: Tim Hockin <thockin@google.com>

Background:
 The MCE handler has several paths that it can take, depending on various
 conditions of the MCE status and the value of the 'tolerant' knob.  The
 exact semantics are not well defined and the code is a bit twisty.

Description:
 This patch makes the MCE handler's behavior more clear by documenting the
 behavior for various 'tolerant' levels.  It also fixes or enhances
 several small things in the handler.  Specifically:
     * If RIPV is set it is not safe to restart, so set the 'no way out'
       flag rather than the 'kill it' flag.
     * Don't panic() on correctable MCEs.
     * If the _OVER bit is set *and* the _UC bit is set (meaning possibly
       dropped uncorrected errors), set the 'no way out' flag.
     * Use EIPV for testing whether an app can be killed (SIGBUS) rather
       than RIPV.  According to docs, EIPV indicates that the error is
       related to the IP, while RIPV simply means the IP is valid to
       restart from.
     * Don't clear the MCi_STATUS registers until after the panic() path.
       This leaves the status bits set after the panic() so clever BIOSes
       can find them (and dumb BIOSes can do nothing).

 This patch also calls nonseekable_open() in mce_open (as suggested by akpm).

Result:
 Tolerant levels behave almost identically to how they always have, but
 not it's well defined.  There's a slightly higher chance of panic()ing
 when multiple errors happen (a good thing, IMHO).  If you take an MBE and
 panic(), the error status bits are not cleared.

Alternatives:
 None.

Testing:
 I used software to inject correctable and uncorrectable errors.  With
 tolerant = 3, the system usually survives.  With tolerant = 2, the system
 usually panic()s (PCC) but not always.  With tolerant = 1, the system
 always panic()s.  When the system panic()s, the BIOS is able to detect
 that the cause of death was an MC4.  I was not able to reproduce the
 case of a non-PCC error in userspace, with EIPV, with (tolerant < 3).
 That will be rare at best.

Signed-off-by: Tim Hockin <thockin@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 Documentation/x86_64/boot-options.txt |    8 +-
 Documentation/x86_64/machinecheck     |   14 ++--
 arch/x86_64/kernel/mce.c              |  101 ++++++++++++++++++++++------------
 3 files changed, 80 insertions(+), 43 deletions(-)

Index: linux/Documentation/x86_64/boot-options.txt
===================================================================
--- linux.orig/Documentation/x86_64/boot-options.txt
+++ linux/Documentation/x86_64/boot-options.txt
@@ -14,9 +14,11 @@ Machine check
    mce=nobootlog
 		Disable boot machine check logging.
    mce=tolerancelevel (number)
-		0: always panic, 1: panic if deadlock possible,
-		2: try to avoid panic, 3: never panic or exit (for testing)
-		default is 1
+		0: always panic on uncorrected errors, log corrected errors
+		1: panic or SIGBUS on uncorrected errors, log corrected errors
+		2: SIGBUS or log uncorrected errors, log corrected errors
+		3: never panic or SIGBUS, log all errors (for testing only)
+		Default is 1
 		Can be also set using sysfs which is preferable.
 
    nomce (for compatibility with i386): same as mce=off
Index: linux/Documentation/x86_64/machinecheck
===================================================================
--- linux.orig/Documentation/x86_64/machinecheck
+++ linux/Documentation/x86_64/machinecheck
@@ -49,12 +49,14 @@ tolerant
 	Since machine check exceptions can happen any time it is sometimes
 	risky for the kernel to kill a process because it defies
 	normal kernel locking rules. The tolerance level configures
-	how hard the kernel tries to recover even at some risk of deadlock.
-
-	0: always panic,
-	1: panic if deadlock possible,
-	2: try to avoid panic,
-   	3: never panic or exit (for testing only)
+	how hard the kernel tries to recover even at some risk of
+	deadlock.  Higher tolerant values trade potentially better uptime
+	with the risk of a crash or even corruption (for tolerant >= 3).
+
+	0: always panic on uncorrected errors, log corrected errors
+	1: panic or SIGBUS on uncorrected errors, log corrected errors
+	2: SIGBUS or log uncorrected errors, log corrected errors
+	3: never panic or SIGBUS, log all errors (for testing only)
 
 	Default: 1
 
Index: linux/arch/x86_64/kernel/mce.c
===================================================================
--- linux.orig/arch/x86_64/kernel/mce.c
+++ linux/arch/x86_64/kernel/mce.c
@@ -37,8 +37,13 @@ atomic_t mce_entry;
 
 static int mce_dont_init;
 
-/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
-   3: never panic or exit (for testing only) */
+/*
+ * Tolerant levels:
+ *   0: always panic on uncorrected errors, log corrected errors
+ *   1: panic or SIGBUS on uncorrected errors, log corrected errors
+ *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
+ *   3: never panic or SIGBUS, log all errors (for testing only)
+ */
 static int tolerant = 1;
 static int banks;
 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
@@ -132,9 +137,6 @@ static void mce_panic(char *msg, struct 
 { 
 	int i;
 
-	if (tolerant >= 3)
-		return;
-
 	oops_begin();
 	for (i = 0; i < MCE_LOG_LEN; i++) {
 		unsigned long tsc = mcelog.entry[i].tsc;
@@ -178,11 +180,19 @@ static inline void mce_get_rip(struct mc
 void do_machine_check(struct pt_regs * regs, long error_code)
 {
 	struct mce m, panicm;
-	int nowayout = (tolerant < 1); 
-	int kill_it = 0;
 	u64 mcestart = 0;
 	int i;
 	int panicm_found = 0;
+	/*
+	 * If no_way_out gets set, there is no safe way to recover from this
+	 * MCE.  If tolerant is cranked up, we'll try anyway.
+	 */
+	int no_way_out = 0;
+	/*
+	 * If kill_it gets set, there might be a way to recover from this
+	 * error.
+	 */
+	int kill_it = 0;
 
 	atomic_inc(&mce_entry);
 
@@ -194,8 +204,9 @@ void do_machine_check(struct pt_regs * r
 	memset(&m, 0, sizeof(struct mce));
 	m.cpu = smp_processor_id();
 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+	/* if the restart IP is not valid, we're done for */
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
-		kill_it = 1;
+		no_way_out = 1;
 	
 	rdtscll(mcestart);
 	barrier();
@@ -214,10 +225,18 @@ void do_machine_check(struct pt_regs * r
 			continue;
 
 		if (m.status & MCI_STATUS_EN) {
-			/* In theory _OVER could be a nowayout too, but
-			   assume any overflowed errors were no fatal. */
-			nowayout |= !!(m.status & MCI_STATUS_PCC);
-			kill_it |= !!(m.status & MCI_STATUS_UC);
+			/* if PCC was set, there's no way out */
+			no_way_out |= !!(m.status & MCI_STATUS_PCC);
+			/*
+			 * If this error was uncorrectable and there was
+			 * an overflow, we're in trouble.  If no overflow,
+			 * we might get away with just killing a task.
+			 */
+			if (m.status & MCI_STATUS_UC) {
+				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
+					no_way_out = 1;
+				kill_it = 1;
+			}
 		}
 
 		if (m.status & MCI_STATUS_MISCV)
@@ -228,7 +247,6 @@ void do_machine_check(struct pt_regs * r
 		mce_get_rip(&m, regs);
 		if (error_code >= 0)
 			rdtscll(m.tsc);
-		wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
 		if (error_code != -2)
 			mce_log(&m);
 
@@ -251,37 +269,52 @@ void do_machine_check(struct pt_regs * r
 	   the last one (shouldn't happen, just being safe). */
 	if (!panicm_found)
 		panicm = m;
-	if (nowayout)
+
+	/*
+	 * If we have decided that we just CAN'T continue, and the user
+	 *  has not set tolerant to an insane level, give up and die.
+	 */
+	if (no_way_out && tolerant < 3)
 		mce_panic("Machine check", &panicm, mcestart);
-	if (kill_it) {
+
+	/*
+	 * If the error seems to be unrecoverable, something should be
+	 * done.  Try to kill as little as possible.  If we can kill just
+	 * one task, do that.  If the user has set the tolerance very
+	 * high, don't try to do anything at all.
+	 */
+	if (kill_it && tolerant < 3) {
 		int user_space = 0;
 
-		if (m.mcgstatus & MCG_STATUS_RIPV)
+		/*
+		 * If the EIPV bit is set, it means the saved IP is the
+		 * instruction which caused the MCE.
+		 */
+		if (m.mcgstatus & MCG_STATUS_EIPV)
 			user_space = panicm.rip && (panicm.cs & 3);
-		
-		/* When the machine was in user space and the CPU didn't get
-		   confused it's normally not necessary to panic, unless you 
-		   are paranoid (tolerant == 0)
-
-		   RED-PEN could be more tolerant for MCEs in idle,
-		   but most likely they occur at boot anyways, where
-		   it is best to just halt the machine. */
-		if ((!user_space && (panic_on_oops || tolerant < 2)) ||
-		    (unsigned)current->pid <= 1)
-			mce_panic("Uncorrected machine check", &panicm, mcestart);
-
-		/* do_exit takes an awful lot of locks and has as
-		   slight risk of deadlocking. If you don't want that
-		   don't set tolerant >= 2 */
-		if (tolerant < 3)
+
+		/*
+		 * If we know that the error was in user space, send a
+		 * SIGBUS.  Otherwise, panic if tolerance is low.
+		 *
+		 * do_exit() takes an awful lot of locks and has a slight
+		 * risk of deadlocking.
+		 */
+		if (user_space) {
 			do_exit(SIGBUS);
+		} else if (panic_on_oops || tolerant < 2) {
+			mce_panic("Uncorrected machine check",
+				&panicm, mcestart);
+		}
 	}
 
 	/* notify userspace ASAP */
 	set_thread_flag(TIF_MCE_NOTIFY);
 
  out:
-	/* Last thing done in the machine check exception to clear state. */
+	/* the last thing we do is clear state */
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out2:
 	atomic_dec(&mce_entry);
@@ -506,7 +539,7 @@ static int mce_open(struct inode *inode,
 
 	spin_unlock(&mce_state_lock);
 
-	return 0;
+	return nonseekable_open(inode, file);
 }
 
 static int mce_release(struct inode *inode, struct file *file)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [51/58] i386: fix machine rebooting
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (49 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [50/58] x86_64: mcelog tolerant level cleanup Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [52/58] i386: fix section mismatch warnings in mtrr Andi Kleen
                   ` (6 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: trux, lee-in-berlin, ak, patches, linux-kernel


From: Truxton Fulton <trux@truxton.com>

59f4e7d572980a521b7bdba74ab71b21f5995538 fixed machine rebooting on Truxton's
machine (when no keyboard was present).  But it broke it on Lee's machine.

The patch reinstates the old (pre-59f4e7d572980a521b7bdba74ab71b21f5995538)
code and if that doesn't work out, try the new,
post-59f4e7d572980a521b7bdba74ab71b21f5995538 code instead.

Cc: Lee Garrett <lee-in-berlin@web.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 include/asm-i386/mach-default/mach_reboot.h |   25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

Index: linux/include/asm-i386/mach-default/mach_reboot.h
===================================================================
--- linux.orig/include/asm-i386/mach-default/mach_reboot.h
+++ linux/include/asm-i386/mach-default/mach_reboot.h
@@ -19,14 +19,37 @@ static inline void kb_wait(void)
 static inline void mach_reboot(void)
 {
 	int i;
+
+	/* old method, works on most machines */
 	for (i = 0; i < 10; i++) {
 		kb_wait();
 		udelay(50);
+		outb(0xfe, 0x64);	/* pulse reset low */
+		udelay(50);
+	}
+
+	/* New method: sets the "System flag" which, when set, indicates
+	 * successful completion of the keyboard controller self-test (Basic
+	 * Assurance Test, BAT).  This is needed for some machines with no
+	 * keyboard plugged in.  This read-modify-write sequence sets only the
+	 * system flag
+	 */
+	for (i = 0; i < 10; i++) {
+		int cmd;
+
+		outb(0x20, 0x64);	/* read Controller Command Byte */
+		udelay(50);
+		kb_wait();
+		udelay(50);
+		cmd = inb(0x60);
+		udelay(50);
+		kb_wait();
+		udelay(50);
 		outb(0x60, 0x64);	/* write Controller Command Byte */
 		udelay(50);
 		kb_wait();
 		udelay(50);
-		outb(0x14, 0x60);	/* set "System flag" */
+		outb(cmd | 0x04, 0x60);	/* set "System flag" */
 		udelay(50);
 		kb_wait();
 		udelay(50);

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [52/58] i386: fix section mismatch warnings in mtrr
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (50 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [51/58] i386: fix machine rebooting Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [53/58] x86: PM_TRACE support Andi Kleen
                   ` (5 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: sam, patches, linux-kernel


From: Sam Ravnborg <sam@ravnborg.org>

Following section mismatch warnings were reported by Andrey Borzenkov:

WARNING: arch/i386/kernel/built-in.o - Section mismatch: reference to .init.text:amd_init_mtrr from .text between 'mtrr_bp_init' (at offset 0x967a) and 'mtrr_attrib_to_str'
WARNING: arch/i386/kernel/built-in.o - Section mismatch: reference to .init.text:cyrix_init_mtrr from .text between 'mtrr_bp_init' (at offset 0x967f) and 'mtrr_attrib_to_str'
WARNING: arch/i386/kernel/built-in.o - Section mismatch: reference to .init.text:centaur_init_mtrr from .text between 'mtrr_bp_init' (at offset 0x9684) and 'mtrr_attrib_to_str'
WARNING: arch/i386/kernel/built-in.o - Section mismatch: reference to .init.text: from .text between 'get_mtrr_state' (at offset 0xa735) and 'generic_get_mtrr'
WARNING: arch/i386/kernel/built-in.o - Section mismatch: reference to .init.text: from .text between 'get_mtrr_state' (at offset 0xa749) and 'generic_get_mtrr'
WARNING: arch/i386/kernel/built-in.o - Section mismatch: reference to .init.text: from .text between 'get_mtrr_state' (at offset 0xa770) and 'generic_get_mtrr'

It was tracked down to a few functions missing __init tag.
Compile tested only.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 arch/i386/kernel/cpu/mtrr/generic.c |    2 +-
 arch/i386/kernel/cpu/mtrr/main.c    |    2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

Index: linux/arch/i386/kernel/cpu/mtrr/generic.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/mtrr/generic.c
+++ linux/arch/i386/kernel/cpu/mtrr/generic.c
@@ -79,7 +79,7 @@ static void print_fixed(unsigned base, u
 }
 
 /*  Grab all of the MTRR state for this CPU into *state  */
-void get_mtrr_state(void)
+void __init get_mtrr_state(void)
 {
 	unsigned int i;
 	struct mtrr_var_range *vrs;
Index: linux/arch/i386/kernel/cpu/mtrr/main.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/mtrr/main.c
+++ linux/arch/i386/kernel/cpu/mtrr/main.c
@@ -643,7 +643,7 @@ static struct sysdev_driver mtrr_sysdev_
  * initialized (i.e. before smp_init()).
  * 
  */
-__init void mtrr_bp_init(void)
+void __init mtrr_bp_init(void)
 {
 	init_ifs();
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [53/58] x86: PM_TRACE support
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (51 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [52/58] i386: fix section mismatch warnings in mtrr Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [54/58] x86: Make Alt-SysRq-p display the debug register contents Andi Kleen
                   ` (4 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: nigel, rdunlap, ak, rjw, pavel, patches, linux-kernel


From: Nigel Cunningham <nigel@nigel.suspend2.net>

Signed-off-by: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Andi Kleen <ak@suse.de>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86_64/kernel/vmlinux.lds.S  |    7 +++++++
 drivers/base/power/trace.c        |    5 ++++-
 include/asm-i386/resume-trace.h   |   13 +++++++++++++
 include/asm-x86_64/resume-trace.h |   13 +++++++++++++
 include/linux/resume-trace.h      |   19 +++++--------------
 kernel/power/Kconfig              |    2 +-
 6 files changed, 43 insertions(+), 16 deletions(-)

Index: linux/arch/x86_64/kernel/vmlinux.lds.S
===================================================================
--- linux.orig/arch/x86_64/kernel/vmlinux.lds.S
+++ linux/arch/x86_64/kernel/vmlinux.lds.S
@@ -52,6 +52,13 @@ SECTIONS
 
   RODATA
 
+  . = ALIGN(4);
+  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+  	__tracedata_start = .;
+	*(.tracedata)
+  	__tracedata_end = .;
+  }
+
   . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
Index: linux/drivers/base/power/trace.c
===================================================================
--- linux.orig/drivers/base/power/trace.c
+++ linux/drivers/base/power/trace.c
@@ -142,6 +142,7 @@ void set_trace_device(struct device *dev
 {
 	dev_hash_value = hash_string(DEVSEED, dev->bus_id, DEVHASH);
 }
+EXPORT_SYMBOL(set_trace_device);
 
 /*
  * We could just take the "tracedata" index into the .tracedata
@@ -162,6 +163,7 @@ void generate_resume_trace(void *traceda
 	file_hash_value = hash_string(lineno, file, FILEHASH);
 	set_magic_time(user_hash_value, file_hash_value, dev_hash_value);
 }
+EXPORT_SYMBOL(generate_resume_trace);
 
 extern char __tracedata_start, __tracedata_end;
 static int show_file_hash(unsigned int value)
@@ -170,7 +172,8 @@ static int show_file_hash(unsigned int v
 	char *tracedata;
 
 	match = 0;
-	for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ; tracedata += 6) {
+	for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ;
+			tracedata += 2 + sizeof(unsigned long)) {
 		unsigned short lineno = *(unsigned short *)tracedata;
 		const char *file = *(const char **)(tracedata + 2);
 		unsigned int hash = hash_string(lineno, file, FILEHASH);
Index: linux/include/asm-i386/resume-trace.h
===================================================================
--- /dev/null
+++ linux/include/asm-i386/resume-trace.h
@@ -0,0 +1,13 @@
+#define TRACE_RESUME(user) do {					\
+	if (pm_trace_enabled) {					\
+		void *tracedata;				\
+		asm volatile("movl $1f,%0\n"			\
+			".section .tracedata,\"a\"\n"		\
+			"1:\t.word %c1\n"			\
+			"\t.long %c2\n"				\
+			".previous"				\
+			:"=r" (tracedata)			\
+			: "i" (__LINE__), "i" (__FILE__));	\
+		generate_resume_trace(tracedata, user);		\
+	}							\
+} while (0)
Index: linux/include/asm-x86_64/resume-trace.h
===================================================================
--- /dev/null
+++ linux/include/asm-x86_64/resume-trace.h
@@ -0,0 +1,13 @@
+#define TRACE_RESUME(user) do {					\
+	if (pm_trace_enabled) {					\
+		void *tracedata;				\
+		asm volatile("movq $1f,%0\n"			\
+			".section .tracedata,\"a\"\n"		\
+			"1:\t.word %c1\n"			\
+			"\t.quad %c2\n"				\
+			".previous"				\
+			:"=r" (tracedata)			\
+			: "i" (__LINE__), "i" (__FILE__));	\
+		generate_resume_trace(tracedata, user);		\
+	}							\
+} while (0)
Index: linux/include/linux/resume-trace.h
===================================================================
--- linux.orig/include/linux/resume-trace.h
+++ linux/include/linux/resume-trace.h
@@ -2,6 +2,7 @@
 #define RESUME_TRACE_H
 
 #ifdef CONFIG_PM_TRACE
+#include <asm/resume-trace.h>
 
 extern int pm_trace_enabled;
 
@@ -9,20 +10,10 @@ struct device;
 extern void set_trace_device(struct device *);
 extern void generate_resume_trace(void *tracedata, unsigned int user);
 
-#define TRACE_DEVICE(dev) set_trace_device(dev)
-#define TRACE_RESUME(user) do {					\
-	if (pm_trace_enabled) {					\
-		void *tracedata;				\
-		asm volatile("movl $1f,%0\n"			\
-			".section .tracedata,\"a\"\n"		\
-			"1:\t.word %c1\n"			\
-			"\t.long %c2\n"				\
-			".previous"				\
-			:"=r" (tracedata)			\
-			: "i" (__LINE__), "i" (__FILE__));	\
-		generate_resume_trace(tracedata, user);		\
-	}							\
-} while (0)
+#define TRACE_DEVICE(dev) do { \
+	if (pm_trace_enabled) \
+		set_trace_device(dev); \
+	} while(0)
 
 #else
 
Index: linux/kernel/power/Kconfig
===================================================================
--- linux.orig/kernel/power/Kconfig
+++ linux/kernel/power/Kconfig
@@ -50,7 +50,7 @@ config DISABLE_CONSOLE_SUSPEND
 
 config PM_TRACE
 	bool "Suspend/resume event tracing"
-	depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
+	depends on PM && PM_DEBUG && X86 && EXPERIMENTAL
 	default n
 	---help---
 	This enables some cheesy code to save the last PM event point in the

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [54/58] x86: Make Alt-SysRq-p display the debug register contents
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (52 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [53/58] x86: PM_TRACE support Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [55/58] i386: add reference to the arguments Andi Kleen
                   ` (3 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: stern, ak, patches, linux-kernel


From: Alan Stern <stern@rowland.harvard.edu>

This patch (as921) adds code to the show_regs() routine in i386 and x86_64
to print the contents of the debug registers along with all the others.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/i386/kernel/process.c   |   12 ++++++++++++
 arch/x86_64/kernel/process.c |   10 ++++++++++
 2 files changed, 22 insertions(+)

Index: linux/arch/i386/kernel/process.c
===================================================================
--- linux.orig/arch/i386/kernel/process.c
+++ linux/arch/i386/kernel/process.c
@@ -300,6 +300,7 @@ early_param("idle", idle_setup);
 void show_regs(struct pt_regs * regs)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+	unsigned long d0, d1, d2, d3, d6, d7;
 
 	printk("\n");
 	printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
@@ -324,6 +325,17 @@ void show_regs(struct pt_regs * regs)
 	cr3 = read_cr3();
 	cr4 = read_cr4_safe();
 	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
+
+	get_debugreg(d0, 0);
+	get_debugreg(d1, 1);
+	get_debugreg(d2, 2);
+	get_debugreg(d3, 3);
+	printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+			d0, d1, d2, d3);
+	get_debugreg(d6, 6);
+	get_debugreg(d7, 7);
+	printk("DR6: %08lx DR7: %08lx\n", d6, d7);
+
 	show_trace(NULL, regs, &regs->esp);
 }
 
Index: linux/arch/x86_64/kernel/process.c
===================================================================
--- linux.orig/arch/x86_64/kernel/process.c
+++ linux/arch/x86_64/kernel/process.c
@@ -306,6 +306,7 @@ early_param("idle", idle_setup);
 void __show_regs(struct pt_regs * regs)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+	unsigned long d0, d1, d2, d3, d6, d7;
 	unsigned int fsindex,gsindex;
 	unsigned int ds,cs,es; 
 
@@ -350,6 +351,15 @@ void __show_regs(struct pt_regs * regs)
 	       fs,fsindex,gs,gsindex,shadowgs); 
 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+
+	get_debugreg(d0, 0);
+	get_debugreg(d1, 1);
+	get_debugreg(d2, 2);
+	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+	get_debugreg(d3, 3);
+	get_debugreg(d6, 6);
+	get_debugreg(d7, 7);
+	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 }
 
 void show_regs(struct pt_regs *regs)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [55/58] i386: add reference to the arguments
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (53 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [54/58] x86: Make Alt-SysRq-p display the debug register contents Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [56/58] x86: round_jiffies() for i386 and x86-64 non-critical/corrected MCE polling Andi Kleen
                   ` (2 subsequent siblings)
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: akpm, ak, benh, patches, linux-kernel


From: Andrew Morton <akpm@linux-foundation.org>

Prevent stuff like this:

mm/vmalloc.c: In function 'unmap_kernel_range':
mm/vmalloc.c:75: warning: unused variable 'start'

Cc: Andi Kleen <ak@suse.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>

---

 include/asm-i386/tlbflush.h |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

Index: linux/include/asm-i386/tlbflush.h
===================================================================
--- linux.orig/include/asm-i386/tlbflush.h
+++ linux/include/asm-i386/tlbflush.h
@@ -160,7 +160,11 @@ DECLARE_PER_CPU(struct tlb_state, cpu_tl
 	native_flush_tlb_others(&mask, mm, va)
 #endif
 
-#define flush_tlb_kernel_range(start, end) flush_tlb_all()
+static inline void flush_tlb_kernel_range(unsigned long start,
+					unsigned long end)
+{
+	flush_tlb_all();
+}
 
 static inline void flush_tlb_pgtables(struct mm_struct *mm,
 				      unsigned long start, unsigned long end)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [56/58] x86: round_jiffies() for i386 and x86-64 non-critical/corrected MCE polling
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (54 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [55/58] i386: add reference to the arguments Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [57/58] x86_64: check remote IRR bit before migrating level triggered irq Andi Kleen
  2007-07-19  9:55 ` [PATCH] [58/58] x86: remove support for the Rise CPU Andi Kleen
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: venkatesh.pallipadi, ak, patches, linux-kernel


From: Venki Pallipadi <venkatesh.pallipadi@intel.com>

This helps to reduce the frequency at which the CPU must be taken out of a
lower-power state.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Tim Hockin <thockin@hockin.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/i386/kernel/cpu/mcheck/non-fatal.c |    4 ++--
 arch/x86_64/kernel/mce.c                |    9 ++++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

Index: linux/arch/i386/kernel/cpu/mcheck/non-fatal.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/mcheck/non-fatal.c
+++ linux/arch/i386/kernel/cpu/mcheck/non-fatal.c
@@ -57,7 +57,7 @@ static DECLARE_DELAYED_WORK(mce_work, mc
 static void mce_work_fn(struct work_struct *work)
 { 
 	on_each_cpu(mce_checkregs, NULL, 1, 1);
-	schedule_delayed_work(&mce_work, MCE_RATE);
+	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
 } 
 
 static int __init init_nonfatal_mce_checker(void)
@@ -82,7 +82,7 @@ static int __init init_nonfatal_mce_chec
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
-	schedule_delayed_work(&mce_work, MCE_RATE);
+	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
 	printk(KERN_INFO "Machine check exception polling timer started.\n");
 	return 0;
 }
Index: linux/arch/x86_64/kernel/mce.c
===================================================================
--- linux.orig/arch/x86_64/kernel/mce.c
+++ linux/arch/x86_64/kernel/mce.c
@@ -375,7 +375,8 @@ static void mcheck_timer(struct work_str
 	if (mce_notify_user()) {
 		next_interval = max(next_interval/2, HZ/100);
 	} else {
-		next_interval = min(next_interval*2, check_interval*HZ);
+		next_interval = min(next_interval*2,
+				(int)round_jiffies_relative(check_interval*HZ));
 	}
 
 	schedule_delayed_work(&mcheck_work, next_interval);
@@ -428,7 +429,8 @@ static __init int periodic_mcheck_init(v
 { 
 	next_interval = check_interval * HZ;
 	if (next_interval)
-		schedule_delayed_work(&mcheck_work, next_interval);
+		schedule_delayed_work(&mcheck_work,
+				      round_jiffies_relative(next_interval));
 	idle_notifier_register(&mce_idle_notifier);
 	return 0;
 } 
@@ -720,7 +722,8 @@ static void mce_restart(void) 
 	on_each_cpu(mce_init, NULL, 1, 1);       
 	next_interval = check_interval * HZ;
 	if (next_interval)
-		schedule_delayed_work(&mcheck_work, next_interval);
+		schedule_delayed_work(&mcheck_work,
+				      round_jiffies_relative(next_interval));
 }
 
 static struct sysdev_class mce_sysclass = {

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [57/58] x86_64: check remote IRR bit before migrating level triggered irq
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (55 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [56/58] x86: round_jiffies() for i386 and x86-64 non-critical/corrected MCE polling Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19  9:55 ` [PATCH] [58/58] x86: remove support for the Rise CPU Andi Kleen
  57 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: ebiederm, nanhai.zou, asit.k.mallick, keith.packard, ak, patches,
	linux-kernel


From: Eric W. Biederman <ebiederm@xmission.com>

On x86_64 kernel, level triggered irq migration gets initiated in the
context of that interrupt(after executing the irq handler) and following
steps are followed to do the irq migration.

1. mask IOAPIC RTE entry;     // write to IOAPIC RTE
2. EOI;                       // processor EOI write
3. reprogram IOAPIC RTE entry // write to IOAPIC RTE with new destination and
                              // and interrupt vector due to per cpu vector
                              // allocation.
4. unmask IOAPIC RTE entry;   // write to IOAPIC RTE

Because of the per cpu vector allocation in x86_64 kernels, when the irq
migrates to a different cpu, new vector(corresponding to the new cpu) will
get allocated.

An EOI write to local APIC has a side effect of generating an EOI write for
level trigger interrupts (normally this is a broadcast to all IOAPICs). 
The EOI broadcast generated as a side effect of EOI write to processor may
be delayed while the other IOAPIC writes (step 3 and 4) can go through.

Normally, the EOI generated by local APIC for level trigger interrupt
contains vector number.  The IOAPIC will take this vector number and search
the IOAPIC RTE entries for an entry with matching vector number and clear
the remote IRR bit (indicate EOI).  However, if the vector number is
changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI
is received later.  This will cause the remote IRR to get stuck causing the
interrupt hang (no more interrupt from this RTE).

Current x86_64 kernel assumes that remote IRR bit is cleared by the time
IOAPIC RTE is reprogrammed.  Fix this assumption by checking for remote IRR
bit and if it still set, delay the irq migration to the next interrupt
arrival event(hopefully, next time remote IRR bit will get cleared before
the IOAPIC RTE is reprogrammed).

Initial analysis and patch from Nanhai.

Clean up patch from Suresh.

Rewritten to be less intrusive, and to contain a big fat comment by Eric.

[akpm@linux-foundation.org: fix comments]
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nanhai Zou <nanhai.zou@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Keith Packard <keith.packard@intel.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86_64/kernel/io_apic.c |   58 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/io_apic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/io_apic.c
+++ linux/arch/x86_64/kernel/io_apic.c
@@ -152,6 +152,32 @@ static inline void io_apic_modify(unsign
 	writel(value, &io_apic->data);
 }
 
+static int io_apic_level_ack_pending(unsigned int irq)
+{
+	struct irq_pin_list *entry;
+	unsigned long flags;
+	int pending = 0;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	entry = irq_2_pin + irq;
+	for (;;) {
+		unsigned int reg;
+		int pin;
+
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		reg = io_apic_read(entry->apic, 0x10 + pin*2);
+		/* Is the remote IRR bit set? */
+		pending |= (reg >> 14) & 1;
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	return pending;
+}
+
 /*
  * Synchronize the IO-APIC and the CPU by doing
  * a dummy read from the IO-APIC
@@ -1418,9 +1444,37 @@ static void ack_apic_level(unsigned int 
 	ack_APIC_irq();
 
 	/* Now we can move and renable the irq */
-	move_masked_irq(irq);
-	if (unlikely(do_unmask_irq))
+	if (unlikely(do_unmask_irq)) {
+		/* Only migrate the irq if the ack has been received.
+		 *
+		 * On rare occasions the broadcast level triggered ack gets
+		 * delayed going to ioapics, and if we reprogram the
+		 * vector while Remote IRR is still set the irq will never
+		 * fire again.
+		 *
+		 * To prevent this scenario we read the Remote IRR bit
+		 * of the ioapic.  This has two effects.
+		 * - On any sane system the read of the ioapic will
+		 *   flush writes (and acks) going to the ioapic from
+		 *   this cpu.
+		 * - We get to see if the ACK has actually been delivered.
+		 *
+		 * Based on failed experiments of reprogramming the
+		 * ioapic entry from outside of irq context starting
+		 * with masking the ioapic entry and then polling until
+		 * Remote IRR was clear before reprogramming the
+		 * ioapic I don't trust the Remote IRR bit to be
+		 * completey accurate.
+		 *
+		 * However there appears to be no other way to plug
+		 * this race, so if the Remote IRR bit is not
+		 * accurate and is causing problems then it is a hardware bug
+		 * and you can go talk to the chipset vendor about it.
+		 */
+		if (!io_apic_level_ack_pending(irq))
+			move_masked_irq(irq);
 		unmask_IO_APIC_irq(irq);
+	}
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] [58/58] x86: remove support for the Rise CPU
  2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
                   ` (56 preceding siblings ...)
  2007-07-19  9:55 ` [PATCH] [57/58] x86_64: check remote IRR bit before migrating level triggered irq Andi Kleen
@ 2007-07-19  9:55 ` Andi Kleen
  2007-07-19 10:45   ` Alan Cox
  57 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19  9:55 UTC (permalink / raw)
  To: bunk, ak, patches, linux-kernel


From: Adrian Bunk <bunk@stusta.de>

The Rise CPUs were only very short-lived, and there are no reports of
anyone both owning one and running Linux on it.

Googling for the printk string "CPU: Rise iDragon" didn't find any dmesg
available online.

If it turns out that against all expectations there are actually users
reverting this patch would be easy.

This patch will make the kernel images smaller by a few bytes for all
i386 users.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Dave Jones <davej@redhat.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/i386/kernel/cpu/Makefile  |    1 
 arch/i386/kernel/cpu/common.c  |    2 -
 arch/i386/kernel/cpu/rise.c    |   52 -----------------------------------------
 include/asm-i386/processor.h   |    1 
 include/asm-x86_64/processor.h |    1 
 5 files changed, 57 deletions(-)

Index: linux/arch/i386/kernel/cpu/Makefile
===================================================================
--- linux.orig/arch/i386/kernel/cpu/Makefile
+++ linux/arch/i386/kernel/cpu/Makefile
@@ -9,7 +9,6 @@ obj-y	+=	cyrix.o
 obj-y	+=	centaur.o
 obj-y	+=	transmeta.o
 obj-y	+=	intel.o intel_cacheinfo.o addon_cpuid_features.o
-obj-y	+=	rise.o
 obj-y	+=	nexgen.o
 obj-y	+=	umc.o
 
Index: linux/arch/i386/kernel/cpu/common.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/common.c
+++ linux/arch/i386/kernel/cpu/common.c
@@ -606,7 +606,6 @@ extern int nsc_init_cpu(void);
 extern int amd_init_cpu(void);
 extern int centaur_init_cpu(void);
 extern int transmeta_init_cpu(void);
-extern int rise_init_cpu(void);
 extern int nexgen_init_cpu(void);
 extern int umc_init_cpu(void);
 
@@ -618,7 +617,6 @@ void __init early_cpu_init(void)
 	amd_init_cpu();
 	centaur_init_cpu();
 	transmeta_init_cpu();
-	rise_init_cpu();
 	nexgen_init_cpu();
 	umc_init_cpu();
 	early_cpu_detect();
Index: linux/arch/i386/kernel/cpu/rise.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/rise.c
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/bitops.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-static void __cpuinit init_rise(struct cpuinfo_x86 *c)
-{
-	printk("CPU: Rise iDragon");
-	if (c->x86_model > 2)
-		printk(" II");
-	printk("\n");
-
-	/* Unhide possibly hidden capability flags
-	   The mp6 iDragon family don't have MSRs.
-	   We switch on extra features with this cpuid weirdness: */
-	__asm__ (
-		"movl $0x6363452a, %%eax\n\t"
-		"movl $0x3231206c, %%ecx\n\t"
-		"movl $0x2a32313a, %%edx\n\t"
-		"cpuid\n\t"
-		"movl $0x63634523, %%eax\n\t"
-		"movl $0x32315f6c, %%ecx\n\t"
-		"movl $0x2333313a, %%edx\n\t"
-		"cpuid\n\t" : : : "eax", "ebx", "ecx", "edx"
-	);
-	set_bit(X86_FEATURE_CX8, c->x86_capability);
-}
-
-static struct cpu_dev rise_cpu_dev __cpuinitdata = {
-	.c_vendor	= "Rise",
-	.c_ident	= { "RiseRiseRise" },
-	.c_models = {
-		{ .vendor = X86_VENDOR_RISE, .family = 5, .model_names = 
-		  { 
-			  [0] = "iDragon", 
-			  [2] = "iDragon", 
-			  [8] = "iDragon II", 
-			  [9] = "iDragon II"
-		  }
-		},
-	},
-	.c_init		= init_rise,
-};
-
-int __init rise_init_cpu(void)
-{
-	cpu_devs[X86_VENDOR_RISE] = &rise_cpu_dev;
-	return 0;
-}
-
Index: linux/include/asm-i386/processor.h
===================================================================
--- linux.orig/include/asm-i386/processor.h
+++ linux/include/asm-i386/processor.h
@@ -88,7 +88,6 @@ struct cpuinfo_x86 {
 #define X86_VENDOR_UMC 3
 #define X86_VENDOR_NEXGEN 4
 #define X86_VENDOR_CENTAUR 5
-#define X86_VENDOR_RISE 6
 #define X86_VENDOR_TRANSMETA 7
 #define X86_VENDOR_NSC 8
 #define X86_VENDOR_NUM 9
Index: linux/include/asm-x86_64/processor.h
===================================================================
--- linux.orig/include/asm-x86_64/processor.h
+++ linux/include/asm-x86_64/processor.h
@@ -83,7 +83,6 @@ struct cpuinfo_x86 {
 #define X86_VENDOR_UMC 3
 #define X86_VENDOR_NEXGEN 4
 #define X86_VENDOR_CENTAUR 5
-#define X86_VENDOR_RISE 6
 #define X86_VENDOR_TRANSMETA 7
 #define X86_VENDOR_NUM 8
 #define X86_VENDOR_UNKNOWN 0xff

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [21/58] i386: Reserve the right performance counter for the Intel PerfMon NMI watchdog
  2007-07-19  9:55 ` [PATCH] [21/58] i386: Reserve the right performance counter for the Intel PerfMon " Andi Kleen
@ 2007-07-19 10:21   ` Björn Steinbrink
  2007-07-19 10:45     ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Björn Steinbrink @ 2007-07-19 10:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8, Size: 1766 bytes --]

On 2007.07.19 11:55:06 +0200, Andi Kleen wrote:
> 
> From: [** iso-8859-1 charset **] BjörnSteinbrink <B.Steinbrink@gmx.de>
> 
> The Intel PerfMon NMI watchdog was using the generic reservation
> function which always reserves the first performance counter. But the
> watchdog actually uses the second performance counter, thus we need a
> specialised function.

Ah, almost forgot about that patch. Actually, thanks to your fix that
basically reverted the msr->offset conversation to its 2.6.21
implementation, single_msr_reserve has sane semantics now and does just
what the name suggests (before, the wd_ops entries had to store the
"base" msrs, so it was really a first_msr_reserve).

With wd_ops->perfctr no longer needed to be the base msr, we can just
fix that value for the arch perfmon watchdog. (And maybe we should
remove the values for those implementations that don't employ the
single_msr_reserve() stuff?)

Thanks,
Björn

----

From: Björn Steinbrink <B.Steinbrink@gmx.de>

The Intel PerfMon NMI watchdog reserves the first performance counter,
but uses the second one. Make it correctly reserve the second one.

Signed-off-by: Björn Steinbrink <B.Steinbrink@gmx.de>
---
diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c
index 4d26d51..30b5e48 100644
--- a/arch/i386/kernel/cpu/perfctr-watchdog.c
+++ b/arch/i386/kernel/cpu/perfctr-watchdog.c
@@ -599,8 +599,8 @@ static struct wd_ops intel_arch_wd_ops = {
 	.setup = setup_intel_arch_watchdog,
 	.rearm = p6_rearm,
 	.stop = single_msr_stop_watchdog,
-	.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
-	.evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr = MSR_ARCH_PERFMON_PERFCTR1,
+	.evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
 };
 
 static void probe_nmi_watchdog(void)

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: [PATCH] [33/58] x86_64: Avoid too many remote cpu references due to /proc/stat
  2007-07-19  9:55 ` [PATCH] [33/58] x86_64: Avoid too many remote cpu references due to /proc/stat Andi Kleen
@ 2007-07-19 10:21   ` Christoph Hellwig
  2007-07-19 10:41     ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Christoph Hellwig @ 2007-07-19 10:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: kiran, patches, linux-kernel

On Thu, Jul 19, 2007 at 11:55:19AM +0200, Andi Kleen wrote:
> 
> From: Ravikiran G Thirumalai <kiran@scalex86.org>
> Too many remote cpu references due to /proc/stat.
> 
> On x86_64, with newer kernel versions, kstat_irqs is a bit of a problem.
> On every call to kstat_irqs, the process brings in per-cpu data from all
> online cpus.  Doing this for NR_IRQS, which is now 256 + 32 * NR_CPUS
> results in (256+32*63) * 63 remote cpu references on a 64 cpu config.
> /proc/stat is parsed by common commands like top, who etc, causing
> lots of cacheline transfers
> 
> This statistic seems useless. Other 'big iron' arches disable this.
> Can we disable computing/reporting this statistic?  This piece of
> statistic is not human readable on x86_64 anymore,
> 
> If not, can we optimize computing this statistic so as to avoid
> too many remote references (patch to follow)

If we disable this on x86_64 we should just kill it completely for consistency.

> -#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64)
> +#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64) \
> +					&& !defined(CONFIG_X86_64)
>  	for (i = 0; i < NR_IRQS; i++)
>  		seq_printf(p, " %u", kstat_irqs(i));
>  #endif

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [5/58] x86_64: Report the pending irq if available in smp_affinity
  2007-07-19  9:54 ` [PATCH] [5/58] x86_64: Report the pending irq if available in smp_affinity Andi Kleen
@ 2007-07-19 10:23   ` Ingo Molnar
  0 siblings, 0 replies; 119+ messages in thread
From: Ingo Molnar @ 2007-07-19 10:23 UTC (permalink / raw)
  To: Andi Kleen; +Cc: tglx, patches, linux-kernel


* Andi Kleen <ak@suse.de> wrote:

> -	int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
> +	struct irq_desc *desc = irq_desc + (long)data;
> +	cpumask_t *mask = &desc->affinity;
> +	int len;
> +#ifdef CONFIG_GENERIC_PENDING_IRQ
> +	if (desc->status & IRQ_MOVE_PENDING)

small style nit: please put a newline after the type definitions.

	Ingo

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [20/58] x86: Always probe the NMI watchdog
  2007-07-19  9:55 ` [PATCH] [20/58] x86: Always probe the NMI watchdog Andi Kleen
@ 2007-07-19 10:24   ` Björn Steinbrink
  2007-07-19 10:42     ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Björn Steinbrink @ 2007-07-19 10:24 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8, Size: 528 bytes --]

On 2007.07.19 11:55:05 +0200, Andi Kleen wrote:
> 
> From: [** iso-8859-1 charset **] BjörnSteinbrink <B.Steinbrink@gmx.de>
> 
> The performance counter allocator relies on the nmi watchdog being
> probed, so we have to do that even if the watchdog is not enabled.

Are you going to revert your fixes to the msr->bit conversions? Or is
this patch still required with them in place? I actually like your fix,
as it also fixes the semantics of single_msr_reserve() (see other mail).

So, can we just drop this one?

Thanks,
Björn

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [33/58] x86_64: Avoid too many remote cpu references due to /proc/stat
  2007-07-19 10:21   ` Christoph Hellwig
@ 2007-07-19 10:41     ` Andi Kleen
  2007-07-19 10:55       ` Adrian Bunk
  0 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 10:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kiran, patches, linux-kernel

On Thursday 19 July 2007 12:21:49 Christoph Hellwig wrote:
> On Thu, Jul 19, 2007 at 11:55:19AM +0200, Andi Kleen wrote:
> > 
> > From: Ravikiran G Thirumalai <kiran@scalex86.org>
> > Too many remote cpu references due to /proc/stat.
> > 
> > On x86_64, with newer kernel versions, kstat_irqs is a bit of a problem.
> > On every call to kstat_irqs, the process brings in per-cpu data from all
> > online cpus.  Doing this for NR_IRQS, which is now 256 + 32 * NR_CPUS
> > results in (256+32*63) * 63 remote cpu references on a 64 cpu config.
> > /proc/stat is parsed by common commands like top, who etc, causing
> > lots of cacheline transfers
> > 
> > This statistic seems useless. Other 'big iron' arches disable this.
> > Can we disable computing/reporting this statistic?  This piece of
> > statistic is not human readable on x86_64 anymore,
> > 
> > If not, can we optimize computing this statistic so as to avoid
> > too many remote references (patch to follow)
> 
> If we disable this on x86_64 we should just kill it completely for consistency.

I guess it's fine on UP only architectures.  I will change it to !CONFIG_SMP
unless someone complains.

-Andi


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [20/58] x86: Always probe the NMI watchdog
  2007-07-19 10:24   ` Björn Steinbrink
@ 2007-07-19 10:42     ` Andi Kleen
  0 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 10:42 UTC (permalink / raw)
  To: Björn Steinbrink; +Cc: patches, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 717 bytes --]

On Thursday 19 July 2007 12:24:05 Björn Steinbrink wrote:
> On 2007.07.19 11:55:05 +0200, Andi Kleen wrote:
> > 
> > From: [** iso-8859-1 charset **] BjörnSteinbrink <B.Steinbrink@gmx.de>
> > 
> > The performance counter allocator relies on the nmi watchdog being
> > probed, so we have to do that even if the watchdog is not enabled.
> 
> Are you going to revert your fixes to the msr->bit conversions? Or is
> this patch still required with them in place? I actually like your fix,
> as it also fixes the semantics of single_msr_reserve() (see other mail).
> 
> So, can we just drop this one?

Sorry, i already dropped it during final review; but it was still
in the list of patches to send out for review.

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [21/58] i386: Reserve the right performance counter for the Intel PerfMon NMI watchdog
  2007-07-19 10:21   ` Björn Steinbrink
@ 2007-07-19 10:45     ` Andi Kleen
  0 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 10:45 UTC (permalink / raw)
  To: Björn Steinbrink; +Cc: patches, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1074 bytes --]

On Thursday 19 July 2007 12:21:45 Björn Steinbrink wrote:
> On 2007.07.19 11:55:06 +0200, Andi Kleen wrote:
> > 
> > From: [** iso-8859-1 charset **] BjörnSteinbrink <B.Steinbrink@gmx.de>
> > 
> > The Intel PerfMon NMI watchdog was using the generic reservation
> > function which always reserves the first performance counter. But the
> > watchdog actually uses the second performance counter, thus we need a
> > specialised function.
> 
> Ah, almost forgot about that patch. Actually, thanks to your fix that
> basically reverted the msr->offset conversation to its 2.6.21
> implementation, single_msr_reserve has sane semantics now and does just
> what the name suggests (before, the wd_ops entries had to store the
> "base" msrs, so it was really a first_msr_reserve).
> 
> With wd_ops->perfctr no longer needed to be the base msr, we can just
> fix that value for the arch perfmon watchdog. (And maybe we should
> remove the values for those implementations that don't employ the
> single_msr_reserve() stuff?)

I replaced the patch with the new patch, thanks

-Andi


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [58/58] x86: remove support for the Rise CPU
  2007-07-19  9:55 ` [PATCH] [58/58] x86: remove support for the Rise CPU Andi Kleen
@ 2007-07-19 10:45   ` Alan Cox
  2007-07-19 10:48     ` Adrian Bunk
  0 siblings, 1 reply; 119+ messages in thread
From: Alan Cox @ 2007-07-19 10:45 UTC (permalink / raw)
  To: Andi Kleen; +Cc: bunk, ak, patches, linux-kernel

> Googling for the printk string "CPU: Rise iDragon" didn't find any dmesg
> available online.
> 
> If it turns out that against all expectations there are actually users
> reverting this patch would be easy.
> 
> This patch will make the kernel images smaller by a few bytes for all
> i386 users.

Why bother. Its a tiny tiny amount of code and it requires no maintenance
so it achieves nothing by leaving it alone and risks (slight I admit)
breaking someones box.

Alan


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [58/58] x86: remove support for the Rise CPU
  2007-07-19 10:45   ` Alan Cox
@ 2007-07-19 10:48     ` Adrian Bunk
  2007-07-19 11:13       ` Alan Cox
  0 siblings, 1 reply; 119+ messages in thread
From: Adrian Bunk @ 2007-07-19 10:48 UTC (permalink / raw)
  To: Alan Cox; +Cc: Andi Kleen, patches, linux-kernel

On Thu, Jul 19, 2007 at 11:45:29AM +0100, Alan Cox wrote:
> > Googling for the printk string "CPU: Rise iDragon" didn't find any dmesg
> > available online.
> > 
> > If it turns out that against all expectations there are actually users
> > reverting this patch would be easy.
> > 
> > This patch will make the kernel images smaller by a few bytes for all
> > i386 users.
> 
> Why bother. Its a tiny tiny amount of code and it requires no maintenance
> so it achieves nothing by leaving it alone and risks (slight I admit)
> breaking someones box.

- It's not only code, it also bloats everyone's kernel image.
- All it did was to fiddle with capabilities - if any computer with
  a Rise cpu running Linux actually exists it should still work.
- It's highly unlikely that we had any user of this code.

> Alan

cu
Adrian

-- 

       "Is there not promise of rain?" Ling Tan asked suddenly out
        of the darkness. There had been need of rain for many days.
       "Only a promise," Lao Er said.
                                       Pearl S. Buck - Dragon Seed


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [33/58] x86_64: Avoid too many remote cpu references due to /proc/stat
  2007-07-19 10:41     ` Andi Kleen
@ 2007-07-19 10:55       ` Adrian Bunk
  0 siblings, 0 replies; 119+ messages in thread
From: Adrian Bunk @ 2007-07-19 10:55 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Christoph Hellwig, kiran, patches, linux-kernel

On Thu, Jul 19, 2007 at 12:41:28PM +0200, Andi Kleen wrote:
> On Thursday 19 July 2007 12:21:49 Christoph Hellwig wrote:
> > On Thu, Jul 19, 2007 at 11:55:19AM +0200, Andi Kleen wrote:
> > > 
> > > From: Ravikiran G Thirumalai <kiran@scalex86.org>
> > > Too many remote cpu references due to /proc/stat.
> > > 
> > > On x86_64, with newer kernel versions, kstat_irqs is a bit of a problem.
> > > On every call to kstat_irqs, the process brings in per-cpu data from all
> > > online cpus.  Doing this for NR_IRQS, which is now 256 + 32 * NR_CPUS
> > > results in (256+32*63) * 63 remote cpu references on a 64 cpu config.
> > > /proc/stat is parsed by common commands like top, who etc, causing
> > > lots of cacheline transfers
> > > 
> > > This statistic seems useless. Other 'big iron' arches disable this.
> > > Can we disable computing/reporting this statistic?  This piece of
> > > statistic is not human readable on x86_64 anymore,
> > > 
> > > If not, can we optimize computing this statistic so as to avoid
> > > too many remote references (patch to follow)
> > 
> > If we disable this on x86_64 we should just kill it completely for consistency.
> 
> I guess it's fine on UP only architectures.  I will change it to !CONFIG_SMP
> unless someone complains.

Making it depending on the kernel configuration will only cause 
surprises for users. And if you really need the data you can get 
it from /proc/interrupts.

> -Andi

cu
Adrian

-- 

       "Is there not promise of rain?" Ling Tan asked suddenly out
        of the darkness. There had been need of rain for many days.
       "Only a promise," Lao Er said.
                                       Pearl S. Buck - Dragon Seed


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [14/58] x86_64: Add on_cpu_single
  2007-07-19  9:54 ` [PATCH] [14/58] x86_64: Add on_cpu_single Andi Kleen
@ 2007-07-19 11:09   ` Satyam Sharma
  2007-07-19 12:07     ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Satyam Sharma @ 2007-07-19 11:09 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

Hi Andi,

On 7/19/07, Andi Kleen <ak@suse.de> wrote:
>
> Call a function on a target CPU but do the right thing when
> we're already on that CPU. That's the main difference from
> smp_call_function_single
> which does the wrong thing in this case (erroring out)

I think this is no longer the case, is it? With KVM updates already
merged in latest mainline -git, that modified smp_call_function_single()
behaviour ...

> +#ifdef CONFIG_SMP
> +/* Similar to smp_call_function_single, but DTRT when we're already
> +   on the right CPU. */
> +static inline void on_cpu_single(int cpu, void (*func)(void *), void *info)
> +{
> +       int me = get_cpu();
> +       if (cpu == me) {
> +               func(info);
> +               put_cpu();
> +       } else {
> +               put_cpu();
> +               /* wait is forced on because the me==cpu case above will always wait */
> +               smp_call_function_single(cpu, func, info, 0, 1);

In any case, this is unsafe. smp_call_function_single() -- with the old
semantics, which is what this patch assumes, obviously -- is quite
pointless without its _caller_ disabling preemption around it. So the
put_cpu() must come after the smp_call_function_single, otherwise
you won't even detect the error that might happen, seeing you're
ignoring its return and this wrapper being void-returning.

> +       }
> +}
> +#else
> +static inline void on_cpu_single(int cpu, void (*func)(void *), void *info)
> +{

WARN_ON(irqs_disabled());
local_irq_disable();

> +       func(info);

local_irq_restore();

> +}
> +#endif

... for the sake of API / behaviour consistency.


But probably you should just drop this ... with smp_call_function_single's
new semantics, I don't see this function growing any users.

Satyam

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [58/58] x86: remove support for the Rise CPU
  2007-07-19 10:48     ` Adrian Bunk
@ 2007-07-19 11:13       ` Alan Cox
  2007-07-19 12:03         ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Alan Cox @ 2007-07-19 11:13 UTC (permalink / raw)
  To: Adrian Bunk; +Cc: Andi Kleen, patches, linux-kernel

> - It's not only code, it also bloats everyone's kernel image.

Its a miniscule piece of code that is discarded on boot. Yes it might
make the image 100 bytes longer, but have you priced a 160GB disk
recently. I don't think 100 bytes of disk and 0 of memory really is worth
saving for any risk at all. Its not even worth the time to apply the
patch.

> - All it did was to fiddle with capabilities - if any computer with
>   a Rise cpu running Linux actually exists it should still work.
> - It's highly unlikely that we had any user of this code.

But you've no idea if this is true. Probably some of our other drivers
are bigger and have less users.

Alan

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [2/58] x86_64: Tell gcc to only align stack to 8 bytes
  2007-07-19  9:54 ` [PATCH] [2/58] x86_64: Tell gcc to only align stack to 8 bytes Andi Kleen
@ 2007-07-19 11:50   ` Serge Belyshev
  2007-07-19 12:06     ` Andi Kleen
  2007-07-19 14:42   ` Chuck Ebbert
  1 sibling, 1 reply; 119+ messages in thread
From: Serge Belyshev @ 2007-07-19 11:50 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

Andi Kleen <ak@suse.de> writes:

> Don't need 16 byte alignment because kernel doesn't use SSE2
>
...
>  cflags-y += -maccumulate-outgoing-args
> +cflags-y += -mpreferred-stack-boundary=4
>  

>From gcc manpage:

       -mpreferred-stack-boundary=num
           Attempt to keep the stack boundary aligned to a 2 raised to num
           byte boundary.  If -mpreferred-stack-boundary is not specified, the
           default is 4 (16 bytes or 128 bits), except when optimizing for
           code size (-Os), in which case the default is the minimum correct
           alignment (4 bytes for x86, and 8 bytes for x86-64).

So -mpreferred-stack-boundary=4 is the default and to align stack
to 8 bytes you want -mpreferred-stack-boundary=3, not 4, IIUC.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [58/58] x86: remove support for the Rise CPU
  2007-07-19 11:13       ` Alan Cox
@ 2007-07-19 12:03         ` Andi Kleen
  2007-07-19 14:56           ` Jeff Garzik
  0 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 12:03 UTC (permalink / raw)
  To: Alan Cox; +Cc: Adrian Bunk, patches, linux-kernel

On Thursday 19 July 2007 13:13:40 Alan Cox wrote:
> > - It's not only code, it also bloats everyone's kernel image.
> 
> Its a miniscule piece of code that is discarded on boot. Yes it might
> make the image 100 bytes longer, but have you priced a 160GB disk
> recently. I don't think 100 bytes of disk and 0 of memory really is worth
> saving for any risk at all. Its not even worth the time to apply the
> patch.

The patch is already applied.

Besides the CPU will likely boot even without special handling.

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [2/58] x86_64: Tell gcc to only align stack to 8 bytes
  2007-07-19 11:50   ` Serge Belyshev
@ 2007-07-19 12:06     ` Andi Kleen
  0 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 12:06 UTC (permalink / raw)
  To: Serge Belyshev; +Cc: patches, linux-kernel

On Thursday 19 July 2007 13:50:20 Serge Belyshev wrote:

> So -mpreferred-stack-boundary=4 is the default and to align stack
> to 8 bytes you want -mpreferred-stack-boundary=3, not 4, IIUC.

Fixed thanks

-Andi



^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [14/58] x86_64: Add on_cpu_single
  2007-07-19 11:09   ` Satyam Sharma
@ 2007-07-19 12:07     ` Andi Kleen
  0 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 12:07 UTC (permalink / raw)
  To: Satyam Sharma; +Cc: patches, linux-kernel


> But probably you should just drop this ... with smp_call_function_single's
> new semantics, I don't see this function growing any users.

The new sched-clock uses it, but i'll update it to use smp_call_function_single

Thanks

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [19/58] x86_64: Don't use softirq save locks in smp_call_function
  2007-07-19  9:55 ` [PATCH] [19/58] x86_64: Don't use softirq save locks in smp_call_function Andi Kleen
@ 2007-07-19 12:16   ` Satyam Sharma
  2007-07-19 12:19     ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Satyam Sharma @ 2007-07-19 12:16 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

On 7/19/07, Andi Kleen <ak@suse.de> wrote:
>
> It is not fully softirq safe anyways.

Ack

[ sorry, I remember having promised to send such a patch myself
some time ago, but just forgot about it ... ]

> Can't do a WARN_ON unfortunately because it could trigger in the
> panic case.

But this is not true at all. This function doesn't come anywhere
on the panic codepath.

> +++ linux/arch/x86_64/kernel/smp.c
> @@ -386,9 +386,9 @@ int smp_call_function_single (int cpu, v
>                 return 0;
>         }

So I'd say we do need a:

WARN_ON(irqs_disabled() || in_interrupt());

or something right about here ...

> -       spin_lock_bh(&call_lock);
> +       spin_lock(&call_lock);
>         __smp_call_function_single(cpu, func, info, nonatomic, wait);
> -       spin_unlock_bh(&call_lock);
> +       spin_unlock(&call_lock);
>         put_cpu();
>         return 0;
>  }

And oh, by the way, you can safely go ahead and put that warning
in smp_call_function() *also*.

Note that panic() -> smp_send_stop() -> calls into the lower-level
__smp_call_function() directly.

So neither smp_call_function() nor smp_call_function_single() come
in the panic codepath -- the warnings there would be okay.

Satyam

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [19/58] x86_64: Don't use softirq save locks in smp_call_function
  2007-07-19 12:16   ` Satyam Sharma
@ 2007-07-19 12:19     ` Andi Kleen
  0 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 12:19 UTC (permalink / raw)
  To: Satyam Sharma; +Cc: patches, linux-kernel

On Thursday 19 July 2007 14:16:48 Satyam Sharma wrote:

>
> > Can't do a WARN_ON unfortunately because it could trigger in the
> > panic case.
> 
> But this is not true at all. This function doesn't come anywhere
> on the panic codepath.


You're wrong.

-Andi



^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [2/58] x86_64: Tell gcc to only align stack to 8 bytes
  2007-07-19  9:54 ` [PATCH] [2/58] x86_64: Tell gcc to only align stack to 8 bytes Andi Kleen
  2007-07-19 11:50   ` Serge Belyshev
@ 2007-07-19 14:42   ` Chuck Ebbert
  1 sibling, 0 replies; 119+ messages in thread
From: Chuck Ebbert @ 2007-07-19 14:42 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

On 07/19/2007 05:54 AM, Andi Kleen wrote:
> Don't need 16 byte alignment because kernel doesn't use SSE2
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  arch/x86_64/Makefile |    1 +
>  1 file changed, 1 insertion(+)
> 
> Index: linux/arch/x86_64/Makefile
> ===================================================================
> --- linux.orig/arch/x86_64/Makefile
> +++ linux/arch/x86_64/Makefile
> @@ -55,6 +55,7 @@ cflags-y += $(call cc-option,-mno-sse -m
>  # this works around some issues with generating unwind tables in older gccs
>  # newer gccs do it by default
>  cflags-y += -maccumulate-outgoing-args
> +cflags-y += -mpreferred-stack-boundary=4

Should be:

+cflags-y += -mpreferred-stack-boundary=3

>  
>  # do binutils support CFI?
>  cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [34/58] x86_64: ia32entry adjustments
  2007-07-19  9:55 ` [PATCH] [34/58] x86_64: ia32entry adjustments Andi Kleen
@ 2007-07-19 14:46   ` Jeff Garzik
  2007-08-06 10:43     ` Jan Beulich
  0 siblings, 1 reply; 119+ messages in thread
From: Jeff Garzik @ 2007-07-19 14:46 UTC (permalink / raw)
  To: Andi Kleen; +Cc: jbeulich, patches, linux-kernel, Andrew Morton

Andi Kleen wrote:
> From: "Jan Beulich" <jbeulich@novell.com>
> Consolidate the three 32-bit system call entry points so that they all
> treat registers in similar ways.
> 
> Signed-off-by: Jan Beulich <jbeulich@novell.com>
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
>  arch/x86_64/ia32/ia32entry.S |    5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> Index: linux/arch/x86_64/ia32/ia32entry.S
> ===================================================================
> --- linux.orig/arch/x86_64/ia32/ia32entry.S
> +++ linux/arch/x86_64/ia32/ia32entry.S
> @@ -104,7 +104,7 @@ ENTRY(ia32_sysenter_target)
>  	pushq	%rax
>  	CFI_ADJUST_CFA_OFFSET 8
>  	cld
> -	SAVE_ARGS 0,0,0
> +	SAVE_ARGS 0,0,1
>   	/* no need to do an access_ok check here because rbp has been
>   	   32bit zero extended */ 
>  1:	movl	(%rbp),%r9d
> @@ -294,7 +294,7 @@ ia32_badarg:
>   */ 				
>  
>  ENTRY(ia32_syscall)
> -	CFI_STARTPROC	simple
> +	CFI_STARTPROC32	simple
>  	CFI_SIGNAL_FRAME
>  	CFI_DEF_CFA	rsp,SS+8-RIP
>  	/*CFI_REL_OFFSET	ss,SS-RIP*/
> @@ -330,6 +330,7 @@ ia32_sysret:
>  
>  ia32_tracesys:			 
>  	SAVE_REST
> +	CLEAR_RREGS
>  	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
>  	movq %rsp,%rdi        /* &pt_regs -> arg1 */
>  	call syscall_trace_enter

More comments and/or a less vague patch description would be nice.

What registers?  What behavior is being made common?  Why?

	Jeff



^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [58/58] x86: remove support for the Rise CPU
  2007-07-19 12:03         ` Andi Kleen
@ 2007-07-19 14:56           ` Jeff Garzik
  0 siblings, 0 replies; 119+ messages in thread
From: Jeff Garzik @ 2007-07-19 14:56 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Alan Cox, Adrian Bunk, patches, linux-kernel, Andrew Morton

Andi Kleen wrote:
> On Thursday 19 July 2007 13:13:40 Alan Cox wrote:
>>> - It's not only code, it also bloats everyone's kernel image.
>> Its a miniscule piece of code that is discarded on boot. Yes it might
>> make the image 100 bytes longer, but have you priced a 160GB disk
>> recently. I don't think 100 bytes of disk and 0 of memory really is worth
>> saving for any risk at all. Its not even worth the time to apply the
>> patch.
> 
> The patch is already applied.

Er, huh?  It's not upstream.


> Besides the CPU will likely boot even without special handling.

You don't know this.  Why risk it?  Just leave the CPU magic as-is.

	Jeff



^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [8/58] x86_64: Use string instruction memcpy/memset on AMD Fam10
  2007-07-19  9:54 ` [PATCH] [8/58] x86_64: Use string instruction memcpy/memset on AMD Fam10 Andi Kleen
@ 2007-07-19 16:43   ` Jan Engelhardt
  2007-07-19 17:00     ` Yinghai Lu
  0 siblings, 1 reply; 119+ messages in thread
From: Jan Engelhardt @ 2007-07-19 16:43 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel


On Jul 19 2007 11:54, Andi Kleen wrote:

>Subject: [PATCH] [8/58] x86_64: Use string instruction memcpy/memset on AMD
>    Fam10

What processors carry 0x10?



	Jan
-- 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19  9:54 ` [PATCH] [15/58] i386: Rewrite sched_clock Andi Kleen
@ 2007-07-19 16:51   ` Daniel Walker
  2007-07-19 17:13     ` Andi Kleen
  2007-07-20  3:11     ` Mathieu Desnoyers
  0 siblings, 2 replies; 119+ messages in thread
From: Daniel Walker @ 2007-07-19 16:51 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

On Thu, 2007-07-19 at 11:54 +0200, Andi Kleen wrote:
> Move it into an own file for easy sharing.
> Do everything per CPU. This avoids problems with TSCs that
> tick at different frequencies per CPU.
> Resync properly on cpufreq changes. CPU frequency is instable
> around cpu frequency changing, so fall back during a backing
> clock during this period.
> Hopefully TSC will work now on all systems except when there isn't a
> physical TSC. 
> 
> And
> 
> +From: Jeremy Fitzhardinge <jeremy@goop.org>
> Three cleanups there:
>  - change "instable" -> "unstable"
>  - it's better to use get_cpu_var for getting this cpu's variables
>  - change cycles_2_ns to do the full computation rather than just the
>    tsc->ns scaling.  It's a simpler interface, and it makes the function

What about using the cycles2ns() clocksource helpers, it would eliminate
the duplication of the shift/multiply math .

> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  arch/i386/kernel/Makefile      |    3 
>  arch/i386/kernel/sched-clock.c |  265 +++++++++++++++++++++++++++++++++++++++++
>  arch/i386/kernel/tsc.c         |   74 -----------
>  include/asm-i386/timer.h       |   32 ----
>  include/asm-i386/tsc.h         |    1 
>  5 files changed, 269 insertions(+), 106 deletions(-)
> 
> Index: linux/arch/i386/kernel/sched-clock.c
> ===================================================================
> --- /dev/null
> +++ linux/arch/i386/kernel/sched-clock.c
> @@ -0,0 +1,265 @@
> +/* A fast clock for the scheduler.
> + * Copyright 2007 Andi Kleen SUSE Labs
> + * Subject to the GNU Public License, version 2 only.
> + */
> +#include <linux/init.h>
> +#include <linux/cpu.h>
> +#include <linux/cpufreq.h>
> +#include <linux/kernel.h>
> +#include <linux/percpu.h>
> +#include <linux/ktime.h>
> +#include <linux/hrtimer.h>
> +#include <linux/smp.h>
> +#include <linux/notifier.h>
> +#include <linux/init.h>
> +#include <asm/tsc.h>
> +#include <asm/cpufeature.h>
> +#include <asm/timer.h>
> +
> +/*
> + * convert from cycles(64bits) => nanoseconds (64bits)
> + *  basic equation:
> + *		ns = cycles / (freq / ns_per_sec)
> + *		ns = cycles * (ns_per_sec / freq)
> + *		ns = cycles * (10^9 / (cpu_khz * 10^3))
> + *		ns = cycles * (10^6 / cpu_khz)
> + *
> + *	Then we use scaling math (suggested by george@mvista.com) to get:
> + *		ns = cycles * (10^6 * SC / cpu_khz) / SC
> + *		ns = cycles * cyc2ns_scale / SC
> + *
> + *	And since SC is a constant power of two, we can convert the div
> + *  into a shift.
> + *
> + *  We can use khz divisor instead of mhz to keep a better percision, since
> + *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> + *  (mathieu.desnoyers@polymtl.ca)
> + *
> + *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
> + */
> +
> +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> +
> +struct sc_data {
> +	unsigned cyc2ns_scale;
> +	unsigned unstable;
> +	unsigned long long sync_base;		/* TSC or jiffies at syncpoint*/
> +	unsigned long long ns_base;		/* nanoseconds at sync point */
> +	unsigned long long last_val;		/* Last returned value */
> +};
> +
> +static DEFINE_PER_CPU(struct sc_data, sc_data) =
> +	{ .unstable = 1, .sync_base = INITIAL_JIFFIES };
> +
> +static inline u64 __cycles_2_ns(struct sc_data *sc, u64 cyc)
> +{
> +	u64 ns;
> +
> +	cyc -= sc->sync_base;
> +	ns = (cyc * sc->cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
> +	ns += sc->ns_base;
> +
> +	return ns;
> +}
> +
> +u64 cycles_2_ns(u64 cyc)
> +{
> +	struct sc_data *sc = &get_cpu_var(sc_data);
> +	u64 ns = __cycles_2_ns(sc, cyc);
> +	put_cpu_var(sc_data);
> +	return ns;
> +}
> +
> +/*
> + * Scheduler clock - returns current time in nanosec units.
> + * All data is local to the CPU.
> + * The values are approximately[1] monotonic local to a CPU, but not
> + * between CPUs.   There might be also an occasionally random error,
> + * but not too bad. Between CPUs the values can be non monotonic.
> + *
> + * [1] no attempt to stop CPU instruction reordering, which can hit
> + * in a 100 instruction window or so.
> + *
> + * The clock can be in two states: stable and unstable.
> + * When it is stable we use the TSC per CPU.
> + * When it is unstable we use jiffies as fallback.
> + * stable->unstable->stable transitions can happen regularly
> + * during CPU frequency changes.
> + * There is special code to avoid having the clock jump backwards
> + * when we switch from TSC to jiffies, which needs to keep some state
> + * per CPU. This state is protected against parallel state changes
> + * with interrupts off.
The comment still says something about interrupts off, but that was
removed it looks like.

> + */
> +unsigned long long tsc_sched_clock(void)
> +{
> +	unsigned long long r;
> +	struct sc_data *sc = &get_cpu_var(sc_data);
> +
> +	if (unlikely(sc->unstable)) {
> +		r = (jiffies_64 - sc->sync_base) * (1000000000 / HZ);
> +		r += sc->ns_base;

Looking further down you aren't using this unstable path when the tsc is
just outright unstable (i.e. some Cyrix systems IIRC)? An improvement
over the original code would be to catch the systems that change
frequencies without cpufreq (like the ones that gave Thomas so much
trouble).

> +		/*
> +		 * last_val is used to avoid non monotonity on a
> +		 * stable->unstable transition. Make sure the time
> +		 * never goes to before the last value returned by the
> +		 * TSC clock.
> +		 */
> +		while (r <= sc->last_val) {
> +			rmb();
> +			r = sc->last_val + 1;
> +			rmb();
> +		}
> +		sc->last_val = r;
> +	} else {
> +		rdtscll(r);
> +		r = __cycles_2_ns(sc, r);
> +		sc->last_val = r;
> +	}
> +
> +	put_cpu_var(sc_data);
> +
> +	return r;
> +}
> +
> +/* We need to define a real function for sched_clock, to override the
> +   weak default version */
> +#ifdef CONFIG_PARAVIRT
> +unsigned long long sched_clock(void)
> +{
> +	return paravirt_sched_clock();
> +}
> +#else
> +unsigned long long sched_clock(void)
> +	__attribute__((alias("tsc_sched_clock")));
> +#endif
> +
> +static int no_sc_for_printk;
> +
> +/*
> + * printk clock: when it is known the sc results are very non monotonic
> + * fall back to jiffies for printk. Other sched_clock users are supposed
> + * to handle this.
> + */
> +unsigned long long printk_clock(void)
> +{
> +	if (unlikely(no_sc_for_printk))
> +		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
> +	return tsc_sched_clock();
> +}
> +
> +static void resolve_freq(struct cpufreq_freqs *freq)
> +{
> +	if (!freq->new) {
> +		freq->new = cpufreq_get(freq->cpu);
> +		if (!freq->new)
> +			freq->new = tsc_khz;
> +	}
> +}
> +
> +/* Resync with new CPU frequency. Must run on to be synced CPU */
> +static void resync_freq(void *arg)
> +{
> +	struct cpufreq_freqs *freq = (void *)arg;
> +	struct sc_data *sc = &__get_cpu_var(sc_data);
> +
> +	sc->sync_base = jiffies;
> +	if (!cpu_has_tsc) {
> +		sc->unstable = 1;
> +		return;
> +	}
> +	resolve_freq(freq);
> +
> +	/*
> +	 * Handle nesting, but when we're zero multiple calls in a row
> +	 * are ok too and not a bug. This can happen during startup
> +	 * when the different callbacks race with each other.
> +	 */
> +	if (sc->unstable > 0)
> +		sc->unstable--;
> +	if (sc->unstable)
> +		return;
> +
> +	/* Minor race window here, but should not add significant errors. */
> +	sc->ns_base = ktime_to_ns(ktime_get());
> +	rdtscll(sc->sync_base);
> +	sc->cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR) / freq->new;
> +}
> +
> +static void resync_freq_on_cpu(void *arg)
> +{
> +	struct cpufreq_freqs f = { .new = 0 };
> +
> +	f.cpu = get_cpu();
> +	resync_freq(&f);
> +	put_cpu();
> +}
> +
> +static int sc_freq_event(struct notifier_block *nb, unsigned long event,
> +			 void *data)
> +{
> +	struct cpufreq_freqs *freq = data;
> +	struct sc_data *sc = &per_cpu(sc_data, freq->cpu);
> +
> +	if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
> +		return NOTIFY_DONE;
> +	if (freq->old == freq->new)
> +		return NOTIFY_DONE;
> +
> +	switch (event) {
> +	case CPUFREQ_SUSPENDCHANGE:
> +		/* Mark TSC unstable during suspend/resume */
> +	case CPUFREQ_PRECHANGE:
> +		/*
> +		 * Mark TSC as unstable until cpu frequency change is
> +		 * done because we don't know when exactly it will
> +		 * change.  unstable in used as a counter to guard
> +		 * against races between the cpu frequency notifiers
> +		 * and normal resyncs
> +		 */
> +		sc->unstable++;
> +		/* FALL THROUGH */
> +	case CPUFREQ_RESUMECHANGE:
> +	case CPUFREQ_POSTCHANGE:
> +		/*
> +		 * Frequency change or resume is done -- update everything and
> +		 * mark TSC as stable again.
> +		 */
> +		on_cpu_single(freq->cpu, resync_freq, freq);
> +		break;
> +	}
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block sc_freq_notifier = {
> +	.notifier_call = sc_freq_event
> +};
> +
> +static int __cpuinit
> +sc_cpu_event(struct notifier_block *self, unsigned long event, void *hcpu)
> +{
> +	long cpu = (long)hcpu;
> +	if (event == CPU_ONLINE) {
> +		struct cpufreq_freqs f = { .cpu = cpu, .new = 0 };
> +
> +		on_cpu_single(cpu, resync_freq, &f);
> +	}
> +	return NOTIFY_DONE;
> +}
> +
> +static __init int init_sched_clock(void)
> +{
> +	if (unsynchronized_tsc())
> +		no_sc_for_printk = 1;
> +
> +	/*
> +	 * On a race between the various events the initialization
> +	 * might be done multiple times, but code is tolerant to
> +	 * this .
> +	 */
> +	cpufreq_register_notifier(&sc_freq_notifier,
> +				CPUFREQ_TRANSITION_NOTIFIER);
> +	hotcpu_notifier(sc_cpu_event, 0);
> +	on_each_cpu(resync_freq_on_cpu, NULL, 0, 0);
> +	return 0;
> +}
> +core_initcall(init_sched_clock);
> Index: linux/arch/i386/kernel/tsc.c
> ===================================================================
> --- linux.orig/arch/i386/kernel/tsc.c
> +++ linux/arch/i386/kernel/tsc.c
> @@ -63,74 +63,6 @@ static inline int check_tsc_unstable(voi
>  	return tsc_unstable;
>  }
>  
> -/* Accellerators for sched_clock()
> - * convert from cycles(64bits) => nanoseconds (64bits)
> - *  basic equation:
> - *		ns = cycles / (freq / ns_per_sec)
> - *		ns = cycles * (ns_per_sec / freq)
> - *		ns = cycles * (10^9 / (cpu_khz * 10^3))
> - *		ns = cycles * (10^6 / cpu_khz)
> - *
> - *	Then we use scaling math (suggested by george@mvista.com) to get:
> - *		ns = cycles * (10^6 * SC / cpu_khz) / SC
> - *		ns = cycles * cyc2ns_scale / SC
> - *
> - *	And since SC is a constant power of two, we can convert the div
> - *  into a shift.
> - *
> - *  We can use khz divisor instead of mhz to keep a better percision, since
> - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> - *  (mathieu.desnoyers@polymtl.ca)
> - *
> - *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
> - */
> -unsigned long cyc2ns_scale __read_mostly;
> -
> -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> -
> -static inline void set_cyc2ns_scale(unsigned long cpu_khz)
> -{
> -	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
> -}
> -
> -/*
> - * Scheduler clock - returns current time in nanosec units.
> - */
> -unsigned long long native_sched_clock(void)
> -{
> -	unsigned long long this_offset;
> -
> -	/*
> -	 * Fall back to jiffies if there's no TSC available:
> -	 * ( But note that we still use it if the TSC is marked
> -	 *   unstable. We do this because unlike Time Of Day,
> -	 *   the scheduler clock tolerates small errors and it's
> -	 *   very important for it to be as fast as the platform
> -	 *   can achive it. )
> -	 */
> -	if (unlikely(!tsc_enabled && !tsc_unstable))
> -		/* No locking but a rare wrong value is not a big deal: */
> -		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
> -
> -	/* read the Time Stamp Counter: */
> -	rdtscll(this_offset);
> -
> -	/* return the value in ns */
> -	return cycles_2_ns(this_offset);
> -}
> -
> -/* We need to define a real function for sched_clock, to override the
> -   weak default version */
> -#ifdef CONFIG_PARAVIRT
> -unsigned long long sched_clock(void)
> -{
> -	return paravirt_sched_clock();
> -}
> -#else
> -unsigned long long sched_clock(void)
> -	__attribute__((alias("native_sched_clock")));
> -#endif
> -
>  unsigned long native_calculate_cpu_khz(void)
>  {
>  	unsigned long long start, end;
> @@ -238,11 +170,6 @@ time_cpufreq_notifier(struct notifier_bl
>  						ref_freq, freq->new);
>  			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
>  				tsc_khz = cpu_khz;
> -				set_cyc2ns_scale(cpu_khz);
> -				/*
> -				 * TSC based sched_clock turns
> -				 * to junk w/ cpufreq
> -				 */
>  				mark_tsc_unstable("cpufreq changes");
>  			}
>  		}
> @@ -380,7 +307,6 @@ void __init tsc_init(void)
>  				(unsigned long)cpu_khz / 1000,
>  				(unsigned long)cpu_khz % 1000);
>  
> -	set_cyc2ns_scale(cpu_khz);
>  	use_tsc_delay();
>  
>  	/* Check and install the TSC clocksource */
> Index: linux/arch/i386/kernel/Makefile
> ===================================================================
> --- linux.orig/arch/i386/kernel/Makefile
> +++ linux/arch/i386/kernel/Makefile
> @@ -7,7 +7,8 @@ extra-y := head.o init_task.o vmlinux.ld
>  obj-y	:= process.o signal.o entry.o traps.o irq.o \
>  		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
>  		pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
> -		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
> +		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o \
> +		sched-clock.o
>  
>  obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
>  obj-y				+= cpu/
> Index: linux/include/asm-i386/timer.h
> ===================================================================
> --- linux.orig/include/asm-i386/timer.h
> +++ linux/include/asm-i386/timer.h
> @@ -6,7 +6,6 @@
>  #define TICK_SIZE (tick_nsec / 1000)
>  
>  void setup_pit_timer(void);
> -unsigned long long native_sched_clock(void);
>  unsigned long native_calculate_cpu_khz(void);
>  
>  extern int timer_ack;
> @@ -18,35 +17,6 @@ extern int recalibrate_cpu_khz(void);
>  #define calculate_cpu_khz() native_calculate_cpu_khz()
>  #endif
>  
> -/* Accellerators for sched_clock()
> - * convert from cycles(64bits) => nanoseconds (64bits)
> - *  basic equation:
> - *		ns = cycles / (freq / ns_per_sec)
> - *		ns = cycles * (ns_per_sec / freq)
> - *		ns = cycles * (10^9 / (cpu_khz * 10^3))
> - *		ns = cycles * (10^6 / cpu_khz)
> - *
> - *	Then we use scaling math (suggested by george@mvista.com) to get:
> - *		ns = cycles * (10^6 * SC / cpu_khz) / SC
> - *		ns = cycles * cyc2ns_scale / SC
> - *
> - *	And since SC is a constant power of two, we can convert the div
> - *  into a shift.
> - *
> - *  We can use khz divisor instead of mhz to keep a better percision, since
> - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> - *  (mathieu.desnoyers@polymtl.ca)
> - *
> - *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
> - */
> -extern unsigned long cyc2ns_scale __read_mostly;
> -
> -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> -
> -static inline unsigned long long cycles_2_ns(unsigned long long cyc)
> -{
> -	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
> -}
> -
> +u64 cycles_2_ns(u64 cyc);
>  
>  #endif
> Index: linux/include/asm-i386/tsc.h
> ===================================================================
> --- linux.orig/include/asm-i386/tsc.h
> +++ linux/include/asm-i386/tsc.h
> @@ -63,6 +63,7 @@ extern void tsc_init(void);
>  extern void mark_tsc_unstable(char *reason);
>  extern int unsynchronized_tsc(void);
>  extern void init_tsc_clocksource(void);
> +extern unsigned long long tsc_sched_clock(void);
>  
>  /*
>   * Boot-time check whether the TSCs are synchronized across
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [8/58] x86_64: Use string instruction memcpy/memset on AMD Fam10
  2007-07-19 16:43   ` Jan Engelhardt
@ 2007-07-19 17:00     ` Yinghai Lu
  0 siblings, 0 replies; 119+ messages in thread
From: Yinghai Lu @ 2007-07-19 17:00 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: Andi Kleen, patches, linux-kernel

On 7/19/07, Jan Engelhardt <jengelh@computergmbh.de> wrote:
>
> On Jul 19 2007 11:54, Andi Kleen wrote:
>
> >Subject: [PATCH] [8/58] x86_64: Use string instruction memcpy/memset on AMD
> >    Fam10
>
> What processors carry 0x10?

Quad core Opteron.

YH

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19 16:51   ` Daniel Walker
@ 2007-07-19 17:13     ` Andi Kleen
  2007-07-19 17:15       ` Daniel Walker
  2007-07-20  3:11     ` Mathieu Desnoyers
  1 sibling, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 17:13 UTC (permalink / raw)
  To: Daniel Walker; +Cc: patches, linux-kernel


> What about using the cycles2ns() clocksource helpers, it would eliminate
> the duplication of the shift/multiply math .

They are completely different from what clocksource provides.

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19 17:13     ` Andi Kleen
@ 2007-07-19 17:15       ` Daniel Walker
  2007-07-19 17:22         ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Daniel Walker @ 2007-07-19 17:15 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

On Thu, 2007-07-19 at 19:13 +0200, Andi Kleen wrote:
> > What about using the cycles2ns() clocksource helpers, it would eliminate
> > the duplication of the shift/multiply math .
> 
> They are completely different from what clocksource provides.

How so?

Daniel


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [7/58] x86_64: various cleanups in NUMA scan node
  2007-07-19  9:54 ` [PATCH] [7/58] x86_64: various cleanups in NUMA scan node Andi Kleen
@ 2007-07-19 17:15   ` Yinghai Lu
  2007-07-19 17:21     ` Andi Kleen
  2007-07-19 21:01     ` David Rientjes
  0 siblings, 2 replies; 119+ messages in thread
From: Yinghai Lu @ 2007-07-19 17:15 UTC (permalink / raw)
  To: Andi Kleen; +Cc: rientjes, patches, linux-kernel

On 7/19/07, Andi Kleen <ak@suse.de> wrote:
>
> From: David Rientjes <rientjes@google.com>
> In acpi_scan_nodes(), we immediately return -1 if acpi_numa <= 0, meaning
> we haven't detected any underlying ACPI topology or we have explicitly
> disabled its use from the command-line with numa=noacpi.
>
> acpi_table_print_srat_entry() and acpi_table_parse_srat() are only
> referenced within drivers/acpi/numa.c, so we can mark them as static and
> remove their prototypes from the header file.
>
> Likewise, pxm_to_node_map[] and node_to_pxm_map[] are only used within
> drivers/acpi/numa.c, so we mark them as static and remove their externs
> from the header file.
>
> The automatic 'result' variable is unused in acpi_numa_init(), so it's
> removed.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
> Signed-off-by: Andi Kleen <ak@suse.de>
>
> ---
>  arch/x86_64/mm/srat.c |    6 +++---
>  drivers/acpi/numa.c   |   20 ++++++++++----------
>  include/linux/acpi.h  |    2 --
>  3 files changed, 13 insertions(+), 15 deletions(-)
>
> Index: linux/arch/x86_64/mm/srat.c
> ===================================================================
> --- linux.orig/arch/x86_64/mm/srat.c
> +++ linux/arch/x86_64/mm/srat.c
>  /* maps to convert between proximity domain and logical node ID */
> -static int pxm_to_node_map[MAX_PXM_DOMAINS]
> +static int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS]
>                                 = { [0 ... MAX_PXM_DOMAINS - 1] = NID_INVAL };
> -static int node_to_pxm_map[MAX_NUMNODES]
> +static int __cpuinitdata node_to_pxm_map[MAX_NUMNODES]
>                                 = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
>
do we need to put __initdata just before =?

YH

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [7/58] x86_64: various cleanups in NUMA scan node
  2007-07-19 17:15   ` Yinghai Lu
@ 2007-07-19 17:21     ` Andi Kleen
  2007-07-19 17:38       ` Yinghai Lu
  2007-07-19 21:01     ` David Rientjes
  1 sibling, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 17:21 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: rientjes, patches, linux-kernel


> do we need to put __initdata just before =?

AFAIK gcc __attribute__ syntax allows both. It certainly seems to compile.

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19 17:15       ` Daniel Walker
@ 2007-07-19 17:22         ` Andi Kleen
  2007-07-19 17:31           ` Daniel Walker
  0 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 17:22 UTC (permalink / raw)
  To: Daniel Walker; +Cc: patches, linux-kernel

On Thursday 19 July 2007 19:15:38 Daniel Walker wrote:
> On Thu, 2007-07-19 at 19:13 +0200, Andi Kleen wrote:
> > > What about using the cycles2ns() clocksource helpers, it would eliminate
> > > the duplication of the shift/multiply math .
> > 
> > They are completely different from what clocksource provides.
> 
> How so?

The new sched_clock's works CPU local and relative to the last sync point. 

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19 17:22         ` Andi Kleen
@ 2007-07-19 17:31           ` Daniel Walker
  2007-07-19 17:38             ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Daniel Walker @ 2007-07-19 17:31 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

On Thu, 2007-07-19 at 19:22 +0200, Andi Kleen wrote:
> On Thursday 19 July 2007 19:15:38 Daniel Walker wrote:
> > On Thu, 2007-07-19 at 19:13 +0200, Andi Kleen wrote:
> > > > What about using the cycles2ns() clocksource helpers, it would eliminate
> > > > the duplication of the shift/multiply math .
> > > 
> > > They are completely different from what clocksource provides.
> > 
> > How so?
> 
> The new sched_clock's works CPU local and relative to the last sync point. 
> 

Right, I guess I'm speaking more low-level than that .. Both function do
shift-multiply style math .. So between the two the cycles to
nanoseconds conversion code is duplicated, and the code to calculate the
multiply value is duplicated ..

>From my perspective a downside to sched_clock is that the math is
duplicated per architecture .. I think it would be a win to use the
generic functions if it's possible..

Daniel


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [7/58] x86_64: various cleanups in NUMA scan node
  2007-07-19 17:21     ` Andi Kleen
@ 2007-07-19 17:38       ` Yinghai Lu
  2007-07-19 20:00         ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Yinghai Lu @ 2007-07-19 17:38 UTC (permalink / raw)
  To: Andi Kleen; +Cc: rientjes, patches, linux-kernel

On 7/19/07, Andi Kleen <ak@suse.de> wrote:
>
> > do we need to put __initdata just before =?
>
> AFAIK gcc __attribute__ syntax allows both. It certainly seems to compile.

in include/linux/init.h

it said

"
 * For initialized data:
 * You should insert __initdata between the variable name and equal
 * sign followed by value, e.g.:
 *
 * static int init_variable __initdata = 0;
 * static char linux_logo[] __initdata = { 0x32, 0x36, ... };
"

or we need to update these lines?

YH

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19 17:31           ` Daniel Walker
@ 2007-07-19 17:38             ` Andi Kleen
  2007-07-19 17:43               ` Daniel Walker
  0 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 17:38 UTC (permalink / raw)
  To: Daniel Walker; +Cc: patches, linux-kernel

On Thursday 19 July 2007 19:31:56 Daniel Walker wrote:

> >From my perspective a downside to sched_clock is that the math is
> duplicated per architecture .. I think it would be a win to use the
> generic functions if it's possible..

They can't be used because they're not cpu local. The whole basic 
concept behind the new sched_clock is to be cpu local. 

-Andi
 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19 17:38             ` Andi Kleen
@ 2007-07-19 17:43               ` Daniel Walker
  2007-07-19 18:00                 ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Daniel Walker @ 2007-07-19 17:43 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

On Thu, 2007-07-19 at 19:38 +0200, Andi Kleen wrote:
> On Thursday 19 July 2007 19:31:56 Daniel Walker wrote:
> 
> > >From my perspective a downside to sched_clock is that the math is
> > duplicated per architecture .. I think it would be a win to use the
> > generic functions if it's possible..
> 
> They can't be used because they're not cpu local. The whole basic 
> concept behind the new sched_clock is to be cpu local. 

Your not following me .. The cpu localness is retained in the multiply
value, which is component of the math .. It's got nothing to do with the
conversion code itself.

You do the same operation to convert from cycles to nanosecond
regardless of the values you use. Example,

+static inline u64 __cycles_2_ns(struct sc_data *sc, u64 cyc)
+{
+       u64 ns;
+
+       cyc -= sc->sync_base;
+       ns = (cyc * sc->cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+       ns += sc->ns_base;
+
+       return ns;
+}

Above the line "(cyc * sc->cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;" is
part of the duplication that I'm referring to, not the surrounding code.

Which looks very much like this,

static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles)
{
        u64 ret = (u64)cycles;
        ret = (ret * cs->mult) >> cs->shift;
        return ret;
}

Daniel


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19 18:00                 ` Andi Kleen
@ 2007-07-19 18:00                   ` Daniel Walker
  0 siblings, 0 replies; 119+ messages in thread
From: Daniel Walker @ 2007-07-19 18:00 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

On Thu, 2007-07-19 at 20:00 +0200, Andi Kleen wrote:
> On Thursday 19 July 2007 19:43:49 Daniel Walker wrote:
> > On Thu, 2007-07-19 at 19:38 +0200, Andi Kleen wrote:
> > > On Thursday 19 July 2007 19:31:56 Daniel Walker wrote:
> > > 
> > > > >From my perspective a downside to sched_clock is that the math is
> > > > duplicated per architecture .. I think it would be a win to use the
> > > > generic functions if it's possible..
> > > 
> > > They can't be used because they're not cpu local. The whole basic 
> > > concept behind the new sched_clock is to be cpu local. 
> > 
> > Your not following me .. 
> 
> Because you don't make much sense. You're really asking me to factor
> an single multiplication and a shift (two CPU instructions!) out to share?

Yes, but I also said that was only part of the duplication. Your already
doing a re-write, there is no reason not to add this..

Daniel


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19 17:43               ` Daniel Walker
@ 2007-07-19 18:00                 ` Andi Kleen
  2007-07-19 18:00                   ` Daniel Walker
  0 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 18:00 UTC (permalink / raw)
  To: Daniel Walker; +Cc: patches, linux-kernel

On Thursday 19 July 2007 19:43:49 Daniel Walker wrote:
> On Thu, 2007-07-19 at 19:38 +0200, Andi Kleen wrote:
> > On Thursday 19 July 2007 19:31:56 Daniel Walker wrote:
> > 
> > > >From my perspective a downside to sched_clock is that the math is
> > > duplicated per architecture .. I think it would be a win to use the
> > > generic functions if it's possible..
> > 
> > They can't be used because they're not cpu local. The whole basic 
> > concept behind the new sched_clock is to be cpu local. 
> 
> Your not following me .. 

Because you don't make much sense. You're really asking me to factor
an single multiplication and a shift (two CPU instructions!) out to share?

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [7/58] x86_64: various cleanups in NUMA scan node
  2007-07-19 17:38       ` Yinghai Lu
@ 2007-07-19 20:00         ` Andi Kleen
  0 siblings, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-19 20:00 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: rientjes, patches, linux-kernel


> "
>  * For initialized data:
>  * You should insert __initdata between the variable name and equal
>  * sign followed by value, e.g.:
>  *
>  * static int init_variable __initdata = 0;
>  * static char linux_logo[] __initdata = { 0x32, 0x36, ... };
> "
> 
> or we need to update these lines?

This might date back to old compilers that are now dropped (like 2.95) 

But recommending it this way is probably not bad, it's just not a catastrophe
to not follow the recommendation.

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [7/58] x86_64: various cleanups in NUMA scan node
  2007-07-19 17:15   ` Yinghai Lu
  2007-07-19 17:21     ` Andi Kleen
@ 2007-07-19 21:01     ` David Rientjes
  1 sibling, 0 replies; 119+ messages in thread
From: David Rientjes @ 2007-07-19 21:01 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: Andi Kleen, patches, linux-kernel

On Thu, 19 Jul 2007, Yinghai Lu wrote:

> > Index: linux/arch/x86_64/mm/srat.c
> > ===================================================================
> > --- linux.orig/arch/x86_64/mm/srat.c
> > +++ linux/arch/x86_64/mm/srat.c
> >  /* maps to convert between proximity domain and logical node ID */
> > -static int pxm_to_node_map[MAX_PXM_DOMAINS]
> > +static int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS]
> >                                 = { [0 ... MAX_PXM_DOMAINS - 1] = NID_INVAL
> > };
> > -static int node_to_pxm_map[MAX_NUMNODES]
> > +static int __cpuinitdata node_to_pxm_map[MAX_NUMNODES]
> >                                 = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
> > 
> do we need to put __initdata just before =?
> 

You mangled the quoting of this patch: the deltas above are actually in 
drivers/acpi/numa.c and not arch/x86_64/mm/srat.c.

The placement of __cpuinitdata as shown above is permitted by gcc for a 
section attribute.  I've been corrected by akpm before when I've written 
function declarations such as "static __init int foo()" in preference of 
using the attribute syntax following all type qualifiers, but it is also 
proper syntax.  It's simply a matter of coding style, the semantics of the 
construct are identical.

		David

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-19 16:51   ` Daniel Walker
  2007-07-19 17:13     ` Andi Kleen
@ 2007-07-20  3:11     ` Mathieu Desnoyers
  2007-07-20  3:47       ` Mathieu Desnoyers
  2007-07-20  8:27       ` [PATCH] [15/58] i386: Rewrite sched_clock Andi Kleen
  1 sibling, 2 replies; 119+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20  3:11 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel, Daniel Walker

* Daniel Walker (dwalker@mvista.com) wrote:
> On Thu, 2007-07-19 at 11:54 +0200, Andi Kleen wrote:
> > Move it into an own file for easy sharing.
> > Do everything per CPU. This avoids problems with TSCs that
> > tick at different frequencies per CPU.
> > Resync properly on cpufreq changes. CPU frequency is instable
> > around cpu frequency changing, so fall back during a backing
> > clock during this period.
> > Hopefully TSC will work now on all systems except when there isn't a
> > physical TSC. 
> > 
> > And
> > 
> > +From: Jeremy Fitzhardinge <jeremy@goop.org>
> > Three cleanups there:
> >  - change "instable" -> "unstable"
> >  - it's better to use get_cpu_var for getting this cpu's variables
> >  - change cycles_2_ns to do the full computation rather than just the
> >    tsc->ns scaling.  It's a simpler interface, and it makes the function
....
> > +/*
> > + * Scheduler clock - returns current time in nanosec units.
> > + * All data is local to the CPU.
> > + * The values are approximately[1] monotonic local to a CPU, but not
> > + * between CPUs.   There might be also an occasionally random error,
> > + * but not too bad. Between CPUs the values can be non monotonic.
> > + *
> > + * [1] no attempt to stop CPU instruction reordering, which can hit
> > + * in a 100 instruction window or so.
> > + *
> > + * The clock can be in two states: stable and unstable.
> > + * When it is stable we use the TSC per CPU.
> > + * When it is unstable we use jiffies as fallback.
> > + * stable->unstable->stable transitions can happen regularly
> > + * during CPU frequency changes.
> > + * There is special code to avoid having the clock jump backwards
> > + * when we switch from TSC to jiffies, which needs to keep some state
> > + * per CPU. This state is protected against parallel state changes
> > + * with interrupts off.
> The comment still says something about interrupts off, but that was
> removed it looks like.
> 

I noticed the same thing about interrupts off when going through the
code. Andi, since you are already playing with per cpu variables, you
could leverage asm/local.h there by declaring last_val as local_t and
use either local_cmpxchg or local_add_return (depending on your needs)
to get both better performances than cli/sti _and_ be really atomic.

See this thread for performance tests:
http://www.ussg.iu.edu/hypermail/linux/kernel/0707.1/0832.html

Mathieu

> > + */
> > +unsigned long long tsc_sched_clock(void)
> > +{
> > +	unsigned long long r;
> > +	struct sc_data *sc = &get_cpu_var(sc_data);
> > +
> > +	if (unlikely(sc->unstable)) {
> > +		r = (jiffies_64 - sc->sync_base) * (1000000000 / HZ);
> > +		r += sc->ns_base;
> 
> Looking further down you aren't using this unstable path when the tsc is
> just outright unstable (i.e. some Cyrix systems IIRC)? An improvement
> over the original code would be to catch the systems that change
> frequencies without cpufreq (like the ones that gave Thomas so much
> trouble).
> 
> > +		/*
> > +		 * last_val is used to avoid non monotonity on a
> > +		 * stable->unstable transition. Make sure the time
> > +		 * never goes to before the last value returned by the
> > +		 * TSC clock.
> > +		 */
> > +		while (r <= sc->last_val) {
> > +			rmb();
> > +			r = sc->last_val + 1;
> > +			rmb();
> > +		}
> > +		sc->last_val = r;
> > +	} else {
> > +		rdtscll(r);
> > +		r = __cycles_2_ns(sc, r);
> > +		sc->last_val = r;
> > +	}
> > +
> > +	put_cpu_var(sc_data);
> > +
> > +	return r;
> > +}
> > +
> > +/* We need to define a real function for sched_clock, to override the
> > +   weak default version */
> > +#ifdef CONFIG_PARAVIRT
> > +unsigned long long sched_clock(void)
> > +{
> > +	return paravirt_sched_clock();
> > +}
> > +#else
> > +unsigned long long sched_clock(void)
> > +	__attribute__((alias("tsc_sched_clock")));
> > +#endif
> > +
> > +static int no_sc_for_printk;
> > +
> > +/*
> > + * printk clock: when it is known the sc results are very non monotonic
> > + * fall back to jiffies for printk. Other sched_clock users are supposed
> > + * to handle this.
> > + */
> > +unsigned long long printk_clock(void)
> > +{
> > +	if (unlikely(no_sc_for_printk))
> > +		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
> > +	return tsc_sched_clock();
> > +}
> > +
> > +static void resolve_freq(struct cpufreq_freqs *freq)
> > +{
> > +	if (!freq->new) {
> > +		freq->new = cpufreq_get(freq->cpu);
> > +		if (!freq->new)
> > +			freq->new = tsc_khz;
> > +	}
> > +}
> > +
> > +/* Resync with new CPU frequency. Must run on to be synced CPU */
> > +static void resync_freq(void *arg)
> > +{
> > +	struct cpufreq_freqs *freq = (void *)arg;
> > +	struct sc_data *sc = &__get_cpu_var(sc_data);
> > +
> > +	sc->sync_base = jiffies;
> > +	if (!cpu_has_tsc) {
> > +		sc->unstable = 1;
> > +		return;
> > +	}
> > +	resolve_freq(freq);
> > +
> > +	/*
> > +	 * Handle nesting, but when we're zero multiple calls in a row
> > +	 * are ok too and not a bug. This can happen during startup
> > +	 * when the different callbacks race with each other.
> > +	 */
> > +	if (sc->unstable > 0)
> > +		sc->unstable--;
> > +	if (sc->unstable)
> > +		return;
> > +
> > +	/* Minor race window here, but should not add significant errors. */
> > +	sc->ns_base = ktime_to_ns(ktime_get());
> > +	rdtscll(sc->sync_base);
> > +	sc->cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR) / freq->new;
> > +}
> > +
> > +static void resync_freq_on_cpu(void *arg)
> > +{
> > +	struct cpufreq_freqs f = { .new = 0 };
> > +
> > +	f.cpu = get_cpu();
> > +	resync_freq(&f);
> > +	put_cpu();
> > +}
> > +
> > +static int sc_freq_event(struct notifier_block *nb, unsigned long event,
> > +			 void *data)
> > +{
> > +	struct cpufreq_freqs *freq = data;
> > +	struct sc_data *sc = &per_cpu(sc_data, freq->cpu);
> > +
> > +	if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
> > +		return NOTIFY_DONE;
> > +	if (freq->old == freq->new)
> > +		return NOTIFY_DONE;
> > +
> > +	switch (event) {
> > +	case CPUFREQ_SUSPENDCHANGE:
> > +		/* Mark TSC unstable during suspend/resume */
> > +	case CPUFREQ_PRECHANGE:
> > +		/*
> > +		 * Mark TSC as unstable until cpu frequency change is
> > +		 * done because we don't know when exactly it will
> > +		 * change.  unstable in used as a counter to guard
> > +		 * against races between the cpu frequency notifiers
> > +		 * and normal resyncs
> > +		 */
> > +		sc->unstable++;
> > +		/* FALL THROUGH */
> > +	case CPUFREQ_RESUMECHANGE:
> > +	case CPUFREQ_POSTCHANGE:
> > +		/*
> > +		 * Frequency change or resume is done -- update everything and
> > +		 * mark TSC as stable again.
> > +		 */
> > +		on_cpu_single(freq->cpu, resync_freq, freq);
> > +		break;
> > +	}
> > +	return NOTIFY_DONE;
> > +}
> > +
> > +static struct notifier_block sc_freq_notifier = {
> > +	.notifier_call = sc_freq_event
> > +};
> > +
> > +static int __cpuinit
> > +sc_cpu_event(struct notifier_block *self, unsigned long event, void *hcpu)
> > +{
> > +	long cpu = (long)hcpu;
> > +	if (event == CPU_ONLINE) {
> > +		struct cpufreq_freqs f = { .cpu = cpu, .new = 0 };
> > +
> > +		on_cpu_single(cpu, resync_freq, &f);
> > +	}
> > +	return NOTIFY_DONE;
> > +}
> > +
> > +static __init int init_sched_clock(void)
> > +{
> > +	if (unsynchronized_tsc())
> > +		no_sc_for_printk = 1;
> > +
> > +	/*
> > +	 * On a race between the various events the initialization
> > +	 * might be done multiple times, but code is tolerant to
> > +	 * this .
> > +	 */
> > +	cpufreq_register_notifier(&sc_freq_notifier,
> > +				CPUFREQ_TRANSITION_NOTIFIER);
> > +	hotcpu_notifier(sc_cpu_event, 0);
> > +	on_each_cpu(resync_freq_on_cpu, NULL, 0, 0);
> > +	return 0;
> > +}
> > +core_initcall(init_sched_clock);
> > Index: linux/arch/i386/kernel/tsc.c
> > ===================================================================
> > --- linux.orig/arch/i386/kernel/tsc.c
> > +++ linux/arch/i386/kernel/tsc.c
> > @@ -63,74 +63,6 @@ static inline int check_tsc_unstable(voi
> >  	return tsc_unstable;
> >  }
> >  
> > -/* Accellerators for sched_clock()
> > - * convert from cycles(64bits) => nanoseconds (64bits)
> > - *  basic equation:
> > - *		ns = cycles / (freq / ns_per_sec)
> > - *		ns = cycles * (ns_per_sec / freq)
> > - *		ns = cycles * (10^9 / (cpu_khz * 10^3))
> > - *		ns = cycles * (10^6 / cpu_khz)
> > - *
> > - *	Then we use scaling math (suggested by george@mvista.com) to get:
> > - *		ns = cycles * (10^6 * SC / cpu_khz) / SC
> > - *		ns = cycles * cyc2ns_scale / SC
> > - *
> > - *	And since SC is a constant power of two, we can convert the div
> > - *  into a shift.
> > - *
> > - *  We can use khz divisor instead of mhz to keep a better percision, since
> > - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> > - *  (mathieu.desnoyers@polymtl.ca)
> > - *
> > - *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
> > - */
> > -unsigned long cyc2ns_scale __read_mostly;
> > -
> > -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> > -
> > -static inline void set_cyc2ns_scale(unsigned long cpu_khz)
> > -{
> > -	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
> > -}
> > -
> > -/*
> > - * Scheduler clock - returns current time in nanosec units.
> > - */
> > -unsigned long long native_sched_clock(void)
> > -{
> > -	unsigned long long this_offset;
> > -
> > -	/*
> > -	 * Fall back to jiffies if there's no TSC available:
> > -	 * ( But note that we still use it if the TSC is marked
> > -	 *   unstable. We do this because unlike Time Of Day,
> > -	 *   the scheduler clock tolerates small errors and it's
> > -	 *   very important for it to be as fast as the platform
> > -	 *   can achive it. )
> > -	 */
> > -	if (unlikely(!tsc_enabled && !tsc_unstable))
> > -		/* No locking but a rare wrong value is not a big deal: */
> > -		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
> > -
> > -	/* read the Time Stamp Counter: */
> > -	rdtscll(this_offset);
> > -
> > -	/* return the value in ns */
> > -	return cycles_2_ns(this_offset);
> > -}
> > -
> > -/* We need to define a real function for sched_clock, to override the
> > -   weak default version */
> > -#ifdef CONFIG_PARAVIRT
> > -unsigned long long sched_clock(void)
> > -{
> > -	return paravirt_sched_clock();
> > -}
> > -#else
> > -unsigned long long sched_clock(void)
> > -	__attribute__((alias("native_sched_clock")));
> > -#endif
> > -
> >  unsigned long native_calculate_cpu_khz(void)
> >  {
> >  	unsigned long long start, end;
> > @@ -238,11 +170,6 @@ time_cpufreq_notifier(struct notifier_bl
> >  						ref_freq, freq->new);
> >  			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
> >  				tsc_khz = cpu_khz;
> > -				set_cyc2ns_scale(cpu_khz);
> > -				/*
> > -				 * TSC based sched_clock turns
> > -				 * to junk w/ cpufreq
> > -				 */
> >  				mark_tsc_unstable("cpufreq changes");
> >  			}
> >  		}
> > @@ -380,7 +307,6 @@ void __init tsc_init(void)
> >  				(unsigned long)cpu_khz / 1000,
> >  				(unsigned long)cpu_khz % 1000);
> >  
> > -	set_cyc2ns_scale(cpu_khz);
> >  	use_tsc_delay();
> >  
> >  	/* Check and install the TSC clocksource */
> > Index: linux/arch/i386/kernel/Makefile
> > ===================================================================
> > --- linux.orig/arch/i386/kernel/Makefile
> > +++ linux/arch/i386/kernel/Makefile
> > @@ -7,7 +7,8 @@ extra-y := head.o init_task.o vmlinux.ld
> >  obj-y	:= process.o signal.o entry.o traps.o irq.o \
> >  		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
> >  		pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
> > -		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
> > +		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o \
> > +		sched-clock.o
> >  
> >  obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
> >  obj-y				+= cpu/
> > Index: linux/include/asm-i386/timer.h
> > ===================================================================
> > --- linux.orig/include/asm-i386/timer.h
> > +++ linux/include/asm-i386/timer.h
> > @@ -6,7 +6,6 @@
> >  #define TICK_SIZE (tick_nsec / 1000)
> >  
> >  void setup_pit_timer(void);
> > -unsigned long long native_sched_clock(void);
> >  unsigned long native_calculate_cpu_khz(void);
> >  
> >  extern int timer_ack;
> > @@ -18,35 +17,6 @@ extern int recalibrate_cpu_khz(void);
> >  #define calculate_cpu_khz() native_calculate_cpu_khz()
> >  #endif
> >  
> > -/* Accellerators for sched_clock()
> > - * convert from cycles(64bits) => nanoseconds (64bits)
> > - *  basic equation:
> > - *		ns = cycles / (freq / ns_per_sec)
> > - *		ns = cycles * (ns_per_sec / freq)
> > - *		ns = cycles * (10^9 / (cpu_khz * 10^3))
> > - *		ns = cycles * (10^6 / cpu_khz)
> > - *
> > - *	Then we use scaling math (suggested by george@mvista.com) to get:
> > - *		ns = cycles * (10^6 * SC / cpu_khz) / SC
> > - *		ns = cycles * cyc2ns_scale / SC
> > - *
> > - *	And since SC is a constant power of two, we can convert the div
> > - *  into a shift.
> > - *
> > - *  We can use khz divisor instead of mhz to keep a better percision, since
> > - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> > - *  (mathieu.desnoyers@polymtl.ca)
> > - *
> > - *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
> > - */
> > -extern unsigned long cyc2ns_scale __read_mostly;
> > -
> > -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> > -
> > -static inline unsigned long long cycles_2_ns(unsigned long long cyc)
> > -{
> > -	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
> > -}
> > -
> > +u64 cycles_2_ns(u64 cyc);
> >  
> >  #endif
> > Index: linux/include/asm-i386/tsc.h
> > ===================================================================
> > --- linux.orig/include/asm-i386/tsc.h
> > +++ linux/include/asm-i386/tsc.h
> > @@ -63,6 +63,7 @@ extern void tsc_init(void);
> >  extern void mark_tsc_unstable(char *reason);
> >  extern int unsynchronized_tsc(void);
> >  extern void init_tsc_clocksource(void);
> > +extern unsigned long long tsc_sched_clock(void);
> >  
> >  /*
> >   * Boot-time check whether the TSCs are synchronized across
> > -
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-20  3:11     ` Mathieu Desnoyers
@ 2007-07-20  3:47       ` Mathieu Desnoyers
  2007-07-20  4:18         ` [PATCH] [15/58] i386: Rewrite sched_clock (cmpxchg8b) Mathieu Desnoyers
  2007-07-20  8:27       ` [PATCH] [15/58] i386: Rewrite sched_clock Andi Kleen
  1 sibling, 1 reply; 119+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20  3:47 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel, Daniel Walker

* Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) wrote:
 ....
> > > +/*
> > > + * Scheduler clock - returns current time in nanosec units.
> > > + * All data is local to the CPU.
> > > + * The values are approximately[1] monotonic local to a CPU, but not
> > > + * between CPUs.   There might be also an occasionally random error,
> > > + * but not too bad. Between CPUs the values can be non monotonic.
> > > + *
> > > + * [1] no attempt to stop CPU instruction reordering, which can hit
> > > + * in a 100 instruction window or so.
> > > + *
> > > + * The clock can be in two states: stable and unstable.
> > > + * When it is stable we use the TSC per CPU.
> > > + * When it is unstable we use jiffies as fallback.
> > > + * stable->unstable->stable transitions can happen regularly
> > > + * during CPU frequency changes.
> > > + * There is special code to avoid having the clock jump backwards
> > > + * when we switch from TSC to jiffies, which needs to keep some state
> > > + * per CPU. This state is protected against parallel state changes
> > > + * with interrupts off.
> > The comment still says something about interrupts off, but that was
> > removed it looks like.
> > 
> 
> I noticed the same thing about interrupts off when going through the
> code. Andi, since you are already playing with per cpu variables, you
> could leverage asm/local.h there by declaring last_val as local_t and
> use either local_cmpxchg or local_add_return (depending on your needs)
> to get both better performances than cli/sti _and_ be really atomic.
> 
> See this thread for performance tests:
> http://www.ussg.iu.edu/hypermail/linux/kernel/0707.1/0832.html
> 
> Mathieu
> 

I just want to rectify a detail: local_t uses type "long", which is 32
bits on x86_32 and 64 bits on x86_64.

Using a cmpxchg8b on i386 seems to require the LOCK prefix to be taken,
so it may degrate performances too much. Therefore, you may prefer to
stay with cli/sti on i386, but using a local cmpxchg would make sense on
x86_64.

A side-note: I really dislike the new cmpxchg behavior when a too
large value is passed to it. If we pass a uint64_t * as first argument
to cmpxchg or cmpxchg_local on i386, it just fails silently. Before, a
linker error was produced, which required the kernel to be compiled with
-O2 as side-effect, but at least there wasn't any silent failure...

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock (cmpxchg8b)
  2007-07-20  3:47       ` Mathieu Desnoyers
@ 2007-07-20  4:18         ` Mathieu Desnoyers
  2007-07-20  5:07           ` Nick Piggin
  0 siblings, 1 reply; 119+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20  4:18 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel, Daniel Walker

* Mathieu Desnoyers (compudj@krystal.dyndns.org) wrote:
 
> I just want to rectify a detail: local_t uses type "long", which is 32
> bits on x86_32 and 64 bits on x86_64.
> 
> Using a cmpxchg8b on i386 seems to require the LOCK prefix to be taken,
> so it may degrate performances too much. Therefore, you may prefer to
> stay with cli/sti on i386, but using a local cmpxchg would make sense on
> x86_64.
> 
> A side-note: I really dislike the new cmpxchg behavior when a too
> large value is passed to it. If we pass a uint64_t * as first argument
> to cmpxchg or cmpxchg_local on i386, it just fails silently. Before, a
> linker error was produced, which required the kernel to be compiled with
> -O2 as side-effect, but at least there wasn't any silent failure...
> 
> Mathieu
> 

Actually, about the i386 case, I am trying to get my head around the
locked cmpxchg8b issue.

I just implemented what would look like something that could be added to
the standard cmpxchg on i386:

union pack64 {
        u64 value;
        struct {
                u32 low, high;
        };
};

static inline u64 cmpxchg8b(u64 * ptr, u64 old, u64 new)
{
        union pack64 oldu;
        union pack64 newu;
        union pack64 prevu;

        oldu.value = old;
        newu.value = new;

        __asm__ __volatile__("cmpxchg8b (%6)\n\t"
                             : "=d"(prevu.high), "=a" (prevu.low)
                             : "0" (oldu.high), "1" (oldu.low),
                               "c" (newu.high), "b" (newu.low),
                               "m"(*__xg(ptr))
                             : "memory");
        return prevu.value;
}

I tried it with and without the LOCK prefix on my Pentium 4.

Locked cmpxchg8b : 90 cycles
Non locked cmpxchg8b: 30 cycles
sti: 166 cycles
cli: 159 cycles

So, hrm, even if we use the locked version, it is still much faster than
the sti/cli. I am thoughtful about the comment in asm-i386/system.h:

/*
 * The semantics of XCHGCMP8B are a bit strange, this is why
 * there is a loop and the loading of %%eax and %%edx has to
 * be inside. This inlines well in most cases, the cached
 * cost is around ~38 cycles. (in the future we might want
 * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
 * might have an implicit FPU-save as a cost, so it's not
 * clear which path to go.)
 *
 * cmpxchg8b must be used with the lock prefix here to allow 
 * the instruction to be executed atomically, see page 3-102
 * of the instruction set reference 24319102.pdf. We need
 * the reader side to see the coherent 64bit value.
 */

I just re-read 24319102.pdf page 3-102. Here is what seems to be the
reference used:

"This instruction can be used with a LOCK prefix to allow the
instruction to be executed atomi cally. To simplify the interface to the
processor’s bus, the destination operand receives a write cycle without
regard to the result of the comparison. The destination operand is
written back if the comparison fails; otherwise, the source operand is
written into the destination. (The processor never produces a locked
read without also producing a locked write.)" (page 3-102)

However, we find _exactly_ the same comment about the cmpxchg
instruction:

"This instruction can be used with a LOCK prefix to allow the
instruction to be executed atomi cally. To simplify the interface to the
processor’s bus, the destination operand receives a write cycle without
regard to the result of the comparison. The destination operand is
written back if the comparison fails; otherwise, the source operand is
written into the destination. (The processor never produces a locked
read without also producing a locked write.)" (page 3-100)

Since we use the standard cmpxchg without lock prefix and consider it
atomic wrt the local CPU, I wonder why we could not do the same for
cmpxchg8b ? Any idea ?

It could then give you 8 bytes cmpxchg atomical wrt the local CPU for 30
cycles, which is really not so bad! Or is there any undocumented
funkyness about cmpxchg8b I should be aware of ?

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock (cmpxchg8b)
  2007-07-20  4:18         ` [PATCH] [15/58] i386: Rewrite sched_clock (cmpxchg8b) Mathieu Desnoyers
@ 2007-07-20  5:07           ` Nick Piggin
  2007-07-20  5:47             ` Mathieu Desnoyers
  0 siblings, 1 reply; 119+ messages in thread
From: Nick Piggin @ 2007-07-20  5:07 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Andi Kleen, patches, linux-kernel, Daniel Walker

Mathieu Desnoyers wrote:

> I tried it with and without the LOCK prefix on my Pentium 4.
> 
> Locked cmpxchg8b : 90 cycles
> Non locked cmpxchg8b: 30 cycles
> sti: 166 cycles
> cli: 159 cycles
> 
> So, hrm, even if we use the locked version, it is still much faster than
> the sti/cli. I am thoughtful about the comment in asm-i386/system.h:

Curious: what does it look like if the memory is not in cache? I
found that cmpxchg is relatively slower than other rmw instructions
in that case.

-- 
SUSE Labs, Novell Inc.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock (cmpxchg8b)
  2007-07-20  5:07           ` Nick Piggin
@ 2007-07-20  5:47             ` Mathieu Desnoyers
  0 siblings, 0 replies; 119+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20  5:47 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andi Kleen, patches, linux-kernel, Daniel Walker

* Nick Piggin (nickpiggin@yahoo.com.au) wrote:
> Mathieu Desnoyers wrote:
> 
> >I tried it with and without the LOCK prefix on my Pentium 4.
> >
> >Locked cmpxchg8b : 90 cycles
> >Non locked cmpxchg8b: 30 cycles
> >sti: 166 cycles
> >cli: 159 cycles
> >
> >So, hrm, even if we use the locked version, it is still much faster than
> >the sti/cli. I am thoughtful about the comment in asm-i386/system.h:
> 
> Curious: what does it look like if the memory is not in cache? I
> found that cmpxchg is relatively slower than other rmw instructions
> in that case.
> 

Actually, I have just seen that cmpxchg64 and cmpxchg64_local are
doing exactly this and they are already implemented in asm-i386/system.h.

A quick test: I am doing clflush in a loop (substracting its time from the
following loops) to have a memory hit when I do cmpxchg. This is the
result of just the cmpxchg8b:

non locked cmpxchg8b: 583.37 cycles
locked cmpxchg8b: 650.48 cycles
rmw in 3 operations: 581.43 cycles

So the locked cmpxchg is 67 cycles slower than the non locked cmpxchg,
which fits with my 30 vs 90 cycles. rmw is a tiny bit faster than
cmpxchg8b (2 cycles), but nothing to call home about.

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-20  3:11     ` Mathieu Desnoyers
  2007-07-20  3:47       ` Mathieu Desnoyers
@ 2007-07-20  8:27       ` Andi Kleen
  2007-07-20 14:12         ` Mathieu Desnoyers
  1 sibling, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-07-20  8:27 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: patches, linux-kernel, Daniel Walker


> I noticed the same thing about interrupts off when going through the
> code.

That's only on a slow path during cpu frequency changing while the TSC is instable.
Shouldn't be that common.

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-20  8:27       ` [PATCH] [15/58] i386: Rewrite sched_clock Andi Kleen
@ 2007-07-20 14:12         ` Mathieu Desnoyers
  2007-07-20 14:39           ` Mathieu Desnoyers
  2007-07-20 15:14           ` Andi Kleen
  0 siblings, 2 replies; 119+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20 14:12 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel, Daniel Walker

* Andi Kleen (ak@suse.de) wrote:
> 
> > I noticed the same thing about interrupts off when going through the
> > code.
> 
> That's only on a slow path during cpu frequency changing while the TSC is instable.
> Shouldn't be that common.
> 
> -Andi

Hrm, I don't see why you can get away without disabling interrupts in
the fast path:

+unsigned long long tsc_sched_clock(void)
+{
+       unsigned long long r;
+       struct sc_data *sc = &get_cpu_var(sc_data);
+       
+       if (unlikely(sc->unstable)) {
+               r = (jiffies_64 - sc->sync_base) * (1000000000 / HZ);
+               r += sc->ns_base;
+               /*
+                * last_val is used to avoid non monotonity on a
+                * stable->unstable transition. Make sure the time
+                * never goes to before the last value returned by the
+                * TSC clock.
+                */
+               while (r <= sc->last_val) {
+                       rmb();
+                       r = sc->last_val + 1;
+                       rmb();
+               }
+               sc->last_val = r;

Here, slow path, we update last_val (64 bits value). Must be protected.

+       } else {
+               rdtscll(r);
+               r = __cycles_2_ns(sc, r);
+               sc->last_val = r;

Here, fast path, we update last_val too so it is ready to be read when
the tsc will become unstable.

If we don't disable interrupts around its update, we could have: (LSB vs
MSB update order is arbitrary)

update sc->last_val 32MSB
  interrupt comes
    update sc->last_val 32MSB
    update sc->last_val 32LSB
  iret
update sc->last_val 32LSB

So if, after this, we run tsc_sched_clock() with an unstable TSC, we
read a last_val containing the interrupt's MSB and the last_val LSB. It
can particularity hurt if we are around a 32 bits overflow, because time
could "jump" forward of about 1.43 seconds on a 3 GHz system.

So I guess we need synchronization on the fast path, and therefore using
cmpxchg_local on x86_64 and cmpxchg64_local on i386 makes sense.

Mathieu

+       }
+ 
+       put_cpu_var(sc_data);
+       
+       return r;
+}



-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-20 14:12         ` Mathieu Desnoyers
@ 2007-07-20 14:39           ` Mathieu Desnoyers
  2007-07-20 15:14           ` Andi Kleen
  1 sibling, 0 replies; 119+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20 14:39 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel, Daniel Walker

* Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) wrote:
> * Andi Kleen (ak@suse.de) wrote:
> > 
> > > I noticed the same thing about interrupts off when going through the
> > > code.
> > 
> > That's only on a slow path during cpu frequency changing while the TSC is instable.
> > Shouldn't be that common.
> > 
> > -Andi
> 
> Hrm, I don't see why you can get away without disabling interrupts in
> the fast path:
> 
> +unsigned long long tsc_sched_clock(void)
> +{
> +       unsigned long long r;
> +       struct sc_data *sc = &get_cpu_var(sc_data);
> +       
> +       if (unlikely(sc->unstable)) {
> +               r = (jiffies_64 - sc->sync_base) * (1000000000 / HZ);
> +               r += sc->ns_base;
> +               /*
> +                * last_val is used to avoid non monotonity on a
> +                * stable->unstable transition. Make sure the time
> +                * never goes to before the last value returned by the
> +                * TSC clock.
> +                */
> +               while (r <= sc->last_val) {
> +                       rmb();
> +                       r = sc->last_val + 1;
> +                       rmb();
> +               }
> +               sc->last_val = r;
> 
> Here, slow path, we update last_val (64 bits value). Must be protected.
> 
> +       } else {
> +               rdtscll(r);
> +               r = __cycles_2_ns(sc, r);
> +               sc->last_val = r;
> 
> Here, fast path, we update last_val too so it is ready to be read when
> the tsc will become unstable.
> 
> If we don't disable interrupts around its update, we could have: (LSB vs
> MSB update order is arbitrary)
> 
> update sc->last_val 32MSB
>   interrupt comes
>     update sc->last_val 32MSB
>     update sc->last_val 32LSB
>   iret
> update sc->last_val 32LSB
> 
> So if, after this, we run tsc_sched_clock() with an unstable TSC, we
> read a last_val containing the interrupt's MSB and the last_val LSB. It
> can particularity hurt if we are around a 32 bits overflow, because time
> could "jump" forward of about 1.43 seconds on a 3 GHz system.
> 
> So I guess we need synchronization on the fast path, and therefore using
> cmpxchg_local on x86_64 and cmpxchg64_local on i386 makes sense.
> 

The case above explained the issue for i386. For x86_64, the race goes
like this:

read tsc
  interrupt
  read tsc
  update sc->last_val
  iret
update sc->last_val

Here, last_val is not at its highest value anymore. This is why a
cmpxchg is useful on x86_64.

Mathieu


> Mathieu
> 
> +       }
> + 
> +       put_cpu_var(sc_data);
> +       
> +       return r;
> +}
> 
> 
> 
> -- 
> Mathieu Desnoyers
> Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
> OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-20 14:12         ` Mathieu Desnoyers
  2007-07-20 14:39           ` Mathieu Desnoyers
@ 2007-07-20 15:14           ` Andi Kleen
  2007-07-20 15:22             ` Mathieu Desnoyers
  2007-07-20 16:49             ` [PATCH] 80386 and 80486 cmpxchg64 and cmpxchg64_local fallback Mathieu Desnoyers
  1 sibling, 2 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-20 15:14 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: patches, linux-kernel, Daniel Walker


> So if, after this, we run tsc_sched_clock() with an unstable TSC, we
> read a last_val containing the interrupt's MSB and the last_val LSB. It
> can particularity hurt if we are around a 32 bits overflow, because time
> could "jump" forward of about 1.43 seconds on a 3 GHz system.
> 
> So I guess we need synchronization on the fast path, and therefore using
> cmpxchg_local on x86_64

On x86-64 the 64bit write is atomic against interrupts.

You're right 32bit has a problem though. I'm not too happy about 
cmpxchg though because that wouldn't work on some CPUs.

I wonder if we can just get away with using a 32bit value on i386.
Just for the purpose of keeping the value monotonic it should be good
enough. Will think about it.

Thanks for the review.

-Andi


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [15/58] i386: Rewrite sched_clock
  2007-07-20 15:14           ` Andi Kleen
@ 2007-07-20 15:22             ` Mathieu Desnoyers
  2007-07-20 16:49             ` [PATCH] 80386 and 80486 cmpxchg64 and cmpxchg64_local fallback Mathieu Desnoyers
  1 sibling, 0 replies; 119+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20 15:22 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel, Daniel Walker

* Andi Kleen (ak@suse.de) wrote:
> 
> > So if, after this, we run tsc_sched_clock() with an unstable TSC, we
> > read a last_val containing the interrupt's MSB and the last_val LSB. It
> > can particularity hurt if we are around a 32 bits overflow, because time
> > could "jump" forward of about 1.43 seconds on a 3 GHz system.
> > 
> > So I guess we need synchronization on the fast path, and therefore using
> > cmpxchg_local on x86_64
> 
> On x86-64 the 64bit write is atomic against interrupts.
> 
> You're right 32bit has a problem though. I'm not too happy about 
> cmpxchg though because that wouldn't work on some CPUs.
> 

Which CPUs ? 386 ? do they even have a cycle counter ?

Please have a look at my reply to this email for the x86_64 problematic
case. This problematic case also applies to i386 (non atomicity of tsc
read vs write to memory).

> I wonder if we can just get away with using a 32bit value on i386.
> Just for the purpose of keeping the value monotonic it should be good
> enough. Will think about it.
> 

Yes, it could work. In this case you have to be aware that the 32 LSBs
of the TSC will overflow every ~ 1s and, in order to be sure to be able
to detect the overflow, you have to do at least 1 TSC read per second
(more precisely per 32 LSB overflow period).

> Thanks for the review.
> 

YW,

Mathieu

> -Andi
> 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH] 80386 and 80486 cmpxchg64 and cmpxchg64_local fallback
  2007-07-20 15:14           ` Andi Kleen
  2007-07-20 15:22             ` Mathieu Desnoyers
@ 2007-07-20 16:49             ` Mathieu Desnoyers
  1 sibling, 0 replies; 119+ messages in thread
From: Mathieu Desnoyers @ 2007-07-20 16:49 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

Fall back on interrupt disable in cmpxchg8b on 80386 and 80486

Actually, on 386, cmpxchg and cmpxchg_local fall back on
cmpxchg_386_u8/16/32: it disables interruptions around non atomic
updates to mimic the cmpxchg behavior.

The comment:
/* Poor man's cmpxchg for 386. Unsuitable for SMP */

in the implementation tells much about how this cmpxchg implementation
should not be used in a SMP context. However, the cmpxchg_local can
perfectly use this fallback, since it only needs to be atomic wrt the
local cpu.

This patch adds a cmpxchg_386_u64 and uses it as a fallback for cmpxchg64
and cmpxchg64_local on 80386 and 80486.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
---
 arch/i386/kernel/cpu/intel.c |   17 +++++++
 include/asm-i386/cmpxchg.h   |   99 ++++++++++++++++++++++++++++---------------
 2 files changed, 82 insertions(+), 34 deletions(-)

Index: linux-2.6-lttng/arch/i386/kernel/cpu/intel.c
===================================================================
--- linux-2.6-lttng.orig/arch/i386/kernel/cpu/intel.c	2007-07-20 12:41:24.000000000 -0400
+++ linux-2.6-lttng/arch/i386/kernel/cpu/intel.c	2007-07-20 12:43:56.000000000 -0400
@@ -329,5 +329,22 @@ unsigned long cmpxchg_386_u32(volatile v
 EXPORT_SYMBOL(cmpxchg_386_u32);
 #endif
 
+#ifndef CONFIG_X86_CMPXCHG64
+unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
+{
+	u64 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u64 *)ptr;
+	if (prev == old)
+		*(u64 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_486_u64);
+#endif
+
 // arch_initcall(intel_cpu_init);
 
Index: linux-2.6-lttng/include/asm-i386/cmpxchg.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-i386/cmpxchg.h	2007-07-20 12:41:24.000000000 -0400
+++ linux-2.6-lttng/include/asm-i386/cmpxchg.h	2007-07-20 12:42:12.000000000 -0400
@@ -116,6 +116,15 @@ static inline unsigned long __xchg(unsig
 					(unsigned long)(n),sizeof(*(ptr))))
 #endif
 
+#ifdef CONFIG_X86_CMPXCHG64
+#define cmpxchg64(ptr,o,n)\
+	((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
+					(unsigned long long)(n)))
+#define cmpxchg64_local(ptr,o,n)\
+	((__typeof__(*(ptr)))__cmpxchg64_local((ptr),(unsigned long long)(o),\
+					(unsigned long long)(n)))
+#endif
+
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 				      unsigned long new, int size)
 {
@@ -203,6 +212,34 @@ static inline unsigned long __cmpxchg_lo
 	return old;
 }
 
+static inline unsigned long long __cmpxchg64(volatile void *ptr,
+			unsigned long long old, unsigned long long new)
+{
+	unsigned long long prev;
+	__asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
+			     : "=A"(prev)
+			     : "b"((unsigned long)new),
+			       "c"((unsigned long)(new >> 32)),
+			       "m"(*__xg(ptr)),
+			       "0"(old)
+			     : "memory");
+	return prev;
+}
+
+static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
+			unsigned long long old, unsigned long long new)
+{
+	unsigned long long prev;
+	__asm__ __volatile__("cmpxchg8b %3"
+			     : "=A"(prev)
+			     : "b"((unsigned long)new),
+			       "c"((unsigned long)(new >> 32)),
+			       "m"(*__xg(ptr)),
+			       "0"(old)
+			     : "memory");
+	return prev;
+}
+
 #ifndef CONFIG_X86_CMPXCHG
 /*
  * Building a kernel capable running on 80386. It may be necessary to
@@ -252,42 +289,36 @@ static inline unsigned long cmpxchg_386(
 })
 #endif
 
-#ifdef CONFIG_X86_CMPXCHG64
-
-static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
-				      unsigned long long new)
-{
-	unsigned long long prev;
-	__asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
-			     : "=A"(prev)
-			     : "b"((unsigned long)new),
-			       "c"((unsigned long)(new >> 32)),
-			       "m"(*__xg(ptr)),
-			       "0"(old)
-			     : "memory");
-	return prev;
-}
+#ifndef CONFIG_X86_CMPXCHG64
+/*
+ * Building a kernel capable running on 80386 and 80486. It may be necessary
+ * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
+ */
 
-static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
-			unsigned long long old, unsigned long long new)
-{
-	unsigned long long prev;
-	__asm__ __volatile__("cmpxchg8b %3"
-			     : "=A"(prev)
-			     : "b"((unsigned long)new),
-			       "c"((unsigned long)(new >> 32)),
-			       "m"(*__xg(ptr)),
-			       "0"(old)
-			     : "memory");
-	return prev;
-}
+extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
 
-#define cmpxchg64(ptr,o,n)\
-	((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
-					(unsigned long long)(n)))
-#define cmpxchg64_local(ptr,o,n)\
-	((__typeof__(*(ptr)))__cmpxchg64_local((ptr),(unsigned long long)(o),\
-					(unsigned long long)(n)))
+#define cmpxchg64(ptr,o,n)						\
+({									\
+	__typeof__(*(ptr)) __ret;					\
+	if (likely(boot_cpu_data.x86 > 4))				\
+		__ret = __cmpxchg64((ptr), (unsigned long long)(o),	\
+				(unsigned long long)(n));		\
+	else								\
+		__ret = cmpxchg_486_u64((ptr), (unsigned long long)(o),	\
+				(unsigned long long)(n));		\
+	__ret;								\
+})
+#define cmpxchg64_local(ptr,o,n)					\
+({									\
+	__typeof__(*(ptr)) __ret;					\
+	if (likely(boot_cpu_data.x86 > 4))				\
+		__ret = __cmpxchg64_local((ptr), (unsigned long long)(o), \
+				(unsigned long long)(n));		\
+	else								\
+		__ret = cmpxchg_486_u64((ptr), (unsigned long long)(o),	\
+				(unsigned long long)(n));		\
+	__ret;								\
+})
 #endif
 
 #endif
-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [patches] [PATCH] [17/58] i386: Add L3 cache support to AMD CPUID4 emulation
  2007-07-19  9:55 ` [PATCH] [17/58] i386: Add L3 cache support to AMD CPUID4 emulation Andi Kleen
@ 2007-07-20 17:00   ` Andreas Herrmann
  2007-07-20 17:15   ` Andreas Herrmann
  1 sibling, 0 replies; 119+ messages in thread
From: Andreas Herrmann @ 2007-07-20 17:00 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

On Thu, Jul 19, 2007 at 11:55:02AM +0200, Andi Kleen wrote:
> 
> With that an L3 cache is correctly reported in the cache information in /sys
> 
> With fixes from Andreas Herrmann and Dean Gaudet
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  arch/i386/kernel/cpu/intel_cacheinfo.c |   74 ++++++++++++++++++++++++---------
>  arch/x86_64/kernel/setup.c             |    7 ++-
>  2 files changed, 60 insertions(+), 21 deletions(-)

Reporting of L3 cache information should also be enabled in 32bit mode.


Regards,

Andreas
--

Enable reporting of L3 cache info in 32 bit mode for family 0x10.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>

diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 6f47eee..815a5f0 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -272,8 +272,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	}
 #endif
 
-	if (cpuid_eax(0x80000000) >= 0x80000006)
-		num_cache_leaves = 3;
+	if (cpuid_eax(0x80000000) >= 0x80000006) {
+		if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000))
+			num_cache_leaves = 4;
+		else
+			num_cache_leaves = 3;
+	}
 
 	if (amd_apic_timer_broken())
 		set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability);




^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: [patches] [PATCH] [17/58] i386: Add L3 cache support to AMD CPUID4 emulation
  2007-07-19  9:55 ` [PATCH] [17/58] i386: Add L3 cache support to AMD CPUID4 emulation Andi Kleen
  2007-07-20 17:00   ` [patches] " Andreas Herrmann
@ 2007-07-20 17:15   ` Andreas Herrmann
  1 sibling, 0 replies; 119+ messages in thread
From: Andreas Herrmann @ 2007-07-20 17:15 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel

I think, Joachim's patch (sent to patches@x86-64.org on June 14) should
be added as well. I have attached his patch below.


Regards,

Andreas

-- 
Operating | AMD Saxony Limited Liability Company & Co. KG,
  System  | Wilschdorfer Landstr. 101, 01109 Dresden, Germany
 Research | Register Court Dresden: HRA 4896, General Partner authorized
  Center  | to represent: AMD Saxony LLC (Wilmington, Delaware, US)
  (OSRC)  | General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
--

This will allow the size field to be reported for all values instead of a 
handful and also fills the shard_cpu_map with meaning full value.

Signed-off-by: Joachim Deguara <joachim.deguara@amd.com>
Index: kernel/arch/i386/kernel/cpu/intel_cacheinfo.c
===================================================================
--- kernel.orig/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ kernel/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -224,12 +224,7 @@ static void __cpuinit amd_cpuid4(int lea
 		assoc = l3.assoc;
 		line_size = l3.line_size;
 		lines_per_tag = l3.lines_per_tag;
-		switch (l3.size_encoded) {
-		case 4:  size_in_kb = 2 * 1024; break;
-		case 8:  size_in_kb = 4 * 1024; break;
-		case 12: size_in_kb = 6 * 1024; break;
-		default: size_in_kb = 0; break;
-		}
+		size_in_kb = l3.size_encoded * 512;
 		break;
 	default:
 		return;
@@ -238,7 +233,10 @@ static void __cpuinit amd_cpuid4(int lea
 	eax->split.is_self_initializing = 1;
 	eax->split.type = types[leaf];
 	eax->split.level = levels[leaf];
-	eax->split.num_threads_sharing = 0;
+	if (leaf == 3)
+		eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+	else
+		eax->split.num_threads_sharing = 0;
 	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
 
 

_______________________________________________
patches mailing list
patches@x86-64.org
https://www.x86-64.org/mailman/listinfo/patches







^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [9/58] x86_64: Always use builtin memcpy on gcc 4.3
  2007-07-19  9:54 ` [PATCH] [9/58] x86_64: Always use builtin memcpy on gcc 4.3 Andi Kleen
@ 2007-07-21 23:16   ` Oleg Verych
  2007-07-21 23:27     ` Andi Kleen
  2007-07-22  0:29     ` Denis Vlasenko
  0 siblings, 2 replies; 119+ messages in thread
From: Oleg Verych @ 2007-07-21 23:16 UTC (permalink / raw)
  To: Andi Kleen; +Cc: jh, patches, linux-kernel

* From: Andi Kleen <ak@suse.de>
* Date: Thu, 19 Jul 2007 11:54:53 +0200 (CEST)
>
> Jan asked to always use the builtin memcpy on gcc 4.3 mainline because
> it should generate better code than the old macro. Let's try it.

Unfortunately such info is hard to find. The discuss@x86-64 list is
empty. So, let me ask how this memcpy relates to recently submitted
for glibc one [0]?

[0] <http://permalink.gmane.org/gmane.comp.lib.glibc.alpha/12217>

Also you are enabling rep. string operations for 10h family. Yet manual
says, that while they were improved, there are still various other
preferred optimization cases

Thanks.

> Cc: jh@suse.cz
>
> Signed-off-by: Andi Kleen <ak@suse.de>
>
> ---
>  include/asm-x86_64/string.h |    5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
____

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [9/58] x86_64: Always use builtin memcpy on gcc 4.3
  2007-07-21 23:16   ` Oleg Verych
@ 2007-07-21 23:27     ` Andi Kleen
  2007-07-22  0:29     ` Denis Vlasenko
  1 sibling, 0 replies; 119+ messages in thread
From: Andi Kleen @ 2007-07-21 23:27 UTC (permalink / raw)
  To: Oleg Verych; +Cc: jh, patches, linux-kernel

On Sunday 22 July 2007 01:16:42 Oleg Verych wrote:
> * From: Andi Kleen <ak@suse.de>
> * Date: Thu, 19 Jul 2007 11:54:53 +0200 (CEST)
> >
> > Jan asked to always use the builtin memcpy on gcc 4.3 mainline because
> > it should generate better code than the old macro. Let's try it.
> 
> Unfortunately such info is hard to find. The discuss@x86-64 list is
> empty. So, let me ask how this memcpy relates to recently submitted
> for glibc one [0]?

It doesn't relate at all. The kernel still uses its own memcpy.

Note that a lot of the traditional memcpy optimizations (like WC copies) 
are pointless in kernel space because the kernel rarely deals with continuous 
memory areas larger than a 4K page.

The only difference from the patch is that instead of using an own
heuristic when to use an out of line memcpy trust gcc's heuristic.

-Andi


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [9/58] x86_64: Always use builtin memcpy on gcc 4.3
  2007-07-21 23:16   ` Oleg Verych
  2007-07-21 23:27     ` Andi Kleen
@ 2007-07-22  0:29     ` Denis Vlasenko
  1 sibling, 0 replies; 119+ messages in thread
From: Denis Vlasenko @ 2007-07-22  0:29 UTC (permalink / raw)
  To: Oleg Verych; +Cc: Andi Kleen, jh, patches, linux-kernel

On Sunday 22 July 2007 00:16, Oleg Verych wrote:
> * From: Andi Kleen <ak@suse.de>
> * Date: Thu, 19 Jul 2007 11:54:53 +0200 (CEST)
> >
> > Jan asked to always use the builtin memcpy on gcc 4.3 mainline because
> > it should generate better code than the old macro. Let's try it.
> 
> Unfortunately such info is hard to find. The discuss@x86-64 list is
> empty. So, let me ask how this memcpy relates to recently submitted
> for glibc one [0]?
> 
> [0] <http://permalink.gmane.org/gmane.comp.lib.glibc.alpha/12217>

Am I stupid or the files attached to that post demonstrate than "new"
code isn't much better and sometimes worse (aligned 4096 byte memcpy
went from 558 to 648 for Core 2)?

Beware that text files in test-memcpy.tar.bz2 seem to have
simple_memcpy / builtin_memcpy / memcpy columns swapped
(-old and -new files have them in different order).
--
vda

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [patches] [PATCH] [1/58] x86: Always flush pages in change_page_attr
  2007-07-19  9:54 ` [PATCH] [1/58] x86: Always flush pages in change_page_attr Andi Kleen
@ 2007-08-06 10:15   ` Jan Beulich
  2007-08-06 10:36     ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Jan Beulich @ 2007-08-06 10:15 UTC (permalink / raw)
  To: Andi Kleen, linux-kernel, patches

But that is still wrong - you're again flushing the page table page rather than
the data one. Fixing this was the purpose of the patch I had sent, plus the
broken reference counting used by the reversion logic. Jan

>>> Andi Kleen <ak@suse.de> 19.07.07 11:54 >>>

Fix a bug introduced with the CLFLUSH changes: we must always flush pages
changed in cpa(), not just when they are reverted.

Reenable CLFLUSH usage with that now (it was temporarily disabled
for .22) 

Add some BUG_ONs

Contains fixes from  Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>

Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/i386/mm/pageattr.c   |   20 +++++++++++++++++---
 arch/x86_64/mm/pageattr.c |   23 ++++++++++++++---------
 2 files changed, 31 insertions(+), 12 deletions(-)

Index: linux/arch/x86_64/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86_64/mm/pageattr.c
+++ linux/arch/x86_64/mm/pageattr.c
@@ -74,14 +74,12 @@ static void flush_kernel_map(void *arg)
 	struct page *pg;
 
 	/* When clflush is available always use it because it is
-	   much cheaper than WBINVD. Disable clflush for now because
-	   the high level code is not ready yet */
-	if (1 || !cpu_has_clflush)
+	   much cheaper than WBINVD. */
+	if (!cpu_has_clflush)
 		asm volatile("wbinvd" ::: "memory");
 	else list_for_each_entry(pg, l, lru) {
 		void *adr = page_address(pg);
-		if (cpu_has_clflush)
-			cache_flush_page(adr);
+		cache_flush_page(adr);
 	}
 	__flush_tlb_all();
 }
@@ -95,7 +93,8 @@ static LIST_HEAD(deferred_pages); /* pro
 
 static inline void save_page(struct page *fpage)
 {
-	list_add(&fpage->lru, &deferred_pages);
+	if (!test_and_set_bit(PG_arch_1, &fpage->flags))
+		list_add(&fpage->lru, &deferred_pages);
 }
 
 /* 
@@ -129,9 +128,12 @@ __change_page_attr(unsigned long address
 	pte_t *kpte; 
 	struct page *kpte_page;
 	pgprot_t ref_prot2;
+
 	kpte = lookup_address(address);
 	if (!kpte) return 0;
 	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+	BUG_ON(PageLRU(kpte_page));
+	BUG_ON(PageCompound(kpte_page));
 	if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
 		if (!pte_huge(*kpte)) {
 			set_pte(kpte, pfn_pte(pfn, prot));
@@ -159,10 +161,9 @@ __change_page_attr(unsigned long address
 	/* on x86-64 the direct mapping set at boot is not using 4k pages */
  	BUG_ON(PageReserved(kpte_page));
 
-	if (page_private(kpte_page) == 0) {
-		save_page(kpte_page);
+	save_page(kpte_page);
+	if (page_private(kpte_page) == 0)
 		revert_page(address, ref_prot);
- 	}
 	return 0;
 } 
 
@@ -234,6 +235,10 @@ void global_flush_tlb(void)
 	flush_map(&l);
 
 	list_for_each_entry_safe(pg, next, &l, lru) {
+		list_del(&pg->lru);
+		clear_bit(PG_arch_1, &pg->flags);
+		if (page_private(pg) != 0)
+			continue;
 		ClearPagePrivate(pg);
 		__free_page(pg);
 	} 
Index: linux/arch/i386/mm/pageattr.c
===================================================================
--- linux.orig/arch/i386/mm/pageattr.c
+++ linux/arch/i386/mm/pageattr.c
@@ -82,7 +82,7 @@ static void flush_kernel_map(void *arg)
 	struct page *p;
 
 	/* High level code is not ready for clflush yet */
-	if (0 && cpu_has_clflush) {
+	if (cpu_has_clflush) {
 		list_for_each_entry (p, lh, lru)
 			cache_flush_page(p);
 	} else if (boot_cpu_data.x86_model >= 4)
@@ -136,6 +136,12 @@ static inline void revert_page(struct pa
 			    ref_prot));
 }
 
+static inline void save_page(struct page *kpte_page)
+{
+	if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
+		list_add(&kpte_page->lru, &df_list);
+}
+
 static int
 __change_page_attr(struct page *page, pgprot_t prot)
 { 
@@ -150,6 +156,9 @@ __change_page_attr(struct page *page, pg
 	if (!kpte)
 		return -EINVAL;
 	kpte_page = virt_to_page(kpte);
+	BUG_ON(PageLRU(kpte_page));
+	BUG_ON(PageCompound(kpte_page));
+
 	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
 		if (!pte_huge(*kpte)) {
 			set_pte_atomic(kpte, mk_pte(page, prot)); 
@@ -179,11 +188,11 @@ __change_page_attr(struct page *page, pg
 	 * time (not via split_large_page) and in turn we must not
 	 * replace it with a largepage.
 	 */
+
+	save_page(kpte_page);
 	if (!PageReserved(kpte_page)) {
 		if (cpu_has_pse && (page_private(kpte_page) == 0)) {
-			ClearPagePrivate(kpte_page);
 			paravirt_release_pt(page_to_pfn(kpte_page));
-			list_add(&kpte_page->lru, &df_list);
 			revert_page(kpte_page, address);
 		}
 	}
@@ -236,6 +245,11 @@ void global_flush_tlb(void)
 	spin_unlock_irq(&cpa_lock);
 	flush_map(&l);
 	list_for_each_entry_safe(pg, next, &l, lru) {
+		list_del(&pg->lru);
+		clear_bit(PG_arch_1, &pg->flags);
+		if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
+			continue;
+		ClearPagePrivate(pg);
 		__free_page(pg);
 	}
 }

_______________________________________________
patches mailing list
patches@x86-64.org 
https://www.x86-64.org/mailman/listinfo/patches 


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [patches] [PATCH] [1/58] x86: Always flush pages in change_page_attr
  2007-08-06 10:15   ` [patches] " Jan Beulich
@ 2007-08-06 10:36     ` Andi Kleen
  2007-08-06 10:49       ` Jan Beulich
  0 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-08-06 10:36 UTC (permalink / raw)
  To: Jan Beulich; +Cc: linux-kernel, patches

On Monday 06 August 2007 12:15:01 Jan Beulich wrote:
> But that is still wrong - you're again flushing the page table page rather than
> the data one. Fixing this was the purpose of the patch I had sent, plus the
> broken reference counting used by the reversion logic. Jan

True. The problem is that we can't necessarily use the LRU list_head of the data
pages though; e.g. when the page is mapped to user space.

I guess we might need to go back to wbinvd again.

What was the remaining problem of the reference counting? 

-Andi


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [34/58] x86_64: ia32entry adjustments
  2007-07-19 14:46   ` Jeff Garzik
@ 2007-08-06 10:43     ` Jan Beulich
  0 siblings, 0 replies; 119+ messages in thread
From: Jan Beulich @ 2007-08-06 10:43 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Andrew Morton, Andi Kleen, linux-kernel, patches

>>> Jeff Garzik <jeff@garzik.org> 19.07.07 16:46 >>>
>Andi Kleen wrote:
>> From: "Jan Beulich" <jbeulich@novell.com>
>> Consolidate the three 32-bit system call entry points so that they all
>> treat registers in similar ways.
>> 
>> Signed-off-by: Jan Beulich <jbeulich@novell.com>
>> Signed-off-by: Andi Kleen <ak@suse.de>
>> 
>>  arch/x86_64/ia32/ia32entry.S |    5 +++--
>>  1 file changed, 3 insertions(+), 2 deletions(-)
>> 
>> Index: linux/arch/x86_64/ia32/ia32entry.S
>> ===================================================================
>> --- linux.orig/arch/x86_64/ia32/ia32entry.S
>> +++ linux/arch/x86_64/ia32/ia32entry.S
>> @@ -104,7 +104,7 @@ ENTRY(ia32_sysenter_target)
>>  	pushq	%rax
>>  	CFI_ADJUST_CFA_OFFSET 8
>>  	cld
>> -	SAVE_ARGS 0,0,0
>> +	SAVE_ARGS 0,0,1
>>   	/* no need to do an access_ok check here because rbp has been
>>   	   32bit zero extended */ 
>>  1:	movl	(%rbp),%r9d
>> @@ -294,7 +294,7 @@ ia32_badarg:
>>   */ 				
>>  
>>  ENTRY(ia32_syscall)
>> -	CFI_STARTPROC	simple
>> +	CFI_STARTPROC32	simple
>>  	CFI_SIGNAL_FRAME
>>  	CFI_DEF_CFA	rsp,SS+8-RIP
>>  	/*CFI_REL_OFFSET	ss,SS-RIP*/
>> @@ -330,6 +330,7 @@ ia32_sysret:
>>  
>>  ia32_tracesys:			 
>>  	SAVE_REST
>> +	CLEAR_RREGS
>>  	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
>>  	movq %rsp,%rdi        /* &pt_regs -> arg1 */
>>  	call syscall_trace_enter
>
>More comments and/or a less vague patch description would be nice.
>
>What registers?  What behavior is being made common?  Why?

I think the description says this quite well - which registers are being saved/
cleared is being made consistent (not common).

Jan


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [patches] [PATCH] [1/58] x86: Always flush pages in change_page_attr
  2007-08-06 10:36     ` Andi Kleen
@ 2007-08-06 10:49       ` Jan Beulich
  0 siblings, 0 replies; 119+ messages in thread
From: Jan Beulich @ 2007-08-06 10:49 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, patches

>>> Andi Kleen <ak@suse.de> 06.08.07 12:36 >>>
>On Monday 06 August 2007 12:15:01 Jan Beulich wrote:
>> But that is still wrong - you're again flushing the page table page rather than
>> the data one. Fixing this was the purpose of the patch I had sent, plus the
>> broken reference counting used by the reversion logic. Jan
>
>True. The problem is that we can't necessarily use the LRU list_head of the data
>pages though; e.g. when the page is mapped to user space.
>
>I guess we might need to go back to wbinvd again.

That was what my patch did, plus an attempt to avoid the wbinvd if all accumulated
pages that have pending modifications are contiguous.

>What was the remaining problem of the reference counting? 

The counter gets adjusted regardless of the current attribute in effect, e.g. if you
change a page to PAGE_KERNEL that already happens to be PAGE_KERNEL, the
counter still gets decremented, which in turn may result in reverting the containing
2M/4M page prematurely. Likewise if a not-PAGE_KERNEL gets changed to another
non-PAGE_KERNEL attribute, the counter would get incremented, likely preventing
reverting the large page forever.

Jan


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [12/58] x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu
  2007-07-19  9:54 ` [PATCH] [12/58] x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu Andi Kleen
@ 2007-08-21 16:25   ` Daniel Walker
  2007-08-21 18:45     ` Andi Kleen
  0 siblings, 1 reply; 119+ messages in thread
From: Daniel Walker @ 2007-08-21 16:25 UTC (permalink / raw)
  To: Andi Kleen; +Cc: patches, linux-kernel, akpm

On Thu, 2007-07-19 at 11:54 +0200, Andi Kleen wrote:
> +
> +static noinline int do_realtime(struct timespec *ts)
> +{
> +       unsigned long seq, ns;
> +       do {
> +               seq = read_seqbegin(&gtod->lock);
> +               ts->tv_sec = gtod->wall_time_sec;
> +               ts->tv_nsec = gtod->wall_time_nsec;
> +               ns = vgetns();
> +       } while (unlikely(read_seqretry(&gtod->lock, seq)));
> +       timespec_add_ns(ts, ns);
> +       return 0;
> +} 

Some thoughts ,

In the -mm kernel there is some debugging that gets injected into the
likely/unlikely macros .. If they get called from userspace it causes a
hang .. We might want to add some new set of macros to specifically
denote that they are called from userspace, not just likely/unlikely but
all the macros so we don't get mixed usage ..

Daniel


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [12/58] x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu
  2007-08-21 18:45     ` Andi Kleen
@ 2007-08-21 18:40       ` Andrew Morton
  0 siblings, 0 replies; 119+ messages in thread
From: Andrew Morton @ 2007-08-21 18:40 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Daniel Walker, patches, linux-kernel

On Tue, 21 Aug 2007 20:45:43 +0200 Andi Kleen <ak@suse.de> wrote:

> > In the -mm kernel there is some debugging that gets injected into the
> > likely/unlikely macros .. If they get called from userspace it causes a
> 
> They should likely define a __likely()/__unlikely() then that doesn't
> do this.
> 
> > hang .. We might want to add some new set of macros to specifically
> > denote that they are called from userspace, not just likely/unlikely but
> > all the macros so we don't get mixed usage ..
> 
> and add a hunk to change the vDSO code. Note that i386 is not the 
> only architecture that has such code.
> 

Yes, the simplest fix would be to remove all the troublesome likely/unlikely
calls within that debug patch.  I'll take a look at that, see if it fixes
the compile.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH] [12/58] x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu
  2007-08-21 16:25   ` Daniel Walker
@ 2007-08-21 18:45     ` Andi Kleen
  2007-08-21 18:40       ` Andrew Morton
  0 siblings, 1 reply; 119+ messages in thread
From: Andi Kleen @ 2007-08-21 18:45 UTC (permalink / raw)
  To: Daniel Walker; +Cc: Andi Kleen, patches, linux-kernel, akpm

> In the -mm kernel there is some debugging that gets injected into the
> likely/unlikely macros .. If they get called from userspace it causes a

They should likely define a __likely()/__unlikely() then that doesn't
do this.

> hang .. We might want to add some new set of macros to specifically
> denote that they are called from userspace, not just likely/unlikely but
> all the macros so we don't get mixed usage ..

and add a hunk to change the vDSO code. Note that i386 is not the 
only architecture that has such code.

-Andi

^ permalink raw reply	[flat|nested] 119+ messages in thread

end of thread, other threads:[~2007-08-21 18:41 UTC | newest]

Thread overview: 119+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-07-19  9:54 [PATCH] [0/58] First batch of x86 patches for .23 Andi Kleen
2007-07-19  9:54 ` [PATCH] [1/58] x86: Always flush pages in change_page_attr Andi Kleen
2007-08-06 10:15   ` [patches] " Jan Beulich
2007-08-06 10:36     ` Andi Kleen
2007-08-06 10:49       ` Jan Beulich
2007-07-19  9:54 ` [PATCH] [2/58] x86_64: Tell gcc to only align stack to 8 bytes Andi Kleen
2007-07-19 11:50   ` Serge Belyshev
2007-07-19 12:06     ` Andi Kleen
2007-07-19 14:42   ` Chuck Ebbert
2007-07-19  9:54 ` [PATCH] [3/58] x86_64: asm/ptrace.h needs linux/compiler.h Andi Kleen
2007-07-19  9:54 ` [PATCH] [4/58] x86_64: Don't rely on a unique IO-APIC ID Andi Kleen
2007-07-19  9:54 ` [PATCH] [5/58] x86_64: Report the pending irq if available in smp_affinity Andi Kleen
2007-07-19 10:23   ` Ingo Molnar
2007-07-19  9:54 ` [PATCH] [6/58] x86_64: Use LOCAL_DISTANCE and REMOTE_DISTANCE in x86_64 ACPI code Andi Kleen
2007-07-19  9:54 ` [PATCH] [7/58] x86_64: various cleanups in NUMA scan node Andi Kleen
2007-07-19 17:15   ` Yinghai Lu
2007-07-19 17:21     ` Andi Kleen
2007-07-19 17:38       ` Yinghai Lu
2007-07-19 20:00         ` Andi Kleen
2007-07-19 21:01     ` David Rientjes
2007-07-19  9:54 ` [PATCH] [8/58] x86_64: Use string instruction memcpy/memset on AMD Fam10 Andi Kleen
2007-07-19 16:43   ` Jan Engelhardt
2007-07-19 17:00     ` Yinghai Lu
2007-07-19  9:54 ` [PATCH] [9/58] x86_64: Always use builtin memcpy on gcc 4.3 Andi Kleen
2007-07-21 23:16   ` Oleg Verych
2007-07-21 23:27     ` Andi Kleen
2007-07-22  0:29     ` Denis Vlasenko
2007-07-19  9:54 ` [PATCH] [10/58] i386: Move all simple string operations out of line Andi Kleen
2007-07-19  9:54 ` [PATCH] [11/58] x86: Support __attribute__((__cold__)) in gcc 4.3 Andi Kleen
2007-07-19  9:54 ` [PATCH] [12/58] x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu Andi Kleen
2007-08-21 16:25   ` Daniel Walker
2007-08-21 18:45     ` Andi Kleen
2007-08-21 18:40       ` Andrew Morton
2007-07-19  9:54 ` [PATCH] [13/58] x86: Separate checking of unsynchronized and unstable TSC Andi Kleen
2007-07-19  9:54 ` [PATCH] [14/58] x86_64: Add on_cpu_single Andi Kleen
2007-07-19 11:09   ` Satyam Sharma
2007-07-19 12:07     ` Andi Kleen
2007-07-19  9:54 ` [PATCH] [15/58] i386: Rewrite sched_clock Andi Kleen
2007-07-19 16:51   ` Daniel Walker
2007-07-19 17:13     ` Andi Kleen
2007-07-19 17:15       ` Daniel Walker
2007-07-19 17:22         ` Andi Kleen
2007-07-19 17:31           ` Daniel Walker
2007-07-19 17:38             ` Andi Kleen
2007-07-19 17:43               ` Daniel Walker
2007-07-19 18:00                 ` Andi Kleen
2007-07-19 18:00                   ` Daniel Walker
2007-07-20  3:11     ` Mathieu Desnoyers
2007-07-20  3:47       ` Mathieu Desnoyers
2007-07-20  4:18         ` [PATCH] [15/58] i386: Rewrite sched_clock (cmpxchg8b) Mathieu Desnoyers
2007-07-20  5:07           ` Nick Piggin
2007-07-20  5:47             ` Mathieu Desnoyers
2007-07-20  8:27       ` [PATCH] [15/58] i386: Rewrite sched_clock Andi Kleen
2007-07-20 14:12         ` Mathieu Desnoyers
2007-07-20 14:39           ` Mathieu Desnoyers
2007-07-20 15:14           ` Andi Kleen
2007-07-20 15:22             ` Mathieu Desnoyers
2007-07-20 16:49             ` [PATCH] 80386 and 80486 cmpxchg64 and cmpxchg64_local fallback Mathieu Desnoyers
2007-07-19  9:55 ` [PATCH] [16/58] x86_64: Use new shared sched_clock in x86-64 too Andi Kleen
2007-07-19  9:55 ` [PATCH] [17/58] i386: Add L3 cache support to AMD CPUID4 emulation Andi Kleen
2007-07-20 17:00   ` [patches] " Andreas Herrmann
2007-07-20 17:15   ` Andreas Herrmann
2007-07-19  9:55 ` [PATCH] [18/58] x86_64: remove extra extern declaring about dmi_ioremap Andi Kleen
2007-07-19  9:55 ` [PATCH] [19/58] x86_64: Don't use softirq save locks in smp_call_function Andi Kleen
2007-07-19 12:16   ` Satyam Sharma
2007-07-19 12:19     ` Andi Kleen
2007-07-19  9:55 ` [PATCH] [20/58] x86: Always probe the NMI watchdog Andi Kleen
2007-07-19 10:24   ` Björn Steinbrink
2007-07-19 10:42     ` Andi Kleen
2007-07-19  9:55 ` [PATCH] [21/58] i386: Reserve the right performance counter for the Intel PerfMon " Andi Kleen
2007-07-19 10:21   ` Björn Steinbrink
2007-07-19 10:45     ` Andi Kleen
2007-07-19  9:55 ` [PATCH] [22/58] x86_64: hpet tsc calibration fix broken smi detection logic Andi Kleen
2007-07-19  9:55 ` [PATCH] [23/58] i386: remove pit_interrupt_hook Andi Kleen
2007-07-19  9:55 ` [PATCH] [24/58] x86_64: Untangle asm/hpet.h from asm/timex.h Andi Kleen
2007-07-19  9:55 ` [PATCH] [25/58] x86_64: use generic cmos update Andi Kleen
2007-07-19  9:55 ` [PATCH] [26/58] x86_64: Use generic xtime init Andi Kleen
2007-07-19  9:55 ` [PATCH] [27/58] x86_64: Remove dead code and other janitor work in tsc.c Andi Kleen
2007-07-19  9:55 ` [PATCH] [28/58] x86_64: Fix APIC typo Andi Kleen
2007-07-19  9:55 ` [PATCH] [29/58] x86_64: fiuxp pt_reqs leftovers Andi Kleen
2007-07-19  9:55 ` [PATCH] [30/58] x86: share hpet.h with i386 Andi Kleen
2007-07-19  9:55 ` [PATCH] [31/58] x86_64: apic.c coding style janitor work Andi Kleen
2007-07-19  9:55 ` [PATCH] [32/58] x86_64: time.c white space wreckage cleanup Andi Kleen
2007-07-19  9:55 ` [PATCH] [33/58] x86_64: Avoid too many remote cpu references due to /proc/stat Andi Kleen
2007-07-19 10:21   ` Christoph Hellwig
2007-07-19 10:41     ` Andi Kleen
2007-07-19 10:55       ` Adrian Bunk
2007-07-19  9:55 ` [PATCH] [34/58] x86_64: ia32entry adjustments Andi Kleen
2007-07-19 14:46   ` Jeff Garzik
2007-08-06 10:43     ` Jan Beulich
2007-07-19  9:55 ` [PATCH] [35/58] i386: allow debuggers to access the vsyscall page with compat vDSO Andi Kleen
2007-07-19  9:55 ` [PATCH] [36/58] x86_64: minor exception trace variables cleanup Andi Kleen
2007-07-19  9:55 ` [PATCH] [37/58] x86_64: remove unused variable maxcpus Andi Kleen
2007-07-19  9:55 ` [PATCH] [38/58] i386: smp-alt-once option is only useful with HOTPLUG_CPU Andi Kleen
2007-07-19  9:55 ` [PATCH] [39/58] i386: minor nx handling adjustment Andi Kleen
2007-07-19  9:55 ` [PATCH] [40/58] i386: remapped_pgdat_init() static Andi Kleen
2007-07-19  9:55 ` [PATCH] [41/58] i386: arch/i386/kernel/i8253.c should #include <asm/timer.h> Andi Kleen
2007-07-19  9:55 ` [PATCH] [42/58] i386: timer_irq_works() static again Andi Kleen
2007-07-19  9:55 ` [PATCH] [43/58] x86_64: Quicklist support for x86_64 Andi Kleen
2007-07-19  9:55 ` [PATCH] [44/58] x86_64: extract helper function from e820_register_active_regions Andi Kleen
2007-07-19  9:55 ` [PATCH] [45/58] x86_64: fake pxm-to-node mapping for fake numa Andi Kleen
2007-07-19  9:55 ` [PATCH] [46/58] x86_64: fake apicid_to_node " Andi Kleen
2007-07-19  9:55 ` [PATCH] [47/58] i386: insert unclaimed MMCONFIG resources Andi Kleen
2007-07-19  9:55 ` [PATCH] [48/58] x86_64: O_EXCL on /dev/mcelog Andi Kleen
2007-07-19  9:55 ` [PATCH] [49/58] x86_64: support poll() " Andi Kleen
2007-07-19  9:55 ` [PATCH] [50/58] x86_64: mcelog tolerant level cleanup Andi Kleen
2007-07-19  9:55 ` [PATCH] [51/58] i386: fix machine rebooting Andi Kleen
2007-07-19  9:55 ` [PATCH] [52/58] i386: fix section mismatch warnings in mtrr Andi Kleen
2007-07-19  9:55 ` [PATCH] [53/58] x86: PM_TRACE support Andi Kleen
2007-07-19  9:55 ` [PATCH] [54/58] x86: Make Alt-SysRq-p display the debug register contents Andi Kleen
2007-07-19  9:55 ` [PATCH] [55/58] i386: add reference to the arguments Andi Kleen
2007-07-19  9:55 ` [PATCH] [56/58] x86: round_jiffies() for i386 and x86-64 non-critical/corrected MCE polling Andi Kleen
2007-07-19  9:55 ` [PATCH] [57/58] x86_64: check remote IRR bit before migrating level triggered irq Andi Kleen
2007-07-19  9:55 ` [PATCH] [58/58] x86: remove support for the Rise CPU Andi Kleen
2007-07-19 10:45   ` Alan Cox
2007-07-19 10:48     ` Adrian Bunk
2007-07-19 11:13       ` Alan Cox
2007-07-19 12:03         ` Andi Kleen
2007-07-19 14:56           ` Jeff Garzik

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).