All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] Define percpu smp cacheline align interface
       [not found] <33E1C72C74DBE747B7B59C1740F7443701A2F0AB@orsmsx417.amr.corp.intel.com>
@ 2007-05-05  0:12 ` Fenghua Yu
  2007-05-07 22:58   ` Fenghua Yu
  2007-05-05  0:12 ` [PATCH 0/2] Add percpu smp cacheline align section Fenghua Yu
  2007-05-05  0:12 ` [PATCH 2/2] Call percpu smp cacheline algin interface Fenghua Yu
  2 siblings, 1 reply; 20+ messages in thread
From: Fenghua Yu @ 2007-05-05  0:12 UTC (permalink / raw)
  To: akpm, suresh.b.siddha, linux-kernel; +Cc: Fenghua Yu


Define percpu smp cacheline align interface

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>

diff -Nurp linux-2.6.21-rc7.0/arch/alpha/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/alpha/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/alpha/kernel/vmlinux.lds.S	2007-05-01 07:32:58.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/alpha/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -69,10 +69,7 @@ SECTIONS
   . = ALIGN(8);
   SECURITY_INIT
 
-  . = ALIGN(8192);
-  __per_cpu_start = .;
-  .data.percpu : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(8192)
 
   . = ALIGN(2*8192);
   __init_end = .;
diff -Nurp linux-2.6.21-rc7.0/arch/arm/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/arm/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/arm/kernel/vmlinux.lds.S	2007-05-01 07:32:58.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/arm/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -59,10 +59,7 @@ SECTIONS
 			usr/built-in.o(.init.ramfs)
 		__initramfs_end = .;
 #endif
-		. = ALIGN(4096);
-		__per_cpu_start = .;
-			*(.data.percpu)
-		__per_cpu_end = .;
+		PERCPU(4096)
 #ifndef CONFIG_XIP_KERNEL
 		__init_begin = _stext;
 		*(.init.data)
diff -Nurp linux-2.6.21-rc7.0/arch/cris/arch-v32/vmlinux.lds.S linux-2.6.21-rc7.1/arch/cris/arch-v32/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/cris/arch-v32/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/cris/arch-v32/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -92,10 +92,7 @@ SECTIONS
 	}
 	SECURITY_INIT
 
-	. =  ALIGN (8192);
-	__per_cpu_start = .;
-	.data.percpu  : { *(.data.percpu) }
-	__per_cpu_end = .;
+	PERCPU(8192)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	.init.ramfs : {
diff -Nurp linux-2.6.21-rc7.0/arch/frv/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/frv/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/frv/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/frv/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -57,10 +57,7 @@ SECTIONS
   __alt_instructions_end = .;
  .altinstr_replacement : { *(.altinstr_replacement) }
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
diff -Nurp linux-2.6.21-rc7.0/arch/i386/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/i386/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/i386/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/i386/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -185,12 +185,7 @@ SECTIONS
 	__initramfs_end = .;
   }
 #endif
-  . = ALIGN(4096);
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
-	__per_cpu_start = .;
-	*(.data.percpu)
-	__per_cpu_end = .;
-  }
+  PERCPU(4096)
   . = ALIGN(4096);
   /* freed after init ends here */
 	
diff -Nurp linux-2.6.21-rc7.0/arch/ia64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/ia64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/ia64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/ia64/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -206,6 +206,7 @@ SECTIONS
 	{
 		__per_cpu_start = .;
 		*(.data.percpu)
+		*(.data.percpu.cacheline_aligned_in_smp)
 		__per_cpu_end = .;
 	}
   . = __phys_per_cpu_start + PERCPU_PAGE_SIZE;	/* ensure percpu data fits
diff -Nurp linux-2.6.21-rc7.0/arch/m32r/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/m32r/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/m32r/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/m32r/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -111,10 +111,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/mips/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/mips/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/mips/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/mips/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -121,10 +121,7 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
 #endif
-  . = ALIGN(_PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(_PAGE_SIZE)
   . = ALIGN(_PAGE_SIZE);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/parisc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/parisc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/parisc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/parisc/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -182,10 +182,9 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
 #endif
-  . = ALIGN(ASM_PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+
+  PERCPU(ASM_PAGE_SIZE)
+
   . = ALIGN(ASM_PAGE_SIZE);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/powerpc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/powerpc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/powerpc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/powerpc/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -139,12 +139,7 @@ SECTIONS
 		__initramfs_end = .;
 	}
 #endif
-	. = ALIGN(PAGE_SIZE);
-	.data.percpu : {
-		__per_cpu_start = .;
-		*(.data.percpu)
-		__per_cpu_end = .;
-	}
+	PERCPU(PAGE_SIZE)
 
 	. = ALIGN(8);
 	.machine.desc : {
diff -Nurp linux-2.6.21-rc7.0/arch/ppc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/ppc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/ppc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/ppc/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -131,10 +131,7 @@ SECTIONS
   __ftr_fixup : { *(__ftr_fixup) }
   __stop___ftr_fixup = .;
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
diff -Nurp linux-2.6.21-rc7.0/arch/s390/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/s390/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/s390/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/s390/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -108,10 +108,7 @@ SECTIONS
   . = ALIGN(2);
   __initramfs_end = .;
 #endif
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/sh/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sh/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sh/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sh/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -61,10 +61,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   __nosave_end = .;
 
-  . = ALIGN(PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(PAGE_SIZE)
   .data.cacheline_aligned : { *(.data.cacheline_aligned) }
 
   _edata = .;			/* End of data section */
diff -Nurp linux-2.6.21-rc7.0/arch/sh64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sh64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sh64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sh64/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -86,10 +86,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   .data.page_aligned : C_PHYS(.data.page_aligned) { *(.data.page_aligned) }
 
-  . = ALIGN(PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) }
-  __per_cpu_end = . ;
+  PERCPU(PAGE_SIZE)
   .data.cacheline_aligned : C_PHYS(.data.cacheline_aligned) { *(.data.cacheline_aligned) }
 
   _edata = .;			/* End of data section */
diff -Nurp linux-2.6.21-rc7.0/arch/sparc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sparc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sparc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sparc/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -66,10 +66,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   __init_end = .;
   . = ALIGN(32);
diff -Nurp linux-2.6.21-rc7.0/arch/sparc64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sparc64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sparc64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sparc64/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -90,10 +90,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(8192);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(8192)
   . = ALIGN(8192);
   __init_end = .;
   __bss_start = .;
diff -Nurp linux-2.6.21-rc7.0/arch/x86_64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/x86_64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/x86_64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/x86_64/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -209,10 +209,8 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
+
   . = ALIGN(4096);
 
   __init_end = .;
diff -Nurp linux-2.6.21-rc7.0/arch/xtensa/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/xtensa/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/xtensa/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/xtensa/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -198,10 +198,7 @@ SECTIONS
   __ftr_fixup : { *(__ftr_fixup) }
   __stop___ftr_fixup = .;
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
diff -Nurp linux-2.6.21-rc7.0/include/asm-generic/percpu.h linux-2.6.21-rc7.1/include/asm-generic/percpu.h
--- linux-2.6.21-rc7.0/include/asm-generic/percpu.h	2007-05-01 07:33:03.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-generic/percpu.h	2007-05-04 11:15:26.000000000 -0700
@@ -14,6 +14,11 @@ extern unsigned long __per_cpu_offset[NR
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)		  \
+    __attribute__((__section__(".data.percpu.cacheline_aligned_in_smp"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp
+
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*({				\
 	extern int simple_identifier_##var(void);	\
@@ -34,6 +39,9 @@ do {								\
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)	\
+    DEFINE_PER_CPU(type, name)
+
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 #define __raw_get_cpu_var(var)			per_cpu__##var
diff -Nurp linux-2.6.21-rc7.0/include/asm-generic/vmlinux.lds.h linux-2.6.21-rc7.1/include/asm-generic/vmlinux.lds.h
--- linux-2.6.21-rc7.0/include/asm-generic/vmlinux.lds.h	2007-05-01 07:33:03.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-generic/vmlinux.lds.h	2007-05-02 17:00:29.000000000 -0700
@@ -265,3 +265,13 @@
   	*(.initcall6s.init)						\
   	*(.initcall7.init)						\
   	*(.initcall7s.init)
+
+#define PERCPU(align)							\
+  . = ALIGN(align);							\
+  __per_cpu_start = .;							\
+  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {		\
+	*(.data.percpu)							\
+	*(.data.percpu.cacheline_aligned_in_smp)			\
+  }									\
+  __per_cpu_end = .;
+
diff -Nurp linux-2.6.21-rc7.0/include/asm-i386/percpu.h linux-2.6.21-rc7.1/include/asm-i386/percpu.h
--- linux-2.6.21-rc7.0/include/asm-i386/percpu.h	2007-05-01 07:33:03.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-i386/percpu.h	2007-05-02 17:00:29.000000000 -0700
@@ -54,6 +54,11 @@ extern unsigned long __per_cpu_offset[];
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)		  \
+    __attribute__((__section__(".data.percpu.cacheline_aligned_in_smp"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp 
+
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 
diff -Nurp linux-2.6.21-rc7.0/include/asm-ia64/percpu.h linux-2.6.21-rc7.1/include/asm-ia64/percpu.h
--- linux-2.6.21-rc7.0/include/asm-ia64/percpu.h	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-ia64/percpu.h	2007-05-04 11:47:45.000000000 -0700
@@ -29,6 +29,16 @@
 	__attribute__((__section__(".data.percpu")))		\
 	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
 
+#ifdef CONFIG_SMP
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)		      \
+	__attribute__((__section__(".data.percpu.cacheline_aligned_in_smp"))) \
+	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name		      \
+	____cacheline_aligned_in_smp	
+#else
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)	\
+	DEFINE_PER_CPU(type, name)
+#endif
+
 /*
  * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
  * external routine, to avoid include-hell.
diff -Nurp linux-2.6.21-rc7.0/include/asm-powerpc/percpu.h linux-2.6.21-rc7.1/include/asm-powerpc/percpu.h
--- linux-2.6.21-rc7.0/include/asm-powerpc/percpu.h	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-powerpc/percpu.h	2007-05-04 11:19:37.000000000 -0700
@@ -20,6 +20,11 @@
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)		  \
+    __attribute__((__section__(".data.percpu.cacheline_aligned_in_smp"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp 
+
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
@@ -40,6 +45,8 @@ extern void setup_per_cpu_areas(void);
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
diff -Nurp linux-2.6.21-rc7.0/include/asm-s390/percpu.h linux-2.6.21-rc7.1/include/asm-s390/percpu.h
--- linux-2.6.21-rc7.0/include/asm-s390/percpu.h	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-s390/percpu.h	2007-05-04 11:20:19.000000000 -0700
@@ -41,6 +41,11 @@ extern unsigned long __per_cpu_offset[NR
     __attribute__((__section__(".data.percpu"))) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)		  \
+    __attribute__((__section__(".data.percpu.cacheline_aligned_in_smp"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp
+
 #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
@@ -59,6 +64,8 @@ do {								\
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define __get_cpu_var(var) __reloc_hide(var,0)
 #define __raw_get_cpu_var(var) __reloc_hide(var,0)
diff -Nurp linux-2.6.21-rc7.0/include/asm-sparc64/percpu.h linux-2.6.21-rc7.1/include/asm-sparc64/percpu.h
--- linux-2.6.21-rc7.0/include/asm-sparc64/percpu.h	2007-05-01 07:33:06.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-sparc64/percpu.h	2007-05-04 11:20:51.000000000 -0700
@@ -17,6 +17,11 @@ extern unsigned long __per_cpu_shift;
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)		  \
+    __attribute__((__section__(".data.percpu.cacheline_aligned_in_smp"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp 
+
 register unsigned long __local_per_cpu_offset asm("g5");
 
 /* var is in discarded region: offset to particular copy we want */
@@ -36,6 +41,8 @@ do {								\
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
diff -Nurp linux-2.6.21-rc7.0/include/asm-x86_64/percpu.h linux-2.6.21-rc7.1/include/asm-x86_64/percpu.h
--- linux-2.6.21-rc7.0/include/asm-x86_64/percpu.h	2007-05-01 07:33:06.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-x86_64/percpu.h	2007-05-04 11:21:25.000000000 -0700
@@ -20,6 +20,11 @@
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)		  \
+    __attribute__((__section__(".data.percpu.cacheline_aligned_in_smp"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_internodealigned_in_smp 
+
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*({				\
 	extern int simple_identifier_##var(void);	\
@@ -46,6 +51,8 @@ extern void setup_per_cpu_areas(void);
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 0/2] Add percpu smp cacheline align section
       [not found] <33E1C72C74DBE747B7B59C1740F7443701A2F0AB@orsmsx417.amr.corp.intel.com>
  2007-05-05  0:12 ` [PATCH 1/2] Define percpu smp cacheline align interface Fenghua Yu
@ 2007-05-05  0:12 ` Fenghua Yu
  2007-05-05 16:52   ` Christoph Lameter
  2007-05-05  0:12 ` [PATCH 2/2] Call percpu smp cacheline algin interface Fenghua Yu
  2 siblings, 1 reply; 20+ messages in thread
From: Fenghua Yu @ 2007-05-05  0:12 UTC (permalink / raw)
  To: akpm, suresh.b.siddha, linux-kernel; +Cc: Fenghua Yu



This is follow-up for Suresh's runqueue align in smp patch at:
http://www.uwsg.iu.edu/hypermail/linux/kernel/0704.1/0340.html

The patches place all of smp cacheline aligned percpu data into .data.percpu.cacheline_aligned_in_smp. Other percpu data is still in data.percpu section. The patches can reduce cache line access in SMP and reduce alignment gap waste. The patches also define PERCPU macro for vmlinux.lds.S for code clean up.

PATCH 1/2: Define percpu smp cacheline align interface
PATCH 2/2: Call percpu smp cacheline algin interface

Thanks.

-Fenghua 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 2/2] Call percpu smp cacheline algin interface
       [not found] <33E1C72C74DBE747B7B59C1740F7443701A2F0AB@orsmsx417.amr.corp.intel.com>
  2007-05-05  0:12 ` [PATCH 1/2] Define percpu smp cacheline align interface Fenghua Yu
  2007-05-05  0:12 ` [PATCH 0/2] Add percpu smp cacheline align section Fenghua Yu
@ 2007-05-05  0:12 ` Fenghua Yu
  2007-05-07 22:59   ` Fenghua Yu
  2 siblings, 1 reply; 20+ messages in thread
From: Fenghua Yu @ 2007-05-05  0:12 UTC (permalink / raw)
  To: akpm, suresh.b.siddha, linux-kernel; +Cc: Fenghua Yu


Call percpu smp cacheline align interface.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>

diff -Nurp linux-2.6.21-rc7.0/arch/i386/kernel/init_task.c linux-2.6.21-rc7.1/arch/i386/kernel/init_task.c
--- linux-2.6.21-rc7.0/arch/i386/kernel/init_task.c	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/i386/kernel/init_task.c	2007-05-02 17:00:34.000000000 -0700
@@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task);
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(struct tss_struct, init_tss) = INIT_TSS;
 
diff -Nurp linux-2.6.21-rc7.0/arch/i386/kernel/irq.c linux-2.6.21-rc7.1/arch/i386/kernel/irq.c
--- linux-2.6.21-rc7.0/arch/i386/kernel/irq.c	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/i386/kernel/irq.c	2007-05-02 17:00:34.000000000 -0700
@@ -21,7 +21,7 @@
 #include <asm/apic.h>
 #include <asm/uaccess.h>
 
-DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
+DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(irq_cpustat_t, irq_stat);
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
diff -Nurp linux-2.6.21-rc7.0/arch/ia64/kernel/smp.c linux-2.6.21-rc7.1/arch/ia64/kernel/smp.c
--- linux-2.6.21-rc7.0/arch/ia64/kernel/smp.c	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/ia64/kernel/smp.c	2007-05-02 17:00:34.000000000 -0700
@@ -70,7 +70,7 @@ static volatile struct call_data_struct 
 #define IPI_KDUMP_CPU_STOP	3
 
 /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
-static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+static DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(u64, ipi_operation);
 
 extern void cpu_halt (void);
 
diff -Nurp linux-2.6.21-rc7.0/arch/x86_64/kernel/init_task.c linux-2.6.21-rc7.1/arch/x86_64/kernel/init_task.c
--- linux-2.6.21-rc7.0/arch/x86_64/kernel/init_task.c	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/x86_64/kernel/init_task.c	2007-05-02 17:00:34.000000000 -0700
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(init_task);
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(struct tss_struct, init_tss) = INIT_TSS;
 
 /* Copies of the original ist values from the tss are only accessed during
  * debugging, no special alignment required.
diff -Nurp linux-2.6.21-rc7.0/kernel/sched.c linux-2.6.21-rc7.1/kernel/sched.c
--- linux-2.6.21-rc7.0/kernel/sched.c	2007-05-01 07:33:07.000000000 -0700
+++ linux-2.6.21-rc7.1/kernel/sched.c	2007-05-02 17:00:34.000000000 -0700
@@ -263,7 +263,7 @@ struct rq {
 	struct lock_class_key rq_lock_key;
 };
 
-static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP(struct rq, runqueues);
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 static inline int cpu_of(struct rq *rq)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/2] Add percpu smp cacheline align section
  2007-05-05  0:12 ` [PATCH 0/2] Add percpu smp cacheline align section Fenghua Yu
@ 2007-05-05 16:52   ` Christoph Lameter
  2007-05-07 17:11     ` Siddha, Suresh B
  0 siblings, 1 reply; 20+ messages in thread
From: Christoph Lameter @ 2007-05-05 16:52 UTC (permalink / raw)
  To: Fenghua Yu; +Cc: akpm, suresh.b.siddha, linux-kernel

On Fri, 4 May 2007, Fenghua Yu wrote:

> This is follow-up for Suresh's runqueue align in smp patch at:
> http://www.uwsg.iu.edu/hypermail/linux/kernel/0704.1/0340.html
> 
> The patches place all of smp cacheline aligned percpu data into 
> .data.percpu.cacheline_aligned_in_smp. Other percpu data is still in 
> data.percpu section. The patches can reduce cache line access in SMP and 
> reduce alignment gap waste. The patches also define PERCPU macro for 
> vmlinux.lds.S for code clean up.

Ummm... The per cpu area is for exclusive use of a particular processor. 
If there is contention in the per cpu area then a data object needs to be 
removed from the per cpu area because the object is *not* accessed only 
from a certain cpu.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/2] Add percpu smp cacheline align section
  2007-05-05 16:52   ` Christoph Lameter
@ 2007-05-07 17:11     ` Siddha, Suresh B
  2007-05-07 17:30       ` Christoph Lameter
  0 siblings, 1 reply; 20+ messages in thread
From: Siddha, Suresh B @ 2007-05-07 17:11 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Yu, Fenghua, akpm, Siddha, Suresh B, linux-kernel

On Sat, May 05, 2007 at 09:52:27AM -0700, Christoph Lameter wrote:
> On Fri, 4 May 2007, Fenghua Yu wrote:
>
> > This is follow-up for Suresh's runqueue align in smp patch at:
> > [1]http://www.uwsg.iu.edu/hypermail/linux/kernel/0704.1/0340.html
> >
> > The patches place all of smp cacheline aligned percpu data into
> > .data.percpu.cacheline_aligned_in_smp. Other percpu data is still in
> > data.percpu section. The patches can reduce cache line access in SMP and
> > reduce alignment gap waste. The patches also define PERCPU macro for
> > vmlinux.lds.S for code clean up.
>
> Ummm... The per cpu area is for exclusive use of a particular processor.
> If there is contention in the per cpu area then a data object needs to be
> removed from the per cpu area because the object is *not* accessed only
> from a certain cpu.

Christoph, This data(that is being accessed by other cpus) also needs
to be defined for each cpu and as such it is getting appended (and
clearly seperated in a different section) to the data which is accessed
only by the local cpu.

Not sure what your concern is.

thanks,
suresh

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/2] Add percpu smp cacheline align section
  2007-05-07 17:11     ` Siddha, Suresh B
@ 2007-05-07 17:30       ` Christoph Lameter
  2007-05-07 17:46         ` Siddha, Suresh B
  0 siblings, 1 reply; 20+ messages in thread
From: Christoph Lameter @ 2007-05-07 17:30 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: Yu, Fenghua, akpm, linux-kernel

On Mon, 7 May 2007, Siddha, Suresh B wrote:

> > Ummm... The per cpu area is for exclusive use of a particular processor.
> > If there is contention in the per cpu area then a data object needs to be
> > removed from the per cpu area because the object is *not* accessed only
> > from a certain cpu.
> 
> Christoph, This data(that is being accessed by other cpus) also needs
> to be defined for each cpu and as such it is getting appended (and
> clearly seperated in a different section) to the data which is accessed
> only by the local cpu.
> 
> Not sure what your concern is.

Call this area "cpu shared" or so?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/2] Add percpu smp cacheline align section
  2007-05-07 17:30       ` Christoph Lameter
@ 2007-05-07 17:46         ` Siddha, Suresh B
  2007-05-07 18:13           ` Yu, Fenghua
                             ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Siddha, Suresh B @ 2007-05-07 17:46 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Siddha, Suresh B, Yu, Fenghua, akpm, linux-kernel

On Mon, May 07, 2007 at 10:30:16AM -0700, Christoph Lameter wrote:
> Call this area "cpu shared" or so?

Sure. cacheline_algined_in_smp was sort of indicating this. But this
name is short and also will clear the confusion.

But this still need to tap into per cpu infrastructure, so that the
new area can be allocated for each cpu during boot time..

thanks,
suresh

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 0/2] Add percpu smp cacheline align section
  2007-05-07 17:46         ` Siddha, Suresh B
@ 2007-05-07 18:13           ` Yu, Fenghua
  2007-05-15  0:12           ` [PATCH 2/2] Use the new percpu interface for shared data -- version 2 Fenghua Yu
       [not found]           ` <20070515001255.GA27978@linux-os.sc.intel.com>
  2 siblings, 0 replies; 20+ messages in thread
From: Yu, Fenghua @ 2007-05-07 18:13 UTC (permalink / raw)
  To: Siddha, Suresh B, Christoph Lameter; +Cc: akpm, linux-kernel

On Mon, May 07, 2007 at 10:30:16AM -0700, Christoph Lameter wrote:
> Call this area "cpu shared" or so?

There are some fields in percpu section which are accessed by other
cpu's but not cache line aligned, e.g. cpu_idle_state, flush_state in
x86-64. We don't want to place those fields into same subsection as
cacheline aligned fields.

So perhaps, I can change the section name
.data.percpu.cacheline_aligned_in_smp to
.data.percpu.shared_cacheline_aligned. And change macro
DEFINE_PER_CPU_CACHELINE_ALIGNED_IN_SMP to
DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED.

.data.percpu.shared_cacheline_aligned contains only cpu shared percpu
fields which are cacheline aligned. In future (or in a separate patch
series), we can define another percpu section .data.percpu.shared and
its macro DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED which store all of cpu
shared percpu fields. The new .data.percpu.shared and
DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED are for better coding and
understanding.

Thanks.

-Fenghua

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] Define percpu smp cacheline align interface
  2007-05-05  0:12 ` [PATCH 1/2] Define percpu smp cacheline align interface Fenghua Yu
@ 2007-05-07 22:58   ` Fenghua Yu
  2007-05-15 23:42     ` Andrew Morton
  0 siblings, 1 reply; 20+ messages in thread
From: Fenghua Yu @ 2007-05-07 22:58 UTC (permalink / raw)
  To: akpm, suresh.b.siddha, linux-kernel, clameter; +Cc: fenghua.yu

On Fri, May 04, 2007 at 05:12:18PM -0700, Fenghua Yu wrote:
> 
> Define percpu smp cacheline align interface

This is updated patch. Per cpu cacheline aligned data section name and macro name are changed to have better code understanding.

The patches place all of smp cacheline aligned percpu data into .data.percpu.shared_cacheline_aligned. Other percpu data is still in data.percpu section. The patches can reduce cache line access in SMP and reduce alignment gap waste. The patches also define PERCPU macro for vmlinux.lds.S for concise code.

PATCH 1/2: Define percpu smp cacheline align interface
PATCH 2/2: Call percpu smp cacheline algin interface

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>

---

 arch/alpha/kernel/vmlinux.lds.S   |    5 +----
 arch/arm/kernel/vmlinux.lds.S     |    5 +----
 arch/cris/arch-v32/vmlinux.lds.S  |    5 +----
 arch/frv/kernel/vmlinux.lds.S     |    5 +----
 arch/i386/kernel/vmlinux.lds.S    |    7 +------
 arch/ia64/kernel/vmlinux.lds.S    |    1 +
 arch/m32r/kernel/vmlinux.lds.S    |    5 +----
 arch/mips/kernel/vmlinux.lds.S    |    5 +----
 arch/parisc/kernel/vmlinux.lds.S  |    7 +++----
 arch/powerpc/kernel/vmlinux.lds.S |    7 +------
 arch/ppc/kernel/vmlinux.lds.S     |    5 +----
 arch/s390/kernel/vmlinux.lds.S    |    5 +----
 arch/sh/kernel/vmlinux.lds.S      |    5 +----
 arch/sh64/kernel/vmlinux.lds.S    |    5 +----
 arch/sparc/kernel/vmlinux.lds.S   |    5 +----
 arch/sparc64/kernel/vmlinux.lds.S |    5 +----
 arch/x86_64/kernel/vmlinux.lds.S  |    6 ++----
 arch/xtensa/kernel/vmlinux.lds.S  |    5 +----
 include/asm-generic/percpu.h      |    8 ++++++++
 include/asm-generic/vmlinux.lds.h |   10 ++++++++++
 include/asm-i386/percpu.h         |    5 +++++
 include/asm-ia64/percpu.h         |   10 ++++++++++
 include/asm-powerpc/percpu.h      |    7 +++++++
 include/asm-s390/percpu.h         |    7 +++++++
 include/asm-sparc64/percpu.h      |    7 +++++++
 include/asm-x86_64/percpu.h       |    7 +++++++
 26 files changed, 82 insertions(+), 72 deletions(-)

diff -Nurp linux-2.6.21-rc7.0/arch/alpha/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/alpha/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/alpha/kernel/vmlinux.lds.S	2007-05-01 07:32:58.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/alpha/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -69,10 +69,7 @@ SECTIONS
   . = ALIGN(8);
   SECURITY_INIT
 
-  . = ALIGN(8192);
-  __per_cpu_start = .;
-  .data.percpu : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(8192)
 
   . = ALIGN(2*8192);
   __init_end = .;
diff -Nurp linux-2.6.21-rc7.0/arch/arm/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/arm/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/arm/kernel/vmlinux.lds.S	2007-05-01 07:32:58.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/arm/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -59,10 +59,7 @@ SECTIONS
 			usr/built-in.o(.init.ramfs)
 		__initramfs_end = .;
 #endif
-		. = ALIGN(4096);
-		__per_cpu_start = .;
-			*(.data.percpu)
-		__per_cpu_end = .;
+		PERCPU(4096)
 #ifndef CONFIG_XIP_KERNEL
 		__init_begin = _stext;
 		*(.init.data)
diff -Nurp linux-2.6.21-rc7.0/arch/cris/arch-v32/vmlinux.lds.S linux-2.6.21-rc7.1/arch/cris/arch-v32/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/cris/arch-v32/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/cris/arch-v32/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -92,10 +92,7 @@ SECTIONS
 	}
 	SECURITY_INIT
 
-	. =  ALIGN (8192);
-	__per_cpu_start = .;
-	.data.percpu  : { *(.data.percpu) }
-	__per_cpu_end = .;
+	PERCPU(8192)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	.init.ramfs : {
diff -Nurp linux-2.6.21-rc7.0/arch/frv/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/frv/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/frv/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/frv/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -57,10 +57,7 @@ SECTIONS
   __alt_instructions_end = .;
  .altinstr_replacement : { *(.altinstr_replacement) }
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
diff -Nurp linux-2.6.21-rc7.0/arch/i386/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/i386/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/i386/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/i386/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -185,12 +185,7 @@ SECTIONS
 	__initramfs_end = .;
   }
 #endif
-  . = ALIGN(4096);
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
-	__per_cpu_start = .;
-	*(.data.percpu)
-	__per_cpu_end = .;
-  }
+  PERCPU(4096)
   . = ALIGN(4096);
   /* freed after init ends here */
 	
diff -Nurp linux-2.6.21-rc7.0/arch/ia64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/ia64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/ia64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/ia64/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -206,6 +206,7 @@ SECTIONS
 	{
 		__per_cpu_start = .;
 		*(.data.percpu)
+		*(.data.percpu.shared_cacheline_aligned)
 		__per_cpu_end = .;
 	}
   . = __phys_per_cpu_start + PERCPU_PAGE_SIZE;	/* ensure percpu data fits
diff -Nurp linux-2.6.21-rc7.0/arch/m32r/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/m32r/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/m32r/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/m32r/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -111,10 +111,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/mips/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/mips/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/mips/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/mips/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -121,10 +121,7 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
 #endif
-  . = ALIGN(_PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(_PAGE_SIZE)
   . = ALIGN(_PAGE_SIZE);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/parisc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/parisc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/parisc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/parisc/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -182,10 +182,9 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
 #endif
-  . = ALIGN(ASM_PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+
+  PERCPU(ASM_PAGE_SIZE)
+
   . = ALIGN(ASM_PAGE_SIZE);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/powerpc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/powerpc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/powerpc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/powerpc/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -139,12 +139,7 @@ SECTIONS
 		__initramfs_end = .;
 	}
 #endif
-	. = ALIGN(PAGE_SIZE);
-	.data.percpu : {
-		__per_cpu_start = .;
-		*(.data.percpu)
-		__per_cpu_end = .;
-	}
+	PERCPU(PAGE_SIZE)
 
 	. = ALIGN(8);
 	.machine.desc : {
diff -Nurp linux-2.6.21-rc7.0/arch/ppc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/ppc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/ppc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/ppc/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -131,10 +131,7 @@ SECTIONS
   __ftr_fixup : { *(__ftr_fixup) }
   __stop___ftr_fixup = .;
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
diff -Nurp linux-2.6.21-rc7.0/arch/s390/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/s390/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/s390/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/s390/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -108,10 +108,7 @@ SECTIONS
   . = ALIGN(2);
   __initramfs_end = .;
 #endif
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/sh/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sh/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sh/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sh/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -61,10 +61,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   __nosave_end = .;
 
-  . = ALIGN(PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(PAGE_SIZE)
   .data.cacheline_aligned : { *(.data.cacheline_aligned) }
 
   _edata = .;			/* End of data section */
diff -Nurp linux-2.6.21-rc7.0/arch/sh64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sh64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sh64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sh64/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -86,10 +86,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   .data.page_aligned : C_PHYS(.data.page_aligned) { *(.data.page_aligned) }
 
-  . = ALIGN(PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) }
-  __per_cpu_end = . ;
+  PERCPU(PAGE_SIZE)
   .data.cacheline_aligned : C_PHYS(.data.cacheline_aligned) { *(.data.cacheline_aligned) }
 
   _edata = .;			/* End of data section */
diff -Nurp linux-2.6.21-rc7.0/arch/sparc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sparc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sparc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sparc/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -66,10 +66,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   __init_end = .;
   . = ALIGN(32);
diff -Nurp linux-2.6.21-rc7.0/arch/sparc64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sparc64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sparc64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sparc64/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -90,10 +90,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(8192);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(8192)
   . = ALIGN(8192);
   __init_end = .;
   __bss_start = .;
diff -Nurp linux-2.6.21-rc7.0/arch/x86_64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/x86_64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/x86_64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/x86_64/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -209,10 +209,8 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
+
   . = ALIGN(4096);
 
   __init_end = .;
diff -Nurp linux-2.6.21-rc7.0/arch/xtensa/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/xtensa/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/xtensa/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/xtensa/kernel/vmlinux.lds.S	2007-05-02 17:00:29.000000000 -0700
@@ -198,10 +198,7 @@ SECTIONS
   __ftr_fixup : { *(__ftr_fixup) }
   __stop___ftr_fixup = .;
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
diff -Nurp linux-2.6.21-rc7.0/include/asm-generic/percpu.h linux-2.6.21-rc7.1/include/asm-generic/percpu.h
--- linux-2.6.21-rc7.0/include/asm-generic/percpu.h	2007-05-01 07:33:03.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-generic/percpu.h	2007-05-04 11:15:26.000000000 -0700
@@ -14,6 +14,11 @@ extern unsigned long __per_cpu_offset[NR
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)		  \
+    __attribute__((__section__(".data.percpu.shared_cacheline_aligned"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp
+
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*({				\
 	extern int simple_identifier_##var(void);	\
@@ -34,6 +39,9 @@ do {								\
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
+
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 #define __raw_get_cpu_var(var)			per_cpu__##var
diff -Nurp linux-2.6.21-rc7.0/include/asm-generic/vmlinux.lds.h linux-2.6.21-rc7.1/include/asm-generic/vmlinux.lds.h
--- linux-2.6.21-rc7.0/include/asm-generic/vmlinux.lds.h	2007-05-01 07:33:03.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-generic/vmlinux.lds.h	2007-05-02 17:00:29.000000000 -0700
@@ -265,3 +265,13 @@
   	*(.initcall6s.init)						\
   	*(.initcall7.init)						\
   	*(.initcall7s.init)
+
+#define PERCPU(align)							\
+  . = ALIGN(align);							\
+  __per_cpu_start = .;							\
+  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {		\
+	*(.data.percpu)							\
+	*(.data.percpu.shared_cacheline_aligned)			\
+  }									\
+  __per_cpu_end = .;
+
diff -Nurp linux-2.6.21-rc7.0/include/asm-i386/percpu.h linux-2.6.21-rc7.1/include/asm-i386/percpu.h
--- linux-2.6.21-rc7.0/include/asm-i386/percpu.h	2007-05-01 07:33:03.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-i386/percpu.h	2007-05-02 17:00:29.000000000 -0700
@@ -54,6 +54,11 @@ extern unsigned long __per_cpu_offset[];
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)		  \
+    __attribute__((__section__(".data.percpu.shared_cacheline_aligned"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp 
+
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 
diff -Nurp linux-2.6.21-rc7.0/include/asm-ia64/percpu.h linux-2.6.21-rc7.1/include/asm-ia64/percpu.h
--- linux-2.6.21-rc7.0/include/asm-ia64/percpu.h	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-ia64/percpu.h	2007-05-04 11:47:45.000000000 -0700
@@ -29,6 +29,16 @@
 	__attribute__((__section__(".data.percpu")))		\
 	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
 
+#ifdef CONFIG_SMP
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)		      \
+	__attribute__((__section__(".data.percpu.shared_cacheline_aligned"))) \
+	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name		      \
+	____cacheline_aligned_in_smp	
+#else
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)	\
+	DEFINE_PER_CPU(type, name)
+#endif
+
 /*
  * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
  * external routine, to avoid include-hell.
diff -Nurp linux-2.6.21-rc7.0/include/asm-powerpc/percpu.h linux-2.6.21-rc7.1/include/asm-powerpc/percpu.h
--- linux-2.6.21-rc7.0/include/asm-powerpc/percpu.h	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-powerpc/percpu.h	2007-05-04 11:19:37.000000000 -0700
@@ -20,6 +20,11 @@
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)		  \
+    __attribute__((__section__(".data.percpu.shared_cacheline_aligned"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp 
+
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
@@ -40,6 +45,8 @@ extern void setup_per_cpu_areas(void);
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
diff -Nurp linux-2.6.21-rc7.0/include/asm-s390/percpu.h linux-2.6.21-rc7.1/include/asm-s390/percpu.h
--- linux-2.6.21-rc7.0/include/asm-s390/percpu.h	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-s390/percpu.h	2007-05-04 11:20:19.000000000 -0700
@@ -41,6 +41,11 @@ extern unsigned long __per_cpu_offset[NR
     __attribute__((__section__(".data.percpu"))) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)		  \
+    __attribute__((__section__(".data.percpu.shared_cacheline_aligned"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp
+
 #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
@@ -59,6 +64,8 @@ do {								\
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define __get_cpu_var(var) __reloc_hide(var,0)
 #define __raw_get_cpu_var(var) __reloc_hide(var,0)
diff -Nurp linux-2.6.21-rc7.0/include/asm-sparc64/percpu.h linux-2.6.21-rc7.1/include/asm-sparc64/percpu.h
--- linux-2.6.21-rc7.0/include/asm-sparc64/percpu.h	2007-05-01 07:33:06.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-sparc64/percpu.h	2007-05-04 11:20:51.000000000 -0700
@@ -17,6 +17,11 @@ extern unsigned long __per_cpu_shift;
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)		  \
+    __attribute__((__section__(".data.percpu.shared_cacheline_aligned"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_aligned_in_smp 
+
 register unsigned long __local_per_cpu_offset asm("g5");
 
 /* var is in discarded region: offset to particular copy we want */
@@ -36,6 +41,8 @@ do {								\
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
diff -Nurp linux-2.6.21-rc7.0/include/asm-x86_64/percpu.h linux-2.6.21-rc7.1/include/asm-x86_64/percpu.h
--- linux-2.6.21-rc7.0/include/asm-x86_64/percpu.h	2007-05-01 07:33:06.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-x86_64/percpu.h	2007-05-04 11:21:25.000000000 -0700
@@ -20,6 +20,11 @@
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)		  \
+    __attribute__((__section__(".data.percpu.shared_cacheline_aligned"))) \
+    __typeof__(type) per_cpu__##name					  \
+    ____cacheline_internodealigned_in_smp 
+
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*({				\
 	extern int simple_identifier_##var(void);	\
@@ -46,6 +51,8 @@ extern void setup_per_cpu_areas(void);
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] Call percpu smp cacheline algin interface
  2007-05-05  0:12 ` [PATCH 2/2] Call percpu smp cacheline algin interface Fenghua Yu
@ 2007-05-07 22:59   ` Fenghua Yu
  2007-05-09 20:33     ` Andrew Morton
  0 siblings, 1 reply; 20+ messages in thread
From: Fenghua Yu @ 2007-05-07 22:59 UTC (permalink / raw)
  To: akpm, suresh.b.siddha, linux-kernel, clameter; +Cc: fenghua.yu

On Fri, May 04, 2007 at 05:12:31PM -0700, Fenghua Yu wrote:
> 
> Call percpu smp cacheline align interface.
> 

This is updated patch. Use new macro name DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED for better code understanding.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>

---

 arch/i386/kernel/init_task.c   |    2 +-
 arch/i386/kernel/irq.c         |    2 +-
 arch/ia64/kernel/smp.c         |    2 +-
 arch/x86_64/kernel/init_task.c |    2 +-
 kernel/sched.c                 |    2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
 
diff -Nurp linux-2.6.21-rc7.0/arch/i386/kernel/init_task.c linux-2.6.21-rc7.1/arch/i386/kernel/init_task.c
--- linux-2.6.21-rc7.0/arch/i386/kernel/init_task.c	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/i386/kernel/init_task.c	2007-05-02 17:00:34.000000000 -0700
@@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task);
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
 
diff -Nurp linux-2.6.21-rc7.0/arch/i386/kernel/irq.c linux-2.6.21-rc7.1/arch/i386/kernel/irq.c
--- linux-2.6.21-rc7.0/arch/i386/kernel/irq.c	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/i386/kernel/irq.c	2007-05-02 17:00:34.000000000 -0700
@@ -21,7 +21,7 @@
 #include <asm/apic.h>
 #include <asm/uaccess.h>
 
-DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
+DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(irq_cpustat_t, irq_stat);
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
diff -Nurp linux-2.6.21-rc7.0/arch/ia64/kernel/smp.c linux-2.6.21-rc7.1/arch/ia64/kernel/smp.c
--- linux-2.6.21-rc7.0/arch/ia64/kernel/smp.c	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/ia64/kernel/smp.c	2007-05-02 17:00:34.000000000 -0700
@@ -70,7 +70,7 @@ static volatile struct call_data_struct 
 #define IPI_KDUMP_CPU_STOP	3
 
 /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
-static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+static DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(u64, ipi_operation);
 
 extern void cpu_halt (void);
 
diff -Nurp linux-2.6.21-rc7.0/arch/x86_64/kernel/init_task.c linux-2.6.21-rc7.1/arch/x86_64/kernel/init_task.c
--- linux-2.6.21-rc7.0/arch/x86_64/kernel/init_task.c	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/x86_64/kernel/init_task.c	2007-05-02 17:00:34.000000000 -0700
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(init_task);
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
 
 /* Copies of the original ist values from the tss are only accessed during
  * debugging, no special alignment required.
diff -Nurp linux-2.6.21-rc7.0/kernel/sched.c linux-2.6.21-rc7.1/kernel/sched.c
--- linux-2.6.21-rc7.0/kernel/sched.c	2007-05-01 07:33:07.000000000 -0700
+++ linux-2.6.21-rc7.1/kernel/sched.c	2007-05-02 17:00:34.000000000 -0700
@@ -263,7 +263,7 @@ struct rq {
 	struct lock_class_key rq_lock_key;
 };
 
-static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED(struct rq, runqueues);
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 static inline int cpu_of(struct rq *rq)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] Call percpu smp cacheline algin interface
  2007-05-07 22:59   ` Fenghua Yu
@ 2007-05-09 20:33     ` Andrew Morton
  2007-05-09 22:16       ` Yu, Fenghua
  0 siblings, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2007-05-09 20:33 UTC (permalink / raw)
  To: Fenghua Yu; +Cc: suresh.b.siddha, linux-kernel, clameter

On Mon, 7 May 2007 15:59:44 -0700
Fenghua Yu <fenghua.yu@intel.com> wrote:

> On Fri, May 04, 2007 at 05:12:31PM -0700, Fenghua Yu wrote:
> > 
> > Call percpu smp cacheline align interface.
> > 
> 
> This is updated patch. Use new macro name DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED for better code understanding.
> 
> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
> Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>

hm, DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED is a bit of a mouthful.

I wonder if we can improve things here so that we use the runtime-detected
cacheline size rather than the compile-time size.  I guess not, given that
the offsets into the percpu area are calculated at build-time.

Did you work out how much space this change will actually save?  It
should be available by suitable crunching on the nm and objdump output.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] Call percpu smp cacheline algin interface
  2007-05-09 20:33     ` Andrew Morton
@ 2007-05-09 22:16       ` Yu, Fenghua
  2007-05-09 22:53         ` Andrew Morton
  0 siblings, 1 reply; 20+ messages in thread
From: Yu, Fenghua @ 2007-05-09 22:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Siddha, Suresh B, linux-kernel, clameter


>hm, DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED is a bit of a mouthful.

>I wonder if we can improve things here so that we use the
runtime-detected
>cacheline size rather than the compile-time size.  I guess not, given
that
>the offsets into the percpu area are calculated at build-time.

>Did you work out how much space this change will actually save?  It
>should be available by suitable crunching on the nm and objdump output.

Depending on how data fields are arranged by linker, the patches could
save or waste per_cpu size. Below is data I got.


Case 1: On linux-2.6.21-rc7-mm2 with defconfig build.
Case 2: On linux-2.6.21-rc7-mm2 plus the patches in this thread with
defconfig build.
Case 3: On linux-2.6.21-rc7-mm2 with defconfig with VSMP=y build.
Case 4: On linux-2.6.21-rc7-mm2 plus the patches in this thread with
defconfig with VSMP=y build.

Please note that on x86/x86-64, per_cpu_init_tss is placed in the first
place in per_cpu section in Case 1 and 3. And thus there is no padding
waste for per_cpu_init_tss in Case 1 and 3.

On X86:
Case 1: Size of per_cpu section is: 0x7768
Case 2: Size of per_cpu section is: 0x790c
The patches waste 0x1a4 bytes.

per_cpu__init_tss, per_cpu__irq_stat, and per_cpu__runqueues are moved
to shared_cacheline_aligned section.

On X86-64:
Case 1: Size of per_cpu section is: 0x72d0
Case 2: Size of per_cpu section is: 0x6540
The patches save 0xd90 bytes.

Case 3: Size of per_cpu section is: 0x72d0
Case 4: Size of per_cpu section is: 0x8340
The patches waste 0x1070 bytes. 

Shall we not use shared_cacheline_aligned section for VSMP case? The
waste of cache eventually may offset the potential gain of alignment.

Probably need to set up a cache line size threshold: if L1 cache line
size is bigger than a number CACHELINE_ALIGN_SHRESHOLD, don't do
cacheline alignment.

per_cpu__init_tss and per_cpu__runqueues are moved to
shared_cacheline_aligned section.

On ia64:
Case 1: Size of per_cpu section is: 0x8370
Case 2: Size of per_cpu section is: 0x7fc0
The patches save 0x3b0 bytes.

per_cpu_ipi_operation and per_cpu_runqueues are moved to
shared_cacheline_aligned section

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] Call percpu smp cacheline algin interface
  2007-05-09 22:16       ` Yu, Fenghua
@ 2007-05-09 22:53         ` Andrew Morton
  2007-05-09 22:56           ` Christoph Lameter
  0 siblings, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2007-05-09 22:53 UTC (permalink / raw)
  To: Yu, Fenghua; +Cc: Siddha, Suresh B, linux-kernel, clameter

On Wed, 9 May 2007 15:16:11 -0700
"Yu, Fenghua" <fenghua.yu@intel.com> wrote:

> 
> >hm, DEFINE_PER_CPU_SHARED_CACHELINE_ALIGNED is a bit of a mouthful.
> 
> >I wonder if we can improve things here so that we use the
> runtime-detected
> >cacheline size rather than the compile-time size.  I guess not, given
> that
> >the offsets into the percpu area are calculated at build-time.
> 
> >Did you work out how much space this change will actually save?  It
> >should be available by suitable crunching on the nm and objdump output.
> 
> Depending on how data fields are arranged by linker, the patches could
> save or waste per_cpu size. Below is data I got.
> 
> 
> Case 1: On linux-2.6.21-rc7-mm2 with defconfig build.
> Case 2: On linux-2.6.21-rc7-mm2 plus the patches in this thread with
> defconfig build.
> Case 3: On linux-2.6.21-rc7-mm2 with defconfig with VSMP=y build.
> Case 4: On linux-2.6.21-rc7-mm2 plus the patches in this thread with
> defconfig with VSMP=y build.
> 
> Please note that on x86/x86-64, per_cpu_init_tss is placed in the first
> place in per_cpu section in Case 1 and 3. And thus there is no padding
> waste for per_cpu_init_tss in Case 1 and 3.
> 
> On X86:
> Case 1: Size of per_cpu section is: 0x7768
> Case 2: Size of per_cpu section is: 0x790c
> The patches waste 0x1a4 bytes.
> 
> per_cpu__init_tss, per_cpu__irq_stat, and per_cpu__runqueues are moved
> to shared_cacheline_aligned section.
> 
> On X86-64:
> Case 1: Size of per_cpu section is: 0x72d0
> Case 2: Size of per_cpu section is: 0x6540
> The patches save 0xd90 bytes.
> 
> Case 3: Size of per_cpu section is: 0x72d0
> Case 4: Size of per_cpu section is: 0x8340
> The patches waste 0x1070 bytes. 
> 
> Shall we not use shared_cacheline_aligned section for VSMP case? The
> waste of cache eventually may offset the potential gain of alignment.
> 
> Probably need to set up a cache line size threshold: if L1 cache line
> size is bigger than a number CACHELINE_ALIGN_SHRESHOLD, don't do
> cacheline alignment.
> 
> per_cpu__init_tss and per_cpu__runqueues are moved to
> shared_cacheline_aligned section.
> 
> On ia64:
> Case 1: Size of per_cpu section is: 0x8370
> Case 2: Size of per_cpu section is: 0x7fc0
> The patches save 0x3b0 bytes.
> 
> per_cpu_ipi_operation and per_cpu_runqueues are moved to
> shared_cacheline_aligned section

erm, it's not obviosu from all this that the patches are worth proceeding
with, are they?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] Call percpu smp cacheline algin interface
  2007-05-09 22:53         ` Andrew Morton
@ 2007-05-09 22:56           ` Christoph Lameter
  2007-05-09 23:06             ` Siddha, Suresh B
  2007-05-09 23:10             ` Yu, Fenghua
  0 siblings, 2 replies; 20+ messages in thread
From: Christoph Lameter @ 2007-05-09 22:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Yu, Fenghua, Siddha, Suresh B, linux-kernel

On Wed, 9 May 2007, Andrew Morton wrote:

> erm, it's not obviosu from all this that the patches are worth proceeding
> with, are they?

What was it? 0.5% performance improvement on a synthetic benchmark? 
Process wakeup I believe?



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] Call percpu smp cacheline algin interface
  2007-05-09 22:56           ` Christoph Lameter
@ 2007-05-09 23:06             ` Siddha, Suresh B
  2007-05-09 23:10             ` Yu, Fenghua
  1 sibling, 0 replies; 20+ messages in thread
From: Siddha, Suresh B @ 2007-05-09 23:06 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Yu, Fenghua, Siddha, Suresh B, linux-kernel

On Wed, May 09, 2007 at 03:56:57PM -0700, Christoph Lameter wrote:
>  On Wed, 9 May 2007, Andrew Morton wrote:
> > erm, it's not obviosu from all this that the patches are worth proceeding
> > with, are they?

Andrew, There are two advantages with this patch.

1. Space savings.. For me, I really don't worry about this. As VSMP only
uses 4K cacheline alignment and the regular kernels don't use VSMP.

2. data accessed by different cpus gets their own cachelines. 
Both the beginning and end of these data elements will be cacheline
aligned.
i.e., other data which gets referenced heavily by local cpu will not bring
the remote lines to local cpu(no unnecessary cacheline bouncing). This
is important, going forward.

> 
> What was it? 0.5% performance improvement on a synthetic benchmark?
> Process wakeup I believe?

Christoph, don't get confused. The patch which gives 0.5% is already
in linus git tree. This adds a clean infrastructure, to avoid the two
above mentioned issues.

thanks,
suresh

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] Call percpu smp cacheline algin interface
  2007-05-09 22:56           ` Christoph Lameter
  2007-05-09 23:06             ` Siddha, Suresh B
@ 2007-05-09 23:10             ` Yu, Fenghua
  2007-05-09 23:36               ` Andrew Morton
  1 sibling, 1 reply; 20+ messages in thread
From: Yu, Fenghua @ 2007-05-09 23:10 UTC (permalink / raw)
  To: Christoph Lameter, Andrew Morton; +Cc: Siddha, Suresh B, linux-kernel



On Wed, 9 May 2007, Andrew Morton wrote:

>> erm, it's not obviosu from all this that the patches are worth
proceeding
>> with, are they?

>What was it? 0.5% performance improvement on a synthetic benchmark? 
>Process wakeup I believe?

The initial patch and discussion is from:
http://www.uwsg.iu.edu/hypermail/linux/kernel/0704.1/0340.html

Yes, the runqueue patch has a 0.5% perf improvement on database
workload(which is a good improvement for this workload).

The theory behind the patches is:

1. Minimize number of cache lines that are touched during a remote
access. On Numa system, remote access is more expensive than local.
2. Do not share cache line between remote accessed data and local
accessed data. Local data update may cause remote access cache miss and
wait for longer time.

Although the patches themselves don't save or waste per_cpu size, the
above two reasons are good to have them in.

Thanks.

-Fenghua

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] Call percpu smp cacheline algin interface
  2007-05-09 23:10             ` Yu, Fenghua
@ 2007-05-09 23:36               ` Andrew Morton
  0 siblings, 0 replies; 20+ messages in thread
From: Andrew Morton @ 2007-05-09 23:36 UTC (permalink / raw)
  To: Yu, Fenghua; +Cc: Christoph Lameter, Siddha, Suresh B, linux-kernel

On Wed, 9 May 2007 16:10:05 -0700
"Yu, Fenghua" <fenghua.yu@intel.com> wrote:

> 
> 
> On Wed, 9 May 2007, Andrew Morton wrote:
> 
> >> erm, it's not obviosu from all this that the patches are worth
> proceeding
> >> with, are they?
> 
> >What was it? 0.5% performance improvement on a synthetic benchmark? 
> >Process wakeup I believe?
> 
> The initial patch and discussion is from:
> http://www.uwsg.iu.edu/hypermail/linux/kernel/0704.1/0340.html
> 
> Yes, the runqueue patch has a 0.5% perf improvement on database
> workload(which is a good improvement for this workload).
> 
> The theory behind the patches is:
> 
> 1. Minimize number of cache lines that are touched during a remote
> access. On Numa system, remote access is more expensive than local.
> 2. Do not share cache line between remote accessed data and local
> accessed data. Local data update may cause remote access cache miss and
> wait for longer time.
> 
> Although the patches themselves don't save or waste per_cpu size, the
> above two reasons are good to have them in.
> 

Guys, this is all a lesson in the value of changelogs, and in how not to
write them.

Can you please prepare a new changelog for these patches?  Something which
encapsulates all the above in as brief a form as possible and which
includes some numbers describing the space and/or speed improvements?

Thanks.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 2/2] Use the new percpu interface for shared data -- version 2
  2007-05-07 17:46         ` Siddha, Suresh B
  2007-05-07 18:13           ` Yu, Fenghua
@ 2007-05-15  0:12           ` Fenghua Yu
       [not found]           ` <20070515001255.GA27978@linux-os.sc.intel.com>
  2 siblings, 0 replies; 20+ messages in thread
From: Fenghua Yu @ 2007-05-15  0:12 UTC (permalink / raw)
  To: akpm, Siddha, Suresh B, Christoph Lameter, kiran, linux-kernel; +Cc: fenghua.yu


Currently most of the per cpu data, which is accessed by different cpus, has a
____cacheline_aligned_in_smp attribute. Move all this data to the new per cpu
shared data section: .data.percpu.shared_aligned.

This will seperate  the percpu data which is referenced frequently by other cpus
from the local only percpu data.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>

---

 arch/i386/kernel/init_task.c   |    2 +-
 arch/i386/kernel/irq.c         |    2 +-
 arch/ia64/kernel/smp.c         |    2 +-
 arch/x86_64/kernel/init_task.c |    2 +-
 kernel/sched.c                 |    2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff -Nurp linux-2.6.21-rc7.0/arch/i386/kernel/init_task.c linux-2.6.21-rc7.1/arch/i386/kernel/init_task.c
--- linux-2.6.21-rc7.0/arch/i386/kernel/init_task.c	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/i386/kernel/init_task.c	2007-05-14 12:44:43.000000000 -0700
@@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task);
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
 
diff -Nurp linux-2.6.21-rc7.0/arch/i386/kernel/irq.c linux-2.6.21-rc7.1/arch/i386/kernel/irq.c
--- linux-2.6.21-rc7.0/arch/i386/kernel/irq.c	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/i386/kernel/irq.c	2007-05-14 12:44:43.000000000 -0700
@@ -21,7 +21,7 @@
 #include <asm/apic.h>
 #include <asm/uaccess.h>
 
-DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
diff -Nurp linux-2.6.21-rc7.0/arch/ia64/kernel/smp.c linux-2.6.21-rc7.1/arch/ia64/kernel/smp.c
--- linux-2.6.21-rc7.0/arch/ia64/kernel/smp.c	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/ia64/kernel/smp.c	2007-05-14 12:44:43.000000000 -0700
@@ -70,7 +70,7 @@ static volatile struct call_data_struct 
 #define IPI_KDUMP_CPU_STOP	3
 
 /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
-static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+static DEFINE_PER_CPU_SHARED_ALIGNED(u64, ipi_operation);
 
 extern void cpu_halt (void);
 
diff -Nurp linux-2.6.21-rc7.0/arch/x86_64/kernel/init_task.c linux-2.6.21-rc7.1/arch/x86_64/kernel/init_task.c
--- linux-2.6.21-rc7.0/arch/x86_64/kernel/init_task.c	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/x86_64/kernel/init_task.c	2007-05-14 12:44:43.000000000 -0700
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(init_task);
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
 
 /* Copies of the original ist values from the tss are only accessed during
  * debugging, no special alignment required.
diff -Nurp linux-2.6.21-rc7.0/kernel/sched.c linux-2.6.21-rc7.1/kernel/sched.c
--- linux-2.6.21-rc7.0/kernel/sched.c	2007-05-01 07:33:07.000000000 -0700
+++ linux-2.6.21-rc7.1/kernel/sched.c	2007-05-14 12:44:43.000000000 -0700
@@ -263,7 +263,7 @@ struct rq {
 	struct lock_class_key rq_lock_key;
 };
 
-static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 static inline int cpu_of(struct rq *rq)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/2] Define new percpu interface for shared data -- version 2
       [not found]           ` <20070515001255.GA27978@linux-os.sc.intel.com>
@ 2007-05-15  0:22             ` Fenghua Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Fenghua Yu @ 2007-05-15  0:22 UTC (permalink / raw)
  To: akpm, Siddha, Suresh B, Christoph Lameter, kiran, linux-kernel; +Cc: fenghua.yu

Changes from previous version:
1. Shorter macro and section names.
2. Detailed change log.
 
per cpu data section contains two types of data. One set which is exclusively
accessed by the local cpu and the other set which is  per cpu, but also shared
by remote cpus. In the current kernel, these two sets are not clearely
separated out. This can potentially cause the same data cacheline shared
between the two sets of data, which will result in unnecessary bouncing of the
cacheline between cpus.
 
One way to fix the problem is to cacheline align the remotely accessed per cpu
data, both at the beginning and at the end. Because of the padding at both ends,
this will likely cause some memory wastage and also the interface to achieve
this is not clean.
 
This patch:

Moves the remotely accessed per cpu data (which is currently marked
as ____cacheline_aligned_in_smp) into a different section, where all the data
elements are cacheline aligned. And as such, this differentiates the local
only data and remotely accessed data cleanly.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>

---

 arch/alpha/kernel/vmlinux.lds.S   |    5 +----
 arch/arm/kernel/vmlinux.lds.S     |    5 +----
 arch/cris/arch-v32/vmlinux.lds.S  |    5 +----
 arch/frv/kernel/vmlinux.lds.S     |    5 +----
 arch/i386/kernel/vmlinux.lds.S    |    7 +------
 arch/ia64/kernel/vmlinux.lds.S    |    1 +
 arch/m32r/kernel/vmlinux.lds.S    |    5 +----
 arch/mips/kernel/vmlinux.lds.S    |    5 +----
 arch/parisc/kernel/vmlinux.lds.S  |    7 +++----
 arch/powerpc/kernel/vmlinux.lds.S |    7 +------
 arch/ppc/kernel/vmlinux.lds.S     |    5 +----
 arch/s390/kernel/vmlinux.lds.S    |    5 +----
 arch/sh/kernel/vmlinux.lds.S      |    5 +----
 arch/sh64/kernel/vmlinux.lds.S    |    5 +----
 arch/sparc/kernel/vmlinux.lds.S   |    5 +----
 arch/sparc64/kernel/vmlinux.lds.S |    5 +----
 arch/x86_64/kernel/vmlinux.lds.S  |    6 ++----
 arch/xtensa/kernel/vmlinux.lds.S  |    5 +----
 include/asm-generic/percpu.h      |    8 ++++++++
 include/asm-generic/vmlinux.lds.h |   10 ++++++++++
 include/asm-i386/percpu.h         |    5 +++++
 include/asm-ia64/percpu.h         |   10 ++++++++++
 include/asm-powerpc/percpu.h      |    7 +++++++
 include/asm-s390/percpu.h         |    7 +++++++
 include/asm-sparc64/percpu.h      |    7 +++++++
 include/asm-x86_64/percpu.h       |    7 +++++++
 26 files changed, 82 insertions(+), 72 deletions(-)


diff -Nurp linux-2.6.21-rc7.0/arch/alpha/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/alpha/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/alpha/kernel/vmlinux.lds.S	2007-05-01 07:32:58.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/alpha/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -69,10 +69,7 @@ SECTIONS
   . = ALIGN(8);
   SECURITY_INIT
 
-  . = ALIGN(8192);
-  __per_cpu_start = .;
-  .data.percpu : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(8192)
 
   . = ALIGN(2*8192);
   __init_end = .;
diff -Nurp linux-2.6.21-rc7.0/arch/arm/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/arm/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/arm/kernel/vmlinux.lds.S	2007-05-01 07:32:58.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/arm/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -59,10 +59,7 @@ SECTIONS
 			usr/built-in.o(.init.ramfs)
 		__initramfs_end = .;
 #endif
-		. = ALIGN(4096);
-		__per_cpu_start = .;
-			*(.data.percpu)
-		__per_cpu_end = .;
+		PERCPU(4096)
 #ifndef CONFIG_XIP_KERNEL
 		__init_begin = _stext;
 		*(.init.data)
diff -Nurp linux-2.6.21-rc7.0/arch/cris/arch-v32/vmlinux.lds.S linux-2.6.21-rc7.1/arch/cris/arch-v32/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/cris/arch-v32/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/cris/arch-v32/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -92,10 +92,7 @@ SECTIONS
 	}
 	SECURITY_INIT
 
-	. =  ALIGN (8192);
-	__per_cpu_start = .;
-	.data.percpu  : { *(.data.percpu) }
-	__per_cpu_end = .;
+	PERCPU(8192)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	.init.ramfs : {
diff -Nurp linux-2.6.21-rc7.0/arch/frv/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/frv/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/frv/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/frv/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -57,10 +57,7 @@ SECTIONS
   __alt_instructions_end = .;
  .altinstr_replacement : { *(.altinstr_replacement) }
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
diff -Nurp linux-2.6.21-rc7.0/arch/i386/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/i386/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/i386/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/i386/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -185,12 +185,7 @@ SECTIONS
 	__initramfs_end = .;
   }
 #endif
-  . = ALIGN(4096);
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
-	__per_cpu_start = .;
-	*(.data.percpu)
-	__per_cpu_end = .;
-  }
+  PERCPU(4096)
   . = ALIGN(4096);
   /* freed after init ends here */
 	
diff -Nurp linux-2.6.21-rc7.0/arch/ia64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/ia64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/ia64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/ia64/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -206,6 +206,7 @@ SECTIONS
 	{
 		__per_cpu_start = .;
 		*(.data.percpu)
+		*(.data.percpu.shared_aligned)
 		__per_cpu_end = .;
 	}
   . = __phys_per_cpu_start + PERCPU_PAGE_SIZE;	/* ensure percpu data fits
diff -Nurp linux-2.6.21-rc7.0/arch/m32r/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/m32r/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/m32r/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/m32r/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -111,10 +111,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/mips/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/mips/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/mips/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/mips/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -121,10 +121,7 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
 #endif
-  . = ALIGN(_PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(_PAGE_SIZE)
   . = ALIGN(_PAGE_SIZE);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/parisc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/parisc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/parisc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/parisc/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -182,10 +182,9 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
 #endif
-  . = ALIGN(ASM_PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+
+  PERCPU(ASM_PAGE_SIZE)
+
   . = ALIGN(ASM_PAGE_SIZE);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/powerpc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/powerpc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/powerpc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/powerpc/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -139,12 +139,7 @@ SECTIONS
 		__initramfs_end = .;
 	}
 #endif
-	. = ALIGN(PAGE_SIZE);
-	.data.percpu : {
-		__per_cpu_start = .;
-		*(.data.percpu)
-		__per_cpu_end = .;
-	}
+	PERCPU(PAGE_SIZE)
 
 	. = ALIGN(8);
 	.machine.desc : {
diff -Nurp linux-2.6.21-rc7.0/arch/ppc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/ppc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/ppc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/ppc/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -131,10 +131,7 @@ SECTIONS
   __ftr_fixup : { *(__ftr_fixup) }
   __stop___ftr_fixup = .;
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
diff -Nurp linux-2.6.21-rc7.0/arch/s390/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/s390/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/s390/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/s390/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -108,10 +108,7 @@ SECTIONS
   . = ALIGN(2);
   __initramfs_end = .;
 #endif
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   __init_end = .;
   /* freed after init ends here */
diff -Nurp linux-2.6.21-rc7.0/arch/sh/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sh/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sh/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sh/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -61,10 +61,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   __nosave_end = .;
 
-  . = ALIGN(PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(PAGE_SIZE)
   .data.cacheline_aligned : { *(.data.cacheline_aligned) }
 
   _edata = .;			/* End of data section */
diff -Nurp linux-2.6.21-rc7.0/arch/sh64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sh64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sh64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sh64/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -86,10 +86,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   .data.page_aligned : C_PHYS(.data.page_aligned) { *(.data.page_aligned) }
 
-  . = ALIGN(PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) }
-  __per_cpu_end = . ;
+  PERCPU(PAGE_SIZE)
   .data.cacheline_aligned : C_PHYS(.data.cacheline_aligned) { *(.data.cacheline_aligned) }
 
   _edata = .;			/* End of data section */
diff -Nurp linux-2.6.21-rc7.0/arch/sparc/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sparc/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sparc/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sparc/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -66,10 +66,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   __init_end = .;
   . = ALIGN(32);
diff -Nurp linux-2.6.21-rc7.0/arch/sparc64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/sparc64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/sparc64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/sparc64/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -90,10 +90,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(8192);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(8192)
   . = ALIGN(8192);
   __init_end = .;
   __bss_start = .;
diff -Nurp linux-2.6.21-rc7.0/arch/x86_64/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/x86_64/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/x86_64/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/x86_64/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -209,10 +209,8 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
+
   . = ALIGN(4096);
 
   __init_end = .;
diff -Nurp linux-2.6.21-rc7.0/arch/xtensa/kernel/vmlinux.lds.S linux-2.6.21-rc7.1/arch/xtensa/kernel/vmlinux.lds.S
--- linux-2.6.21-rc7.0/arch/xtensa/kernel/vmlinux.lds.S	2007-05-01 07:32:59.000000000 -0700
+++ linux-2.6.21-rc7.1/arch/xtensa/kernel/vmlinux.lds.S	2007-05-14 12:44:40.000000000 -0700
@@ -198,10 +198,7 @@ SECTIONS
   __ftr_fixup : { *(__ftr_fixup) }
   __stop___ftr_fixup = .;
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
diff -Nurp linux-2.6.21-rc7.0/include/asm-generic/percpu.h linux-2.6.21-rc7.1/include/asm-generic/percpu.h
--- linux-2.6.21-rc7.0/include/asm-generic/percpu.h	2007-05-01 07:33:03.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-generic/percpu.h	2007-05-14 14:16:15.000000000 -0700
@@ -14,6 +14,11 @@ extern unsigned long __per_cpu_offset[NR
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp
+
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*({				\
 	extern int simple_identifier_##var(void);	\
@@ -34,6 +39,9 @@ do {								\
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
+
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 #define __raw_get_cpu_var(var)			per_cpu__##var
diff -Nurp linux-2.6.21-rc7.0/include/asm-generic/vmlinux.lds.h linux-2.6.21-rc7.1/include/asm-generic/vmlinux.lds.h
--- linux-2.6.21-rc7.0/include/asm-generic/vmlinux.lds.h	2007-05-01 07:33:03.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-generic/vmlinux.lds.h	2007-05-14 14:16:38.000000000 -0700
@@ -265,3 +265,13 @@
   	*(.initcall6s.init)						\
   	*(.initcall7.init)						\
   	*(.initcall7s.init)
+
+#define PERCPU(align)							\
+  . = ALIGN(align);							\
+  __per_cpu_start = .;							\
+  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {		\
+	*(.data.percpu)							\
+	*(.data.percpu.shared_aligned)					\
+  }									\
+  __per_cpu_end = .;
+
diff -Nurp linux-2.6.21-rc7.0/include/asm-i386/percpu.h linux-2.6.21-rc7.1/include/asm-i386/percpu.h
--- linux-2.6.21-rc7.0/include/asm-i386/percpu.h	2007-05-01 07:33:03.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-i386/percpu.h	2007-05-14 14:17:16.000000000 -0700
@@ -54,6 +54,11 @@ extern unsigned long __per_cpu_offset[];
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp 
+
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 
diff -Nurp linux-2.6.21-rc7.0/include/asm-ia64/percpu.h linux-2.6.21-rc7.1/include/asm-ia64/percpu.h
--- linux-2.6.21-rc7.0/include/asm-ia64/percpu.h	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-ia64/percpu.h	2007-05-14 14:17:38.000000000 -0700
@@ -29,6 +29,16 @@
 	__attribute__((__section__(".data.percpu")))		\
 	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
 
+#ifdef CONFIG_SMP
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)			\
+	__attribute__((__section__(".data.percpu.shared_aligned")))	\
+	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name		\
+	____cacheline_aligned_in_smp	
+#else
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+	DEFINE_PER_CPU(type, name)
+#endif
+
 /*
  * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
  * external routine, to avoid include-hell.
diff -Nurp linux-2.6.21-rc7.0/include/asm-powerpc/percpu.h linux-2.6.21-rc7.1/include/asm-powerpc/percpu.h
--- linux-2.6.21-rc7.0/include/asm-powerpc/percpu.h	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-powerpc/percpu.h	2007-05-14 14:17:58.000000000 -0700
@@ -20,6 +20,11 @@
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp 
+
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
@@ -40,6 +45,8 @@ extern void setup_per_cpu_areas(void);
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
diff -Nurp linux-2.6.21-rc7.0/include/asm-s390/percpu.h linux-2.6.21-rc7.1/include/asm-s390/percpu.h
--- linux-2.6.21-rc7.0/include/asm-s390/percpu.h	2007-04-15 16:50:57.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-s390/percpu.h	2007-05-14 14:18:12.000000000 -0700
@@ -41,6 +41,11 @@ extern unsigned long __per_cpu_offset[NR
     __attribute__((__section__(".data.percpu"))) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp
+
 #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
@@ -59,6 +64,8 @@ do {								\
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define __get_cpu_var(var) __reloc_hide(var,0)
 #define __raw_get_cpu_var(var) __reloc_hide(var,0)
diff -Nurp linux-2.6.21-rc7.0/include/asm-sparc64/percpu.h linux-2.6.21-rc7.1/include/asm-sparc64/percpu.h
--- linux-2.6.21-rc7.0/include/asm-sparc64/percpu.h	2007-05-01 07:33:06.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-sparc64/percpu.h	2007-05-14 14:18:27.000000000 -0700
@@ -17,6 +17,11 @@ extern unsigned long __per_cpu_shift;
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp 
+
 register unsigned long __local_per_cpu_offset asm("g5");
 
 /* var is in discarded region: offset to particular copy we want */
@@ -36,6 +41,8 @@ do {								\
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
diff -Nurp linux-2.6.21-rc7.0/include/asm-x86_64/percpu.h linux-2.6.21-rc7.1/include/asm-x86_64/percpu.h
--- linux-2.6.21-rc7.0/include/asm-x86_64/percpu.h	2007-05-01 07:33:06.000000000 -0700
+++ linux-2.6.21-rc7.1/include/asm-x86_64/percpu.h	2007-05-14 14:19:40.000000000 -0700
@@ -20,6 +20,11 @@
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_internodealigned_in_smp 
+
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*({				\
 	extern int simple_identifier_##var(void);	\
@@ -46,6 +51,8 @@ extern void setup_per_cpu_areas(void);
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] Define percpu smp cacheline align interface
  2007-05-07 22:58   ` Fenghua Yu
@ 2007-05-15 23:42     ` Andrew Morton
  0 siblings, 0 replies; 20+ messages in thread
From: Andrew Morton @ 2007-05-15 23:42 UTC (permalink / raw)
  To: Fenghua Yu; +Cc: suresh.b.siddha, linux-kernel, clameter

On Mon, 7 May 2007 15:58:41 -0700
Fenghua Yu <fenghua.yu@intel.com> wrote:

> On Fri, May 04, 2007 at 05:12:18PM -0700, Fenghua Yu wrote:
> > 
> > Define percpu smp cacheline align interface
> 
> This is updated patch. Per cpu cacheline aligned data section name and macro name are changed to have better code understanding.
> 
> The patches place all of smp cacheline aligned percpu data into .data.percpu.shared_cacheline_aligned. Other percpu data is still in data.percpu section. The patches can reduce cache line access in SMP and reduce alignment gap waste. The patches also define PERCPU macro for vmlinux.lds.S for concise code.
> 
> PATCH 1/2: Define percpu smp cacheline align interface
> PATCH 2/2: Call percpu smp cacheline algin interface

For some reason arm exploded.

/opt/crosstool/gcc-3.4.5-glibc-2.3.6/arm-unknown-linux-gnu/bin/arm-unknown-linux-gnu-ld:arch/arm/kernel/vmlinux.lds:327: parse error

  __con_initcall_start = .;
   *(.con_initcall.init)
  __con_initcall_end = .;
  __security_initcall_start = .;
   *(.security_initcall.init)
  __security_initcall_end = .;

  . = ALIGN(32);
  __initramfs_start = .;
   usr/built-in.o(.init.ramfs)
  __initramfs_end = .;

  . = ALIGN(4096); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - 0) { *(.data.percpu) *(.data.percpu.shared_cacheline_aligned) } __per_cpu_end = .;






 }

Seems I don't have the latest version any more so I'll drop it.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2007-05-15 23:45 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <33E1C72C74DBE747B7B59C1740F7443701A2F0AB@orsmsx417.amr.corp.intel.com>
2007-05-05  0:12 ` [PATCH 1/2] Define percpu smp cacheline align interface Fenghua Yu
2007-05-07 22:58   ` Fenghua Yu
2007-05-15 23:42     ` Andrew Morton
2007-05-05  0:12 ` [PATCH 0/2] Add percpu smp cacheline align section Fenghua Yu
2007-05-05 16:52   ` Christoph Lameter
2007-05-07 17:11     ` Siddha, Suresh B
2007-05-07 17:30       ` Christoph Lameter
2007-05-07 17:46         ` Siddha, Suresh B
2007-05-07 18:13           ` Yu, Fenghua
2007-05-15  0:12           ` [PATCH 2/2] Use the new percpu interface for shared data -- version 2 Fenghua Yu
     [not found]           ` <20070515001255.GA27978@linux-os.sc.intel.com>
2007-05-15  0:22             ` [PATCH 1/2] Define " Fenghua Yu
2007-05-05  0:12 ` [PATCH 2/2] Call percpu smp cacheline algin interface Fenghua Yu
2007-05-07 22:59   ` Fenghua Yu
2007-05-09 20:33     ` Andrew Morton
2007-05-09 22:16       ` Yu, Fenghua
2007-05-09 22:53         ` Andrew Morton
2007-05-09 22:56           ` Christoph Lameter
2007-05-09 23:06             ` Siddha, Suresh B
2007-05-09 23:10             ` Yu, Fenghua
2007-05-09 23:36               ` Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.