linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
@ 2006-05-10  4:03 Paul Mackerras
  2006-05-10  5:16 ` Olof Johansson
  2006-05-10 15:47 ` Richard Henderson
  0 siblings, 2 replies; 25+ messages in thread
From: Paul Mackerras @ 2006-05-10  4:03 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-arch, linuxppc-dev

With this patch, 64-bit powerpc uses __thread for per-cpu variables.

The motivation for doing this is that getting the address of a per-cpu
variable currently requires two loads (one to get our per-cpu offset
and one to get the address of the variable in the .data.percpu
section) plus an add.  With __thread we can get the address of our
copy of a per-cpu variable with just an add (r13 plus a constant).

This means that r13 now has to hold the per-cpu base address + 0x7000
(the 0x7000 is to allow us to address 60k of per-cpu data with a
16-bit signed offset, and is dictated by the toolchain).  In
particular that means that the r13 can't hold the pointer to the
paca.  Instead we can get the paca pointer from the SPRG3 register.
We use r13 for the paca pointer for the early exception entry code,
and load the thread pointer into r13 before calling C code.

With this there is an incentive to move things that are currently
stored in the paca into per-cpu variables, and eventually to get rid
of the paca altogether.  I'll address that in future patches.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index ed5b26a..95a7480 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -58,12 +58,13 @@ override LD	+= -m elf$(SZ)ppc
 override CC	+= -m$(SZ)
 endif
 
-LDFLAGS_vmlinux	:= -Bstatic
+LDFLAGS_vmlinux	:= -Bstatic --no-tls-optimize
 
 # The -Iarch/$(ARCH)/include is temporary while we are merging
 CPPFLAGS-$(CONFIG_PPC32) := -Iarch/$(ARCH) -Iarch/$(ARCH)/include
 AFLAGS-$(CONFIG_PPC32)	:= -Iarch/$(ARCH)
-CFLAGS-$(CONFIG_PPC64)	:= -mminimal-toc -mtraceback=none  -mcall-aixdesc
+CFLAGS-$(CONFIG_PPC64)	:= -mminimal-toc -mtraceback=none -mcall-aixdesc \
+			   -ftls-model=local-exec -mtls-size=16
 CFLAGS-$(CONFIG_PPC32)	:= -Iarch/$(ARCH) -ffixed-r2 -mmultiple
 CPPFLAGS	+= $(CPPFLAGS-y)
 AFLAGS		+= $(AFLAGS-y)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8f85c5e..1cd54a6 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -112,6 +112,7 @@ #ifdef CONFIG_PPC64
 	DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start));
 	DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack));
 	DEFINE(PACACURRENT, offsetof(struct paca_struct, __current));
+	DEFINE(PACATHREADPTR, offsetof(struct paca_struct, thread_ptr));
 	DEFINE(PACASAVEDMSR, offsetof(struct paca_struct, saved_msr));
 	DEFINE(PACASTABREAL, offsetof(struct paca_struct, stab_real));
 	DEFINE(PACASTABVIRT, offsetof(struct paca_struct, stab_addr));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 19ad5c6..455443e 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -92,14 +92,15 @@ system_call_common:
 	ld	r11,exception_marker@toc(r2)
 	std	r11,-16(r9)		/* "regshere" marker */
 #ifdef CONFIG_PPC_ISERIES
+	lbz	r10,PACAPROCENABLED(r13)
+	std	r10,SOFTE(r1)
 	/* Hack for handling interrupts when soft-enabling on iSeries */
 	cmpdi	cr1,r0,0x5555		/* syscall 0x5555 */
 	andi.	r10,r12,MSR_PR		/* from kernel */
 	crand	4*cr0+eq,4*cr1+eq,4*cr0+eq
 	beq	hardware_interrupt_entry
-	lbz	r10,PACAPROCENABLED(r13)
-	std	r10,SOFTE(r1)
 #endif
+	ld	r13,PACATHREADPTR(r13)
 	mfmsr	r11
 	ori	r11,r11,MSR_EE
 	mtmsrd	r11,1
@@ -170,6 +171,7 @@ syscall_error_cont:
 	andi.	r6,r8,MSR_PR
 	ld	r4,_LINK(r1)
 	beq-	1f
+	mfspr	r13,SPRN_SPRG3
 	ACCOUNT_CPU_USER_EXIT(r11, r12)
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
 1:	ld	r2,GPR2(r1)
@@ -361,7 +363,8 @@ #ifdef CONFIG_SMP
 #endif /* CONFIG_SMP */
 
 	addi	r6,r4,-THREAD	/* Convert THREAD to 'current' */
-	std	r6,PACACURRENT(r13)	/* Set new 'current' */
+	mfspr	r10,SPRN_SPRG3
+	std	r6,PACACURRENT(r10)	/* Set new 'current' */
 
 	ld	r8,KSP(r4)	/* new stack pointer */
 BEGIN_FTR_SECTION
@@ -390,7 +393,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SLB)
 	addi	r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE
 
 	mr	r1,r8		/* start using new stack pointer */
-	std	r7,PACAKSAVE(r13)
+	std	r7,PACAKSAVE(r10)
 
 	ld	r6,_CCR(r1)
 	mtcrf	0xFF,r6
@@ -457,22 +460,23 @@ restore:
 #ifdef CONFIG_PPC_ISERIES
 	ld	r5,SOFTE(r1)
 	cmpdi	0,r5,0
+	mfspr	r11,SPRN_SPRG3
 	beq	4f
 	/* Check for pending interrupts (iSeries) */
-	ld	r3,PACALPPACAPTR(r13)
+	ld	r3,PACALPPACAPTR(r11)
 	ld	r3,LPPACAANYINT(r3)
 	cmpdi	r3,0
 	beq+	4f			/* skip do_IRQ if no interrupts */
 
 	li	r3,0
-	stb	r3,PACAPROCENABLED(r13)	/* ensure we are soft-disabled */
+	stb	r3,PACAPROCENABLED(r11)	/* ensure we are soft-disabled */
 	ori	r10,r10,MSR_EE
 	mtmsrd	r10			/* hard-enable again */
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.do_IRQ
 	b	.ret_from_except_lite		/* loop back and handle more */
 
-4:	stb	r5,PACAPROCENABLED(r13)
+4:	stb	r5,PACAPROCENABLED(r11)
 #endif
 
 	ld	r3,_MSR(r1)
@@ -486,6 +490,7 @@ #endif
 	 * userspace
 	 */
 	beq	1f
+	mfspr	r13,SPRN_SPRG3
 	ACCOUNT_CPU_USER_EXIT(r3, r4)
 	REST_GPR(13, r1)
 1:
@@ -541,8 +546,9 @@ #endif
 	/* here we are preempting the current task */
 1:
 #ifdef CONFIG_PPC_ISERIES
+	mfspr	r11,SPRN_SPRG3
 	li	r0,1
-	stb	r0,PACAPROCENABLED(r13)
+	stb	r0,PACAPROCENABLED(r11)
 #endif
 	ori	r10,r10,MSR_EE
 	mtmsrd	r10,1		/* reenable interrupts */
@@ -641,8 +647,9 @@ _GLOBAL(enter_rtas)
 	 * so they are saved in the PACA which allows us to restore
 	 * our original state after RTAS returns.
          */
-	std	r1,PACAR1(r13)
-        std	r6,PACASAVEDMSR(r13)
+	mfspr	r5,SPRN_SPRG3
+	std	r1,PACAR1(r5)
+	std	r6,PACASAVEDMSR(r5)
 
 	/* Setup our real return addr */	
 	LOAD_REG_ADDR(r4,.rtas_return_loc)
@@ -698,6 +705,7 @@ _STATIC(rtas_restore_regs)
 	REST_10GPRS(22, r1)		/* ditto */
 
 	mfspr	r13,SPRN_SPRG3
+	ld	r13,PACATHREADPTR(r13)
 
 	ld	r4,_CCR(r1)
 	mtcr	r4
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index b7d1404..80d95b4 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -298,6 +298,7 @@ #define EXCEPTION_PROLOG_COMMON(n, area)
 	std	r10,_CTR(r1);						   \
 	mfspr	r11,SPRN_XER;		/* save XER in stackframe	*/ \
 	std	r11,_XER(r1);						   \
+	SAVE_INT_ENABLE(r10);		/* save soft irq disable state	*/ \
 	li	r9,(n)+1;						   \
 	std	r9,_TRAP(r1);		/* set trap number		*/ \
 	li	r10,0;							   \
@@ -338,27 +339,27 @@ label##_iSeries:							\
 	b	label##_common;						\
 
 #ifdef DO_SOFT_DISABLE
+#define SAVE_INT_ENABLE(rn)			\
+	lbz	rn,PACAPROCENABLED(r13);	\
+	std	rn,SOFTE(r1)
+
 #define DISABLE_INTS				\
-	lbz	r10,PACAPROCENABLED(r13);	\
 	li	r11,0;				\
-	std	r10,SOFTE(r1);			\
 	mfmsr	r10;				\
 	stb	r11,PACAPROCENABLED(r13);	\
 	ori	r10,r10,MSR_EE;			\
 	mtmsrd	r10,1
 
 #define ENABLE_INTS				\
-	lbz	r10,PACAPROCENABLED(r13);	\
 	mfmsr	r11;				\
-	std	r10,SOFTE(r1);			\
 	ori	r11,r11,MSR_EE;			\
 	mtmsrd	r11,1
 
 #else	/* hard enable/disable interrupts */
+#define SAVE_INT_ENABLE(rn)
 #define DISABLE_INTS
 
 #define ENABLE_INTS				\
-	ld	r12,_MSR(r1);			\
 	mfmsr	r11;				\
 	rlwimi	r11,r12,0,MSR_EE;		\
 	mtmsrd	r11,1
@@ -371,6 +372,7 @@ #define STD_EXCEPTION_COMMON(trap, label
 label##_common:						\
 	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);	\
 	DISABLE_INTS;					\
+	ld	r13,PACATHREADPTR(r13);		\
 	bl	.save_nvgprs;				\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;		\
 	bl	hdlr;					\
@@ -387,6 +389,7 @@ label##_common:						\
 	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);	\
 	FINISH_NAP;					\
 	DISABLE_INTS;					\
+	ld	r13,PACATHREADPTR(r13);		\
 	bl	.save_nvgprs;				\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;		\
 	bl	hdlr;					\
@@ -399,6 +402,7 @@ label##_common:						\
 	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);	\
 	FINISH_NAP;					\
 	DISABLE_INTS;					\
+	ld	r13,PACATHREADPTR(r13);		\
 	bl	.ppc64_runlatch_on;			\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;		\
 	bl	hdlr;					\
@@ -810,6 +814,7 @@ machine_check_common:
 	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
 	FINISH_NAP
 	DISABLE_INTS
+	ld	r13,PACATHREADPTR(r13)
 	bl	.save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.machine_check_exception
@@ -864,6 +869,7 @@ bad_stack:
 	li	r12,0
 	std	r12,0(r11)
 	ld	r2,PACATOC(r13)
+	ld	r13,PACATHREADPTR(r13)
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.kernel_bad_stack
 	b	1b
@@ -886,6 +892,7 @@ fast_exception_return:
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 	andi.	r3,r12,MSR_PR
 	beq	2f
+	mfspr	r13,SPRN_SPRG3
 	ACCOUNT_CPU_USER_EXIT(r3, r4)
 2:
 #endif
@@ -913,6 +920,8 @@ #endif
 	b	.	/* prevent speculative execution */
 
 unrecov_fer:
+	mfspr	r13,SPRN_SPRG3
+	ld	r13,PACATHREADPTR(r13)
 	bl	.save_nvgprs
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.unrecoverable_exception
@@ -933,16 +942,20 @@ data_access_common:
 	EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
 	ld	r3,PACA_EXGEN+EX_DAR(r13)
 	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	DISABLE_INTS
 	li	r5,0x300
+	ld	r13,PACATHREADPTR(r13)
 	b	.do_hash_page	 	/* Try to handle as hpte fault */
 
 	.align	7
 	.globl instruction_access_common
 instruction_access_common:
 	EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN)
+	DISABLE_INTS
 	ld	r3,_NIP(r1)
 	andis.	r4,r12,0x5820
 	li	r5,0x400
+	ld	r13,PACATHREADPTR(r13)
 	b	.do_hash_page		/* Try to handle as hpte fault */
 
 /*
@@ -958,7 +971,7 @@ slb_miss_user_common:
 	stw	r9,PACA_EXGEN+EX_CCR(r13)
 	std	r10,PACA_EXGEN+EX_LR(r13)
 	std	r11,PACA_EXGEN+EX_SRR0(r13)
-	bl	.slb_allocate_user
+	bl	..slb_allocate_user
 
 	ld	r10,PACA_EXGEN+EX_LR(r13)
 	ld	r3,PACA_EXGEN+EX_R3(r13)
@@ -996,11 +1009,14 @@ slb_miss_fault:
 	li	r5,0
 	std	r4,_DAR(r1)
 	std	r5,_DSISR(r1)
+	ld	r13,PACATHREADPTR(r13)
+	ENABLE_INTS
 	b	.handle_page_fault
 
 unrecov_user_slb:
 	EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN)
 	DISABLE_INTS
+	ld	r13,PACATHREADPTR(r13)
 	bl	.save_nvgprs
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.unrecoverable_exception
@@ -1023,7 +1039,7 @@ _GLOBAL(slb_miss_realmode)
 	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
 	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
 
-	bl	.slb_allocate_realmode
+	bl	..slb_allocate_realmode
 
 	/* All done -- return from exception. */
 
@@ -1061,6 +1077,7 @@ #endif /* CONFIG_PPC_ISERIES */
 unrecov_slb:
 	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
 	DISABLE_INTS
+	ld	r13,PACATHREADPTR(r13)
 	bl	.save_nvgprs
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.unrecoverable_exception
@@ -1074,6 +1091,7 @@ hardware_interrupt_common:
 	FINISH_NAP
 hardware_interrupt_entry:
 	DISABLE_INTS
+	ld	r13,PACATHREADPTR(r13)
 	bl	.ppc64_runlatch_on
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.do_IRQ
@@ -1100,9 +1118,10 @@ alignment_common:
 	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
 	std	r3,_DAR(r1)
 	std	r4,_DSISR(r1)
+	ld	r13,PACATHREADPTR(r13)
+	ENABLE_INTS
 	bl	.save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	ENABLE_INTS
 	bl	.alignment_exception
 	b	.ret_from_except
 
@@ -1110,9 +1129,10 @@ alignment_common:
 	.globl program_check_common
 program_check_common:
 	EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
+	ld	r13,PACATHREADPTR(r13)
+	ENABLE_INTS
 	bl	.save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	ENABLE_INTS
 	bl	.program_check_exception
 	b	.ret_from_except
 
@@ -1121,9 +1141,10 @@ program_check_common:
 fp_unavailable_common:
 	EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN)
 	bne	.load_up_fpu		/* if from user, just load it up */
+	ld	r13,PACATHREADPTR(r13)
+	ENABLE_INTS
 	bl	.save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	ENABLE_INTS
 	bl	.kernel_fp_unavailable_exception
 	BUG_OPCODE
 
@@ -1136,9 +1157,10 @@ BEGIN_FTR_SECTION
 	bne	.load_up_altivec	/* if from user, just load it up */
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
+	ld	r13,PACATHREADPTR(r13)
+	ENABLE_INTS
 	bl	.save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	ENABLE_INTS
 	bl	.altivec_unavailable_exception
 	b	.ret_from_except
 
@@ -1242,13 +1264,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
 	rlwimi	r4,r0,32-13,30,30	/* becomes _PAGE_USER access bit */
 	ori	r4,r4,1			/* add _PAGE_PRESENT */
 	rlwimi	r4,r5,22+2,31-2,31-2	/* Set _PAGE_EXEC if trap is 0x400 */
-
-	/*
-	 * On iSeries, we soft-disable interrupts here, then
-	 * hard-enable interrupts so that the hash_page code can spin on
-	 * the hash_table_lock without problems on a shared processor.
-	 */
-	DISABLE_INTS
 
 	/*
 	 * r3 contains the faulting address
@@ -1258,6 +1273,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
 	 * at return r3 = 0 for success
 	 */
 	bl	.hash_page		/* build HPTE if possible */
+11:					/* re-enter here from do_ste_alloc */
 	cmpdi	r3,0			/* see if hash_page succeeded */
 
 #ifdef DO_SOFT_DISABLE
@@ -1280,18 +1296,18 @@ #ifdef DO_SOFT_DISABLE
 	 */
 	ld	r3,SOFTE(r1)
 	bl	.local_irq_restore
-	b	11f
 #else
 	beq	fast_exception_return   /* Return from exception on success */
 	ble-	12f			/* Failure return from hash_page */
 
-	/* fall through */
+	ld	r12,_MSR(r1)		/* Reenable interrupts if they */
+	ENABLE_INTS			/* were enabled when trap occurred */
 #endif
+	/* fall through */
 
 /* Here we have a page fault that hash_page can't handle. */
 _GLOBAL(handle_page_fault)
-	ENABLE_INTS
-11:	ld	r4,_DAR(r1)
+	ld	r4,_DAR(r1)
 	ld	r5,_DSISR(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.do_page_fault
@@ -1316,9 +1332,7 @@ _GLOBAL(handle_page_fault)
 	/* here we have a segment miss */
 _GLOBAL(do_ste_alloc)
 	bl	.ste_allocate		/* try to insert stab entry */
-	cmpdi	r3,0
-	beq+	fast_exception_return
-	b	.handle_page_fault
+	b	11b
 
 /*
  * r13 points to the PACA, r9 contains the saved CR,
@@ -1796,6 +1810,9 @@ _GLOBAL(__secondary_start)
 	/* Clear backchain so we get nice backtraces */
 	li	r7,0
 	mtlr	r7
+
+	/* load per-cpu data area pointer */
+	ld	r13,PACATHREADPTR(r13)
 
 	/* enable MMU and jump to start_secondary */
 	LOAD_REG_ADDR(r3, .start_secondary_prolog)
@@ -1808,9 +1825,11 @@ #endif
 	rfid
 	b	.	/* prevent speculative execution */
 
-/* 
+/*
  * Running with relocation on at this point.  All we want to do is
  * zero the stack back-chain pointer before going into C code.
+ * We can't do this in __secondary_start because the stack isn't
+ * necessarily in the RMA, so it might not be accessible in real mode.
  */
 _GLOBAL(start_secondary_prolog)
 	li	r3,0
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 2778cce..f1899b0 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -27,6 +27,7 @@ #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/cputable.h>
 #include <asm/thread_info.h>
+#include <asm/reg.h>
 
 	.text
 
@@ -820,6 +821,7 @@ #ifdef CONFIG_KEXEC
  * join other cpus in kexec_wait(phys_id)
  */
 _GLOBAL(kexec_smp_wait)
+	mfspr	r13,SPRN_SPRG3
 	lhz	r3,PACAHWCPUID(r13)
 	li	r4,-1
 	sth	r4,PACAHWCPUID(r13)	/* let others know we left */
@@ -885,6 +887,7 @@ _GLOBAL(kexec_sequence)
 	mr	r28,r6			/* control, unused */
 	mr	r27,r7			/* clear_all() fn desc */
 	mr	r26,r8			/* spare */
+	mfspr	r13,SPRN_SPRG3
 	lhz	r25,PACAHWCPUID(r13)	/* get our phys cpu from paca */
 
 	/* disable interrupts, we are overwriting kernel data next */
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index ba34001..8140cbe 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -357,9 +357,7 @@ int apply_relocate_add(Elf64_Shdr *sechd
 				       me->name, value);
 				return -ENOEXEC;
 			}
-			*((uint16_t *) location)
-				= (*((uint16_t *) location) & ~0xffff)
-				| (value & 0xffff);
+			*(u16 *)location = value;
 			break;
 
 		case R_PPC64_TOC16_DS:
@@ -398,6 +396,32 @@ int apply_relocate_add(Elf64_Shdr *sechd
 			*(uint32_t *)location 
 				= (*(uint32_t *)location & ~0x03fffffc)
 				| (value & 0x03fffffc);
+			break;
+
+		case R_PPC64_TPREL16:
+			if (value > 0xffff) {
+				printk(KERN_ERR "%s: TPREL16 relocation "
+				       "too large (%d)\n", value - 0x8000);
+				return -ENOEXEC;
+			}
+			*(u16 *)location = value - 0x8000;
+			break;
+
+		case R_PPC64_TPREL16_LO:
+			*(u16 *)location = PPC_LO(value - 0x8000);
+			break;
+
+		case R_PPC64_TPREL16_LO_DS:
+			*(u16 *)location = ((*(u16 *)location) & ~0xfffc)
+				| ((value - 0x8000) & 0xfffc);
+			break;
+
+		case R_PPC64_TPREL16_HA:
+			*(u16 *)location = PPC_HA(value - 0x8000);
+			break;
+
+		case R_PPC64_TPREL64:
+			*(u64 *)location = value - 0x8000;
 			break;
 
 		default:
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 4467c49..7fe7c7d 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -605,6 +605,7 @@ void __init setup_per_cpu_areas(void)
 {
 	int i;
 	unsigned long size;
+	unsigned long initsize;
 	char *ptr;
 
 	/* Copy section for each CPU (we discard the original) */
@@ -613,14 +614,19 @@ #ifdef CONFIG_MODULES
 	if (size < PERCPU_ENOUGH_ROOM)
 		size = PERCPU_ENOUGH_ROOM;
 #endif
+	initsize = __end_tdata - __start_tdata;
 
 	for_each_possible_cpu(i) {
 		ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
 		if (!ptr)
 			panic("Cannot allocate cpu data for CPU %d\n", i);
 
-		paca[i].data_offset = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+		paca[i].thread_ptr = (unsigned long)ptr + 0x7000;
+		memcpy(ptr, __start_tdata, initsize);
+		if (initsize < size)
+			memset(ptr + initsize, 0, size - initsize);
 	}
+	/* Set our percpu area pointer register */
+	asm volatile("mr 13,%0" : : "r" (paca[boot_cpuid].thread_ptr));
 }
 #endif
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index fe79c25..c83ff6a 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -141,11 +141,12 @@ #ifdef CONFIG_PPC32
 #else
 	. = ALIGN(128);
 #endif
-	.data.percpu : {
-		__per_cpu_start = .;
-		*(.data.percpu)
-		__per_cpu_end = .;
-	}
+	__start_tdata = .;
+	.tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) }
+	__end_tdata = .;
+	.tbss  : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) }
+	__per_cpu_start = 0x1000;
+	__per_cpu_end = 0x1000 + ALIGN(SIZEOF(.tdata), 128) + SIZEOF(.tbss);
 
 	. = ALIGN(8);
 	.machine.desc : {
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index abfaabf..92f11cd 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -23,14 +23,30 @@ #include <asm/page.h>
 #include <asm/mmu.h>
 #include <asm/pgtable.h>
 
-/* void slb_allocate_realmode(unsigned long ea);
+/*
+ * void slb_allocate_realmode(unsigned long ea);
  *
+ * This version is callable from C; the version with two dots at the
+ * start of the name assumes r13 points to the PACA and thus isn't.
+ */
+_GLOBAL(slb_allocate_realmode)
+	mflr	r0
+	std	r0,16(r1)
+	mr	r8,r13
+	mfspr	r13,SPRN_SPRG3
+	bl	..slb_allocate_realmode
+	mr	r13,r8
+	mtlr	r0
+	blr
+
+/*
  * Create an SLB entry for the given EA (user or kernel).
  * 	r3 = faulting address, r13 = PACA
  *	r9, r10, r11 are clobbered by this function
  * No other registers are examined or changed.
  */
-_GLOBAL(slb_allocate_realmode)
+	.globl	..slb_allocate_realmode
+..slb_allocate_realmode:
 	/* r3 = faulting address */
 
 	srdi	r9,r3,60		/* get region */
@@ -121,7 +137,8 @@ #ifdef __DISABLED__
  * It is called with translation enabled in order to be able to walk the
  * page tables. This is not currently used.
  */
-_GLOBAL(slb_allocate_user)
+	.globl	..slb_allocate_user
+..slb_allocate_user:
 	/* r3 = faulting address */
 	srdi	r10,r3,28		/* get esid */
 
diff --git a/arch/powerpc/platforms/iseries/misc.S b/arch/powerpc/platforms/iseries/misc.S
index 7641fc7..d8a3ab5 100644
--- a/arch/powerpc/platforms/iseries/misc.S
+++ b/arch/powerpc/platforms/iseries/misc.S
@@ -21,30 +21,33 @@ #include <asm/ppc_asm.h>
 
 /* unsigned long local_save_flags(void) */
 _GLOBAL(local_get_flags)
-	lbz	r3,PACAPROCENABLED(r13)
+	mfspr	r3,SPRG3
+	lbz	r3,PACAPROCENABLED(r3)
 	blr
 
 /* unsigned long local_irq_disable(void) */
 _GLOBAL(local_irq_disable)
-	lbz	r3,PACAPROCENABLED(r13)
+	mfspr	r5,SPRG3
+	lbz	r3,PACAPROCENABLED(r5)
 	li	r4,0
-	stb	r4,PACAPROCENABLED(r13)
+	stb	r4,PACAPROCENABLED(r5)
 	blr			/* Done */
 
 /* void local_irq_restore(unsigned long flags) */
 _GLOBAL(local_irq_restore)
-	lbz	r5,PACAPROCENABLED(r13)
+	mfspr	r6,SPRG3
+	lbz	r5,PACAPROCENABLED(r6)
 	 /* Check if things are setup the way we want _already_. */
 	cmpw	0,r3,r5
 	beqlr
 	/* are we enabling interrupts? */
 	cmpdi	0,r3,0
-	stb	r3,PACAPROCENABLED(r13)
+	stb	r3,PACAPROCENABLED(r6)
 	beqlr
 	/* Check pending interrupts */
 	/*   A decrementer, IPI or PMC interrupt may have occurred
 	 *   while we were in the hypervisor (which enables) */
-	ld	r4,PACALPPACAPTR(r13)
+	ld	r4,PACALPPACAPTR(r6)
 	ld	r4,LPPACAANYINT(r4)
 	cmpdi	r4,0
 	beqlr
diff --git a/include/asm-powerpc/paca.h b/include/asm-powerpc/paca.h
index 706325f..afbfb5c 100644
--- a/include/asm-powerpc/paca.h
+++ b/include/asm-powerpc/paca.h
@@ -21,8 +21,14 @@ #include	<asm/types.h>
 #include	<asm/lppaca.h>
 #include	<asm/mmu.h>
 
-register struct paca_struct *local_paca asm("r13");
-#define get_paca()	local_paca
+static inline struct paca_struct *get_paca(void)
+{
+	struct paca_struct *p;
+
+	asm volatile("mfsprg3 %0" : "=r" (p));
+	return p;
+}
+
 #define get_lppaca()	(get_paca()->lppaca_ptr)
 
 struct task_struct;
@@ -66,7 +72,7 @@ #endif /* CONFIG_PPC_ISERIES */
 	u64 stab_real;			/* Absolute address of segment table */
 	u64 stab_addr;			/* Virtual address of segment table */
 	void *emergency_sp;		/* pointer to emergency stack */
-	u64 data_offset;		/* per cpu data offset */
+	u64 thread_ptr;			/* per cpu data pointer + 0x7000 */
 	s16 hw_cpu_id;			/* Physical processor number */
 	u8 cpu_start;			/* At startup, processor spins until */
 					/* this becomes non-zero. */
diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h
index 5d603ff..dcd9aa0 100644
--- a/include/asm-powerpc/percpu.h
+++ b/include/asm-powerpc/percpu.h
@@ -2,40 +2,76 @@ #ifndef _ASM_POWERPC_PERCPU_H_
 #define _ASM_POWERPC_PERCPU_H_
 #ifdef __powerpc64__
 #include <linux/compiler.h>
-
-/*
- * Same as asm-generic/percpu.h, except that we store the per cpu offset
- * in the paca. Based on the x86-64 implementation.
- */
-
-#ifdef CONFIG_SMP
-
 #include <asm/paca.h>
 
-#define __per_cpu_offset(cpu) (paca[cpu].data_offset)
-#define __my_cpu_offset() get_paca()->data_offset
+#ifdef CONFIG_SMP
 
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+	__thread __typeof__(type) per_cpu__##name __attribute__((__used__))
+
+#define __get_cpu_var(var)	per_cpu__##var
+#define __raw_get_cpu_var(var)	per_cpu__##var
 
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
-#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
-#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
+#define per_cpu(var, cpu)					\
+	(*(__typeof__(&per_cpu__##var))({			\
+		void *__ptr;					\
+		asm("addi %0,%1,per_cpu__"#var"@tprel"		\
+		    : "=b" (__ptr)				\
+		    : "b" (paca[(cpu)].thread_ptr));		\
+		__ptr;						\
+	}))
 
 /* A macro to avoid #include hell... */
-#define percpu_modcopy(pcpudst, src, size, zero_size)		\
-do {								\
-	unsigned int __i;					\
-	BUG_ON(zero_size != 0);					\
-	for_each_possible_cpu(__i)				\
-		memcpy((pcpudst)+__per_cpu_offset(__i),		\
-		       (src), (size));				\
+#define percpu_modcopy(pcpudst, src, size, total_size)			    \
+do {									    \
+	unsigned int __i;						    \
+	extern char __per_cpu_start[];					    \
+	unsigned long offset = (unsigned long)(pcpudst) - 0x8000;	    \
+	for_each_possible_cpu(__i) {					    \
+		memcpy((void *)(offset + paca[__i].thread_ptr),		    \
+		       (src), (size));					    \
+		if ((size) < (total_size))				    \
+			memset((void *)(offset + (size) + paca[__i].thread_ptr), \
+			       0, (total_size) - (size));		    \
+	}								    \
 } while (0)
 
 extern void setup_per_cpu_areas(void);
+
+#define DECLARE_PER_CPU(type, name) \
+	extern __thread __typeof__(type) per_cpu__##name
+
+#ifndef __GENKSYMS__
+#define __EXPORT_PER_CPU_SYMBOL(sym, sec)				\
+	extern __thread typeof(sym) sym;				\
+	__CRC_SYMBOL(sym, sec)						\
+	static const char __kstrtab_##sym[]				\
+	__attribute__((used, section("__ksymtab_strings"))) = #sym;	\
+	asm(".section	__ksymtab"sec",\"aw\",@progbits\n"		\
+	    "	.align 3\n"						\
+	    "	.type	__ksymtab_"#sym", @object\n"			\
+	    "	.size	__ksymtab_"#sym", 16\n"				\
+	    "__ksymtab_"#sym":\n"					\
+	    "	.quad	0x8000+"#sym"@tprel\n"				\
+	    "	.quad	__kstrtab_"#sym)
+
+#define EXPORT_PER_CPU_SYMBOL(var) \
+	__EXPORT_PER_CPU_SYMBOL(per_cpu__##var, "")
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) \
+	__EXPORT_PER_CPU_SYMBOL(per_cpu__##var, "_gpl")
+
+#else
+/* for genksyms's sake... */
+#define __thread
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#endif
 
+/* Actual kernel address of .tdata section contents */
+extern char __start_tdata[];
+extern char __end_tdata[];
+
 #else /* ! SMP */
 
 #define DEFINE_PER_CPU(type, name) \
@@ -45,12 +81,12 @@ #define per_cpu(var, cpu)			(*((void)(cp
 #define __get_cpu_var(var)			per_cpu__##var
 #define __raw_get_cpu_var(var)			per_cpu__##var
 
-#endif	/* SMP */
-
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
 
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+
+#endif	/* SMP */
 
 #else
 #include <asm-generic/percpu.h>
diff --git a/kernel/printk.c b/kernel/printk.c

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10  4:03 [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables Paul Mackerras
@ 2006-05-10  5:16 ` Olof Johansson
  2006-05-10  5:35   ` Alan Modra
                     ` (2 more replies)
  2006-05-10 15:47 ` Richard Henderson
  1 sibling, 3 replies; 25+ messages in thread
From: Olof Johansson @ 2006-05-10  5:16 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linux-kernel, linux-arch, linuxppc-dev

On Wed, May 10, 2006 at 02:03:59PM +1000, Paul Mackerras wrote:
> With this patch, 64-bit powerpc uses __thread for per-cpu variables.

Nice! I like the way you hid the slb functions so they can't ever be
called by mistake from C code. :-)

This patch a ppc64_defconfig vmlinux a bit (with the other two percpu
patches):

olof@quad:~/work/linux/powerpc $ ls -l vmlinux.pre vmlinux
-rwxr-xr-x 1 olof olof 10290928 2006-05-09 23:48 vmlinux.pre
-rwxr-xr-x 1 olof olof 10307499 2006-05-09 23:50 vmlinux
olof@quad:~/work/linux/powerpc $ size vmlinux.pre vmlinux
   text    data     bss     dec     hex filename
5554034 2404256  480472 8438762  80c3ea vmlinux.pre
5578866 2384944  498848 8462658  812142 vmlinux

Looks like alot of the text growth is from the added mfsprg3 instructions:

$ objdump -d vmlinux.pre | egrep mfsprg.\*,3\$ | wc -l
26
$ objdump -d vmlinux | egrep mfsprg.\*,3\$ | wc -l
5134

... so, as the PACA gets deprecated, the bloat will go away again.

> The motivation for doing this is that getting the address of a per-cpu
> variable currently requires two loads (one to get our per-cpu offset
> and one to get the address of the variable in the .data.percpu
> section) plus an add.  With __thread we can get the address of our
> copy of a per-cpu variable with just an add (r13 plus a constant).

It would be interesting to see benchmarks of how much it improves
things. I guess it doesn't really get interesting until after the paca
gets removed though, due to the added mfsprg's.


-Olof

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10  5:16 ` Olof Johansson
@ 2006-05-10  5:35   ` Alan Modra
  2006-05-10  6:22   ` David S. Miller
  2006-05-10  6:29   ` Paul Mackerras
  2 siblings, 0 replies; 25+ messages in thread
From: Alan Modra @ 2006-05-10  5:35 UTC (permalink / raw)
  To: Olof Johansson; +Cc: Paul Mackerras, linux-arch, linuxppc-dev, linux-kernel

On Wed, May 10, 2006 at 12:16:50AM -0500, Olof Johansson wrote:
> ... so, as the PACA gets deprecated, the bloat will go away again.

We can also lose one instruction per tls access, if I can manage to
teach gcc a trick or two.

-- 
Alan Modra
IBM OzLabs - Linux Technology Centre

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10  5:16 ` Olof Johansson
  2006-05-10  5:35   ` Alan Modra
@ 2006-05-10  6:22   ` David S. Miller
  2006-05-10  6:29   ` Paul Mackerras
  2 siblings, 0 replies; 25+ messages in thread
From: David S. Miller @ 2006-05-10  6:22 UTC (permalink / raw)
  To: olof; +Cc: paulus, linux-kernel, linux-arch, linuxppc-dev

From: Olof Johansson <olof@lixom.net>
Date: Wed, 10 May 2006 00:16:50 -0500

> It would be interesting to see benchmarks of how much it improves
> things. I guess it doesn't really get interesting until after the paca
> gets removed though, due to the added mfsprg's.

When I moved the per-cpu base into a fixed register on sparc64,
it definitely showed up on the micro-benchmarks because this
shrunk the .text a lot in that case.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10  5:16 ` Olof Johansson
  2006-05-10  5:35   ` Alan Modra
  2006-05-10  6:22   ` David S. Miller
@ 2006-05-10  6:29   ` Paul Mackerras
  2006-05-10  6:39     ` David S. Miller
  2 siblings, 1 reply; 25+ messages in thread
From: Paul Mackerras @ 2006-05-10  6:29 UTC (permalink / raw)
  To: Olof Johansson; +Cc: linux-kernel, linux-arch, linuxppc-dev

Olof Johansson writes:

> Looks like alot of the text growth is from the added mfsprg3 instructions:

Yes, probably mostly from current.

> ... so, as the PACA gets deprecated, the bloat will go away again.

Yes.  I was hoping to get rid of the paca entirely, but that would
mean it would have to be in the RMA (so that the early exception entry
code can use it) which means that it won't be node-local any more.
I have a patch which allocates the per-cpu areas in the RMA but now
I'm rethinking it, since Ben H (at least) thinks the per-cpu area
really needs to be node-local.

Moving current to a per-cpu variable means that we need to allocate at
least the boot cpu's per-cpu area earlier than we do now, since it
seems that printk references current.  That makes it hard to make sure
the boot cpu's per-cpu area is node-local, unless we do something
tricky like reallocating it once the bootmem allocator is available.

> It would be interesting to see benchmarks of how much it improves
> things. I guess it doesn't really get interesting until after the paca
> gets removed though, due to the added mfsprg's.

I have moved current, smp_processor_id and a couple of other things to
per-cpu variables, and that results in the kernel text being about 8k
smaller than without any of these __thread patches.  Performance seems
to be very slightly better but it's hard to be sure that the change is
statistically significant, from the measurements I've done so far.

Paul.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10  6:29   ` Paul Mackerras
@ 2006-05-10  6:39     ` David S. Miller
  2006-05-10  7:21       ` Benjamin Herrenschmidt
                         ` (2 more replies)
  0 siblings, 3 replies; 25+ messages in thread
From: David S. Miller @ 2006-05-10  6:39 UTC (permalink / raw)
  To: paulus; +Cc: olof, linux-kernel, linux-arch, linuxppc-dev

From: Paul Mackerras <paulus@samba.org>
Date: Wed, 10 May 2006 16:29:59 +1000

> I have moved current, smp_processor_id and a couple of other things to
> per-cpu variables, and that results in the kernel text being about 8k
> smaller than without any of these __thread patches.  Performance seems
> to be very slightly better but it's hard to be sure that the change is
> statistically significant, from the measurements I've done so far.

That first cache line of current_thread_info() should be so hot that
it's probably just fine to use current_thread_info()->task since
you're just doing a mask on a fixed register (r1) to implement that.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10  6:39     ` David S. Miller
@ 2006-05-10  7:21       ` Benjamin Herrenschmidt
  2006-05-10  7:41       ` Paul Mackerras
  2006-05-10 10:14       ` David Howells
  2 siblings, 0 replies; 25+ messages in thread
From: Benjamin Herrenschmidt @ 2006-05-10  7:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: paulus, olof, linux-arch, linux-kernel, linuxppc-dev

On Tue, 2006-05-09 at 23:39 -0700, David S. Miller wrote:
> From: Paul Mackerras <paulus@samba.org>
> Date: Wed, 10 May 2006 16:29:59 +1000
> 
> > I have moved current, smp_processor_id and a couple of other things to
> > per-cpu variables, and that results in the kernel text being about 8k
> > smaller than without any of these __thread patches.  Performance seems
> > to be very slightly better but it's hard to be sure that the change is
> > statistically significant, from the measurements I've done so far.
> 
> That first cache line of current_thread_info() should be so hot that
> it's probably just fine to use current_thread_info()->task since
> you're just doing a mask on a fixed register (r1) to implement that.

Iirc, he tried that, though it did bloat the kernel size a bit due the
the amount of occurences of current-> in there. We are now thinking
about either dedicating a register to current (that would avoid the
problem of printk() using it in start_kernel before we get the per-cpu
areas setup) in addition to __thread (heh, we have lots of registers on
ppc :) or maybe putting current back in the paca...

It's a bit sad that we can't get rid of the PACA because it has to be in
the RMA (for those who don't know that it is, the RMA is an area of
memory that is accessible in real mode on LPAR machines, that is the
hypervisor guarantees a bunch of physically contiguous memory that is
made accessible to the partition for use in real mode). We could have
put the per-cpu infos in the RMA but I'm a bit freaked out by the idea
of having those not be node-local...

Ben.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10  6:39     ` David S. Miller
  2006-05-10  7:21       ` Benjamin Herrenschmidt
@ 2006-05-10  7:41       ` Paul Mackerras
  2006-05-10 10:14       ` David Howells
  2 siblings, 0 replies; 25+ messages in thread
From: Paul Mackerras @ 2006-05-10  7:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: olof, linux-kernel, linux-arch, linuxppc-dev

David S. Miller writes:

> That first cache line of current_thread_info() should be so hot that
> it's probably just fine to use current_thread_info()->task since
> you're just doing a mask on a fixed register (r1) to implement that.

I tried that, but I found that adding 1 instruction to the sequence
for getting current adds about 8k to the kernel text.  Currently we do
it in one instruction, that would be two - the mask and the load.  It
probably doesn't make a measurable difference to performance, but it
doesn't look good.  The number of instructions we lose by using
__thread is much less than the 8k we gain from using
current_thread_info()->task for current.  So I'd prefer to use a
per-cpu variable for current, since we can get to that in 1
instruction.

Paul.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10  6:39     ` David S. Miller
  2006-05-10  7:21       ` Benjamin Herrenschmidt
  2006-05-10  7:41       ` Paul Mackerras
@ 2006-05-10 10:14       ` David Howells
  2 siblings, 0 replies; 25+ messages in thread
From: David Howells @ 2006-05-10 10:14 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: David S. Miller, olof, linux-arch, paulus, linux-kernel, linuxppc-dev

Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> > That first cache line of current_thread_info() should be so hot that
> > it's probably just fine to use current_thread_info()->task since
> > you're just doing a mask on a fixed register (r1) to implement that.
> 
> Iirc, he tried that, though it did bloat the kernel size a bit due the
> the amount of occurences of current-> in there. We are now thinking
> about either dedicating a register to current (that would avoid the
> problem of printk() using it in start_kernel before we get the per-cpu
> areas setup) in addition to __thread (heh, we have lots of registers on
> ppc :) or maybe putting current back in the paca...

I dedicated registers to current and current threadinfo in the FRV arch.  As I
recall doing that improved both performance and code size quite a bit.  It also
means that I get sensible bug panic reports when the stack pointer overruns the
stack space.

David

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10  4:03 [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables Paul Mackerras
  2006-05-10  5:16 ` Olof Johansson
@ 2006-05-10 15:47 ` Richard Henderson
  2006-05-10 18:04   ` Steven Rostedt
                     ` (2 more replies)
  1 sibling, 3 replies; 25+ messages in thread
From: Richard Henderson @ 2006-05-10 15:47 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linux-kernel, linux-arch, linuxppc-dev

On Wed, May 10, 2006 at 02:03:59PM +1000, Paul Mackerras wrote:
> With this patch, 64-bit powerpc uses __thread for per-cpu variables.

How do you plan to address the compiler optimizing

	__thread int foo;
	{
	  use(foo);
	  schedule();
	  use(foo);
	}

into

	{
	  int *tmp = &foo;	// tls arithmetic here
	  use(*tmp);
	  schedule();
	  use(*tmp);
	}

Across the schedule, we may have changed cpus, making the cached
address invalid.


r~

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 15:47 ` Richard Henderson
@ 2006-05-10 18:04   ` Steven Rostedt
  2006-05-10 19:40   ` David S. Miller
  2006-05-10 23:05   ` Paul Mackerras
  2 siblings, 0 replies; 25+ messages in thread
From: Steven Rostedt @ 2006-05-10 18:04 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Paul Mackerras, t, linux-kernel, linux-arch, linuxppc-dev


On Wed, 10 May 2006, Richard Henderson wrote:

> On Wed, May 10, 2006 at 02:03:59PM +1000, Paul Mackerras wrote:
> > With this patch, 64-bit powerpc uses __thread for per-cpu variables.
>
> How do you plan to address the compiler optimizing
>
> 	__thread int foo;
> 	{
> 	  use(foo);
> 	  schedule();
> 	  use(foo);
> 	}
>
> into
>
> 	{
> 	  int *tmp = &foo;	// tls arithmetic here
> 	  use(*tmp);
> 	  schedule();
> 	  use(*tmp);
> 	}
>
> Across the schedule, we may have changed cpus, making the cached
> address invalid.
>

If you mean use(foo) is the same as per_cpu(foo), I can't see the compile
optimizing this:

+#define per_cpu(var, cpu)                                      \
+       (*(__typeof__(&per_cpu__##var))({                       \
+               void *__ptr;                                    \
+               asm("addi %0,%1,per_cpu__"#var"@tprel"          \
+                   : "=b" (__ptr)                              \
+                   : "b" (paca[(cpu)].thread_ptr));            \
+               __ptr;                                          \
+       }))


Anyway, per_cpu variables are usually used with preemption turned off and
no need to schedule.

-- Steve


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 15:47 ` Richard Henderson
  2006-05-10 18:04   ` Steven Rostedt
@ 2006-05-10 19:40   ` David S. Miller
  2006-05-10 21:05     ` Paul Mackerras
  2006-05-10 23:05   ` Paul Mackerras
  2 siblings, 1 reply; 25+ messages in thread
From: David S. Miller @ 2006-05-10 19:40 UTC (permalink / raw)
  To: rth; +Cc: paulus, linux-kernel, linux-arch, linuxppc-dev

From: Richard Henderson <rth@twiddle.net>
Date: Wed, 10 May 2006 08:47:13 -0700

> How do you plan to address the compiler optimizing
 ...
> Across the schedule, we may have changed cpus, making the cached
> address invalid.

Per-cpu variables need to be accessed only with preemption
disabled.  And the preemption enable/disable operations
provide a compiler memory barrier.

#define preempt_disable() \
do { \
	inc_preempt_count(); \
	barrier(); \
} while (0)

 ...

#define preempt_enable() \
do { \
	preempt_enable_no_resched(); \
	barrier(); \
	preempt_check_resched(); \
} while (0)

The scheduler itself need to take care to not cause the situation
you mention either.

Therefore this is an issue we had already, not some new thing
introduced by using __thread for per-cpu variables.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 19:40   ` David S. Miller
@ 2006-05-10 21:05     ` Paul Mackerras
  2006-05-10 22:25       ` David S. Miller
                         ` (2 more replies)
  0 siblings, 3 replies; 25+ messages in thread
From: Paul Mackerras @ 2006-05-10 21:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: rth, linux-kernel, linux-arch, linuxppc-dev

David S. Miller writes:

> From: Richard Henderson <rth@twiddle.net>
> Date: Wed, 10 May 2006 08:47:13 -0700
> 
> > How do you plan to address the compiler optimizing
>  ...
> > Across the schedule, we may have changed cpus, making the cached
> > address invalid.
> 
> Per-cpu variables need to be accessed only with preemption
> disabled.  And the preemption enable/disable operations
> provide a compiler memory barrier.

No, Richard has a point, it's not the value that is the concern, it's
the address, which gcc could assume is still valid after a barrier.
Drat.

Paul.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 21:05     ` Paul Mackerras
@ 2006-05-10 22:25       ` David S. Miller
  2006-05-10 23:17       ` Segher Boessenkool
  2006-05-11  1:04       ` Alan Modra
  2 siblings, 0 replies; 25+ messages in thread
From: David S. Miller @ 2006-05-10 22:25 UTC (permalink / raw)
  To: paulus; +Cc: rth, linux-kernel, linux-arch, linuxppc-dev

From: Paul Mackerras <paulus@samba.org>
Date: Thu, 11 May 2006 07:05:24 +1000

> No, Richard has a point, it's not the value that is the concern, it's
> the address, which gcc could assume is still valid after a barrier.
> Drat.

Oh right, and that's currently part of why we obfuscate the
address computation with the RELOC_HIDE() buisness.

Once we expose what's really going on with something like
__thread, gcc can now be "smart" about it.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 15:47 ` Richard Henderson
  2006-05-10 18:04   ` Steven Rostedt
  2006-05-10 19:40   ` David S. Miller
@ 2006-05-10 23:05   ` Paul Mackerras
  2006-05-10 23:44     ` Paul Mackerras
  2 siblings, 1 reply; 25+ messages in thread
From: Paul Mackerras @ 2006-05-10 23:05 UTC (permalink / raw)
  To: Richard Henderson; +Cc: t, linux-kernel, linux-arch, linuxppc-dev

Richard Henderson writes:

> How do you plan to address the compiler optimizing
> 
> 	__thread int foo;
> 	{
> 	  use(foo);
> 	  schedule();
> 	  use(foo);
> 	}
> 
> into
> 
> 	{
> 	  int *tmp = &foo;	// tls arithmetic here
> 	  use(*tmp);
> 	  schedule();
> 	  use(*tmp);
> 	}

Hmmm...  Would it be sufficient to use a RELOC_HIDE in __get_cpu_var,
like this?

#define __get_cpu_var(x)	(*(RELOC_HIDE(&per_cpu__##x, 0)))

Paul.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 21:05     ` Paul Mackerras
  2006-05-10 22:25       ` David S. Miller
@ 2006-05-10 23:17       ` Segher Boessenkool
  2006-05-11  0:22         ` Richard Henderson
  2006-05-11  1:04       ` Alan Modra
  2 siblings, 1 reply; 25+ messages in thread
From: Segher Boessenkool @ 2006-05-10 23:17 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: David S. Miller, linux-arch, linuxppc-dev, linux-kernel, rth

>>> How do you plan to address the compiler optimizing
>>  ...
>>> Across the schedule, we may have changed cpus, making the cached
>>> address invalid.
>>
>> Per-cpu variables need to be accessed only with preemption
>> disabled.  And the preemption enable/disable operations
>> provide a compiler memory barrier.
>
> No, Richard has a point, it's not the value that is the concern, it's
> the address, which gcc could assume is still valid after a barrier.
> Drat.

Would an asm clobber of GPR13 in the schedule routines (or a wrapper
for them, or whatever) work?


Segher


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 23:05   ` Paul Mackerras
@ 2006-05-10 23:44     ` Paul Mackerras
  2006-05-11  0:11       ` David S. Miller
  0 siblings, 1 reply; 25+ messages in thread
From: Paul Mackerras @ 2006-05-10 23:44 UTC (permalink / raw)
  To: Richard Henderson, t, linux-kernel, linux-arch, linuxppc-dev, amodra

I wrote:

> Hmmm...  Would it be sufficient to use a RELOC_HIDE in __get_cpu_var,
> like this?
> 
> #define __get_cpu_var(x)	(*(RELOC_HIDE(&per_cpu__##x, 0)))

But that won't work because the compiler can still cache &per_cpu__x.
I guess I have to do this:

#define __get_cpu_var(var, cpu)					\
	(*(__typeof__(&per_cpu__##var))({			\
		void *__ptr;					\
		asm("addi %0,13,per_cpu__"#var"@tprel"		\
		    : "=r" (__ptr));				\
		__ptr;						\
	}))

That means we lose the possible optimization of combining the addi
into a following load or store.  Bah.  However, I guess it's still
better than what we do at the moment.

Paul.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 23:44     ` Paul Mackerras
@ 2006-05-11  0:11       ` David S. Miller
  2006-05-18 23:50         ` Paul Mackerras
  0 siblings, 1 reply; 25+ messages in thread
From: David S. Miller @ 2006-05-11  0:11 UTC (permalink / raw)
  To: paulus; +Cc: rth, t, linux-kernel, linux-arch, linuxppc-dev, amodra

From: Paul Mackerras <paulus@samba.org>
Date: Thu, 11 May 2006 09:44:32 +1000

> That means we lose the possible optimization of combining the addi
> into a following load or store.  Bah.  However, I guess it's still
> better than what we do at the moment.

If you have to hide the operation so deeply like this, maybe you can
do something similar to sparc64, by explicitly doing the per-cpu fixed
register and offsets, and still get the single instruction relocs that
powerpc can do for up to 64K by doing something like:

	&per_cpu_blah - &per_cpu_base

to calculate the offset.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 23:17       ` Segher Boessenkool
@ 2006-05-11  0:22         ` Richard Henderson
  2006-05-11 23:41           ` Segher Boessenkool
  0 siblings, 1 reply; 25+ messages in thread
From: Richard Henderson @ 2006-05-11  0:22 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Paul Mackerras, David S. Miller, linux-arch, linuxppc-dev, linux-kernel

On Thu, May 11, 2006 at 01:17:50AM +0200, Segher Boessenkool wrote:
> Would an asm clobber of GPR13 in the schedule routines (or a wrapper
> for them, or whatever) work?

No.  The address is cse'd symbolically long before the r13
reference is exposed.


r~

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-10 21:05     ` Paul Mackerras
  2006-05-10 22:25       ` David S. Miller
  2006-05-10 23:17       ` Segher Boessenkool
@ 2006-05-11  1:04       ` Alan Modra
  2006-05-11  1:21         ` Paul Mackerras
  2 siblings, 1 reply; 25+ messages in thread
From: Alan Modra @ 2006-05-11  1:04 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: David S. Miller, linux-arch, linuxppc-dev, linux-kernel, rth

On Thu, May 11, 2006 at 07:05:24AM +1000, Paul Mackerras wrote:
> No, Richard has a point, it's not the value that is the concern, it's
> the address, which gcc could assume is still valid after a barrier.
> Drat.

That may never happen, at least with a compiler that knows how to
optimise away the addi.  You're using -mtls-size=16 so all your accesses
should look like

	lwz rn,per_cpu_var@tprel(13)

gcc shouldn't think there is any reason to cache the address.

-- 
Alan Modra
IBM OzLabs - Linux Technology Centre

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-11  1:04       ` Alan Modra
@ 2006-05-11  1:21         ` Paul Mackerras
  2006-05-11  2:01           ` Alan Modra
  2006-05-11 23:42           ` Segher Boessenkool
  0 siblings, 2 replies; 25+ messages in thread
From: Paul Mackerras @ 2006-05-11  1:21 UTC (permalink / raw)
  To: Alan Modra; +Cc: David S. Miller, linux-arch, linuxppc-dev, linux-kernel, rth

Alan Modra writes:

> gcc shouldn't think there is any reason to cache the address.

Can I rely on that being true in the future?

Paul.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-11  1:21         ` Paul Mackerras
@ 2006-05-11  2:01           ` Alan Modra
  2006-05-11 23:42           ` Segher Boessenkool
  1 sibling, 0 replies; 25+ messages in thread
From: Alan Modra @ 2006-05-11  2:01 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: David S. Miller, linux-arch, linuxppc-dev, linux-kernel, rth

On Thu, May 11, 2006 at 11:21:15AM +1000, Paul Mackerras wrote:
> Alan Modra writes:
> 
> > gcc shouldn't think there is any reason to cache the address.
> 
> Can I rely on that being true in the future?

It isn't true in the *present*, except with a compiler on my home
machine.  :-) 

__thread int i1;
void
f3 (void)
{
  int x = i1;
  __asm__ __volatile__ ("#dragons be here.  %0" : "+r" (x));
  i1 = x;
}

current mainline with -O2 -S -mtls-size=16

f3:
        addi 9,2,i1@tprel
        lwz 0,0(9)
#APP
        #dragons be here.  0
#NO_APP
        stw 0,0(9)
        blr

Same thing with my modified compiler.

f3:
        lwz 0,i1@tprel(2)
#APP
        #dragons be here.  0
#NO_APP
        stw 0,i1@tprel(2)
        blr

-- 
Alan Modra
IBM OzLabs - Linux Technology Centre

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-11  0:22         ` Richard Henderson
@ 2006-05-11 23:41           ` Segher Boessenkool
  0 siblings, 0 replies; 25+ messages in thread
From: Segher Boessenkool @ 2006-05-11 23:41 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Paul Mackerras, David S. Miller, linux-arch, linuxppc-dev, linux-kernel

>> Would an asm clobber of GPR13 in the schedule routines (or a wrapper
>> for them, or whatever) work?
>
> No.  The address is cse'd symbolically long before the r13
> reference is exposed.

Current GCC won't ever do that over a (non-local, non-inlinable)
function call though.  _Current_ GCC.


Segher


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-11  1:21         ` Paul Mackerras
  2006-05-11  2:01           ` Alan Modra
@ 2006-05-11 23:42           ` Segher Boessenkool
  1 sibling, 0 replies; 25+ messages in thread
From: Segher Boessenkool @ 2006-05-11 23:42 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Alan Modra, David S. Miller, linux-arch, linuxppc-dev, linux-kernel, rth

>> gcc shouldn't think there is any reason to cache the address.
>
> Can I rely on that being true in the future?

As long as the compiler stays smart enough, and doesn't do
stupid things :-)

(i.e., no.  Sigh).


Segher


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables
  2006-05-11  0:11       ` David S. Miller
@ 2006-05-18 23:50         ` Paul Mackerras
  0 siblings, 0 replies; 25+ messages in thread
From: Paul Mackerras @ 2006-05-18 23:50 UTC (permalink / raw)
  To: David S. Miller; +Cc: rth, linux-kernel, linux-arch, linuxppc-dev, amodra

David S. Miller writes:

> If you have to hide the operation so deeply like this, maybe you can
> do something similar to sparc64, by explicitly doing the per-cpu fixed
> register and offsets, and still get the single instruction relocs that
> powerpc can do for up to 64K by doing something like:
> 
> 	&per_cpu_blah - &per_cpu_base
> 
> to calculate the offset.

I don't know how to tell gcc that (&per_cpu_blah - &per_cpu_base) is a
quantity that the linker can compute and that will fit into a 16-bit
offset.  If I use an inline asm, then I have to generate the address
and let gcc dereference it, because __get_cpu_var is used both as an
lvalue and an rvalue.  That means two instructions where one would
suffice.  So there doesn't seem to be a way to get the optimal code,
unless the gcc folks are willing to add a -fkernel or something for
us. :)

Paul.

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2006-05-18 23:50 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-05-10  4:03 [RFC/PATCH] Make powerpc64 use __thread for per-cpu variables Paul Mackerras
2006-05-10  5:16 ` Olof Johansson
2006-05-10  5:35   ` Alan Modra
2006-05-10  6:22   ` David S. Miller
2006-05-10  6:29   ` Paul Mackerras
2006-05-10  6:39     ` David S. Miller
2006-05-10  7:21       ` Benjamin Herrenschmidt
2006-05-10  7:41       ` Paul Mackerras
2006-05-10 10:14       ` David Howells
2006-05-10 15:47 ` Richard Henderson
2006-05-10 18:04   ` Steven Rostedt
2006-05-10 19:40   ` David S. Miller
2006-05-10 21:05     ` Paul Mackerras
2006-05-10 22:25       ` David S. Miller
2006-05-10 23:17       ` Segher Boessenkool
2006-05-11  0:22         ` Richard Henderson
2006-05-11 23:41           ` Segher Boessenkool
2006-05-11  1:04       ` Alan Modra
2006-05-11  1:21         ` Paul Mackerras
2006-05-11  2:01           ` Alan Modra
2006-05-11 23:42           ` Segher Boessenkool
2006-05-10 23:05   ` Paul Mackerras
2006-05-10 23:44     ` Paul Mackerras
2006-05-11  0:11       ` David S. Miller
2006-05-18 23:50         ` Paul Mackerras

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).