All of lore.kernel.org
 help / color / mirror / Atom feed
* syscall exit path optimization
@ 2005-01-26 21:02 Chen, Kenneth W
  2005-01-26 21:30 ` David Mosberger
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Chen, Kenneth W @ 2005-01-26 21:02 UTC (permalink / raw)
  To: linux-ia64

Follow up on previous discussion, this patch optimize how we handle
r8/r10 in syscall return path.  If there are no pending work to be
done, we will skip storing/loading r8/r10, cutting out 4 memory
references in the fast path.  This resulted a net of 4 cycles saving.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Rohit Seth <rohit.seth@intel.com>


--- linux-ia64-release/arch/ia64/kernel/entry.S.orig	2005-01-26 11:41:24.000000000 -0800
+++ linux-ia64-release/arch/ia64/kernel/entry.S	2005-01-26 12:31:52.000000000 -0800
@@ -558,7 +558,8 @@ GLOBAL_ENTRY(ia64_trace_syscall)
 .mem.offset 0,0; st8.spill [r2]=r8		// store return value in slot for r8
 .mem.offset 8,0; st8.spill [r3]=r10		// clear error indication in slot for r10
 	br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
-.ret3:	br.cond.sptk ia64_leave_syscall
+	cmp.eq p9,p8=r0,r0
+	br.cond.sptk ia64_leave_syscall

 strace_error:
 	ld8 r3=[r2]				// load pt_regs.r8
@@ -619,12 +620,10 @@ END(ia64_ret_from_clone)
 	// fall through
 GLOBAL_ENTRY(ia64_ret_from_syscall)
 	PT_REGS_UNWIND_INFO(0)
+	cmp.eq p8,p9=r0,r0	// p8: ret val in live reg, p9: ret val in pt_regs
 	cmp.ge p6,p7=r8,r0			// syscall executed successfully?
 	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
-	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
-	;;
-(p6)	st8 [r2]=r8				// store return value in slot for r8
-(p6)	st8 [r3]=r0				// clear error indication in slot for r10
+	mov r10=r0				// clear error indication in r10
 (p7)	br.cond.spnt handle_syscall_error	// handle potential syscall failure
 END(ia64_ret_from_syscall)
 	// fall through
@@ -715,10 +714,10 @@ ENTRY(ia64_leave_syscall)
 	;;
 	mov r16=ar.bsp				// M2  get existing backing store pointer
 (p6)	cmp4.ne.unc p6,p0=r15, r0		// any special work pending?
-(p6)	br.cond.spnt .work_pending
+(p6)	br.cond.spnt .work_pending_syscall
 	;;
 	// start restoring the state saved on the kernel stack (struct pt_regs):
-	ld8 r8=[r2],16
+(p9)	ld8 r8=[r2],16
 	ld8 r9=[r3],16
 	mov f6ð		// clear f6
 	;;
@@ -726,9 +725,10 @@ ENTRY(ia64_leave_syscall)
 	rsm psr.i | psr.ic	// M2 initiate turning off of interrupt and interruption collection
 	mov f9ð		// clear f9

-	ld8 r10=[r2],16
+	.pred.rel.mutex p8,p9
+(p9)	ld8 r10=[r2],16
 	ld8 r11=[r3],16
-	mov f7ð		// clear f7
+(p8)	add r22,r2
 	;;
 	ld8 r29=[r2],16		// load cr.ipsr
 	ld8 r28=[r3],16			// load cr.iip
@@ -760,7 +760,7 @@ ENTRY(ia64_leave_syscall)
 	;;
 	srlz.d			// M0  ensure interruption collection is off
 	ld8.fill r13=[r3],16
-	nop.i 0
+	mov f7ð		// clear f7
 	;;
 	ld8.fill r12=[r2]	// restore r12 (sp)
 	ld8.fill r15=[r3]	// restore r15
@@ -770,8 +770,8 @@ ENTRY(ia64_leave_syscall)
 (pUStk) st1 [r14]=r17
 	mov b6=r18		// I0  restore b6
 	;;
-	shr.u r18=r19,16	// I0|1 get byte size of existing "dirty" partition
 	mov r14=r0		// clear r14
+	shr.u r18=r19,16	// I0|1 get byte size of existing "dirty" partition
 (pKStk) br.cond.dpnt.many skip_rbs_switch

 	mov.m ar.ccv=r0		// clear ar.ccv
@@ -1083,6 +1083,10 @@ skip_rbs_switch:
 	 * On exit:
 	 *	p6 = TRUE if work-pending-check needs to be redone
 	 */
+.work_pending_syscall:
+(p8)	st8 [r2]=r8,16
+	;;
+(p8)	st8 [r2]=r10,16
 .work_pending:
 	tbit.nz p6,p0=r31,TIF_SIGDELAYED		// signal delayed from  MCA/INIT/NMI/PMI context?
 (p6)	br.cond.sptk.few .sigdelayed
@@ -1104,12 +1108,14 @@ skip_rbs_switch:
 	;;
 (pKStk)	st4 [r20]=r0		// preempt_count() <- 0
 #endif
+	cmp.eq p9,p8=r0,r0
 (pLvSys)br.cond.sptk.many .work_processed_syscall	// re-check
 	br.cond.sptk.many .work_processed_kernel	// re-check

 .notify:
 (pUStk)	br.call.spnt.many rp=notify_resume_user
 .ret10:	cmp.ne p6,p0=r0,r0				// p6 <- 0
+	cmp.eq p9,p8=r0,r0
 (pLvSys)br.cond.sptk.many .work_processed_syscall	// don't re-check
 	br.cond.sptk.many .work_processed_kernel	// don't re-check

@@ -1121,6 +1127,7 @@ skip_rbs_switch:
 .sigdelayed:
 	br.call.sptk.many rp=do_sigdelayed
 	cmp.eq p6,p0=r0,r0				// p6 <- 1, always re-check
+	cmp.eq p9,p8=r0,r0
 (pLvSys)br.cond.sptk.many .work_processed_syscall	// re-check
 	br.cond.sptk.many .work_processed_kernel	// re-check

@@ -1135,17 +1142,11 @@ ENTRY(handle_syscall_error)
 	 */
 	PT_REGS_UNWIND_INFO(0)
 	ld8 r3=[r2]		// load pt_regs.r8
-	sub r9=0,r8		// negate return value to get errno
 	;;
-	mov r10=-1		// return -1 in pt_regs.r10 to indicate error
 	cmp.eq p6,p7=r3,r0	// is pt_regs.r8=0?
-	adds r3\x16,r2		// r3=&pt_regs.r10
-	;;
-(p6)	mov r9=r8
-(p6)	mov r10=0
 	;;
-	st8 [r2]=r9		// store errno in pt_regs.r8
-	st8 [r3]=r10		// store error indication in pt_regs.r10
+(p7)	mov r10=-1
+(p7)	sub r8=0,r8		// negate return value to get errno
 	br.cond.sptk ia64_leave_syscall
 END(handle_syscall_error)




^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: syscall exit path optimization
  2005-01-26 21:02 syscall exit path optimization Chen, Kenneth W
@ 2005-01-26 21:30 ` David Mosberger
  2005-01-26 21:54 ` Chen, Kenneth W
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: David Mosberger @ 2005-01-26 21:30 UTC (permalink / raw)
  To: linux-ia64

I like the patch (it's quite clever!), except it would be nice if we
could avoid the p8/p9 dependencies.  Couldn't you restore r8/r10 after
.work_pending is done in if pLvSys is TRUE?  That way, .work_processed
would simply preserve (save _and_ restore) r8/r10.  My comments are
based on reading the diff only, though, so perhaps I'm missing
something.

	--david

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: syscall exit path optimization
  2005-01-26 21:02 syscall exit path optimization Chen, Kenneth W
  2005-01-26 21:30 ` David Mosberger
@ 2005-01-26 21:54 ` Chen, Kenneth W
  2005-01-27 23:44 ` Chen, Kenneth W
  2005-01-27 23:50 ` David Mosberger
  3 siblings, 0 replies; 5+ messages in thread
From: Chen, Kenneth W @ 2005-01-26 21:54 UTC (permalink / raw)
  To: linux-ia64

David Mosberger wrote on Wednesday, January 26, 2005 1:31 PM
> I like the patch (it's quite clever!), except it would be nice if we
> could avoid the p8/p9 dependencies.  Couldn't you restore r8/r10 after
> .work_pending is done in if pLvSys is TRUE?  That way, .work_processed
> would simply preserve (save _and_ restore) r8/r10.

Yeah, I think that's doable.

- Ken



^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: syscall exit path optimization
  2005-01-26 21:02 syscall exit path optimization Chen, Kenneth W
  2005-01-26 21:30 ` David Mosberger
  2005-01-26 21:54 ` Chen, Kenneth W
@ 2005-01-27 23:44 ` Chen, Kenneth W
  2005-01-27 23:50 ` David Mosberger
  3 siblings, 0 replies; 5+ messages in thread
From: Chen, Kenneth W @ 2005-01-27 23:44 UTC (permalink / raw)
  To: linux-ia64

David Mosberger wrote on Wednesday, January 26, 2005 1:31 PM
> Couldn't you restore r8/r10 after .work_pending is done in if
> pLvSys is TRUE?  That way, .work_processed would simply preserve
> (save _and_ restore) r8/r10.

Thank you for reviewing and the suggestion.  Here is the updated
patch, net saving for 6 cycles compares to 4 with earlier version.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Rohit Seth <rohit.seth@intel.com>


--- linux-ia64-release/arch/ia64/kernel/entry.S.orig	2005-01-26 11:41:24.000000000 -0800
+++ linux-ia64-release/arch/ia64/kernel/entry.S	2005-01-27 15:13:25.000000000 -0800
@@ -558,7 +558,7 @@ GLOBAL_ENTRY(ia64_trace_syscall)
 .mem.offset 0,0; st8.spill [r2]=r8		// store return value in slot for r8
 .mem.offset 8,0; st8.spill [r3]=r10		// clear error indication in slot for r10
 	br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
-.ret3:	br.cond.sptk ia64_leave_syscall
+.ret3:	br.cond.sptk .work_pending_syscall_end

 strace_error:
 	ld8 r3=[r2]				// load pt_regs.r8
@@ -621,10 +621,7 @@ GLOBAL_ENTRY(ia64_ret_from_syscall)
 	PT_REGS_UNWIND_INFO(0)
 	cmp.ge p6,p7=r8,r0			// syscall executed successfully?
 	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
-	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
-	;;
-(p6)	st8 [r2]=r8				// store return value in slot for r8
-(p6)	st8 [r3]=r0				// clear error indication in slot for r10
+	mov r10=r0				// clear error indication in r10
 (p7)	br.cond.spnt handle_syscall_error	// handle potential syscall failure
 END(ia64_ret_from_syscall)
 	// fall through
@@ -709,27 +706,23 @@ ENTRY(ia64_leave_syscall)
 	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
 	mov b7=r0		// clear b7
 	;;
-	ld8 r23=[r3],PT(R9)-PT(AR_BSPSTORE)	// load ar.bspstore (may be garbage)
-	ld8 r18=[r2],PT(R8)-PT(B6)		// load b6
+	ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)	// load ar.bspstore (may be garbage)
+	ld8 r18=[r2],PT(R9)-PT(B6)		// load b6
 (p6)	and r15=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
 	;;
 	mov r16=ar.bsp				// M2  get existing backing store pointer
 (p6)	cmp4.ne.unc p6,p0=r15, r0		// any special work pending?
-(p6)	br.cond.spnt .work_pending
+(p6)	br.cond.spnt .work_pending_syscall
 	;;
 	// start restoring the state saved on the kernel stack (struct pt_regs):
-	ld8 r8=[r2],16
-	ld8 r9=[r3],16
+	ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
+	ld8 r11=[r3],PT(CR_IIP)-PT(R11)
 	mov f6ð		// clear f6
 	;;
 	invala			// M0|1 invalidate ALAT
 	rsm psr.i | psr.ic	// M2 initiate turning off of interrupt and interruption collection
 	mov f9ð		// clear f9

-	ld8 r10=[r2],16
-	ld8 r11=[r3],16
-	mov f7ð		// clear f7
-	;;
 	ld8 r29=[r2],16		// load cr.ipsr
 	ld8 r28=[r3],16			// load cr.iip
 	mov f8ð		// clear f8
@@ -760,7 +753,7 @@ ENTRY(ia64_leave_syscall)
 	;;
 	srlz.d			// M0  ensure interruption collection is off
 	ld8.fill r13=[r3],16
-	nop.i 0
+	mov f7ð		// clear f7
 	;;
 	ld8.fill r12=[r2]	// restore r12 (sp)
 	ld8.fill r15=[r3]	// restore r15
@@ -770,8 +763,8 @@ ENTRY(ia64_leave_syscall)
 (pUStk) st1 [r14]=r17
 	mov b6=r18		// I0  restore b6
 	;;
-	shr.u r18=r19,16	// I0|1 get byte size of existing "dirty" partition
 	mov r14=r0		// clear r14
+	shr.u r18=r19,16	// I0|1 get byte size of existing "dirty" partition
 (pKStk) br.cond.dpnt.many skip_rbs_switch

 	mov.m ar.ccv=r0		// clear ar.ccv
@@ -1083,6 +1076,12 @@ skip_rbs_switch:
 	 * On exit:
 	 *	p6 = TRUE if work-pending-check needs to be redone
 	 */
+.work_pending_syscall:
+	add r2=-8,r2
+	add r3=-8,r3
+	;;
+	st8 [r2]=r8
+	st8 [r3]=r10
 .work_pending:
 	tbit.nz p6,p0=r31,TIF_SIGDELAYED		// signal delayed from  MCA/INIT/NMI/PMI context?
 (p6)	br.cond.sptk.few .sigdelayed
@@ -1104,13 +1103,13 @@ skip_rbs_switch:
 	;;
 (pKStk)	st4 [r20]=r0		// preempt_count() <- 0
 #endif
-(pLvSys)br.cond.sptk.many .work_processed_syscall	// re-check
+(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel	// re-check

 .notify:
 (pUStk)	br.call.spnt.many rp=notify_resume_user
 .ret10:	cmp.ne p6,p0=r0,r0				// p6 <- 0
-(pLvSys)br.cond.sptk.many .work_processed_syscall	// don't re-check
+(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel	// don't re-check

 // There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
@@ -1121,9 +1120,17 @@ skip_rbs_switch:
 .sigdelayed:
 	br.call.sptk.many rp=do_sigdelayed
 	cmp.eq p6,p0=r0,r0				// p6 <- 1, always re-check
-(pLvSys)br.cond.sptk.many .work_processed_syscall	// re-check
+(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel	// re-check

+.work_pending_syscall_end:
+	adds r2=PT(R8)+16,r12
+	adds r3=PT(R10)+16,r12
+	;;
+	ld8 r8=[r2]
+	ld8 r10=[r3]
+	br.cond.sptk.many .work_processed_syscall	// re-check
+
 END(ia64_leave_kernel)

 ENTRY(handle_syscall_error)
@@ -1135,17 +1142,11 @@ ENTRY(handle_syscall_error)
 	 */
 	PT_REGS_UNWIND_INFO(0)
 	ld8 r3=[r2]		// load pt_regs.r8
-	sub r9=0,r8		// negate return value to get errno
 	;;
-	mov r10=-1		// return -1 in pt_regs.r10 to indicate error
 	cmp.eq p6,p7=r3,r0	// is pt_regs.r8=0?
-	adds r3\x16,r2		// r3=&pt_regs.r10
-	;;
-(p6)	mov r9=r8
-(p6)	mov r10=0
 	;;
-	st8 [r2]=r9		// store errno in pt_regs.r8
-	st8 [r3]=r10		// store error indication in pt_regs.r10
+(p7)	mov r10=-1
+(p7)	sub r8=0,r8		// negate return value to get errno
 	br.cond.sptk ia64_leave_syscall
 END(handle_syscall_error)





^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: syscall exit path optimization
  2005-01-26 21:02 syscall exit path optimization Chen, Kenneth W
                   ` (2 preceding siblings ...)
  2005-01-27 23:44 ` Chen, Kenneth W
@ 2005-01-27 23:50 ` David Mosberger
  3 siblings, 0 replies; 5+ messages in thread
From: David Mosberger @ 2005-01-27 23:50 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Thu, 27 Jan 2005 15:44:42 -0800, "Chen, Kenneth W" <kenneth.w.chen@intel.com> said:

  Ken> David Mosberger wrote on Wednesday, January 26, 2005 1:31 PM
  >> Couldn't you restore r8/r10 after .work_pending is done in if
  >> pLvSys is TRUE?  That way, .work_processed would simply preserve
  >> (save _and_ restore) r8/r10.

  Ken> Thank you for reviewing and the suggestion.  Here is the
  Ken> updated patch, net saving for 6 cycles compares to 4 with
  Ken> earlier version.

Tidy _and_ faster---not bad at all for a net change of one extra line!

Good work.

	--david

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2005-01-27 23:50 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-01-26 21:02 syscall exit path optimization Chen, Kenneth W
2005-01-26 21:30 ` David Mosberger
2005-01-26 21:54 ` Chen, Kenneth W
2005-01-27 23:44 ` Chen, Kenneth W
2005-01-27 23:50 ` David Mosberger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.