linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: RFC: unlazy fpu for frequent fpu users
@ 2006-07-01 16:22 Chuck Ebbert
  0 siblings, 0 replies; 6+ messages in thread
From: Chuck Ebbert @ 2006-07-01 16:22 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: linux-kernel

In-Reply-To: <1151756010.3195.31.camel@laptopd505.fenrus.org>

On Sat, 01 Jul 2006 14:13:30 +0200, Arjan van de Ven wrote:

> > You can do better that that.  FXSR doesn't destroy the FPU contents; if
> > you track the context carefully you can completely avoid the restore.
> > This requires keeping a per-cpu variable that holds a pointer to the
> 
> to be honest, while I like the idea, it does scare me from a security
> point of view, both in terms of leaks and in terms of injecting bad
> stuff. 

I thought about it and couldn't find any obvious problems.  If there
are leaks then they're already possible because we always leave the
stale context in the FPU when we switch (except on K7/K8.)  And if a
task touches the FPU the stale context gets overwritten first.

> Can you send me your patch so that I can integrate it (and I'll port
> your if() to the prefetch)... 

OK.  I ported your patch to i386 on top of the perfmon2 patchset, but
by hand editing I got one that will apply with offsets to 2.6.17-mm4.
Not even compile tested against that version...

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>

 arch/i386/kernel/process.c |   12 ++++++++++++
 arch/i386/kernel/traps.c   |    3 ++-
 include/asm-i386/i387.h    |    5 ++++-
 include/linux/sched.h      |    9 +++++++++
 4 files changed, 27 insertions(+), 2 deletions(-)

--- 2.6.17.1-32-pfmon.orig/arch/i386/kernel/process.c
+++ 2.6.17.1-32-pfmon/arch/i386/kernel/process.c
@@ -635,6 +635,11 @@ struct task_struct fastcall * __switch_t
 
 	__unlazy_fpu(prev_p);
 
+
+	if (next_p->fpu_counter > 5)
+		/* prefetch the fxsave area into the cache */
+		prefetch(&next->i387.fxsave);
+
 	/*
 	 * Reload esp0.
 	 */
@@ -693,6 +698,13 @@ struct task_struct fastcall * __switch_t
 
 	disable_tsc(prev_p, next_p);
 
+	/* If the task has used fpu the last 5 timeslices, just do a full
+	 * restore of the math state immediately to avoid the trap; the
+	 * chances of needing FPU soon are obviously high now
+	 */
+	if (next_p->fpu_counter > 5)
+		math_state_restore();
+
 	return prev_p;
 }
 
--- 2.6.17.1-32-pfmon.orig/arch/i386/kernel/traps.c
+++ 2.6.17.1-32-pfmon/arch/i386/kernel/traps.c
@@ -1046,7 +1046,7 @@ fastcall unsigned char * fixup_x86_bogus
  * Must be called with kernel preemption disabled (in this case,
  * local interrupts are disabled at the call-site in entry.S).
  */
-asmlinkage void math_state_restore(struct pt_regs regs)
+asmlinkage void math_state_restore(void)
 {
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = thread->task;
@@ -1056,6 +1056,7 @@ asmlinkage void math_state_restore(struc
 		init_fpu(tsk);
 	restore_fpu(tsk);
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
+	tsk->fpu_counter++;
 }
 
 #ifndef CONFIG_MATH_EMULATION
--- 2.6.17.1-32-pfmon.orig/include/asm-i386/i387.h
+++ 2.6.17.1-32-pfmon/include/asm-i386/i387.h
@@ -76,7 +76,9 @@ static inline void __save_init_fpu( stru
 
 #define __unlazy_fpu( tsk ) do { \
 	if (task_thread_info(tsk)->status & TS_USEDFPU) \
-		save_init_fpu( tsk ); \
+		save_init_fpu( tsk ); 			\
+	else						\
+		tsk->fpu_counter = 0;			\
 } while (0)
 
 #define __clear_fpu( tsk )					\
@@ -118,6 +120,7 @@ static inline void save_init_fpu( struct
 extern unsigned short get_fpu_cwd( struct task_struct *tsk );
 extern unsigned short get_fpu_swd( struct task_struct *tsk );
 extern unsigned short get_fpu_mxcsr( struct task_struct *tsk );
+extern asmlinkage void math_state_restore(void);
 
 /*
  * Signal frame handlers...
--- 2.6.17.1-32-pfmon.orig/include/linux/sched.h
+++ 2.6.17.1-32-pfmon/include/linux/sched.h
@@ -893,6 +893,15 @@ struct task_struct {
 	spinlock_t delays_lock;
 	struct task_delay_info *delays;
 #endif
+	/*
+	 * fpu_counter contains the number of consecutive context switches
+	 * that the FPU is used. If this is over a threshold, the lazy fpu
+	 * saving becomes unlazy to save the trap. This is an unsigned char
+	 * so that after 256 times the counter wraps and the behavior turns
+	 * lazy again; this to deal with bursty apps that only use FPU for
+	 * a short time
+	 */
+	unsigned char fpu_counter;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: RFC: unlazy fpu for frequent fpu users
  2006-06-30 22:31 ` Roland Dreier
@ 2006-07-01 13:01   ` Arjan van de Ven
  0 siblings, 0 replies; 6+ messages in thread
From: Arjan van de Ven @ 2006-07-01 13:01 UTC (permalink / raw)
  To: Roland Dreier; +Cc: linux-kernel

On Fri, 2006-06-30 at 15:31 -0700, Roland Dreier wrote:
>  > +	/* prefetch the fxsave area into the cache */
>  > +	prefetch(&next->i387.fxsave);
> 
> This chunk is not obviously related to the rest of the patch, and
> perhaps needs additional justification.

ok fair enough; will improve the comment


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: RFC: unlazy fpu for frequent fpu users
  2006-07-01 11:51 Chuck Ebbert
@ 2006-07-01 12:13 ` Arjan van de Ven
  0 siblings, 0 replies; 6+ messages in thread
From: Arjan van de Ven @ 2006-07-01 12:13 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: linux-kernel


> You can do better that that.  FXSR doesn't destroy the FPU contents; if
> you track the context carefully you can completely avoid the restore.
> This requires keeping a per-cpu variable that holds a pointer to the


to be honest, while I like the idea, it does scare me from a security
point of view, both in terms of leaks and in terms of injecting bad
stuff. 

> > --- linux-2.6.17-sleazyfpu.orig/arch/x86_64/kernel/process.c
> > +++ linux-2.6.17-sleazyfpu/arch/x86_64/kernel/process.c
> > @@ -515,6 +515,9 @@ __switch_to(struct task_struct *prev_p, 
> >       int cpu = smp_processor_id();  
> >       struct tss_struct *tss = &per_cpu(init_tss, cpu);
> >  
> > +     /* prefetch the fxsave area into the cache */
> > +     prefetch(&next->i387.fxsave);
> > +
> >       /*
> >        * Reload esp0, LDT and the page table pointer:
> >        */
> 
> This prefetch is probably a bad idea.  I ported your patch to i386 and it was
> actually slower until I changed it:
> 
> +       if (next_p->fpu_counter > 5)
> +               /* prefetch the fxsave area into the cache */
> +               prefetch(&next->i387.fxsave);
> +
> 
> Now it's ~.4% faster.  The test was an FP program doing a simple benchmark
> while a non-FP program ran in a tight loop.

nice! I sort of am not a big fan of if .. prefetch() but if it shows
gain... then you convinced me. 0.4% is roughly the same order I saw.
It's not gigantic but it's almost free to do so it may be worth it
anyway... 
Can you send me your patch so that I can integrate it (and I'll port
your if() to the prefetch)... 

Greetings,
    Arjan van de Ven


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: RFC: unlazy fpu for frequent fpu users
@ 2006-07-01 11:51 Chuck Ebbert
  2006-07-01 12:13 ` Arjan van de Ven
  0 siblings, 1 reply; 6+ messages in thread
From: Chuck Ebbert @ 2006-07-01 11:51 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: linux-kernel

In-Reply-To: <1151705536.11434.69.camel@laptopd505.fenrus.org>

On Sat, 01 Jul 2006 00:12:16 +0200, Arjan van de Ven wrote:

> right now the kernel on x86-64 has a 100% lazy fpu behavior: after
> *every* context switch a trap is taken for the first FPU use to restore
> the FPU context lazily. This is of course great for applications that
> have very sporadic or no FPU use (since then you avoid doing the
> expensive save/restore all the time). However for very frequent FPU
> users... you take an extra trap every context switch.
> 
> The patch below adds a simple heuristic to this code: After 5
> consecutive context switches of FPU use, the lazy behavior is disabled
> and the context gets restored every context switch. If the app indeed
> uses the FPU, the trap is avoided. (the chance of the 6th time slice
> using FPU after the previous 5 having done so are quite high obviously).
> 

You can do better that that.  FXSR doesn't destroy the FPU contents; if
you track the context carefully you can completely avoid the restore.
This requires keeping a per-cpu variable that holds a pointer to the
thread that last used the FPU and a per-thread variable containing the
CPU number on which the thread last used FP math. Unfortunately this
won't work in x86_64 because of the 'fxsave information leak' workaround.

> --- linux-2.6.17-sleazyfpu.orig/arch/x86_64/kernel/process.c
> +++ linux-2.6.17-sleazyfpu/arch/x86_64/kernel/process.c
> @@ -515,6 +515,9 @@ __switch_to(struct task_struct *prev_p, 
>       int cpu = smp_processor_id();  
>       struct tss_struct *tss = &per_cpu(init_tss, cpu);
>  
> +     /* prefetch the fxsave area into the cache */
> +     prefetch(&next->i387.fxsave);
> +
>       /*
>        * Reload esp0, LDT and the page table pointer:
>        */

This prefetch is probably a bad idea.  I ported your patch to i386 and it was
actually slower until I changed it:

+       if (next_p->fpu_counter > 5)
+               /* prefetch the fxsave area into the cache */
+               prefetch(&next->i387.fxsave);
+

Now it's ~.4% faster.  The test was an FP program doing a simple benchmark
while a non-FP program ran in a tight loop.

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: RFC: unlazy fpu for frequent fpu users
  2006-06-30 22:12 Arjan van de Ven
@ 2006-06-30 22:31 ` Roland Dreier
  2006-07-01 13:01   ` Arjan van de Ven
  0 siblings, 1 reply; 6+ messages in thread
From: Roland Dreier @ 2006-06-30 22:31 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: linux-kernel

 > +	/* prefetch the fxsave area into the cache */
 > +	prefetch(&next->i387.fxsave);

This chunk is not obviously related to the rest of the patch, and
perhaps needs additional justification.

And the comment getting pretty close to

	/* set i to 2 */
	i = 2;

territory ;)

 - R.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RFC: unlazy fpu for frequent fpu users
@ 2006-06-30 22:12 Arjan van de Ven
  2006-06-30 22:31 ` Roland Dreier
  0 siblings, 1 reply; 6+ messages in thread
From: Arjan van de Ven @ 2006-06-30 22:12 UTC (permalink / raw)
  To: linux-kernel

Hi,

right now the kernel on x86-64 has a 100% lazy fpu behavior: after
*every* context switch a trap is taken for the first FPU use to restore
the FPU context lazily. This is of course great for applications that
have very sporadic or no FPU use (since then you avoid doing the
expensive save/restore all the time). However for very frequent FPU
users... you take an extra trap every context switch.

The patch below adds a simple heuristic to this code: After 5
consecutive context switches of FPU use, the lazy behavior is disabled
and the context gets restored every context switch. If the app indeed
uses the FPU, the trap is avoided. (the chance of the 6th time slice
using FPU after the previous 5 having done so are quite high obviously).

After 256 switches, this is reset and lazy behavior is returned (until
there are 5 consecutive ones again). The reason for this is to give apps
that do longer bursts of FPU use still the lazy behavior back after some
time.

I've done some very simple, unscientific, benchmarks and these showed a
very small performance increase; I'll try to do some better benchmarks
soon.


This is not a request for merge but a request for comments.. do people
think this is a useful idea? Also.. I'd love to get benchmarks on
not-my-machine with this...


---
 arch/x86_64/kernel/process.c |    9 +++++++++
 arch/x86_64/kernel/traps.c   |    1 +
 include/asm-x86_64/i387.h    |    5 ++++-
 include/linux/sched.h        |    9 +++++++++
 4 files changed, 23 insertions(+), 1 deletion(-)

Index: linux-2.6.17-sleazyfpu/arch/x86_64/kernel/process.c
===================================================================
--- linux-2.6.17-sleazyfpu.orig/arch/x86_64/kernel/process.c
+++ linux-2.6.17-sleazyfpu/arch/x86_64/kernel/process.c
@@ -515,6 +515,9 @@ __switch_to(struct task_struct *prev_p, 
 	int cpu = smp_processor_id();  
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 
+	/* prefetch the fxsave area into the cache */
+	prefetch(&next->i387.fxsave);
+
 	/*
 	 * Reload esp0, LDT and the page table pointer:
 	 */
@@ -618,6 +621,12 @@ __switch_to(struct task_struct *prev_p, 
 		}
 	}
 
+	/* If the task has used fpu the last 5 timeslices, just do a full
+	 * restore of the math state immediately to avoid the trap; the
+	 * chances of needing FPU soon are obviously high now
+	 */
+	if (next_p->fpu_counter>5)
+		math_state_restore();
 	return prev_p;
 }
 
Index: linux-2.6.17-sleazyfpu/arch/x86_64/kernel/traps.c
===================================================================
--- linux-2.6.17-sleazyfpu.orig/arch/x86_64/kernel/traps.c
+++ linux-2.6.17-sleazyfpu/arch/x86_64/kernel/traps.c
@@ -1061,6 +1061,7 @@ asmlinkage void math_state_restore(void)
 		init_fpu(me);
 	restore_fpu_checking(&me->thread.i387.fxsave);
 	task_thread_info(me)->status |= TS_USEDFPU;
+	me->fpu_counter++;
 }
 
 void __init trap_init(void)
Index: linux-2.6.17-sleazyfpu/include/asm-x86_64/i387.h
===================================================================
--- linux-2.6.17-sleazyfpu.orig/include/asm-x86_64/i387.h
+++ linux-2.6.17-sleazyfpu/include/asm-x86_64/i387.h
@@ -24,6 +24,7 @@ extern unsigned int mxcsr_feature_mask;
 extern void mxcsr_feature_mask_init(void);
 extern void init_fpu(struct task_struct *child);
 extern int save_i387(struct _fpstate __user *buf);
+extern asmlinkage void math_state_restore(void);
 
 /*
  * FPU lazy state save handling...
@@ -31,7 +32,9 @@ extern int save_i387(struct _fpstate __u
 
 #define unlazy_fpu(tsk) do { \
 	if (task_thread_info(tsk)->status & TS_USEDFPU) \
-		save_init_fpu(tsk); \
+		save_init_fpu(tsk); 			\
+	else						\
+		tsk->fpu_counter = 0;			\
 } while (0)
 
 /* Ignore delayed exceptions from user space */
Index: linux-2.6.17-sleazyfpu/include/linux/sched.h
===================================================================
--- linux-2.6.17-sleazyfpu.orig/include/linux/sched.h
+++ linux-2.6.17-sleazyfpu/include/linux/sched.h
@@ -1023,6 +1023,15 @@ struct task_struct {
 	spinlock_t delays_lock;
 	struct task_delay_info *delays;
 #endif
+	/*
+	 * fpu_counter contains the number of consecutive context switches
+	 * that the FPU is used. If this is over a threshold, the lazy fpu
+	 * saving becomes unlazy to save the trap. This is an unsigned char
+	 * so that after 256 times the counter wraps and the behavior turns
+	 * lazy again; this to deal with bursty apps that only use FPU for
+	 * a short time
+	 */
+	unsigned char fpu_counter;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2006-07-01 16:25 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-07-01 16:22 RFC: unlazy fpu for frequent fpu users Chuck Ebbert
  -- strict thread matches above, loose matches on Subject: below --
2006-07-01 11:51 Chuck Ebbert
2006-07-01 12:13 ` Arjan van de Ven
2006-06-30 22:12 Arjan van de Ven
2006-06-30 22:31 ` Roland Dreier
2006-07-01 13:01   ` Arjan van de Ven

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).