linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v2 1/2] x86/fpu: detect AVX task
@ 2018-11-07 17:16 Aubrey Li
  2018-11-07 17:16 ` [RFC PATCH v2 2/2] proc: add /proc/<pid>/thread_state Aubrey Li
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Aubrey Li @ 2018-11-07 17:16 UTC (permalink / raw)
  To: tglx, mingo, peterz, hpa
  Cc: ak, tim.c.chen, arjan, aubrey.li, linux-kernel, Aubrey Li

XSAVES and its variants use init optimization to reduce the amount of
data that they save to memory during context switch. Init optimization
uses the state component bitmap to denote if a component is in its init
configuration. We use this information to detect if a task contains AVX
instructions.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/x86/include/asm/fpu/internal.h | 97 +++++++++++++++++++++++++++----------
 arch/x86/include/asm/fpu/types.h    | 17 +++++++
 2 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a38bf5a..b0519f4 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -74,6 +74,12 @@ static __always_inline __pure bool use_fxsr(void)
 	return static_cpu_has(X86_FEATURE_FXSR);
 }
 
+static __always_inline __pure bool use_xgetbv1(void)
+{
+	return static_cpu_has(X86_FEATURE_OSXSAVE) &&
+		static_cpu_has(X86_FEATURE_XGETBV1);
+}
+
 /*
  * fpstate handling functions:
  */
@@ -103,6 +109,34 @@ static inline void fpstate_init_fxstate(struct fxregs_state *fx)
 }
 extern void fpstate_sanitize_xstate(struct fpu *fpu);
 
+/*
+ * MXCSR and XCR definitions:
+ */
+
+extern unsigned int mxcsr_feature_mask;
+
+#define	XCR_XFEATURE_ENABLED_MASK	0x00000000
+#define	XINUSE_STATE_BITMAP_INDEX	0x00000001
+
+static inline u64 xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
+		     : : "a" (eax), "d" (edx), "c" (index));
+}
+
 #define user_insn(insn, output, input...)				\
 ({									\
 	int err;							\
@@ -275,6 +309,42 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
 
+#define	AVX_STATE_DECAY_COUNT	3
+
+/*
+ * This function is called during context switch to update AVX component state
+ */
+static inline void update_avx_state(struct avx_state *avx)
+{
+	/*
+	 * Check if XGETBV with ECX = 1 supported. XGETBV with ECX = 1
+	 * returns the logical-AND of XCR0 and XINUSE. XINUSE is a bitmap
+	 * by which the processor tracks the status of various components.
+	 */
+	if (!use_xgetbv1()) {
+		avx->state = 0;
+		return;
+	}
+	/*
+	 * XINUSE is dynamic to track component state because VZEROUPPER
+	 * happens on every function end and reset the bitmap to the
+	 * initial configuration.
+	 *
+	 * State decay is introduced to solve the race condition between
+	 * context switch and a function end. State is aggressively set
+	 * once it's detected but need to be cleared by decay 3 context
+	 * switches
+	 */
+	if (xgetbv(XINUSE_STATE_BITMAP_INDEX) & XFEATURE_MASK_Hi16_ZMM) {
+		avx->state = 1;
+		avx->decay_count = AVX_STATE_DECAY_COUNT;
+	} else {
+		if (avx->decay_count)
+			avx->decay_count--;
+		else
+			avx->state = 0;
+	}
+}
 /*
  * This function is called only during boot time when x86 caps are not set
  * up and alternative can not be used yet.
@@ -411,6 +481,7 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
 {
 	if (likely(use_xsave())) {
 		copy_xregs_to_kernel(&fpu->state.xsave);
+		update_avx_state(&fpu->avx);
 		return 1;
 	}
 
@@ -577,31 +648,5 @@ static inline void user_fpu_begin(void)
 	preempt_enable();
 }
 
-/*
- * MXCSR and XCR definitions:
- */
-
-extern unsigned int mxcsr_feature_mask;
-
-#define XCR_XFEATURE_ENABLED_MASK	0x00000000
-
-static inline u64 xgetbv(u32 index)
-{
-	u32 eax, edx;
-
-	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
-		     : "=a" (eax), "=d" (edx)
-		     : "c" (index));
-	return eax + ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
-	u32 eax = value;
-	u32 edx = value >> 32;
-
-	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
-		     : : "a" (eax), "d" (edx), "c" (index));
-}
 
 #endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 202c539..39d5bc2 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -274,6 +274,15 @@ union fpregs_state {
 };
 
 /*
+ * This is per task AVX state data structure that indicates
+ * whether the task uses AVX instructions.
+ */
+struct avx_state {
+	unsigned int			state;
+	unsigned int			decay_count;
+};
+
+/*
  * Highest level per task FPU state data structure that
  * contains the FPU register state plus various FPU
  * state fields:
@@ -303,6 +312,14 @@ struct fpu {
 	unsigned char			initialized;
 
 	/*
+	 * @avx_state:
+	 *
+	 * This data structure indicates whether this context
+	 * contains AVX states
+	 */
+	struct avx_state		avx;
+
+	/*
 	 * @state:
 	 *
 	 * In-memory copy of all FPU registers that we save/restore
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH v2 2/2] proc: add /proc/<pid>/thread_state
  2018-11-07 17:16 [RFC PATCH v2 1/2] x86/fpu: detect AVX task Aubrey Li
@ 2018-11-07 17:16 ` Aubrey Li
  2018-11-09 11:21 ` [RFC PATCH v2 1/2] x86/fpu: detect AVX task Thomas Gleixner
  2018-11-12  2:32 ` Dave Hansen
  2 siblings, 0 replies; 11+ messages in thread
From: Aubrey Li @ 2018-11-07 17:16 UTC (permalink / raw)
  To: tglx, mingo, peterz, hpa
  Cc: ak, tim.c.chen, arjan, aubrey.li, linux-kernel, Aubrey Li

Expose the per-task cpu specific thread state value, it's helpful
for userland to classify and schedule the tasks by different policies

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/x86/kernel/fpu/xstate.c | 13 +++++++++++++
 fs/proc/base.c               | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 87a57b7..5a4a1d5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -7,6 +7,7 @@
 #include <linux/cpu.h>
 #include <linux/mman.h>
 #include <linux/pkeys.h>
+#include <linux/seq_file.h>
 
 #include <asm/fpu/api.h>
 #include <asm/fpu/internal.h>
@@ -1245,3 +1246,15 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 
 	return 0;
 }
+
+/*
+ * report AVX state
+ */
+void arch_thread_state(struct seq_file *m, struct task_struct *task)
+{
+	if (task->thread.fpu.avx.state)
+		seq_putc(m, '1');
+	else
+		seq_putc(m, '0');
+	seq_putc(m, '\n');
+}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aaffc0c..be24327 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2893,6 +2893,17 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
 }
 #endif /* CONFIG_LIVEPATCH */
 
+void __weak arch_thread_state(struct seq_file *m, struct task_struct *task)
+{
+}
+
+static int proc_pid_thread_state(struct seq_file *m, struct pid_namespace *ns,
+				 struct pid *pid, struct task_struct *task)
+{
+	arch_thread_state(m, task);
+	return 0;
+}
+
 /*
  * Thread groups
  */
@@ -2994,6 +3005,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
+	ONE("thread_state",  S_IRUSR, proc_pid_thread_state),
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3372,6 +3384,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
+	ONE("thread_state",  S_IRUSR, proc_pid_thread_state),
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/2] x86/fpu: detect AVX task
  2018-11-07 17:16 [RFC PATCH v2 1/2] x86/fpu: detect AVX task Aubrey Li
  2018-11-07 17:16 ` [RFC PATCH v2 2/2] proc: add /proc/<pid>/thread_state Aubrey Li
@ 2018-11-09 11:21 ` Thomas Gleixner
  2018-11-12  1:40   ` Li, Aubrey
  2018-11-12  2:32 ` Dave Hansen
  2 siblings, 1 reply; 11+ messages in thread
From: Thomas Gleixner @ 2018-11-09 11:21 UTC (permalink / raw)
  To: Aubrey Li
  Cc: mingo, peterz, hpa, ak, tim.c.chen, arjan, linux-kernel, Aubrey Li

Aubrey,

On Thu, 8 Nov 2018, Aubrey Li wrote:

> Subject: .... x86/fpu: detect AVX task

What is an AVX task? I know what you mean, but for the casual reader this
is not very informative. So something like:

  x86/fpu: Track AVX usage of tasks

would be more informative and precise. The mechanism you add is tracking
the state and not just detecting in.

> XSAVES and its variants use init optimization to reduce the amount of
> data that they save to memory during context switch. Init optimization
> uses the state component bitmap to denote if a component is in its init
> configuration. We use this information to detect if a task contains AVX
> instructions.

Please avoid 'We use..'. Changelogs should be factual and precise and not
be written as first-person narrative.

Aside of that, you very well explained how XSAVES optimization works, but
that's only a small part of the picture. The changelog should also contain
a justification why this is necessary in the first place along with more
missing bits and pieces. And please use paragraphs to structure the
changelog instead of having one big lump.

Something like this:

  User space tools which do automated task placement need information about
  AVX usage of tasks, because of <reasons>....

  The extended control register (XCR) allows access to the XINUSE
  state-component bitmap, which allows software to discover the state of
  the init optimization used by XSAVEOPT and XSAVES. Set bits in the bitmap
  denote the usage of AVX, SIMD, FPU and other components. The XSAVE
  variants store only the state of used components to speed up the
  operation.

  The XGETBV instruction, if supported, allows software to read the
  state-component bitmap and use this information to detect the usage of
  the tracked components.

  Add accessor functions and per task state tracking to context switch.

  The tracking turns on the usage flag immediately, but requires 3
  consecutive context switches with no usage to clear it. This decay is
  required because of <reasons>....

You surely had all this information in your head when writing the code and
the changelog, so you could have spared me to dig all this out of the SDM.

> +/*
> + * This function is called during context switch to update AVX component state
> + */
> +static inline void update_avx_state(struct avx_state *avx)
> +{
> +	/*
> +	 * Check if XGETBV with ECX = 1 supported. XGETBV with ECX = 1
> +	 * returns the logical-AND of XCR0 and XINUSE. XINUSE is a bitmap
> +	 * by which the processor tracks the status of various components.
> +	 */
> +	if (!use_xgetbv1()) {
> +		avx->state = 0;

avx->state should be initialized to 0 when the task starts, so why
does it need to be cleared when XGETBV is not supported?

Also this is the only usage site of use_xgetbv1(). So please open code it
here and avoid the extra inline which does not provide any extra value in
this case.

> +		return;
> +	}
> +	/*
> +	 * XINUSE is dynamic to track component state because VZEROUPPER
> +	 * happens on every function end and reset the bitmap to the
> +	 * initial configuration.
> +	 *
> +	 * State decay is introduced to solve the race condition between
> +	 * context switch and a function end. State is aggressively set
> +	 * once it's detected but need to be cleared by decay 3 context
> +	 * switches

I have no idea what _the_ race condition between context switch and a
function end is about. Probably most readers wont have.

> +	 */
> +	if (xgetbv(XINUSE_STATE_BITMAP_INDEX) & XFEATURE_MASK_Hi16_ZMM) {

In the changelog and also in the code you say AVX (avx->state), but this
actually looks only for Hi16_ZMM state, which are the upper 16 AVX512
registers.

So again, this wants to be documented in the changelog along with an
explanation WHY this check is restricted to Hi16_ZMM state. And please
rename the variable accordingly. This is confusing at best.

> +		avx->state = 1;
> +		avx->decay_count = AVX_STATE_DECAY_COUNT;
> +	} else {
> +		if (avx->decay_count)
> +			avx->decay_count--;
> +		else
> +			avx->state = 0;
> +	}

Is there a reason why you need two variables for this?

	if (xgetbv(XINUSE_STATE_BITMAP_INDEX) & XFEATURE_MASK_Hi16_ZMM)
		tsk->hi16zmm_usage = HI16ZMM_DECAY_COUNT;
	else if (tsk->hi16zmm_usage)
		tsk->hi16zmm_usage--;

and the function which exposes it later to user space can just check
whether tsk->hi16zmm_usage is 0 or not.

This is context switch and we really want to avoid every pointless
instruction we can.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/2] x86/fpu: detect AVX task
  2018-11-09 11:21 ` [RFC PATCH v2 1/2] x86/fpu: detect AVX task Thomas Gleixner
@ 2018-11-12  1:40   ` Li, Aubrey
  2018-11-13 10:25     ` David Laight
  0 siblings, 1 reply; 11+ messages in thread
From: Li, Aubrey @ 2018-11-12  1:40 UTC (permalink / raw)
  To: Thomas Gleixner, Aubrey Li
  Cc: mingo, peterz, hpa, ak, tim.c.chen, arjan, linux-kernel

On 2018/11/9 19:21, Thomas Gleixner wrote:
> Aubrey,
> 
> On Thu, 8 Nov 2018, Aubrey Li wrote:
> 
>> Subject: .... x86/fpu: detect AVX task
> 
> What is an AVX task? I know what you mean, but for the casual reader this
> is not very informative. So something like:
> 
>   x86/fpu: Track AVX usage of tasks
> 
> would be more informative and precise. The mechanism you add is tracking
> the state and not just detecting in.
> 
>> XSAVES and its variants use init optimization to reduce the amount of
>> data that they save to memory during context switch. Init optimization
>> uses the state component bitmap to denote if a component is in its init
>> configuration. We use this information to detect if a task contains AVX
>> instructions.
> 
> Please avoid 'We use..'. Changelogs should be factual and precise and not
> be written as first-person narrative.
> 
> Aside of that, you very well explained how XSAVES optimization works, but
> that's only a small part of the picture. The changelog should also contain
> a justification why this is necessary in the first place along with more
> missing bits and pieces. And please use paragraphs to structure the
> changelog instead of having one big lump.
> 
> Something like this:
> 
>   User space tools which do automated task placement need information about
>   AVX usage of tasks, because of <reasons>....
> 
>   The extended control register (XCR) allows access to the XINUSE
>   state-component bitmap, which allows software to discover the state of
>   the init optimization used by XSAVEOPT and XSAVES. Set bits in the bitmap
>   denote the usage of AVX, SIMD, FPU and other components. The XSAVE
>   variants store only the state of used components to speed up the
>   operation.
> 
>   The XGETBV instruction, if supported, allows software to read the
>   state-component bitmap and use this information to detect the usage of
>   the tracked components.
> 
>   Add accessor functions and per task state tracking to context switch.
> 
>   The tracking turns on the usage flag immediately, but requires 3
>   consecutive context switches with no usage to clear it. This decay is
>   required because of <reasons>....
> 
> You surely had all this information in your head when writing the code and
> the changelog, so you could have spared me to dig all this out of the SDM.

Thanks Thomas, will try to refine in the next version.

> 
>> +/*
>> + * This function is called during context switch to update AVX component state
>> + */
>> +static inline void update_avx_state(struct avx_state *avx)
>> +{
>> +	/*
>> +	 * Check if XGETBV with ECX = 1 supported. XGETBV with ECX = 1
>> +	 * returns the logical-AND of XCR0 and XINUSE. XINUSE is a bitmap
>> +	 * by which the processor tracks the status of various components.
>> +	 */
>> +	if (!use_xgetbv1()) {
>> +		avx->state = 0;
> 
> avx->state should be initialized to 0 when the task starts, so why
> does it need to be cleared when XGETBV is not supported?

will fix in the next version.

> 
> Also this is the only usage site of use_xgetbv1(). So please open code it
> here and avoid the extra inline which does not provide any extra value in
> this case.

will fix in the next version

> 
>> +		return;
>> +	}
>> +	/*
>> +	 * XINUSE is dynamic to track component state because VZEROUPPER
>> +	 * happens on every function end and reset the bitmap to the
>> +	 * initial configuration.
>> +	 *
>> +	 * State decay is introduced to solve the race condition between
>> +	 * context switch and a function end. State is aggressively set
>> +	 * once it's detected but need to be cleared by decay 3 context
>> +	 * switches
> 
> I have no idea what _the_ race condition between context switch and a
> function end is about. Probably most readers wont have.
> 

VZEROUPPER instruction resets the init state. If context switch happens
to occur exactly after VZEROUPPER instruction, XINUSE bitmap is empty(all
zeros), which indicates the task is not using AVX. That's why the state
decay count is used here.


>> +	 */
>> +	if (xgetbv(XINUSE_STATE_BITMAP_INDEX) & XFEATURE_MASK_Hi16_ZMM) {
> 
> In the changelog and also in the code you say AVX (avx->state), but this
> actually looks only for Hi16_ZMM state, which are the upper 16 AVX512
> registers.
> 
> So again, this wants to be documented in the changelog along with an
> explanation WHY this check is restricted to Hi16_ZMM state. And please
> rename the variable accordingly. This is confusing at best.

okay, I'll try to address this in the next version.


> 
>> +		avx->state = 1;
>> +		avx->decay_count = AVX_STATE_DECAY_COUNT;
>> +	} else {
>> +		if (avx->decay_count)
>> +			avx->decay_count--;
>> +		else
>> +			avx->state = 0;
>> +	}
> 
> Is there a reason why you need two variables for this?
> 
> 	if (xgetbv(XINUSE_STATE_BITMAP_INDEX) & XFEATURE_MASK_Hi16_ZMM)
> 		tsk->hi16zmm_usage = HI16ZMM_DECAY_COUNT;
> 	else if (tsk->hi16zmm_usage)
> 		tsk->hi16zmm_usage--;
> 
> and the function which exposes it later to user space can just check
> whether tsk->hi16zmm_usage is 0 or not.
> 
> This is context switch and we really want to avoid every pointless
> instruction we can.

make sense, will try to refine in the next version.

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/2] x86/fpu: detect AVX task
  2018-11-07 17:16 [RFC PATCH v2 1/2] x86/fpu: detect AVX task Aubrey Li
  2018-11-07 17:16 ` [RFC PATCH v2 2/2] proc: add /proc/<pid>/thread_state Aubrey Li
  2018-11-09 11:21 ` [RFC PATCH v2 1/2] x86/fpu: detect AVX task Thomas Gleixner
@ 2018-11-12  2:32 ` Dave Hansen
  2018-11-12  5:38   ` Li, Aubrey
  2 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2018-11-12  2:32 UTC (permalink / raw)
  To: Aubrey Li, tglx, mingo, peterz, hpa
  Cc: ak, tim.c.chen, arjan, linux-kernel, Aubrey Li

On 11/7/18 9:16 AM, Aubrey Li wrote:
> XSAVES and its variants use init optimization to reduce the amount of
> data that they save to memory during context switch. Init optimization
> uses the state component bitmap to denote if a component is in its init
> configuration. We use this information to detect if a task contains AVX
> instructions.

I'm a little uncomfortable with changelogs like this.  Someone might
read this and think that this patch precisely detects AVX instructions.
 It would be great is we could make this more precise to say that this
patch detects if there is valid state in the AVX registers, *not* if the
task contains or uses AVX instructions.

>  arch/x86/include/asm/fpu/internal.h | 97 +++++++++++++++++++++++++++----------
>  arch/x86/include/asm/fpu/types.h    | 17 +++++++
>  2 files changed, 88 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
> index a38bf5a..b0519f4 100644
> --- a/arch/x86/include/asm/fpu/internal.h
> +++ b/arch/x86/include/asm/fpu/internal.h
> @@ -74,6 +74,12 @@ static __always_inline __pure bool use_fxsr(void)
>  	return static_cpu_has(X86_FEATURE_FXSR);
>  }
>  
> +static __always_inline __pure bool use_xgetbv1(void)
> +{
> +	return static_cpu_has(X86_FEATURE_OSXSAVE) &&
> +		static_cpu_has(X86_FEATURE_XGETBV1);
> +}
> +
>  /*
>   * fpstate handling functions:
>   */
> @@ -103,6 +109,34 @@ static inline void fpstate_init_fxstate(struct fxregs_state *fx)
>  }
>  extern void fpstate_sanitize_xstate(struct fpu *fpu);
>  
> +/*
> + * MXCSR and XCR definitions:
> + */
> +
> +extern unsigned int mxcsr_feature_mask;
> +
> +#define	XCR_XFEATURE_ENABLED_MASK	0x00000000
> +#define	XINUSE_STATE_BITMAP_INDEX	0x00000001
> +
> +static inline u64 xgetbv(u32 index)
> +{
> +	u32 eax, edx;
> +
> +	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
> +		     : "=a" (eax), "=d" (edx)
> +		     : "c" (index));
> +	return eax + ((u64)edx << 32);
> +}
> +
> +static inline void xsetbv(u32 index, u64 value)
> +{
> +	u32 eax = value;
> +	u32 edx = value >> 32;
> +
> +	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
> +		     : : "a" (eax), "d" (edx), "c" (index));
> +}
> +
>  #define user_insn(insn, output, input...)				\
>  ({									\
>  	int err;							\
> @@ -275,6 +309,42 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
>  		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
>  		     : "memory")
>  
> +#define	AVX_STATE_DECAY_COUNT	3

How was this number chosen?  What does this mean?

It appears that this is saying that after 3 non-AVX-512-using context
switches, the task is not considered to be using AVX512 any more.  That
seems a bit goofy because the context switch rate is highly dependent on
HZ, and on how often the task yields.

Do we want this, or do we want something more time-based?

> +/*
> + * This function is called during context switch to update AVX component state
> + */
> +static inline void update_avx_state(struct avx_state *avx)
> +{
> +	/*
> +	 * Check if XGETBV with ECX = 1 supported. XGETBV with ECX = 1
> +	 * returns the logical-AND of XCR0 and XINUSE. XINUSE is a bitmap
> +	 * by which the processor tracks the status of various components.
> +	 */
> +	if (!use_xgetbv1()) {
> +		avx->state = 0;
> +		return;
> +	}

This is a place where we have conflated the implementation in the CPU
and the logical operation that we are performing.

In this case, it appears that we want to know whether AVX state
detection is available, but we're doing that with a function that's
apparently asking if the kernel is using XGETBV1.

I'd really like if this looked like this:

	if (!have_avx_state_detect()) {
		avx->state = 0;
		return;
	}

Then, in have_avx_state_detect(), explain what XGETBV1 does.  BTW, I
don't think we *totally* need to duplicate the SDM definitions in kernel
code for each instruction.  It's fine to just say that it set 1 for
features not in the init state.

> +	/*
> +	 * XINUSE is dynamic to track component state because VZEROUPPER
> +	 * happens on every function end and reset the bitmap to the
> +	 * initial configuration.

This is confusing to me because VZEROUPPER is not apparently involved
here.  What is this trying to say?

> +	 * State decay is introduced to solve the race condition between
> +	 * context switch and a function end. State is aggressively set
> +	 * once it's detected but need to be cleared by decay 3 context
> +	 * switches
> +	 */

I'd probably say:

	AVX512-using processes frequently set AVX512 back to the init 	
	state themselves.  Thus, this detection mechanism can miss.
	The decay ensures that false-negatives do not immediately make
	a task be considered as not using AVX512.

> +	if (xgetbv(XINUSE_STATE_BITMAP_INDEX) & XFEATURE_MASK_Hi16_ZMM) {

This is *just* an AVX512 state, right?  That isn't reflected in any
comments or naming.  Also, why just this state, and not any of the other
AVX512-related states?

This is also precisely the kind of thing that would be nice to wrap up
in a tiny helper.

	if (avx512_in_use())

is much more self-documenting, for instance.

> +		avx->state = 1;

I'm not a huge fan of this naming.  Could this be:

		avx->had_avx_512_state = true;

> +		avx->decay_count = AVX_STATE_DECAY_COUNT;
> +	} else {
> +		if (avx->decay_count)
> +			avx->decay_count--;
> +		else
> +			avx->state = 0;
> +	}
> +}

This all needs a bunch more commenting.  The state transitions are not
horribly clear.

>  /*
>   * This function is called only during boot time when x86 caps are not set
>   * up and alternative can not be used yet.
> @@ -411,6 +481,7 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
>  {
>  	if (likely(use_xsave())) {
>  		copy_xregs_to_kernel(&	);
> +		update_avx_state(&fpu->avx);
>  		return 1;
>  	}

I'm not sure why update_avx_state() goes to the trouble of calling
XGETBV1.  I believe the exact same state is captured in the 'xfeatures'
field in the XSAVE buffer after copy_xregs_to_kernel().  Why bother
calling the instruction when you can get the data from memory?

>  /*
> + * This is per task AVX state data structure that indicates
> + * whether the task uses AVX instructions.
> + */

Here's another spot that doesn't precisely capture how his detection
mechanism works.

> +struct avx_state {
> +	unsigned int			state;

Isn't state a 0/1 thing?

> +	unsigned int			decay_count;
> +};

Doesn't this max out at 3?

Seems like we're storing about three bits of data in 64 bits of space.
Not a huge deal, but I think we can do better?

Also, do we really even need 'state'?

When would its value be different than

	fpu->state.xsave.xfeatures & XFEATURE_MASK_Hi16_ZMM

> +/*
>   * Highest level per task FPU state data structure that
>   * contains the FPU register state plus various FPU
>   * state fields:
> @@ -303,6 +312,14 @@ struct fpu {
>  	unsigned char			initialized;
>  
>  	/*
> +	 * @avx_state:
> +	 *
> +	 * This data structure indicates whether this context
> +	 * contains AVX states
> +	 */

Yeah, that's precisely what fpu->state.xsave.xfeatures does. :)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/2] x86/fpu: detect AVX task
  2018-11-12  2:32 ` Dave Hansen
@ 2018-11-12  5:38   ` Li, Aubrey
  2018-11-12 15:46     ` Dave Hansen
  0 siblings, 1 reply; 11+ messages in thread
From: Li, Aubrey @ 2018-11-12  5:38 UTC (permalink / raw)
  To: Dave Hansen, Aubrey Li, tglx, mingo, peterz, hpa
  Cc: ak, tim.c.chen, arjan, linux-kernel

Hi Dave,

Thanks for your comments!

On 2018/11/12 10:32, Dave Hansen wrote:
> On 11/7/18 9:16 AM, Aubrey Li wrote:
>> XSAVES and its variants use init optimization to reduce the amount of
>> data that they save to memory during context switch. Init optimization
>> uses the state component bitmap to denote if a component is in its init
>> configuration. We use this information to detect if a task contains AVX
>> instructions.
> 
> I'm a little uncomfortable with changelogs like this.  Someone might
> read this and think that this patch precisely detects AVX instructions.
>  It would be great is we could make this more precise to say that this
> patch detects if there is valid state in the AVX registers, *not* if the
> task contains or uses AVX instructions.

Our intention is to figure out if the task contains AVX instructions, this
kind of task causes frequency drop and need attention when dispatched.

If there is a valid state in the AVX registers, we can say the tasks contains
AVX instructions, can't we?

>>  
>> +#define	AVX_STATE_DECAY_COUNT	3
> 
> How was this number chosen?  What does this mean?
> 
> It appears that this is saying that after 3 non-AVX-512-using context
> switches, the task is not considered to be using AVX512 any more.  That
> seems a bit goofy because the context switch rate is highly dependent on
> HZ, and on how often the task yields.
> 
> Do we want this, or do we want something more time-based?
>
This counter is introduced here to solve the race of context switch and
VZEROUPPER. 3 context switches mean the same thread is on-off CPU 3 times.
Due to scheduling latency, 3 jiffies could only happen AVX task on-off just
1 time. So IMHO the context switches number is better here.

> 
>> +/*
>> + * This function is called during context switch to update AVX component state
>> + */
>> +static inline void update_avx_state(struct avx_state *avx)
>> +{
>> +	/*
>> +	 * Check if XGETBV with ECX = 1 supported. XGETBV with ECX = 1
>> +	 * returns the logical-AND of XCR0 and XINUSE. XINUSE is a bitmap
>> +	 * by which the processor tracks the status of various components.
>> +	 */
>> +	if (!use_xgetbv1()) {
>> +		avx->state = 0;
>> +		return;
>> +	}
> 
> This is a place where we have conflated the implementation in the CPU
> and the logical operation that we are performing.
> 
> In this case, it appears that we want to know whether AVX state
> detection is available, but we're doing that with a function that's
> apparently asking if the kernel is using XGETBV1.
> 
> I'd really like if this looked like this:
> 
> 	if (!have_avx_state_detect()) {
> 		avx->state = 0;
> 		return;
> 	}
> 
> Then, in have_avx_state_detect(), explain what XGETBV1 does.  BTW, I
> don't think we *totally* need to duplicate the SDM definitions in kernel
> code for each instruction.  It's fine to just say that it set 1 for
> features not in the init state.
> 

Thomas suggested open this inline since this is only usage of xgetbv1. So I'll
use static_cpu_has() here directly. Does it sound good?

>> +	/*
>> +	 * XINUSE is dynamic to track component state because VZEROUPPER
>> +	 * happens on every function end and reset the bitmap to the
>> +	 * initial configuration.
> 
> This is confusing to me because VZEROUPPER is not apparently involved
> here.  What is this trying to say?
> 
VZERROUPPER instruction in the task resets the bitmap.


>> +	 * State decay is introduced to solve the race condition between
>> +	 * context switch and a function end. State is aggressively set
>> +	 * once it's detected but need to be cleared by decay 3 context
>> +	 * switches
>> +	 */
> 
> I'd probably say:
> 
> 	AVX512-using processes frequently set AVX512 back to the init 	
> 	state themselves.  Thus, this detection mechanism can miss.
> 	The decay ensures that false-negatives do not immediately make
> 	a task be considered as not using AVX512.

Thanks, will refine it in the next version.
And yes, AVX512-using processoes set AVX512 back to the init state themselves
by VZEROUPPER instructions.

> 
>> +	if (xgetbv(XINUSE_STATE_BITMAP_INDEX) & XFEATURE_MASK_Hi16_ZMM) {
> 
> This is *just* an AVX512 state, right?  That isn't reflected in any
> comments or naming.  Also, why just this state, and not any of the other
> AVX512-related states?

Only this state causes significant frequency drop, other states not.
> 
> This is also precisely the kind of thing that would be nice to wrap up
> in a tiny helper.
> 
> 	if (avx512_in_use())
> 
> is much more self-documenting, for instance.

Thanks, will refine it in the next version.

> 
>> +		avx->state = 1;
> 
> I'm not a huge fan of this naming.  Could this be:
> 
> 		avx->had_avx_512_state = true;
> 
>> +		avx->decay_count = AVX_STATE_DECAY_COUNT;
>> +	} else {
>> +		if (avx->decay_count)
>> +			avx->decay_count--;
>> +		else
>> +			avx->state = 0;
>> +	}
>> +}
> 
> This all needs a bunch more commenting.  The state transitions are not
> horribly clear.

Thanks, will refine it in next version.
> 
>>  /*
>>   * This function is called only during boot time when x86 caps are not set
>>   * up and alternative can not be used yet.
>> @@ -411,6 +481,7 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
>>  {
>>  	if (likely(use_xsave())) {
>>  		copy_xregs_to_kernel(&	);
>> +		update_avx_state(&fpu->avx);
>>  		return 1;
>>  	}
> 
> I'm not sure why update_avx_state() goes to the trouble of calling
> XGETBV1.  I believe the exact same state is captured in the 'xfeatures'
> field in the XSAVE buffer after copy_xregs_to_kernel().  Why bother
> calling the instruction when you can get the data from memory?

Thanks to point this out, I'll check this.
> 
>>  /*
>> + * This is per task AVX state data structure that indicates
>> + * whether the task uses AVX instructions.
>> + */
> 
> Here's another spot that doesn't precisely capture how his detection
> mechanism works.
> 
>> +struct avx_state {
>> +	unsigned int			state;
> 
> Isn't state a 0/1 thing?
> 
>> +	unsigned int			decay_count;
>> +};
> 
> Doesn't this max out at 3?
> 
> Seems like we're storing about three bits of data in 64 bits of space.
> Not a huge deal, but I think we can do better?
> 
> Also, do we really even need 'state'?
> 
> When would its value be different than
> 
> 	fpu->state.xsave.xfeatures & XFEATURE_MASK_Hi16_ZMM

okay, will refine in the next version

> 
>> +/*
>>   * Highest level per task FPU state data structure that
>>   * contains the FPU register state plus various FPU
>>   * state fields:
>> @@ -303,6 +312,14 @@ struct fpu {
>>  	unsigned char			initialized;
>>  
>>  	/*
>> +	 * @avx_state:
>> +	 *
>> +	 * This data structure indicates whether this context
>> +	 * contains AVX states
>> +	 */
> 
> Yeah, that's precisely what fpu->state.xsave.xfeatures does. :)
> I see, will refine in the next version

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [RFC PATCH v2 1/2] x86/fpu: detect AVX task
  2018-11-12  5:38   ` Li, Aubrey
@ 2018-11-12 15:46     ` Dave Hansen
  2018-11-13  6:42       ` Li, Aubrey
  0 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2018-11-12 15:46 UTC (permalink / raw)
  To: Li, Aubrey, Aubrey Li, tglx, mingo, peterz, hpa
  Cc: ak, tim.c.chen, arjan, linux-kernel

On 11/11/18 9:38 PM, Li, Aubrey wrote:
> If there is a valid state in the AVX registers, we can say the tasks contains
> AVX instructions, can't we?

XRSTOR, for instance, can take XSAVE state out of the init state, but it
is not necessarily an AVX instruction.  In fact, we had a kernel bug
along the way that was accidentally doing this on systems without AVX.

>>> +#define	AVX_STATE_DECAY_COUNT	3
>>
>> How was this number chosen?  What does this mean?
>>
>> It appears that this is saying that after 3 non-AVX-512-using context
>> switches, the task is not considered to be using AVX512 any more.  That
>> seems a bit goofy because the context switch rate is highly dependent on
>> HZ, and on how often the task yields.
>>
>> Do we want this, or do we want something more time-based?
>>
> This counter is introduced here to solve the race of context switch and
> VZEROUPPER. 3 context switches mean the same thread is on-off CPU 3 times.
> Due to scheduling latency, 3 jiffies could only happen AVX task on-off just
> 1 time. So IMHO the context switches number is better here.

Imagine we have a HZ=1000 system where AVX_STATE_DECAY_COUNT=3.  That
means that a task can be marked as a non-AVX-512-user after not using it
for ~3 ms.  But, with HZ=250, that's ~12ms.

Also, don't forget that we have context switches from the timer
interrupt, but also from normal old operations that sleep.

Let's say our AVX-512 app was doing:

	while (foo) {
		do_avx_512();
		read(pipe, buf, len);
		read(pipe, buf, len);
		read(pipe, buf, len);
	}

And all three pipe reads context-switched the task.  That loop could
finish in way under 3HZ, but still end up in do_avx_512() each time with
fpu...avx->state=0.

BTW, I don't have a great solution for this.  I was just pointing out
one of the pitfalls from using context switch counts so strictly.

>> I'd really like if this looked like this:
>>
>> 	if (!have_avx_state_detect()) {
>> 		avx->state = 0;
>> 		return;
>> 	}
>>
>> Then, in have_avx_state_detect(), explain what XGETBV1 does.  BTW, I
>> don't think we *totally* need to duplicate the SDM definitions in kernel
>> code for each instruction.  It's fine to just say that it set 1 for
>> features not in the init state.
>>
> 
> Thomas suggested open this inline since this is only usage of xgetbv1. So I'll
> use static_cpu_has() here directly. Does it sound good?

Yeah, that's OK.  But, I'd still limit the comments to what it is doing
instead of defining the instruction.

>>> +	/*
>>> +	 * XINUSE is dynamic to track component state because VZEROUPPER
>>> +	 * happens on every function end and reset the bitmap to the
>>> +	 * initial configuration.
>>
>> This is confusing to me because VZEROUPPER is not apparently involved
>> here.  What is this trying to say?
>>
> VZERROUPPER instruction in the task resets the bitmap.

I think this is too much detail for a kernel comment.  Let's just say
that the task itself can get back to the 'init state'.

>>> +	 * State decay is introduced to solve the race condition between
>>> +	 * context switch and a function end. State is aggressively set
>>> +	 * once it's detected but need to be cleared by decay 3 context
>>> +	 * switches
>>> +	 */
>>
>> I'd probably say:
>>
>> 	AVX512-using processes frequently set AVX512 back to the init 	
>> 	state themselves.  Thus, this detection mechanism can miss.
>> 	The decay ensures that false-negatives do not immediately make
>> 	a task be considered as not using AVX512.
> 
> Thanks, will refine it in the next version.
> And yes, AVX512-using processoes set AVX512 back to the init state themselves
> by VZEROUPPER instructions.

... or XRSTOR, or an in-kernel XRSTOR from a signal entry.

The point is, I'd probably not actually mention VZEROUPPER.  It's
certainly the common way to init the state, but let's not the *only* way
we have to be concerned with.

>>> +	if (xgetbv(XINUSE_STATE_BITMAP_INDEX) & XFEATURE_MASK_Hi16_ZMM) {
>>
>> This is *just* an AVX512 state, right?  That isn't reflected in any
>> comments or naming.  Also, why just this state, and not any of the other
>> AVX512-related states?
> 
> Only this state causes significant frequency drop, other states not.

OK.  This is an extremely crucial bit of information to capture,
including the background.

>>> +/*
>>>   * Highest level per task FPU state data structure that
>>>   * contains the FPU register state plus various FPU
>>>   * state fields:
>>> @@ -303,6 +312,14 @@ struct fpu {
>>>  	unsigned char			initialized;
>>>  
>>>  	/*
>>> +	 * @avx_state:
>>> +	 *
>>> +	 * This data structure indicates whether this context
>>> +	 * contains AVX states
>>> +	 */
>>
>> Yeah, that's precisely what fpu->state.xsave.xfeatures does. :)
>> I see, will refine in the next version

One other thought about the new 'avx_state':

fxregs_state (which is a part of the XSAVE state) has some padding and
'sw_reserved' areas.  You *might* be able to steal some space there.
Not that this is a huge space eater, but why waste the space if we don't
have to?



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/2] x86/fpu: detect AVX task
  2018-11-12 15:46     ` Dave Hansen
@ 2018-11-13  6:42       ` Li, Aubrey
  0 siblings, 0 replies; 11+ messages in thread
From: Li, Aubrey @ 2018-11-13  6:42 UTC (permalink / raw)
  To: Dave Hansen, Aubrey Li, tglx, mingo, peterz, hpa
  Cc: ak, tim.c.chen, arjan, linux-kernel

On 2018/11/12 23:46, Dave Hansen wrote:
> On 11/11/18 9:38 PM, Li, Aubrey wrote:
>
>>> Do we want this, or do we want something more time-based?
>>>
>> This counter is introduced here to solve the race of context switch and
>> VZEROUPPER. 3 context switches mean the same thread is on-off CPU 3 times.
>> Due to scheduling latency, 3 jiffies could only happen AVX task on-off just
>> 1 time. So IMHO the context switches number is better here.
> 
> Imagine we have a HZ=1000 system where AVX_STATE_DECAY_COUNT=3.  That
> means that a task can be marked as a non-AVX-512-user after not using it
> for ~3 ms.  But, with HZ=250, that's ~12ms.

From the other side, if we set a 4ms decay, when HZ=1000, context switch count
is 4, that means, we have 4 times of chance to maintain the AVX state, that is,
we are able to filter 4 times init state reset out. But if HZ = 250, the context
switch is 1, we only have 1 time of chance to filter init state reset out.
> 
> Also, don't forget that we have context switches from the timer
> interrupt, but also from normal old operations that sleep.
> 
> Let's say our AVX-512 app was doing:
> 
> 	while (foo) {
> 		do_avx_512();
> 		read(pipe, buf, len);
> 		read(pipe, buf, len);
> 		read(pipe, buf, len);
> 	}
> 
> And all three pipe reads context-switched the task.  That loop could
> finish in way under 3HZ, but still end up in do_avx_512() each time with
> fpu...avx->state=0.

Yeah, we are trying to address a prediction according to the historical pattern,
so you always can make a pattern to beat the prediction pattern. But in practice,
I measured tensorflow with AVX512 enabled, linpack with AVX512, and a micro 
benchmark, the current 3 context switches decay works well enough.

> 
> BTW, I don't have a great solution for this.  I was just pointing out
> one of the pitfalls from using context switch counts so strictly.

I really don't think time-based is better than the count in this case. 

>>>> +/*
>>>>   * Highest level per task FPU state data structure that
>>>>   * contains the FPU register state plus various FPU
>>>>   * state fields:
>>>> @@ -303,6 +312,14 @@ struct fpu {
>>>>  	unsigned char			initialized;
>>>>  
>>>>  	/*
>>>> +	 * @avx_state:
>>>> +	 *
>>>> +	 * This data structure indicates whether this context
>>>> +	 * contains AVX states
>>>> +	 */
>>>
>>> Yeah, that's precisely what fpu->state.xsave.xfeatures does. :)
>>> I see, will refine in the next version
> 
> One other thought about the new 'avx_state':
> 
> fxregs_state (which is a part of the XSAVE state) has some padding and
> 'sw_reserved' areas.  You *might* be able to steal some space there.
> Not that this is a huge space eater, but why waste the space if we don't
> have to?
> 

IMHO, I prefer not adding any extra thing into a data structure associated
with a hardware table. Let me try to work out a new version to see if it can
satisfy you.

Thanks,
-Aubrey


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC PATCH v2 1/2] x86/fpu: detect AVX task
  2018-11-12  1:40   ` Li, Aubrey
@ 2018-11-13 10:25     ` David Laight
  2018-11-13 13:06       ` Li, Aubrey
  0 siblings, 1 reply; 11+ messages in thread
From: David Laight @ 2018-11-13 10:25 UTC (permalink / raw)
  To: 'Li, Aubrey', Thomas Gleixner, Aubrey Li
  Cc: mingo, peterz, hpa, ak, tim.c.chen, arjan, linux-kernel

From: Li, Aubrey
> Sent: 12 November 2018 01:41
...
> VZEROUPPER instruction resets the init state. If context switch happens
> to occur exactly after VZEROUPPER instruction, XINUSE bitmap is empty(all
> zeros), which indicates the task is not using AVX. That's why the state
> decay count is used here.

Isn't there an obvious optimisation to execute VZEROALL during system call
entry?
If that is done does any of this actually work?

	David
 

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/2] x86/fpu: detect AVX task
  2018-11-13 10:25     ` David Laight
@ 2018-11-13 13:06       ` Li, Aubrey
  2018-11-13 14:56         ` David Laight
  0 siblings, 1 reply; 11+ messages in thread
From: Li, Aubrey @ 2018-11-13 13:06 UTC (permalink / raw)
  To: David Laight, Thomas Gleixner, Aubrey Li
  Cc: mingo, peterz, hpa, ak, tim.c.chen, arjan, linux-kernel

On 2018/11/13 18:25, David Laight wrote:
> From: Li, Aubrey
>> Sent: 12 November 2018 01:41
> ...
>> VZEROUPPER instruction resets the init state. If context switch happens
>> to occur exactly after VZEROUPPER instruction, XINUSE bitmap is empty(all
>> zeros), which indicates the task is not using AVX. That's why the state
>> decay count is used here.
> 
> Isn't there an obvious optimisation to execute VZEROALL during system call
> entry?

I'm not aware of this in the kernel, maybe you are talking about some 
optimization in glibc?

> If that is done does any of this actually work?

The bitmap is checked during context switch, not system call entry.

Also, the flag is turned on immediately once it's detected, but requires 3
*consecutive* context switches with no usage to clear. So it could filter
most jitter patterns out.

I measured tensorflow(with AVX512), linpack(with AVX512) and a micro
benchmark, this works properly.

If you have a AVX512 workload, I'd like to know if this works for it.

Thanks,
-Aubrey
> 
> 	David
>  
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC PATCH v2 1/2] x86/fpu: detect AVX task
  2018-11-13 13:06       ` Li, Aubrey
@ 2018-11-13 14:56         ` David Laight
  0 siblings, 0 replies; 11+ messages in thread
From: David Laight @ 2018-11-13 14:56 UTC (permalink / raw)
  To: 'Li, Aubrey', Thomas Gleixner, Aubrey Li
  Cc: mingo, peterz, hpa, ak, tim.c.chen, arjan, linux-kernel

From: Li, Aubrey
> Sent: 13 November 2018 13:07
...
> > Isn't there an obvious optimisation to execute VZEROALL during system call
> > entry?
> 
> I'm not aware of this in the kernel, maybe you are talking about some
> optimization in glibc?

I've not seen it anywhere either.

IIRC all the xmm and ymm registers are 'caller saved'.
Since system calls all look like function calls the compiler
has to assume that the registers can be modified.
This means that it is safe for the system call code to change them.
It mustn't leak kernel values out to userspace, but it can zero them.

So adding VZEROALL on system call entry would save the FP save code having
to save and restore any of the xmm/ymm registers.
IIRC the hardware just saves a flag indicating that they are all zero.

All the registers do need saving if the kernel is entered by an
interrupt or fault.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2018-11-13 14:56 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-07 17:16 [RFC PATCH v2 1/2] x86/fpu: detect AVX task Aubrey Li
2018-11-07 17:16 ` [RFC PATCH v2 2/2] proc: add /proc/<pid>/thread_state Aubrey Li
2018-11-09 11:21 ` [RFC PATCH v2 1/2] x86/fpu: detect AVX task Thomas Gleixner
2018-11-12  1:40   ` Li, Aubrey
2018-11-13 10:25     ` David Laight
2018-11-13 13:06       ` Li, Aubrey
2018-11-13 14:56         ` David Laight
2018-11-12  2:32 ` Dave Hansen
2018-11-12  5:38   ` Li, Aubrey
2018-11-12 15:46     ` Dave Hansen
2018-11-13  6:42       ` Li, Aubrey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).