All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 0/8] x86/entry: Consolidation - Part II
@ 2020-02-25 22:08 Thomas Gleixner
  2020-02-25 22:08 ` [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space Thomas Gleixner
                   ` (7 more replies)
  0 siblings, 8 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-25 22:08 UTC (permalink / raw)
  To: LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

Hi!

This is the second batch of a 73 patches series which consolidates the x86
entry code. The larger explanation is in the part I cover letter:

 https://lore.kernel.org/r/20200225213636.689276920@linutronix.de

I applies on top of part I which can be found via the above link.

This part cleans up the entry code and lifts the irq tracing and entry/exit
work into C, which is a preliminary to make especially the exit work a
generic infrastructure.

It has some rough edges as some of the ASM code is shared with
exceptions/traps/interrupts, which will be addressed in later parts of the
series.

This applies on top of part one which is available here:

   git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git entry-v1-part1

To get both part 1 and part 2 pull from here:

   git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git entry-v1-part2

Thanks,

	tglx

8<---------------
 common.c          |   82 ++++++++++++++++++++++++++++++++++++++++--------------
 entry_32.S        |   24 ++-------------
 entry_64.S        |    6 ---
 entry_64_compat.S |   32 +++------------------
 4 files changed, 71 insertions(+), 73 deletions(-)





^ permalink raw reply	[flat|nested] 56+ messages in thread

* [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space
  2020-02-25 22:08 [patch 0/8] x86/entry: Consolidation - Part II Thomas Gleixner
@ 2020-02-25 22:08 ` Thomas Gleixner
  2020-02-27 19:49   ` Borislav Petkov
                     ` (2 more replies)
  2020-02-25 22:08 ` [patch 2/8] x86/entry/common: Consolidate syscall entry code Thomas Gleixner
                   ` (6 subsequent siblings)
  7 siblings, 3 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-25 22:08 UTC (permalink / raw)
  To: LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

User space cannot longer disable interrupts so trace return to user space
unconditionally as IRQS_ON.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/entry_64.S |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -174,7 +174,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_h
 	movq	%rsp, %rsi
 	call	do_syscall_64		/* returns with IRQs disabled */
 
-	TRACE_IRQS_IRETQ		/* we're about to change IF */
+	TRACE_IRQS_ON			/* return enables interrupts */
 
 	/*
 	 * Try to use SYSRET instead of IRET if we're returning to
@@ -619,7 +619,7 @@ SYM_CODE_START_LOCAL(common_interrupt)
 .Lretint_user:
 	mov	%rsp,%rdi
 	call	prepare_exit_to_usermode
-	TRACE_IRQS_IRETQ
+	TRACE_IRQS_ON
 
 SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
 #ifdef CONFIG_DEBUG_ENTRY


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [patch 2/8] x86/entry/common: Consolidate syscall entry code
  2020-02-25 22:08 [patch 0/8] x86/entry: Consolidation - Part II Thomas Gleixner
  2020-02-25 22:08 ` [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space Thomas Gleixner
@ 2020-02-25 22:08 ` Thomas Gleixner
  2020-02-27 19:57   ` Borislav Petkov
                     ` (2 more replies)
  2020-02-25 22:08 ` [patch 3/8] x86/entry/common: Mark syscall entry points notrace/nokprobe Thomas Gleixner
                   ` (5 subsequent siblings)
  7 siblings, 3 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-25 22:08 UTC (permalink / raw)
  To: LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

All syscall entry points call enter_from_user_mode() and
local_irq_enable(). Move that into an inline helper so the interrupt
tracing can be moved into that helper later instead of sprinkling
it all over the place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/common.c |   48 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -49,6 +49,18 @@
 static inline void enter_from_user_mode(void) {}
 #endif
 
+/*
+ * All syscall entry variants call with interrupts disabled.
+ *
+ * Invoke context tracking if enabled and enable interrupts for further
+ * processing.
+ */
+static __always_inline void syscall_entry_fixups(void)
+{
+	enter_from_user_mode();
+	local_irq_enable();
+}
+
 static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
 {
 #ifdef CONFIG_X86_64
@@ -279,13 +291,11 @@ static void syscall_slow_exit_work(struc
 }
 
 #ifdef CONFIG_X86_64
-__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
+static __always_inline
+void do_syscall_64_irqs_on(unsigned long nr, struct pt_regs *regs)
 {
-	struct thread_info *ti;
+	struct thread_info *ti = current_thread_info();
 
-	enter_from_user_mode();
-	local_irq_enable();
-	ti = current_thread_info();
 	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
 		nr = syscall_trace_enter(regs);
 
@@ -303,6 +313,12 @@ static void syscall_slow_exit_work(struc
 
 	syscall_return_slowpath(regs);
 }
+
+__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
+{
+	syscall_entry_fixups();
+	do_syscall_64_irqs_on(nr, regs);
+}
 #endif
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -355,19 +371,17 @@ static __always_inline void do_syscall_3
 /* Handles int $0x80 */
 __visible void do_int80_syscall_32(struct pt_regs *regs)
 {
-	enter_from_user_mode();
-	local_irq_enable();
+	syscall_entry_fixups();
 	do_syscall_32_irqs_on(regs);
 }
 
-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible long do_fast_syscall_32(struct pt_regs *regs)
+/* Fast syscall 32bit variant */
+static __always_inline long do_fast_syscall_32_irqs_on(struct pt_regs *regs)
 {
 	/*
 	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
 	 * convention.  Adjust regs so it looks like we entered using int80.
 	 */
-
 	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
 		vdso_image_32.sym_int80_landing_pad;
 
@@ -378,10 +392,6 @@ static __always_inline void do_syscall_3
 	 */
 	regs->ip = landing_pad;
 
-	enter_from_user_mode();
-
-	local_irq_enable();
-
 	/* Fetch EBP from where the vDSO stashed it. */
 	if (
 #ifdef CONFIG_X86_64
@@ -437,4 +447,12 @@ static __always_inline void do_syscall_3
 		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
 #endif
 }
-#endif
+
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
+__visible long do_fast_syscall_32(struct pt_regs *regs)
+{
+	syscall_entry_fixups();
+	return do_fast_syscall_32_irqs_on(regs);
+}
+
+#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [patch 3/8] x86/entry/common: Mark syscall entry points notrace/nokprobe
  2020-02-25 22:08 [patch 0/8] x86/entry: Consolidation - Part II Thomas Gleixner
  2020-02-25 22:08 ` [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space Thomas Gleixner
  2020-02-25 22:08 ` [patch 2/8] x86/entry/common: Consolidate syscall entry code Thomas Gleixner
@ 2020-02-25 22:08 ` Thomas Gleixner
  2020-02-27 23:15   ` Frederic Weisbecker
  2020-02-28  8:59   ` Alexandre Chartre
  2020-02-25 22:08 ` [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code Thomas Gleixner
                   ` (4 subsequent siblings)
  7 siblings, 2 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-25 22:08 UTC (permalink / raw)
  To: LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

Anything before enter_from_user_mode() is not safe to be traced or probed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/common.c |    9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -314,11 +314,12 @@ void do_syscall_64_irqs_on(unsigned long
 	syscall_return_slowpath(regs);
 }
 
-__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
+__visible notrace void do_syscall_64(unsigned long nr, struct pt_regs *regs)
 {
 	syscall_entry_fixups();
 	do_syscall_64_irqs_on(nr, regs);
 }
+NOKPROBE_SYMBOL(do_syscall_64);
 #endif
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -369,11 +370,12 @@ static __always_inline void do_syscall_3
 }
 
 /* Handles int $0x80 */
-__visible void do_int80_syscall_32(struct pt_regs *regs)
+__visible notrace void do_int80_syscall_32(struct pt_regs *regs)
 {
 	syscall_entry_fixups();
 	do_syscall_32_irqs_on(regs);
 }
+NOKPROBE_SYMBOL(do_int80_syscall_32);
 
 /* Fast syscall 32bit variant */
 static __always_inline long do_fast_syscall_32_irqs_on(struct pt_regs *regs)
@@ -449,10 +451,11 @@ static __always_inline long do_fast_sysc
 }
 
 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible long do_fast_syscall_32(struct pt_regs *regs)
+__visible notrace long do_fast_syscall_32(struct pt_regs *regs)
 {
 	syscall_entry_fixups();
 	return do_fast_syscall_32_irqs_on(regs);
 }
+NOKPROBE_SYMBOL(do_fast_syscall_32);
 
 #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-25 22:08 [patch 0/8] x86/entry: Consolidation - Part II Thomas Gleixner
                   ` (2 preceding siblings ...)
  2020-02-25 22:08 ` [patch 3/8] x86/entry/common: Mark syscall entry points notrace/nokprobe Thomas Gleixner
@ 2020-02-25 22:08 ` Thomas Gleixner
  2020-02-26  5:43   ` Andy Lutomirski
                     ` (2 more replies)
  2020-02-25 22:08 ` [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions Thomas Gleixner
                   ` (3 subsequent siblings)
  7 siblings, 3 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-25 22:08 UTC (permalink / raw)
  To: LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

Now that the C entry points are safe, move the irq flags tracing code into
the entry helper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/common.c          |    5 +++++
 arch/x86/entry/entry_32.S        |   12 ------------
 arch/x86/entry/entry_64.S        |    2 --
 arch/x86/entry/entry_64_compat.S |   18 ------------------
 4 files changed, 5 insertions(+), 32 deletions(-)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -57,6 +57,11 @@ static inline void enter_from_user_mode(
  */
 static __always_inline void syscall_entry_fixups(void)
 {
+	/*
+	 * Usermode is traced as interrupts enabled, but the syscall entry
+	 * mechanisms disable interrupts. Tell the tracer.
+	 */
+	trace_hardirqs_off();
 	enter_from_user_mode();
 	local_irq_enable();
 }
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -960,12 +960,6 @@ SYM_FUNC_START(entry_SYSENTER_32)
 	jnz	.Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
 
-	/*
-	 * User mode is traced as though IRQs are on, and SYSENTER
-	 * turned them off.
-	 */
-	TRACE_IRQS_OFF
-
 	movl	%esp, %eax
 	call	do_fast_syscall_32
 	/* XEN PV guests always use IRET path */
@@ -1075,12 +1069,6 @@ SYM_FUNC_START(entry_INT80_32)
 
 	SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1	/* save rest */
 
-	/*
-	 * User mode is traced as though IRQs are on, and the interrupt gate
-	 * turned them off.
-	 */
-	TRACE_IRQS_OFF
-
 	movl	%esp, %eax
 	call	do_int80_syscall_32
 .Lsyscall_32_done:
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -167,8 +167,6 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_h
 
 	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
 
-	TRACE_IRQS_OFF
-
 	/* IRQs are off. */
 	movq	%rax, %rdi
 	movq	%rsp, %rsi
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -129,12 +129,6 @@ SYM_FUNC_START(entry_SYSENTER_compat)
 	jnz	.Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
 
-	/*
-	 * User mode is traced as though IRQs are on, and SYSENTER
-	 * turned them off.
-	 */
-	TRACE_IRQS_OFF
-
 	movq	%rsp, %rdi
 	call	do_fast_syscall_32
 	/* XEN PV guests always use IRET path */
@@ -247,12 +241,6 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_aft
 	pushq   $0			/* pt_regs->r15 = 0 */
 	xorl	%r15d, %r15d		/* nospec   r15 */
 
-	/*
-	 * User mode is traced as though IRQs are on, and SYSENTER
-	 * turned them off.
-	 */
-	TRACE_IRQS_OFF
-
 	movq	%rsp, %rdi
 	call	do_fast_syscall_32
 	/* XEN PV guests always use IRET path */
@@ -403,12 +391,6 @@ SYM_CODE_START(entry_INT80_compat)
 	xorl	%r15d, %r15d		/* nospec   r15 */
 	cld
 
-	/*
-	 * User mode is traced as though IRQs are on, and the interrupt
-	 * gate turned them off.
-	 */
-	TRACE_IRQS_OFF
-
 	movq	%rsp, %rdi
 	call	do_int80_syscall_32
 .Lsyscall_32_done:


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions
  2020-02-25 22:08 [patch 0/8] x86/entry: Consolidation - Part II Thomas Gleixner
                   ` (3 preceding siblings ...)
  2020-02-25 22:08 ` [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code Thomas Gleixner
@ 2020-02-25 22:08 ` Thomas Gleixner
  2020-02-26  5:45   ` Andy Lutomirski
  2020-02-27 15:43   ` Alexandre Chartre
  2020-02-25 22:08 ` [patch 6/8] x86/entry: Move irq tracing to syscall_slow_exit_work Thomas Gleixner
                   ` (2 subsequent siblings)
  7 siblings, 2 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-25 22:08 UTC (permalink / raw)
  To: LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

Split prepare_enter_to_user_mode() and mark it notrace/noprobe so the irq
flags tracing on return can be put into it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/common.c |   15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -196,7 +196,7 @@ static void exit_to_usermode_loop(struct
 }
 
 /* Called with IRQs disabled. */
-__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
+static inline void __prepare_exit_to_usermode(struct pt_regs *regs)
 {
 	struct thread_info *ti = current_thread_info();
 	u32 cached_flags;
@@ -241,6 +241,12 @@ static void exit_to_usermode_loop(struct
 	mds_user_clear_cpu_buffers();
 }
 
+__visible inline notrace void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+	__prepare_exit_to_usermode(regs);
+}
+NOKPROBE_SYMBOL(prepare_exit_to_usermode);
+
 #define SYSCALL_EXIT_WORK_FLAGS				\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |	\
 	 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
@@ -271,7 +277,7 @@ static void syscall_slow_exit_work(struc
  * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
  * state such that we can immediately switch to user mode.
  */
-__visible inline void syscall_return_slowpath(struct pt_regs *regs)
+__visible inline notrace void syscall_return_slowpath(struct pt_regs *regs)
 {
 	struct thread_info *ti = current_thread_info();
 	u32 cached_flags = READ_ONCE(ti->flags);
@@ -292,8 +298,9 @@ static void syscall_slow_exit_work(struc
 		syscall_slow_exit_work(regs, cached_flags);
 
 	local_irq_disable();
-	prepare_exit_to_usermode(regs);
+	__prepare_exit_to_usermode(regs);
 }
+NOKPROBE_SYMBOL(syscall_return_slowpath);
 
 #ifdef CONFIG_X86_64
 static __always_inline
@@ -417,7 +424,7 @@ static __always_inline long do_fast_sysc
 		/* User code screwed up. */
 		local_irq_disable();
 		regs->ax = -EFAULT;
-		prepare_exit_to_usermode(regs);
+		__prepare_exit_to_usermode(regs);
 		return 0;	/* Keep it simple: use IRET. */
 	}
 


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [patch 6/8] x86/entry: Move irq tracing to syscall_slow_exit_work
  2020-02-25 22:08 [patch 0/8] x86/entry: Consolidation - Part II Thomas Gleixner
                   ` (4 preceding siblings ...)
  2020-02-25 22:08 ` [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions Thomas Gleixner
@ 2020-02-25 22:08 ` Thomas Gleixner
  2020-02-26  5:47   ` Andy Lutomirski
  2020-02-27 16:12   ` Alexandre Chartre
  2020-02-25 22:08 ` [patch 7/8] x86/entry: Move irq tracing to prepare_exit_to_user_mode() Thomas Gleixner
  2020-02-25 22:08 ` [patch 8/8] x86/entry: Move irqflags tracing to do_int80_syscall_32() Thomas Gleixner
  7 siblings, 2 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-25 22:08 UTC (permalink / raw)
  To: LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

which removes the ASM IRQ tracepoints.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/common.c   |    2 ++
 arch/x86/entry/entry_32.S |    3 ++-
 arch/x86/entry/entry_64.S |    3 ---
 3 files changed, 4 insertions(+), 4 deletions(-)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -299,6 +299,8 @@ static void syscall_slow_exit_work(struc
 
 	local_irq_disable();
 	__prepare_exit_to_usermode(regs);
+	/* Return to user space enables interrupts */
+	trace_hardirqs_on();
 }
 NOKPROBE_SYMBOL(syscall_return_slowpath);
 
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -812,7 +812,7 @@ SYM_CODE_START(ret_from_fork)
 	movl    %esp, %eax
 	call    syscall_return_slowpath
 	STACKLEAK_ERASE
-	jmp     restore_all
+	jmp     restore_all_switch_stack
 
 	/* kernel thread */
 1:	movl	%edi, %eax
@@ -1077,6 +1077,7 @@ SYM_FUNC_START(entry_INT80_32)
 
 restore_all:
 	TRACE_IRQS_IRET
+restore_all_switch_stack:
 	SWITCH_TO_ENTRY_STACK
 	CHECK_AND_APPLY_ESPFIX
 .Lrestore_nocheck:
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -172,8 +172,6 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_h
 	movq	%rsp, %rsi
 	call	do_syscall_64		/* returns with IRQs disabled */
 
-	TRACE_IRQS_ON			/* return enables interrupts */
-
 	/*
 	 * Try to use SYSRET instead of IRET if we're returning to
 	 * a completely clean 64-bit userspace context.  If we're not,
@@ -340,7 +338,6 @@ SYM_CODE_START(ret_from_fork)
 	UNWIND_HINT_REGS
 	movq	%rsp, %rdi
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
-	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
 	jmp	swapgs_restore_regs_and_return_to_usermode
 
 1:


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [patch 7/8] x86/entry: Move irq tracing to prepare_exit_to_user_mode()
  2020-02-25 22:08 [patch 0/8] x86/entry: Consolidation - Part II Thomas Gleixner
                   ` (5 preceding siblings ...)
  2020-02-25 22:08 ` [patch 6/8] x86/entry: Move irq tracing to syscall_slow_exit_work Thomas Gleixner
@ 2020-02-25 22:08 ` Thomas Gleixner
  2020-02-26  5:50   ` Andy Lutomirski
  2020-02-25 22:08 ` [patch 8/8] x86/entry: Move irqflags tracing to do_int80_syscall_32() Thomas Gleixner
  7 siblings, 1 reply; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-25 22:08 UTC (permalink / raw)
  To: LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

which again gets it out of the ASM code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/common.c   |    1 +
 arch/x86/entry/entry_32.S |    2 +-
 arch/x86/entry/entry_64.S |    1 -
 3 files changed, 2 insertions(+), 2 deletions(-)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -244,6 +244,7 @@ static inline void __prepare_exit_to_use
 __visible inline notrace void prepare_exit_to_usermode(struct pt_regs *regs)
 {
 	__prepare_exit_to_usermode(regs);
+	trace_hardirqs_on();
 }
 NOKPROBE_SYMBOL(prepare_exit_to_usermode);
 
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -855,7 +855,7 @@ SYM_CODE_START_LOCAL(ret_from_exception)
 	TRACE_IRQS_OFF
 	movl	%esp, %eax
 	call	prepare_exit_to_usermode
-	jmp	restore_all
+	jmp	restore_all_switch_stack
 SYM_CODE_END(ret_from_exception)
 
 SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -614,7 +614,6 @@ SYM_CODE_START_LOCAL(common_interrupt)
 .Lretint_user:
 	mov	%rsp,%rdi
 	call	prepare_exit_to_usermode
-	TRACE_IRQS_ON
 
 SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
 #ifdef CONFIG_DEBUG_ENTRY


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [patch 8/8] x86/entry: Move irqflags tracing to do_int80_syscall_32()
  2020-02-25 22:08 [patch 0/8] x86/entry: Consolidation - Part II Thomas Gleixner
                   ` (6 preceding siblings ...)
  2020-02-25 22:08 ` [patch 7/8] x86/entry: Move irq tracing to prepare_exit_to_user_mode() Thomas Gleixner
@ 2020-02-25 22:08 ` Thomas Gleixner
  2020-02-27 16:46   ` Alexandre Chartre
  7 siblings, 1 reply; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-25 22:08 UTC (permalink / raw)
  To: LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

which cleans up the ASM maze.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/common.c          |    8 +++++++-
 arch/x86/entry/entry_32.S        |    9 ++-------
 arch/x86/entry/entry_64_compat.S |   14 +++++---------
 3 files changed, 14 insertions(+), 17 deletions(-)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -333,6 +333,7 @@ void do_syscall_64_irqs_on(unsigned long
 {
 	syscall_entry_fixups();
 	do_syscall_64_irqs_on(nr, regs);
+	trace_hardirqs_on();
 }
 NOKPROBE_SYMBOL(do_syscall_64);
 #endif
@@ -389,6 +390,7 @@ static __always_inline void do_syscall_3
 {
 	syscall_entry_fixups();
 	do_syscall_32_irqs_on(regs);
+	trace_hardirqs_on();
 }
 NOKPROBE_SYMBOL(do_int80_syscall_32);
 
@@ -468,8 +470,12 @@ static __always_inline long do_fast_sysc
 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
 __visible notrace long do_fast_syscall_32(struct pt_regs *regs)
 {
+	long ret;
+
 	syscall_entry_fixups();
-	return do_fast_syscall_32_irqs_on(regs);
+	ret = do_fast_syscall_32_irqs_on(regs);
+	trace_hardirqs_on();
+	return ret;
 }
 NOKPROBE_SYMBOL(do_fast_syscall_32);
 
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -811,8 +811,7 @@ SYM_CODE_START(ret_from_fork)
 	/* When we fork, we trace the syscall return in the child, too. */
 	movl    %esp, %eax
 	call    syscall_return_slowpath
-	STACKLEAK_ERASE
-	jmp     restore_all_switch_stack
+	jmp     .Lsyscall_32_done
 
 	/* kernel thread */
 1:	movl	%edi, %eax
@@ -968,8 +967,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
 	STACKLEAK_ERASE
 
-/* Opportunistic SYSEXIT */
-	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
+	/* Opportunistic SYSEXIT */
 
 	/*
 	 * Setup entry stack - we keep the pointer in %eax and do the
@@ -1072,11 +1070,8 @@ SYM_FUNC_START(entry_INT80_32)
 	movl	%esp, %eax
 	call	do_int80_syscall_32
 .Lsyscall_32_done:
-
 	STACKLEAK_ERASE
 
-restore_all:
-	TRACE_IRQS_IRET
 restore_all_switch_stack:
 	SWITCH_TO_ENTRY_STACK
 	CHECK_AND_APPLY_ESPFIX
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -132,8 +132,8 @@ SYM_FUNC_START(entry_SYSENTER_compat)
 	movq	%rsp, %rdi
 	call	do_fast_syscall_32
 	/* XEN PV guests always use IRET path */
-	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
-		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
 	jmp	sysret32_from_system_call
 
 .Lsysenter_fix_flags:
@@ -244,8 +244,8 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_aft
 	movq	%rsp, %rdi
 	call	do_fast_syscall_32
 	/* XEN PV guests always use IRET path */
-	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
-		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
 
 	/* Opportunistic SYSRET */
 sysret32_from_system_call:
@@ -254,7 +254,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_aft
 	 * stack. So let's erase the thread stack right now.
 	 */
 	STACKLEAK_ERASE
-	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
+
 	movq	RBX(%rsp), %rbx		/* pt_regs->rbx */
 	movq	RBP(%rsp), %rbp		/* pt_regs->rbp */
 	movq	EFLAGS(%rsp), %r11	/* pt_regs->flags (in r11) */
@@ -393,9 +393,5 @@ SYM_CODE_START(entry_INT80_compat)
 
 	movq	%rsp, %rdi
 	call	do_int80_syscall_32
-.Lsyscall_32_done:
-
-	/* Go back to user mode. */
-	TRACE_IRQS_ON
 	jmp	swapgs_restore_regs_and_return_to_usermode
 SYM_CODE_END(entry_INT80_compat)


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-25 22:08 ` [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code Thomas Gleixner
@ 2020-02-26  5:43   ` Andy Lutomirski
  2020-02-26  8:17     ` Peter Zijlstra
  2020-02-27 23:11   ` Frederic Weisbecker
  2020-02-28  9:00   ` Alexandre Chartre
  2 siblings, 1 reply; 56+ messages in thread
From: Andy Lutomirski @ 2020-02-26  5:43 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

On 2/25/20 2:08 PM, Thomas Gleixner wrote:
> Now that the C entry points are safe, move the irq flags tracing code into
> the entry helper.

I'm so confused.

> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  arch/x86/entry/common.c          |    5 +++++
>  arch/x86/entry/entry_32.S        |   12 ------------
>  arch/x86/entry/entry_64.S        |    2 --
>  arch/x86/entry/entry_64_compat.S |   18 ------------------
>  4 files changed, 5 insertions(+), 32 deletions(-)
> 
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -57,6 +57,11 @@ static inline void enter_from_user_mode(
>   */
>  static __always_inline void syscall_entry_fixups(void)
>  {
> +	/*
> +	 * Usermode is traced as interrupts enabled, but the syscall entry
> +	 * mechanisms disable interrupts. Tell the tracer.
> +	 */
> +	trace_hardirqs_off();

Your earlier patches suggest quite strongly that tracing isn't safe
until enter_from_user_mode().  But trace_hardirqs_off() calls
trace_irq_disable_rcuidle(), which looks [0] like a tracepoint.

Did you perhaps mean to do this *after* enter_from_user_mode()?

(My tracepoint-fu is pretty bad, and I can't actually find where it's
defined as a tracepoint.  But then it does nothing at all, so it must be
a tracepoint, right?)

>  	enter_from_user_mode();
>  	local_irq_enable();
>  }


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions
  2020-02-25 22:08 ` [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions Thomas Gleixner
@ 2020-02-26  5:45   ` Andy Lutomirski
  2020-02-26  8:15     ` Peter Zijlstra
  2020-02-27 15:43   ` Alexandre Chartre
  1 sibling, 1 reply; 56+ messages in thread
From: Andy Lutomirski @ 2020-02-26  5:45 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

On 2/25/20 2:08 PM, Thomas Gleixner wrote:
> Split prepare_enter_to_user_mode() and mark it notrace/noprobe so the irq
> flags tracing on return can be put into it.

Is our tooling clever enough for thsi to do anything?  You have a static
inline that is only called in one place, and the caller is notrace and
NOKPROBE.  Does this actually allow tracing in the static inline callee?

--Andy

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 6/8] x86/entry: Move irq tracing to syscall_slow_exit_work
  2020-02-25 22:08 ` [patch 6/8] x86/entry: Move irq tracing to syscall_slow_exit_work Thomas Gleixner
@ 2020-02-26  5:47   ` Andy Lutomirski
  2020-02-27 16:12   ` Alexandre Chartre
  1 sibling, 0 replies; 56+ messages in thread
From: Andy Lutomirski @ 2020-02-26  5:47 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

On 2/25/20 2:08 PM, Thomas Gleixner wrote:
> which removes the ASM IRQ tracepoints.

It's still after we re-enter rcuidle.  Is tracing actually okay?

I always had the impression that tracing was okay in rcuidle mode
because the tracing code was smart enough to do the right thing, but
your patch 3/8 changelog says:

Anything before enter_from_user_mode() is not safe to be traced or probed.


--Andy

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 7/8] x86/entry: Move irq tracing to prepare_exit_to_user_mode()
  2020-02-25 22:08 ` [patch 7/8] x86/entry: Move irq tracing to prepare_exit_to_user_mode() Thomas Gleixner
@ 2020-02-26  5:50   ` Andy Lutomirski
  2020-02-26 19:53     ` Thomas Gleixner
  0 siblings, 1 reply; 56+ messages in thread
From: Andy Lutomirski @ 2020-02-26  5:50 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

On 2/25/20 2:08 PM, Thomas Gleixner wrote:
> which again gets it out of the ASM code.

Why is this better than just sticking the tracing in
__prepare_exit_from_usermode() or just not splitting it in the first place?

(ISTM the real right solution is to make sure that it's okay for
trace_hardirqs_... to be safe even when rcuidle.  But I may still be
missing something.)

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions
  2020-02-26  5:45   ` Andy Lutomirski
@ 2020-02-26  8:15     ` Peter Zijlstra
  0 siblings, 0 replies; 56+ messages in thread
From: Peter Zijlstra @ 2020-02-26  8:15 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, LKML, x86, Steven Rostedt, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann

On Tue, Feb 25, 2020 at 09:45:11PM -0800, Andy Lutomirski wrote:
> On 2/25/20 2:08 PM, Thomas Gleixner wrote:
> > Split prepare_enter_to_user_mode() and mark it notrace/noprobe so the irq
> > flags tracing on return can be put into it.
> 
> Is our tooling clever enough for thsi to do anything?  You have a static
> inline that is only called in one place, and the caller is notrace and
> NOKPROBE.  Does this actually allow tracing in the static inline callee?

tracing, no, but the NOKPROBE on inline functions is buggered afaiu.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-26  5:43   ` Andy Lutomirski
@ 2020-02-26  8:17     ` Peter Zijlstra
  2020-02-26 11:20       ` Andy Lutomirski
  0 siblings, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2020-02-26  8:17 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, LKML, x86, Steven Rostedt, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann

On Tue, Feb 25, 2020 at 09:43:46PM -0800, Andy Lutomirski wrote:
> On 2/25/20 2:08 PM, Thomas Gleixner wrote:
> > Now that the C entry points are safe, move the irq flags tracing code into
> > the entry helper.
> 
> I'm so confused.
> 
> > 
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > ---
> >  arch/x86/entry/common.c          |    5 +++++
> >  arch/x86/entry/entry_32.S        |   12 ------------
> >  arch/x86/entry/entry_64.S        |    2 --
> >  arch/x86/entry/entry_64_compat.S |   18 ------------------
> >  4 files changed, 5 insertions(+), 32 deletions(-)
> > 
> > --- a/arch/x86/entry/common.c
> > +++ b/arch/x86/entry/common.c
> > @@ -57,6 +57,11 @@ static inline void enter_from_user_mode(
> >   */
> >  static __always_inline void syscall_entry_fixups(void)
> >  {
> > +	/*
> > +	 * Usermode is traced as interrupts enabled, but the syscall entry
> > +	 * mechanisms disable interrupts. Tell the tracer.
> > +	 */
> > +	trace_hardirqs_off();
> 
> Your earlier patches suggest quite strongly that tracing isn't safe
> until enter_from_user_mode().  But trace_hardirqs_off() calls
> trace_irq_disable_rcuidle(), which looks [0] like a tracepoint.
> 
> Did you perhaps mean to do this *after* enter_from_user_mode()?

aside from the fact that enter_from_user_mode() itself also has a
tracepoint, the crucial detail is that we must not trace/kprobe the
function calling this.

Specifically for #PF, because we need read_cr2() before this. See later
patches.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-26  8:17     ` Peter Zijlstra
@ 2020-02-26 11:20       ` Andy Lutomirski
  2020-02-26 19:51         ` Thomas Gleixner
  0 siblings, 1 reply; 56+ messages in thread
From: Andy Lutomirski @ 2020-02-26 11:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andy Lutomirski, Thomas Gleixner, LKML, x86, Steven Rostedt,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann



> On Feb 26, 2020, at 12:17 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Tue, Feb 25, 2020 at 09:43:46PM -0800, Andy Lutomirski wrote:
>>> On 2/25/20 2:08 PM, Thomas Gleixner wrote:
>>> Now that the C entry points are safe, move the irq flags tracing code into
>>> the entry helper.
>> 
>> I'm so confused.
>> 
>>> 
>>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>>> ---
>>> arch/x86/entry/common.c          |    5 +++++
>>> arch/x86/entry/entry_32.S        |   12 ------------
>>> arch/x86/entry/entry_64.S        |    2 --
>>> arch/x86/entry/entry_64_compat.S |   18 ------------------
>>> 4 files changed, 5 insertions(+), 32 deletions(-)
>>> 
>>> --- a/arch/x86/entry/common.c
>>> +++ b/arch/x86/entry/common.c
>>> @@ -57,6 +57,11 @@ static inline void enter_from_user_mode(
>>>  */
>>> static __always_inline void syscall_entry_fixups(void)
>>> {
>>> +    /*
>>> +     * Usermode is traced as interrupts enabled, but the syscall entry
>>> +     * mechanisms disable interrupts. Tell the tracer.
>>> +     */
>>> +    trace_hardirqs_off();
>> 
>> Your earlier patches suggest quite strongly that tracing isn't safe
>> until enter_from_user_mode().  But trace_hardirqs_off() calls
>> trace_irq_disable_rcuidle(), which looks [0] like a tracepoint.
>> 
>> Did you perhaps mean to do this *after* enter_from_user_mode()?
> 
> aside from the fact that enter_from_user_mode() itself also has a
> tracepoint, the crucial detail is that we must not trace/kprobe the
> function calling this.
> 
> Specifically for #PF, because we need read_cr2() before this. See later
> patches.

Indeed. I’m fine with this patch, but I still don’t understand what the changelog is about. And I’m still rather baffled by most of the notrace annotations in the series.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-26 11:20       ` Andy Lutomirski
@ 2020-02-26 19:51         ` Thomas Gleixner
  2020-02-29 14:44           ` Thomas Gleixner
  0 siblings, 1 reply; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-26 19:51 UTC (permalink / raw)
  To: Andy Lutomirski, Peter Zijlstra
  Cc: Andy Lutomirski, LKML, x86, Steven Rostedt, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann

Andy Lutomirski <luto@amacapital.net> writes:
>> On Feb 26, 2020, at 12:17 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> On Tue, Feb 25, 2020 at 09:43:46PM -0800, Andy Lutomirski wrote:
>>> Your earlier patches suggest quite strongly that tracing isn't safe
>>> until enter_from_user_mode().  But trace_hardirqs_off() calls
>>> trace_irq_disable_rcuidle(), which looks [0] like a tracepoint.
>>> 
>>> Did you perhaps mean to do this *after* enter_from_user_mode()?
>> 
>> aside from the fact that enter_from_user_mode() itself also has a
>> tracepoint, the crucial detail is that we must not trace/kprobe the
>> function calling this.
>> 
>> Specifically for #PF, because we need read_cr2() before this. See later
>> patches.
>
> Indeed. I’m fine with this patch, but I still don’t understand what
> the changelog is about.

Yeah, the changelog is not really helpful. Let me fix that.

> And I’m still rather baffled by most of the notrace annotations in the
> series.

As discussed on IRC, this might be too broad, but then I rather have the
actual C-entry points neither traceable nor probable in general and
relax this by calling functions which can be traced and probed.

My rationale for this decision was that enter_from_user_mode() is marked
notrace/noprobe as well, so I kept the protection scope the same as we
had in the ASM maze which is marked noprobe already.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 7/8] x86/entry: Move irq tracing to prepare_exit_to_user_mode()
  2020-02-26  5:50   ` Andy Lutomirski
@ 2020-02-26 19:53     ` Thomas Gleixner
  2020-02-26 20:07       ` Andy Lutomirski
  0 siblings, 1 reply; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-26 19:53 UTC (permalink / raw)
  To: Andy Lutomirski, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

Andy Lutomirski <luto@kernel.org> writes:

> On 2/25/20 2:08 PM, Thomas Gleixner wrote:
>> which again gets it out of the ASM code.
>
> Why is this better than just sticking the tracing in
> __prepare_exit_from_usermode() or just not splitting it in the first
> place?

The split is there because prepare_exit_from_usermode() is used from the
idtentry maze as well. Once that stuff is converted in the later series
the split goes away again.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 7/8] x86/entry: Move irq tracing to prepare_exit_to_user_mode()
  2020-02-26 19:53     ` Thomas Gleixner
@ 2020-02-26 20:07       ` Andy Lutomirski
  0 siblings, 0 replies; 56+ messages in thread
From: Andy Lutomirski @ 2020-02-26 20:07 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, LKML, X86 ML, Steven Rostedt, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann

On Wed, Feb 26, 2020 at 11:54 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Andy Lutomirski <luto@kernel.org> writes:
>
> > On 2/25/20 2:08 PM, Thomas Gleixner wrote:
> >> which again gets it out of the ASM code.
> >
> > Why is this better than just sticking the tracing in
> > __prepare_exit_from_usermode() or just not splitting it in the first
> > place?
>
> The split is there because prepare_exit_from_usermode() is used from the
> idtentry maze as well. Once that stuff is converted in the later series
> the split goes away again.
>

Okay.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions
  2020-02-25 22:08 ` [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions Thomas Gleixner
  2020-02-26  5:45   ` Andy Lutomirski
@ 2020-02-27 15:43   ` Alexandre Chartre
  2020-02-27 15:53     ` Thomas Gleixner
  1 sibling, 1 reply; 56+ messages in thread
From: Alexandre Chartre @ 2020-02-27 15:43 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann


On 2/25/20 11:08 PM, Thomas Gleixner wrote:
> Split prepare_enter_to_user_mode() and mark it notrace/noprobe so the irq
> flags tracing on return can be put into it.

This splits prepare_exit_to_usermode() not prepare_enter_to_user_mode().


> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>   arch/x86/entry/common.c |   15 +++++++++++----
>   1 file changed, 11 insertions(+), 4 deletions(-)
> 
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -196,7 +196,7 @@ static void exit_to_usermode_loop(struct
>   }
>   
>   /* Called with IRQs disabled. */
> -__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
> +static inline void __prepare_exit_to_usermode(struct pt_regs *regs)
>   {
>   	struct thread_info *ti = current_thread_info();
>   	u32 cached_flags;
> @@ -241,6 +241,12 @@ static void exit_to_usermode_loop(struct
>   	mds_user_clear_cpu_buffers();
>   }
>   
> +__visible inline notrace void prepare_exit_to_usermode(struct pt_regs *regs)
> +{
> +	__prepare_exit_to_usermode(regs);
> +}
> +NOKPROBE_SYMBOL(prepare_exit_to_usermode);


Would it be worth grouping local_irq_disable() and prepare_exit_to_usermode()
(similarly to what was done entry with syscall_entry_fixups()) and then put
trace_hardirqs_on() there too. For example:

static __always_inline void syscall_exit_fixups(struct pt_regs *regs)
{
         local_irq_disable();
         prepare_exit_to_usermode(regs);
         trace_hardirqs_on();
}

Or is this planned once prepare_exit_from_usermode() is not used by idtentry
anymore?

Thanks,

alex.

>   #define SYSCALL_EXIT_WORK_FLAGS				\
>   	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |	\
>   	 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
> @@ -271,7 +277,7 @@ static void syscall_slow_exit_work(struc
>    * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
>    * state such that we can immediately switch to user mode.
>    */
> -__visible inline void syscall_return_slowpath(struct pt_regs *regs)
> +__visible inline notrace void syscall_return_slowpath(struct pt_regs *regs)
>   {
>   	struct thread_info *ti = current_thread_info();
>   	u32 cached_flags = READ_ONCE(ti->flags);
> @@ -292,8 +298,9 @@ static void syscall_slow_exit_work(struc
>   		syscall_slow_exit_work(regs, cached_flags);
>   
>   	local_irq_disable();
> -	prepare_exit_to_usermode(regs);
> +	__prepare_exit_to_usermode(regs);
>   }
> +NOKPROBE_SYMBOL(syscall_return_slowpath);
>   
>   #ifdef CONFIG_X86_64
>   static __always_inline
> @@ -417,7 +424,7 @@ static __always_inline long do_fast_sysc
>   		/* User code screwed up. */
>   		local_irq_disable();
>   		regs->ax = -EFAULT;
> -		prepare_exit_to_usermode(regs);
> +		__prepare_exit_to_usermode(regs);
>   		return 0;	/* Keep it simple: use IRET. */
>   	}
>   
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions
  2020-02-27 15:43   ` Alexandre Chartre
@ 2020-02-27 15:53     ` Thomas Gleixner
  0 siblings, 0 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-27 15:53 UTC (permalink / raw)
  To: Alexandre Chartre, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

Alexandre Chartre <alexandre.chartre@oracle.com> writes:

> On 2/25/20 11:08 PM, Thomas Gleixner wrote:
>> Split prepare_enter_to_user_mode() and mark it notrace/noprobe so the irq
>> flags tracing on return can be put into it.
>
> This splits prepare_exit_to_usermode() not prepare_enter_to_user_mode().

Ooops

>>   /* Called with IRQs disabled. */
>> -__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
>> +static inline void __prepare_exit_to_usermode(struct pt_regs *regs)
>>   {
>>   	struct thread_info *ti = current_thread_info();
>>   	u32 cached_flags;
>> @@ -241,6 +241,12 @@ static void exit_to_usermode_loop(struct
>>   	mds_user_clear_cpu_buffers();
>>   }
>>   
>> +__visible inline notrace void prepare_exit_to_usermode(struct pt_regs *regs)
>> +{
>> +	__prepare_exit_to_usermode(regs);
>> +}
>> +NOKPROBE_SYMBOL(prepare_exit_to_usermode);
>
>
> Would it be worth grouping local_irq_disable() and prepare_exit_to_usermode()
> (similarly to what was done entry with syscall_entry_fixups()) and then put
> trace_hardirqs_on() there too. For example:
>
> static __always_inline void syscall_exit_fixups(struct pt_regs *regs)
> {
>          local_irq_disable();
>          prepare_exit_to_usermode(regs);
>          trace_hardirqs_on();
> }
>
> Or is this planned once prepare_exit_from_usermode() is not used by idtentry
> anymore?

This should be consolidated at the very end when all the interrupt muck
is done, but maybe I forgot.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 6/8] x86/entry: Move irq tracing to syscall_slow_exit_work
  2020-02-25 22:08 ` [patch 6/8] x86/entry: Move irq tracing to syscall_slow_exit_work Thomas Gleixner
  2020-02-26  5:47   ` Andy Lutomirski
@ 2020-02-27 16:12   ` Alexandre Chartre
  1 sibling, 0 replies; 56+ messages in thread
From: Alexandre Chartre @ 2020-02-27 16:12 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann


On 2/25/20 11:08 PM, Thomas Gleixner wrote:
> which removes the ASM IRQ tracepoints.

This moves irq tracing to syscall_return_slowpath, not syscall_slow_exit_work,
right?

alex.

  
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>   arch/x86/entry/common.c   |    2 ++
>   arch/x86/entry/entry_32.S |    3 ++-
>   arch/x86/entry/entry_64.S |    3 ---
>   3 files changed, 4 insertions(+), 4 deletions(-)
> 
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -299,6 +299,8 @@ static void syscall_slow_exit_work(struc
>   
>   	local_irq_disable();
>   	__prepare_exit_to_usermode(regs);
> +	/* Return to user space enables interrupts */
> +	trace_hardirqs_on();
>   }
>   NOKPROBE_SYMBOL(syscall_return_slowpath);
>   
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -812,7 +812,7 @@ SYM_CODE_START(ret_from_fork)
>   	movl    %esp, %eax
>   	call    syscall_return_slowpath
>   	STACKLEAK_ERASE
> -	jmp     restore_all
> +	jmp     restore_all_switch_stack
>   
>   	/* kernel thread */
>   1:	movl	%edi, %eax
> @@ -1077,6 +1077,7 @@ SYM_FUNC_START(entry_INT80_32)
>   
>   restore_all:
>   	TRACE_IRQS_IRET
> +restore_all_switch_stack:
>   	SWITCH_TO_ENTRY_STACK
>   	CHECK_AND_APPLY_ESPFIX
>   .Lrestore_nocheck:
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -172,8 +172,6 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_h
>   	movq	%rsp, %rsi
>   	call	do_syscall_64		/* returns with IRQs disabled */
>   
> -	TRACE_IRQS_ON			/* return enables interrupts */
> -
>   	/*
>   	 * Try to use SYSRET instead of IRET if we're returning to
>   	 * a completely clean 64-bit userspace context.  If we're not,
> @@ -340,7 +338,6 @@ SYM_CODE_START(ret_from_fork)
>   	UNWIND_HINT_REGS
>   	movq	%rsp, %rdi
>   	call	syscall_return_slowpath	/* returns with IRQs disabled */
> -	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
>   	jmp	swapgs_restore_regs_and_return_to_usermode
>   
>   1:
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 8/8] x86/entry: Move irqflags tracing to do_int80_syscall_32()
  2020-02-25 22:08 ` [patch 8/8] x86/entry: Move irqflags tracing to do_int80_syscall_32() Thomas Gleixner
@ 2020-02-27 16:46   ` Alexandre Chartre
  2020-02-28 13:49     ` Thomas Gleixner
  0 siblings, 1 reply; 56+ messages in thread
From: Alexandre Chartre @ 2020-02-27 16:46 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann


On 2/25/20 11:08 PM, Thomas Gleixner wrote:
> which cleans up the ASM maze.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>   arch/x86/entry/common.c          |    8 +++++++-
>   arch/x86/entry/entry_32.S        |    9 ++-------
>   arch/x86/entry/entry_64_compat.S |   14 +++++---------
>   3 files changed, 14 insertions(+), 17 deletions(-)
> 
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -333,6 +333,7 @@ void do_syscall_64_irqs_on(unsigned long
>   {
>   	syscall_entry_fixups();
>   	do_syscall_64_irqs_on(nr, regs);
> +	trace_hardirqs_on();
>   }

trace_hardirqs_on() is already called through syscall_return_slowpath()
(from the previous patch):

do_syscall_64()
   -> do_syscall_64_irqs_on()
     -> syscall_return_slowpath()
       -> trace_hardirqs_on()

>   NOKPROBE_SYMBOL(do_syscall_64);
>   #endif
> @@ -389,6 +390,7 @@ static __always_inline void do_syscall_3
>   {
>   	syscall_entry_fixups();
>   	do_syscall_32_irqs_on(regs);
> +	trace_hardirqs_on();
>   }

Same here:

do_int80_syscall_32()
   -> do_syscall_32_irqs_on()
     -> syscall_return_slowpath()
       -> trace_hardirqs_on()

>   NOKPROBE_SYMBOL(do_int80_syscall_32);
>   
> @@ -468,8 +470,12 @@ static __always_inline long do_fast_sysc
>   /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
>   __visible notrace long do_fast_syscall_32(struct pt_regs *regs)
>   {
> +	long ret;
> +
>   	syscall_entry_fixups();
> -	return do_fast_syscall_32_irqs_on(regs);
> +	ret = do_fast_syscall_32_irqs_on(regs);
> +	trace_hardirqs_on();
> +	return ret;
>   }
>   NOKPROBE_SYMBOL(do_fast_syscall_32);

Same here:

   do_fast_syscall_32()
     -> do_fast_syscall_32_irqs_on()
       -> do_syscall_32_irqs_on()
         -> syscall_return_slowpath()
           -> trace_hardirqs_on()

Except for one case (if the get_user() call is true in
do_fast_syscall_32_irqs_on()):

   do_fast_syscall_32()
     -> do_fast_syscall_32_irqs_on()
       -> prepare_exit_to_usermode()

So we need to call trace_hardirqs_on() but only in that case:

static __always_inline long do_fast_syscall_32_irqs_on(struct pt_regs *regs)
{
           ...
	if (
#ifdef CONFIG_X86_64
		/*
		 * Micro-optimization: the pointer we're following is explicitly
		 * 32 bits, so it can't be out of range.
		 */
		__get_user(*(u32 *)&regs->bp,
			    (u32 __user __force *)(unsigned long)(u32)regs->sp)
#else
		get_user(*(u32 *)&regs->bp,
			 (u32 __user __force *)(unsigned long)(u32)regs->sp)
#endif
		) {

		/* User code screwed up. */
		local_irq_disable();
		regs->ax = -EFAULT;
		prepare_exit_to_usermode(regs);
                 trace_hardirqs_on();                <<<=== HERE
		return 0;	/* Keep it simple: use IRET. */
	}
         ...
}

alex.


> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -811,8 +811,7 @@ SYM_CODE_START(ret_from_fork)
>   	/* When we fork, we trace the syscall return in the child, too. */
>   	movl    %esp, %eax
>   	call    syscall_return_slowpath
> -	STACKLEAK_ERASE
> -	jmp     restore_all_switch_stack
> +	jmp     .Lsyscall_32_done
>   
>   	/* kernel thread */
>   1:	movl	%edi, %eax
> @@ -968,8 +967,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
>   
>   	STACKLEAK_ERASE
>   
> -/* Opportunistic SYSEXIT */
> -	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
> +	/* Opportunistic SYSEXIT */
>   
>   	/*
>   	 * Setup entry stack - we keep the pointer in %eax and do the
> @@ -1072,11 +1070,8 @@ SYM_FUNC_START(entry_INT80_32)
>   	movl	%esp, %eax
>   	call	do_int80_syscall_32
>   .Lsyscall_32_done:
> -
>   	STACKLEAK_ERASE
>   
> -restore_all:
> -	TRACE_IRQS_IRET
>   restore_all_switch_stack:
>   	SWITCH_TO_ENTRY_STACK
>   	CHECK_AND_APPLY_ESPFIX
> --- a/arch/x86/entry/entry_64_compat.S
> +++ b/arch/x86/entry/entry_64_compat.S
> @@ -132,8 +132,8 @@ SYM_FUNC_START(entry_SYSENTER_compat)
>   	movq	%rsp, %rdi
>   	call	do_fast_syscall_32
>   	/* XEN PV guests always use IRET path */
> -	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
> -		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
> +	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
> +		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
>   	jmp	sysret32_from_system_call
>   
>   .Lsysenter_fix_flags:
> @@ -244,8 +244,8 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_aft
>   	movq	%rsp, %rdi
>   	call	do_fast_syscall_32
>   	/* XEN PV guests always use IRET path */
> -	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
> -		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
> +	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
> +		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
>   
>   	/* Opportunistic SYSRET */
>   sysret32_from_system_call:
> @@ -254,7 +254,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_aft
>   	 * stack. So let's erase the thread stack right now.
>   	 */
>   	STACKLEAK_ERASE
> -	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
> +
>   	movq	RBX(%rsp), %rbx		/* pt_regs->rbx */
>   	movq	RBP(%rsp), %rbp		/* pt_regs->rbp */
>   	movq	EFLAGS(%rsp), %r11	/* pt_regs->flags (in r11) */
> @@ -393,9 +393,5 @@ SYM_CODE_START(entry_INT80_compat)
>   
>   	movq	%rsp, %rdi
>   	call	do_int80_syscall_32
> -.Lsyscall_32_done:
> -
> -	/* Go back to user mode. */
> -	TRACE_IRQS_ON
>   	jmp	swapgs_restore_regs_and_return_to_usermode
>   SYM_CODE_END(entry_INT80_compat)
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space
  2020-02-25 22:08 ` [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space Thomas Gleixner
@ 2020-02-27 19:49   ` Borislav Petkov
  2020-02-27 22:45   ` Frederic Weisbecker
  2020-02-28  8:58   ` Alexandre Chartre
  2 siblings, 0 replies; 56+ messages in thread
From: Borislav Petkov @ 2020-02-27 19:49 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, x86, Steven Rostedt, Brian Gerst, Juergen Gross,
	Paolo Bonzini, Arnd Bergmann

Just formulations improvements:

> Subject: [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space

s/on //

On Tue, Feb 25, 2020 at 11:08:02PM +0100, Thomas Gleixner wrote:
> User space cannot longer disable interrupts so trace return to user space

"Userspace cannot disable interrupts any longer... "

> unconditionally as IRQS_ON.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

...

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 2/8] x86/entry/common: Consolidate syscall entry code
  2020-02-25 22:08 ` [patch 2/8] x86/entry/common: Consolidate syscall entry code Thomas Gleixner
@ 2020-02-27 19:57   ` Borislav Petkov
  2020-02-27 22:52   ` Frederic Weisbecker
  2020-02-28  8:59   ` Alexandre Chartre
  2 siblings, 0 replies; 56+ messages in thread
From: Borislav Petkov @ 2020-02-27 19:57 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, x86, Steven Rostedt, Brian Gerst, Juergen Gross,
	Paolo Bonzini, Arnd Bergmann

On Tue, Feb 25, 2020 at 11:08:03PM +0100, Thomas Gleixner wrote:
> All syscall entry points call enter_from_user_mode() and
> local_irq_enable(). Move that into an inline helper so the interrupt
> tracing can be moved into that helper later instead of sprinkling
> it all over the place.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  arch/x86/entry/common.c |   48 +++++++++++++++++++++++++++++++++---------------
>  1 file changed, 33 insertions(+), 15 deletions(-)
> 
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -49,6 +49,18 @@
>  static inline void enter_from_user_mode(void) {}
>  #endif
>  
> +/*
> + * All syscall entry variants call with interrupts disabled.
> + *
> + * Invoke context tracking if enabled and enable interrupts for further
> + * processing.
> + */
> +static __always_inline void syscall_entry_fixups(void)

Function name needs a verb. "do_syscall_entry_fixups()" seems to fit well
with where it is called or "apply_..." or so.

Rest looks ok.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space
  2020-02-25 22:08 ` [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space Thomas Gleixner
  2020-02-27 19:49   ` Borislav Petkov
@ 2020-02-27 22:45   ` Frederic Weisbecker
  2020-02-28  8:58   ` Alexandre Chartre
  2 siblings, 0 replies; 56+ messages in thread
From: Frederic Weisbecker @ 2020-02-27 22:45 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, x86, Steven Rostedt, Brian Gerst, Juergen Gross,
	Paolo Bonzini, Arnd Bergmann

On Tue, Feb 25, 2020 at 11:08:02PM +0100, Thomas Gleixner wrote:
> User space cannot longer disable interrupts so trace return to user space
> unconditionally as IRQS_ON.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Frederic Weisbecker <frederic@kernel.org>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 2/8] x86/entry/common: Consolidate syscall entry code
  2020-02-25 22:08 ` [patch 2/8] x86/entry/common: Consolidate syscall entry code Thomas Gleixner
  2020-02-27 19:57   ` Borislav Petkov
@ 2020-02-27 22:52   ` Frederic Weisbecker
  2020-02-28  8:59   ` Alexandre Chartre
  2 siblings, 0 replies; 56+ messages in thread
From: Frederic Weisbecker @ 2020-02-27 22:52 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, x86, Steven Rostedt, Brian Gerst, Juergen Gross,
	Paolo Bonzini, Arnd Bergmann

On Tue, Feb 25, 2020 at 11:08:03PM +0100, Thomas Gleixner wrote:
> All syscall entry points call enter_from_user_mode() and
> local_irq_enable(). Move that into an inline helper so the interrupt
> tracing can be moved into that helper later instead of sprinkling
> it all over the place.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Frederic Weisbecker <frederic@kernel.org>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-25 22:08 ` [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code Thomas Gleixner
  2020-02-26  5:43   ` Andy Lutomirski
@ 2020-02-27 23:11   ` Frederic Weisbecker
  2020-02-28  9:00   ` Alexandre Chartre
  2 siblings, 0 replies; 56+ messages in thread
From: Frederic Weisbecker @ 2020-02-27 23:11 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, x86, Steven Rostedt, Brian Gerst, Juergen Gross,
	Paolo Bonzini, Arnd Bergmann

On Tue, Feb 25, 2020 at 11:08:05PM +0100, Thomas Gleixner wrote:
> Now that the C entry points are safe, move the irq flags tracing code into
> the entry helper.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Frederic Weisbecker <frederic@kernel.org>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 3/8] x86/entry/common: Mark syscall entry points notrace/nokprobe
  2020-02-25 22:08 ` [patch 3/8] x86/entry/common: Mark syscall entry points notrace/nokprobe Thomas Gleixner
@ 2020-02-27 23:15   ` Frederic Weisbecker
  2020-02-28  8:59   ` Alexandre Chartre
  1 sibling, 0 replies; 56+ messages in thread
From: Frederic Weisbecker @ 2020-02-27 23:15 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, x86, Steven Rostedt, Brian Gerst, Juergen Gross,
	Paolo Bonzini, Arnd Bergmann

On Tue, Feb 25, 2020 at 11:08:04PM +0100, Thomas Gleixner wrote:
> Anything before enter_from_user_mode() is not safe to be traced or probed.

Perhaps the changelog should clarify as to why exactly it isn't
safe to trace before enter_from_user_mode(). There have been
so many patches on that lately that I'm not clear on that either.

Thanks.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space
  2020-02-25 22:08 ` [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space Thomas Gleixner
  2020-02-27 19:49   ` Borislav Petkov
  2020-02-27 22:45   ` Frederic Weisbecker
@ 2020-02-28  8:58   ` Alexandre Chartre
  2 siblings, 0 replies; 56+ messages in thread
From: Alexandre Chartre @ 2020-02-28  8:58 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann


On 2/25/20 11:08 PM, Thomas Gleixner wrote:
> User space cannot longer disable interrupts so trace return to user space
> unconditionally as IRQS_ON.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>   arch/x86/entry/entry_64.S |    4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)
> 

Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>

alex.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 2/8] x86/entry/common: Consolidate syscall entry code
  2020-02-25 22:08 ` [patch 2/8] x86/entry/common: Consolidate syscall entry code Thomas Gleixner
  2020-02-27 19:57   ` Borislav Petkov
  2020-02-27 22:52   ` Frederic Weisbecker
@ 2020-02-28  8:59   ` Alexandre Chartre
  2 siblings, 0 replies; 56+ messages in thread
From: Alexandre Chartre @ 2020-02-28  8:59 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann


On 2/25/20 11:08 PM, Thomas Gleixner wrote:
> All syscall entry points call enter_from_user_mode() and
> local_irq_enable(). Move that into an inline helper so the interrupt
> tracing can be moved into that helper later instead of sprinkling
> it all over the place.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>   arch/x86/entry/common.c |   48 +++++++++++++++++++++++++++++++++---------------
>   1 file changed, 33 insertions(+), 15 deletions(-)
> 

Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>

alex.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 3/8] x86/entry/common: Mark syscall entry points notrace/nokprobe
  2020-02-25 22:08 ` [patch 3/8] x86/entry/common: Mark syscall entry points notrace/nokprobe Thomas Gleixner
  2020-02-27 23:15   ` Frederic Weisbecker
@ 2020-02-28  8:59   ` Alexandre Chartre
  1 sibling, 0 replies; 56+ messages in thread
From: Alexandre Chartre @ 2020-02-28  8:59 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann


On 2/25/20 11:08 PM, Thomas Gleixner wrote:
> Anything before enter_from_user_mode() is not safe to be traced or probed.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>   arch/x86/entry/common.c |    9 ++++++---
>   1 file changed, 6 insertions(+), 3 deletions(-)
> 

Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>

alex.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-25 22:08 ` [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code Thomas Gleixner
  2020-02-26  5:43   ` Andy Lutomirski
  2020-02-27 23:11   ` Frederic Weisbecker
@ 2020-02-28  9:00   ` Alexandre Chartre
  2 siblings, 0 replies; 56+ messages in thread
From: Alexandre Chartre @ 2020-02-28  9:00 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann


On 2/25/20 11:08 PM, Thomas Gleixner wrote:
> Now that the C entry points are safe, move the irq flags tracing code into
> the entry helper.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>   arch/x86/entry/common.c          |    5 +++++
>   arch/x86/entry/entry_32.S        |   12 ------------
>   arch/x86/entry/entry_64.S        |    2 --
>   arch/x86/entry/entry_64_compat.S |   18 ------------------
>   4 files changed, 5 insertions(+), 32 deletions(-)
> 

Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>

alex.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 8/8] x86/entry: Move irqflags tracing to do_int80_syscall_32()
  2020-02-27 16:46   ` Alexandre Chartre
@ 2020-02-28 13:49     ` Thomas Gleixner
  0 siblings, 0 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-28 13:49 UTC (permalink / raw)
  To: Alexandre Chartre, LKML
  Cc: x86, Steven Rostedt, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

Alexandre Chartre <alexandre.chartre@oracle.com> writes:
> On 2/25/20 11:08 PM, Thomas Gleixner wrote:
>> which cleans up the ASM maze.
>> 
>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>> ---
>>   arch/x86/entry/common.c          |    8 +++++++-
>>   arch/x86/entry/entry_32.S        |    9 ++-------
>>   arch/x86/entry/entry_64_compat.S |   14 +++++---------
>>   3 files changed, 14 insertions(+), 17 deletions(-)
>> 
>> --- a/arch/x86/entry/common.c
>> +++ b/arch/x86/entry/common.c
>> @@ -333,6 +333,7 @@ void do_syscall_64_irqs_on(unsigned long
>>   {
>>   	syscall_entry_fixups();
>>   	do_syscall_64_irqs_on(nr, regs);
>> +	trace_hardirqs_on();
>>   }
>
> trace_hardirqs_on() is already called through syscall_return_slowpath()
> (from the previous patch):
>
> do_syscall_64()
>    -> do_syscall_64_irqs_on()
>      -> syscall_return_slowpath()
>        -> trace_hardirqs_on()

Duh, indeed.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-26 19:51         ` Thomas Gleixner
@ 2020-02-29 14:44           ` Thomas Gleixner
  2020-02-29 19:25             ` Andy Lutomirski
  0 siblings, 1 reply; 56+ messages in thread
From: Thomas Gleixner @ 2020-02-29 14:44 UTC (permalink / raw)
  To: Andy Lutomirski, Peter Zijlstra
  Cc: Andy Lutomirski, LKML, x86, Steven Rostedt, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann, Paul E. McKenney

Thomas Gleixner <tglx@linutronix.de> writes:
> Andy Lutomirski <luto@amacapital.net> writes:
>>> On Feb 26, 2020, at 12:17 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> On Tue, Feb 25, 2020 at 09:43:46PM -0800, Andy Lutomirski wrote:
>>>> Your earlier patches suggest quite strongly that tracing isn't safe
>>>> until enter_from_user_mode().  But trace_hardirqs_off() calls
>>>> trace_irq_disable_rcuidle(), which looks [0] like a tracepoint.
>>>> 
>>>> Did you perhaps mean to do this *after* enter_from_user_mode()?
>>> 
>>> aside from the fact that enter_from_user_mode() itself also has a
>>> tracepoint, the crucial detail is that we must not trace/kprobe the
>>> function calling this.
>>> 
>>> Specifically for #PF, because we need read_cr2() before this. See later
>>> patches.
>>
>> Indeed. I’m fine with this patch, but I still don’t understand what
>> the changelog is about.
>
> Yeah, the changelog is not really helpful. Let me fix that.
>
>> And I’m still rather baffled by most of the notrace annotations in the
>> series.
>
> As discussed on IRC, this might be too broad, but then I rather have the
> actual C-entry points neither traceable nor probable in general and
> relax this by calling functions which can be traced and probed.
>
> My rationale for this decision was that enter_from_user_mode() is marked
> notrace/noprobe as well, so I kept the protection scope the same as we
> had in the ASM maze which is marked noprobe already.

I have second thoughts vs. tracing in this context.

While the tracer itself seems to handle this correctly, what about
things like BPF programs which can be attached to tracepoints and
function trace entries?

Is that really safe _before_ context tracking has updated RCU state?

Thanks,

        tglx



^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-29 14:44           ` Thomas Gleixner
@ 2020-02-29 19:25             ` Andy Lutomirski
  2020-02-29 23:58               ` Steven Rostedt
  0 siblings, 1 reply; 56+ messages in thread
From: Andy Lutomirski @ 2020-02-29 19:25 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Andy Lutomirski, LKML, x86, Steven Rostedt,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann,
	Paul E. McKenney



> On Feb 29, 2020, at 6:44 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> Thomas Gleixner <tglx@linutronix.de> writes:
>> Andy Lutomirski <luto@amacapital.net> writes:
>>>>> On Feb 26, 2020, at 12:17 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>>>> On Tue, Feb 25, 2020 at 09:43:46PM -0800, Andy Lutomirski wrote:
>>>>>> Your earlier patches suggest quite strongly that tracing isn't safe
>>>>>> until enter_from_user_mode().  But trace_hardirqs_off() calls
>>>>>> trace_irq_disable_rcuidle(), which looks [0] like a tracepoint.
>>>>>> 
>>>>>> Did you perhaps mean to do this *after* enter_from_user_mode()?
>>>>> 
>>>>> aside from the fact that enter_from_user_mode() itself also has a
>>>>> tracepoint, the crucial detail is that we must not trace/kprobe the
>>>>> function calling this.
>>>>> 
>>>>> Specifically for #PF, because we need read_cr2() before this. See later
>>>>> patches.
>>> 
>>> Indeed. I’m fine with this patch, but I still don’t understand what
>>> the changelog is about.
>> 
>> Yeah, the changelog is not really helpful. Let me fix that.
>> 
>>> And I’m still rather baffled by most of the notrace annotations in the
>>> series.
>> 
>> As discussed on IRC, this might be too broad, but then I rather have the
>> actual C-entry points neither traceable nor probable in general and
>> relax this by calling functions which can be traced and probed.
>> 
>> My rationale for this decision was that enter_from_user_mode() is marked
>> notrace/noprobe as well, so I kept the protection scope the same as we
>> had in the ASM maze which is marked noprobe already.
> 
> I have second thoughts vs. tracing in this context.
> 
> While the tracer itself seems to handle this correctly, what about
> things like BPF programs which can be attached to tracepoints and
> function trace entries?

I think that everything using the tracing code, including BPF, should either do its own rcuidle stuff or explicitly not execute if we’re not in CONTEXT_KERNEL.  That is, we probably need to patch BPF.

> 
> Is that really safe _before_ context tracking has updated RCU state?
> 
> Thanks,
> 
>        tglx
> 
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-29 19:25             ` Andy Lutomirski
@ 2020-02-29 23:58               ` Steven Rostedt
  2020-03-01 10:16                 ` Thomas Gleixner
  0 siblings, 1 reply; 56+ messages in thread
From: Steven Rostedt @ 2020-02-29 23:58 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, Peter Zijlstra, Andy Lutomirski, LKML, x86,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann,
	Paul E. McKenney

On Sat, 29 Feb 2020 11:25:24 -0800
Andy Lutomirski <luto@amacapital.net> wrote:

> > While the tracer itself seems to handle this correctly, what about
> > things like BPF programs which can be attached to tracepoints and
> > function trace entries?  
> 
> I think that everything using the tracing code, including BPF, should
> either do its own rcuidle stuff or explicitly not execute if we’re
> not in CONTEXT_KERNEL.  That is, we probably need to patch BPF.

That's basically the route we are taking.

-- Steve

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-02-29 23:58               ` Steven Rostedt
@ 2020-03-01 10:16                 ` Thomas Gleixner
  2020-03-01 14:37                   ` Andy Lutomirski
  0 siblings, 1 reply; 56+ messages in thread
From: Thomas Gleixner @ 2020-03-01 10:16 UTC (permalink / raw)
  To: Steven Rostedt, Andy Lutomirski
  Cc: Peter Zijlstra, Andy Lutomirski, LKML, x86, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann, Paul E. McKenney

Steven Rostedt <rostedt@goodmis.org> writes:

> On Sat, 29 Feb 2020 11:25:24 -0800
> Andy Lutomirski <luto@amacapital.net> wrote:
>
>> > While the tracer itself seems to handle this correctly, what about
>> > things like BPF programs which can be attached to tracepoints and
>> > function trace entries?  
>> 
>> I think that everything using the tracing code, including BPF, should
>> either do its own rcuidle stuff or explicitly not execute if we’re
>> not in CONTEXT_KERNEL.  That is, we probably need to patch BPF.
>
> That's basically the route we are taking.

Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
except trace_hardirq_off/on() as those trace functions do not allow to
attach anything AFAICT.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 10:16                 ` Thomas Gleixner
@ 2020-03-01 14:37                   ` Andy Lutomirski
  2020-03-01 15:21                     ` Thomas Gleixner
  0 siblings, 1 reply; 56+ messages in thread
From: Andy Lutomirski @ 2020-03-01 14:37 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Steven Rostedt, Peter Zijlstra, Andy Lutomirski, LKML, x86,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann,
	Paul E. McKenney



> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> Steven Rostedt <rostedt@goodmis.org> writes:
> 
>> On Sat, 29 Feb 2020 11:25:24 -0800
>> Andy Lutomirski <luto@amacapital.net> wrote:
>> 
>>>> While the tracer itself seems to handle this correctly, what about
>>>> things like BPF programs which can be attached to tracepoints and
>>>> function trace entries?  
>>> 
>>> I think that everything using the tracing code, including BPF, should
>>> either do its own rcuidle stuff or explicitly not execute if we’re
>>> not in CONTEXT_KERNEL.  That is, we probably need to patch BPF.
>> 
>> That's basically the route we are taking.
> 
> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> except trace_hardirq_off/on() as those trace functions do not allow to
> attach anything AFAICT.

Can you point to whatever makes those particular functions special?  I failed to follow the macro maze.

> 
> Thanks,
> 
>        tglx

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 14:37                   ` Andy Lutomirski
@ 2020-03-01 15:21                     ` Thomas Gleixner
  2020-03-01 16:00                       ` Andy Lutomirski
  2020-03-01 18:20                       ` Steven Rostedt
  0 siblings, 2 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-03-01 15:21 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Steven Rostedt, Peter Zijlstra, Andy Lutomirski, LKML, x86,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann,
	Paul E. McKenney

Andy Lutomirski <luto@amacapital.net> writes:
>> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
>> except trace_hardirq_off/on() as those trace functions do not allow to
>> attach anything AFAICT.
>
> Can you point to whatever makes those particular functions special?  I
> failed to follow the macro maze.

Those are not tracepoints and not going through the macro maze. See
kernel/trace/trace_preemptirq.c

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 15:21                     ` Thomas Gleixner
@ 2020-03-01 16:00                       ` Andy Lutomirski
  2020-03-01 18:12                         ` Thomas Gleixner
  2020-03-01 18:23                         ` Steven Rostedt
  2020-03-01 18:20                       ` Steven Rostedt
  1 sibling, 2 replies; 56+ messages in thread
From: Andy Lutomirski @ 2020-03-01 16:00 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Steven Rostedt, Peter Zijlstra, Andy Lutomirski, LKML, X86 ML,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann,
	Paul E. McKenney

On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Andy Lutomirski <luto@amacapital.net> writes:
> >> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> >> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> >> except trace_hardirq_off/on() as those trace functions do not allow to
> >> attach anything AFAICT.
> >
> > Can you point to whatever makes those particular functions special?  I
> > failed to follow the macro maze.
>
> Those are not tracepoints and not going through the macro maze. See
> kernel/trace/trace_preemptirq.c

That has:

void trace_hardirqs_on(void)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
                this_cpu_write(tracing_irq_cpu, 0);
        }

        lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
NOKPROBE_SYMBOL(trace_hardirqs_on);

But this calls trace_irq_enable_rcuidle(), and that's the part of the
macro maze I got lost in.  I found:

#ifdef CONFIG_TRACE_IRQFLAGS
DEFINE_EVENT(preemptirq_template, irq_disable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));

DEFINE_EVENT(preemptirq_template, irq_enable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));
#else
#define trace_irq_enable(...)
#define trace_irq_disable(...)
#define trace_irq_enable_rcuidle(...)
#define trace_irq_disable_rcuidle(...)
#endif

But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
where I got lost in the macro maze.  I looked at the gcc asm output,
and there is, indeed:

# ./include/trace/events/preemptirq.h:40:
DEFINE_EVENT(preemptirq_template, irq_enable,

with a bunch of asm magic that looks like it's probably a tracepoint.
I still don't quite see where the "_rcuidle" went.

But I also don't see why this is any different from any other tracepoint.

--Andy

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 16:00                       ` Andy Lutomirski
@ 2020-03-01 18:12                         ` Thomas Gleixner
  2020-03-01 18:26                           ` Paul E. McKenney
  2020-03-01 18:23                         ` Steven Rostedt
  1 sibling, 1 reply; 56+ messages in thread
From: Thomas Gleixner @ 2020-03-01 18:12 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Steven Rostedt, Peter Zijlstra, Andy Lutomirski, LKML, X86 ML,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann,
	Paul E. McKenney

Andy Lutomirski <luto@kernel.org> writes:
> On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>> Andy Lutomirski <luto@amacapital.net> writes:
>> >> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>> >> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
>> >> except trace_hardirq_off/on() as those trace functions do not allow to
>> >> attach anything AFAICT.
>> >
>> > Can you point to whatever makes those particular functions special?  I
>> > failed to follow the macro maze.
>>
>> Those are not tracepoints and not going through the macro maze. See
>> kernel/trace/trace_preemptirq.c
>
> That has:
>
> void trace_hardirqs_on(void)
> {
>         if (this_cpu_read(tracing_irq_cpu)) {
>                 if (!in_nmi())
>                         trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>                 tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
>                 this_cpu_write(tracing_irq_cpu, 0);
>         }
>
>         lockdep_hardirqs_on(CALLER_ADDR0);
> }
> EXPORT_SYMBOL(trace_hardirqs_on);
> NOKPROBE_SYMBOL(trace_hardirqs_on);
>
> But this calls trace_irq_enable_rcuidle(), and that's the part of the
> macro maze I got lost in.  I found:
>
> #ifdef CONFIG_TRACE_IRQFLAGS
> DEFINE_EVENT(preemptirq_template, irq_disable,
>              TP_PROTO(unsigned long ip, unsigned long parent_ip),
>              TP_ARGS(ip, parent_ip));
>
> DEFINE_EVENT(preemptirq_template, irq_enable,
>              TP_PROTO(unsigned long ip, unsigned long parent_ip),
>              TP_ARGS(ip, parent_ip));
> #else
> #define trace_irq_enable(...)
> #define trace_irq_disable(...)
> #define trace_irq_enable_rcuidle(...)
> #define trace_irq_disable_rcuidle(...)
> #endif
>
> But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> where I got lost in the macro maze.  I looked at the gcc asm output,
> and there is, indeed:

DEFINE_EVENT
  DECLARE_TRACE
    __DECLARE_TRACE
       __DECLARE_TRACE_RCU
         static inline void trace_##name##_rcuidle(proto)
            __DO_TRACE
               if (rcuidle)
                  ....

> But I also don't see why this is any different from any other tracepoint.

Indeed. I took a wrong turn at some point in the macro jungle :)

So tracing itself is fine, but then if you have probes or bpf programs
attached to a tracepoint these use rcu_read_lock()/unlock() which is
obviosly wrong in rcuidle context.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 15:21                     ` Thomas Gleixner
  2020-03-01 16:00                       ` Andy Lutomirski
@ 2020-03-01 18:20                       ` Steven Rostedt
  1 sibling, 0 replies; 56+ messages in thread
From: Steven Rostedt @ 2020-03-01 18:20 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, Peter Zijlstra, Andy Lutomirski, LKML, x86,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann,
	Paul E. McKenney, Joel Fernandes

On Sun, 01 Mar 2020 16:21:15 +0100
Thomas Gleixner <tglx@linutronix.de> wrote:

> Andy Lutomirski <luto@amacapital.net> writes:
> >> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> >> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> >> except trace_hardirq_off/on() as those trace functions do not allow to
> >> attach anything AFAICT.  
> >
> > Can you point to whatever makes those particular functions special?  I
> > failed to follow the macro maze.  
> 
> Those are not tracepoints and not going through the macro maze. See
> kernel/trace/trace_preemptirq.c

For the latency tracing, they do call into the tracing infrastructure,
not just lockdep. And Joel Fernandez did try to make these into trace
events as well.

-- Steve

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 16:00                       ` Andy Lutomirski
  2020-03-01 18:12                         ` Thomas Gleixner
@ 2020-03-01 18:23                         ` Steven Rostedt
  1 sibling, 0 replies; 56+ messages in thread
From: Steven Rostedt @ 2020-03-01 18:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, Peter Zijlstra, LKML, X86 ML, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann, Paul E. McKenney

On Sun, 1 Mar 2020 08:00:01 -0800
Andy Lutomirski <luto@kernel.org> wrote:

> But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> where I got lost in the macro maze.  I looked at the gcc asm output,
> and there is, indeed:
> 
> # ./include/trace/events/preemptirq.h:40:
> DEFINE_EVENT(preemptirq_template, irq_enable,
> 
> with a bunch of asm magic that looks like it's probably a tracepoint.
> I still don't quite see where the "_rcuidle" went.

See the code in include/linux/tracepoint.h and search for "rcuidle"
there. All defined trace events get a "_rcuidle" version, but as it is
declared a static inline, then they only show up if they are used.

-- Steve

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 18:12                         ` Thomas Gleixner
@ 2020-03-01 18:26                           ` Paul E. McKenney
  2020-03-01 18:54                             ` Andy Lutomirski
  0 siblings, 1 reply; 56+ messages in thread
From: Paul E. McKenney @ 2020-03-01 18:26 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, Steven Rostedt, Peter Zijlstra, LKML, X86 ML,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann

On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:
> Andy Lutomirski <luto@kernel.org> writes:
> > On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> >> Andy Lutomirski <luto@amacapital.net> writes:
> >> >> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> >> >> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> >> >> except trace_hardirq_off/on() as those trace functions do not allow to
> >> >> attach anything AFAICT.
> >> >
> >> > Can you point to whatever makes those particular functions special?  I
> >> > failed to follow the macro maze.
> >>
> >> Those are not tracepoints and not going through the macro maze. See
> >> kernel/trace/trace_preemptirq.c
> >
> > That has:
> >
> > void trace_hardirqs_on(void)
> > {
> >         if (this_cpu_read(tracing_irq_cpu)) {
> >                 if (!in_nmi())
> >                         trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> >                 tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
> >                 this_cpu_write(tracing_irq_cpu, 0);
> >         }
> >
> >         lockdep_hardirqs_on(CALLER_ADDR0);
> > }
> > EXPORT_SYMBOL(trace_hardirqs_on);
> > NOKPROBE_SYMBOL(trace_hardirqs_on);
> >
> > But this calls trace_irq_enable_rcuidle(), and that's the part of the
> > macro maze I got lost in.  I found:
> >
> > #ifdef CONFIG_TRACE_IRQFLAGS
> > DEFINE_EVENT(preemptirq_template, irq_disable,
> >              TP_PROTO(unsigned long ip, unsigned long parent_ip),
> >              TP_ARGS(ip, parent_ip));
> >
> > DEFINE_EVENT(preemptirq_template, irq_enable,
> >              TP_PROTO(unsigned long ip, unsigned long parent_ip),
> >              TP_ARGS(ip, parent_ip));
> > #else
> > #define trace_irq_enable(...)
> > #define trace_irq_disable(...)
> > #define trace_irq_enable_rcuidle(...)
> > #define trace_irq_disable_rcuidle(...)
> > #endif
> >
> > But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> > where I got lost in the macro maze.  I looked at the gcc asm output,
> > and there is, indeed:
> 
> DEFINE_EVENT
>   DECLARE_TRACE
>     __DECLARE_TRACE
>        __DECLARE_TRACE_RCU
>          static inline void trace_##name##_rcuidle(proto)
>             __DO_TRACE
>                if (rcuidle)
>                   ....
> 
> > But I also don't see why this is any different from any other tracepoint.
> 
> Indeed. I took a wrong turn at some point in the macro jungle :)
> 
> So tracing itself is fine, but then if you have probes or bpf programs
> attached to a tracepoint these use rcu_read_lock()/unlock() which is
> obviosly wrong in rcuidle context.

Definitely, any such code needs to use tricks similar to that of the
tracing code.  Or instead use something like SRCU, which is OK with
readers from idle.  Or use something like Steve Rostedt's workqueue-based
approach, though please be very careful with this latter, lest the
battery-powered embedded guys come after you for waking up idle CPUs
too often.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 18:26                           ` Paul E. McKenney
@ 2020-03-01 18:54                             ` Andy Lutomirski
  2020-03-01 19:30                               ` Paul E. McKenney
                                                 ` (2 more replies)
  0 siblings, 3 replies; 56+ messages in thread
From: Andy Lutomirski @ 2020-03-01 18:54 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Thomas Gleixner, Andy Lutomirski, Steven Rostedt, Peter Zijlstra,
	LKML, X86 ML, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:
> > Andy Lutomirski <luto@kernel.org> writes:
> > > On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> > >> Andy Lutomirski <luto@amacapital.net> writes:
> > >> >> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > >> >> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> > >> >> except trace_hardirq_off/on() as those trace functions do not allow to
> > >> >> attach anything AFAICT.
> > >> >
> > >> > Can you point to whatever makes those particular functions special?  I
> > >> > failed to follow the macro maze.
> > >>
> > >> Those are not tracepoints and not going through the macro maze. See
> > >> kernel/trace/trace_preemptirq.c
> > >
> > > That has:
> > >
> > > void trace_hardirqs_on(void)
> > > {
> > >         if (this_cpu_read(tracing_irq_cpu)) {
> > >                 if (!in_nmi())
> > >                         trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> > >                 tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
> > >                 this_cpu_write(tracing_irq_cpu, 0);
> > >         }
> > >
> > >         lockdep_hardirqs_on(CALLER_ADDR0);
> > > }
> > > EXPORT_SYMBOL(trace_hardirqs_on);
> > > NOKPROBE_SYMBOL(trace_hardirqs_on);
> > >
> > > But this calls trace_irq_enable_rcuidle(), and that's the part of the
> > > macro maze I got lost in.  I found:
> > >
> > > #ifdef CONFIG_TRACE_IRQFLAGS
> > > DEFINE_EVENT(preemptirq_template, irq_disable,
> > >              TP_PROTO(unsigned long ip, unsigned long parent_ip),
> > >              TP_ARGS(ip, parent_ip));
> > >
> > > DEFINE_EVENT(preemptirq_template, irq_enable,
> > >              TP_PROTO(unsigned long ip, unsigned long parent_ip),
> > >              TP_ARGS(ip, parent_ip));
> > > #else
> > > #define trace_irq_enable(...)
> > > #define trace_irq_disable(...)
> > > #define trace_irq_enable_rcuidle(...)
> > > #define trace_irq_disable_rcuidle(...)
> > > #endif
> > >
> > > But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> > > where I got lost in the macro maze.  I looked at the gcc asm output,
> > > and there is, indeed:
> >
> > DEFINE_EVENT
> >   DECLARE_TRACE
> >     __DECLARE_TRACE
> >        __DECLARE_TRACE_RCU
> >          static inline void trace_##name##_rcuidle(proto)
> >             __DO_TRACE
> >                if (rcuidle)
> >                   ....
> >
> > > But I also don't see why this is any different from any other tracepoint.
> >
> > Indeed. I took a wrong turn at some point in the macro jungle :)
> >
> > So tracing itself is fine, but then if you have probes or bpf programs
> > attached to a tracepoint these use rcu_read_lock()/unlock() which is
> > obviosly wrong in rcuidle context.
>
> Definitely, any such code needs to use tricks similar to that of the
> tracing code.  Or instead use something like SRCU, which is OK with
> readers from idle.  Or use something like Steve Rostedt's workqueue-based
> approach, though please be very careful with this latter, lest the
> battery-powered embedded guys come after you for waking up idle CPUs
> too often.  ;-)
>

Are we okay if we somehow ensure that all the entry code before
enter_from_user_mode() only does rcuidle tracing variants and has
kprobes off?  Including for BPF use cases?

It would be *really* nice if we could statically verify this, as has
been mentioned elsewhere in the thread.  It would also probably be
good enough if we could do it at runtime.  Maybe with lockdep on, we
verify rcu state in tracepoints even if the tracepoint isn't active?
And we could plausibly have some widget that could inject something
into *every* kprobeable function to check rcu state.

--Andy

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 18:54                             ` Andy Lutomirski
@ 2020-03-01 19:30                               ` Paul E. McKenney
  2020-03-01 19:39                                 ` Andy Lutomirski
  2020-03-02  1:10                               ` Joel Fernandes
  2020-03-02  8:10                               ` Thomas Gleixner
  2 siblings, 1 reply; 56+ messages in thread
From: Paul E. McKenney @ 2020-03-01 19:30 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, Steven Rostedt, Peter Zijlstra, LKML, X86 ML,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann

On Sun, Mar 01, 2020 at 10:54:23AM -0800, Andy Lutomirski wrote:
> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:
> > > Andy Lutomirski <luto@kernel.org> writes:
> > > > On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> > > >> Andy Lutomirski <luto@amacapital.net> writes:
> > > >> >> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > > >> >> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> > > >> >> except trace_hardirq_off/on() as those trace functions do not allow to
> > > >> >> attach anything AFAICT.
> > > >> >
> > > >> > Can you point to whatever makes those particular functions special?  I
> > > >> > failed to follow the macro maze.
> > > >>
> > > >> Those are not tracepoints and not going through the macro maze. See
> > > >> kernel/trace/trace_preemptirq.c
> > > >
> > > > That has:
> > > >
> > > > void trace_hardirqs_on(void)
> > > > {
> > > >         if (this_cpu_read(tracing_irq_cpu)) {
> > > >                 if (!in_nmi())
> > > >                         trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> > > >                 tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
> > > >                 this_cpu_write(tracing_irq_cpu, 0);
> > > >         }
> > > >
> > > >         lockdep_hardirqs_on(CALLER_ADDR0);
> > > > }
> > > > EXPORT_SYMBOL(trace_hardirqs_on);
> > > > NOKPROBE_SYMBOL(trace_hardirqs_on);
> > > >
> > > > But this calls trace_irq_enable_rcuidle(), and that's the part of the
> > > > macro maze I got lost in.  I found:
> > > >
> > > > #ifdef CONFIG_TRACE_IRQFLAGS
> > > > DEFINE_EVENT(preemptirq_template, irq_disable,
> > > >              TP_PROTO(unsigned long ip, unsigned long parent_ip),
> > > >              TP_ARGS(ip, parent_ip));
> > > >
> > > > DEFINE_EVENT(preemptirq_template, irq_enable,
> > > >              TP_PROTO(unsigned long ip, unsigned long parent_ip),
> > > >              TP_ARGS(ip, parent_ip));
> > > > #else
> > > > #define trace_irq_enable(...)
> > > > #define trace_irq_disable(...)
> > > > #define trace_irq_enable_rcuidle(...)
> > > > #define trace_irq_disable_rcuidle(...)
> > > > #endif
> > > >
> > > > But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> > > > where I got lost in the macro maze.  I looked at the gcc asm output,
> > > > and there is, indeed:
> > >
> > > DEFINE_EVENT
> > >   DECLARE_TRACE
> > >     __DECLARE_TRACE
> > >        __DECLARE_TRACE_RCU
> > >          static inline void trace_##name##_rcuidle(proto)
> > >             __DO_TRACE
> > >                if (rcuidle)
> > >                   ....
> > >
> > > > But I also don't see why this is any different from any other tracepoint.
> > >
> > > Indeed. I took a wrong turn at some point in the macro jungle :)
> > >
> > > So tracing itself is fine, but then if you have probes or bpf programs
> > > attached to a tracepoint these use rcu_read_lock()/unlock() which is
> > > obviosly wrong in rcuidle context.
> >
> > Definitely, any such code needs to use tricks similar to that of the
> > tracing code.  Or instead use something like SRCU, which is OK with
> > readers from idle.  Or use something like Steve Rostedt's workqueue-based
> > approach, though please be very careful with this latter, lest the
> > battery-powered embedded guys come after you for waking up idle CPUs
> > too often.  ;-)
> 
> Are we okay if we somehow ensure that all the entry code before
> enter_from_user_mode() only does rcuidle tracing variants and has
> kprobes off?  Including for BPF use cases?

That would work, though if BPF used SRCU instead of RCU, this would
be unnecessary.  Sadly, SRCU has full memory barriers in each of
srcu_read_lock() and srcu_read_unlock(), but we are working on it.
(As always, no promises!)

> It would be *really* nice if we could statically verify this, as has
> been mentioned elsewhere in the thread.  It would also probably be
> good enough if we could do it at runtime.  Maybe with lockdep on, we
> verify rcu state in tracepoints even if the tracepoint isn't active?
> And we could plausibly have some widget that could inject something
> into *every* kprobeable function to check rcu state.

Or just have at least one testing step that activates all tracepoints,
but with lockdep enabled?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 19:30                               ` Paul E. McKenney
@ 2020-03-01 19:39                                 ` Andy Lutomirski
  2020-03-01 20:18                                   ` Paul E. McKenney
  2020-03-02  0:35                                   ` Steven Rostedt
  0 siblings, 2 replies; 56+ messages in thread
From: Andy Lutomirski @ 2020-03-01 19:39 UTC (permalink / raw)
  To: paulmck
  Cc: Andy Lutomirski, Thomas Gleixner, Steven Rostedt, Peter Zijlstra,
	LKML, X86 ML, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann



> On Mar 1, 2020, at 11:30 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> 
> On Sun, Mar 01, 2020 at 10:54:23AM -0800, Andy Lutomirski wrote:
>>> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>>> 
>>> On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:
>>>> Andy Lutomirski <luto@kernel.org> writes:
>>>>> On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>>>>>> Andy Lutomirski <luto@amacapital.net> writes:
>>>>>>>> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>>>>>>>> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
>>>>>>>> except trace_hardirq_off/on() as those trace functions do not allow to
>>>>>>>> attach anything AFAICT.
>>>>>>> 
>>>>>>> Can you point to whatever makes those particular functions special?  I
>>>>>>> failed to follow the macro maze.
>>>>>> 
>>>>>> Those are not tracepoints and not going through the macro maze. See
>>>>>> kernel/trace/trace_preemptirq.c
>>>>> 
>>>>> That has:
>>>>> 
>>>>> void trace_hardirqs_on(void)
>>>>> {
>>>>>        if (this_cpu_read(tracing_irq_cpu)) {
>>>>>                if (!in_nmi())
>>>>>                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>>>>>                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
>>>>>                this_cpu_write(tracing_irq_cpu, 0);
>>>>>        }
>>>>> 
>>>>>        lockdep_hardirqs_on(CALLER_ADDR0);
>>>>> }
>>>>> EXPORT_SYMBOL(trace_hardirqs_on);
>>>>> NOKPROBE_SYMBOL(trace_hardirqs_on);
>>>>> 
>>>>> But this calls trace_irq_enable_rcuidle(), and that's the part of the
>>>>> macro maze I got lost in.  I found:
>>>>> 
>>>>> #ifdef CONFIG_TRACE_IRQFLAGS
>>>>> DEFINE_EVENT(preemptirq_template, irq_disable,
>>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
>>>>>             TP_ARGS(ip, parent_ip));
>>>>> 
>>>>> DEFINE_EVENT(preemptirq_template, irq_enable,
>>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
>>>>>             TP_ARGS(ip, parent_ip));
>>>>> #else
>>>>> #define trace_irq_enable(...)
>>>>> #define trace_irq_disable(...)
>>>>> #define trace_irq_enable_rcuidle(...)
>>>>> #define trace_irq_disable_rcuidle(...)
>>>>> #endif
>>>>> 
>>>>> But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
>>>>> where I got lost in the macro maze.  I looked at the gcc asm output,
>>>>> and there is, indeed:
>>>> 
>>>> DEFINE_EVENT
>>>>  DECLARE_TRACE
>>>>    __DECLARE_TRACE
>>>>       __DECLARE_TRACE_RCU
>>>>         static inline void trace_##name##_rcuidle(proto)
>>>>            __DO_TRACE
>>>>               if (rcuidle)
>>>>                  ....
>>>> 
>>>>> But I also don't see why this is any different from any other tracepoint.
>>>> 
>>>> Indeed. I took a wrong turn at some point in the macro jungle :)
>>>> 
>>>> So tracing itself is fine, but then if you have probes or bpf programs
>>>> attached to a tracepoint these use rcu_read_lock()/unlock() which is
>>>> obviosly wrong in rcuidle context.
>>> 
>>> Definitely, any such code needs to use tricks similar to that of the
>>> tracing code.  Or instead use something like SRCU, which is OK with
>>> readers from idle.  Or use something like Steve Rostedt's workqueue-based
>>> approach, though please be very careful with this latter, lest the
>>> battery-powered embedded guys come after you for waking up idle CPUs
>>> too often.  ;-)
>> 
>> Are we okay if we somehow ensure that all the entry code before
>> enter_from_user_mode() only does rcuidle tracing variants and has
>> kprobes off?  Including for BPF use cases?
> 
> That would work, though if BPF used SRCU instead of RCU, this would
> be unnecessary.  Sadly, SRCU has full memory barriers in each of
> srcu_read_lock() and srcu_read_unlock(), but we are working on it.
> (As always, no promises!)
> 
>> It would be *really* nice if we could statically verify this, as has
>> been mentioned elsewhere in the thread.  It would also probably be
>> good enough if we could do it at runtime.  Maybe with lockdep on, we
>> verify rcu state in tracepoints even if the tracepoint isn't active?
>> And we could plausibly have some widget that could inject something
>> into *every* kprobeable function to check rcu state.
> 
> Or just have at least one testing step that activates all tracepoints,
> but with lockdep enabled?

Also kprobe.

I don’t suppose we could make notrace imply nokprobe.  Then all kprobeable functions would also have entry/exit tracepoints, right?

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 19:39                                 ` Andy Lutomirski
@ 2020-03-01 20:18                                   ` Paul E. McKenney
  2020-03-02  0:35                                   ` Steven Rostedt
  1 sibling, 0 replies; 56+ messages in thread
From: Paul E. McKenney @ 2020-03-01 20:18 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Thomas Gleixner, Steven Rostedt, Peter Zijlstra,
	LKML, X86 ML, Brian Gerst, Juergen Gross, Paolo Bonzini,
	Arnd Bergmann

On Sun, Mar 01, 2020 at 11:39:42AM -0800, Andy Lutomirski wrote:
> 
> 
> > On Mar 1, 2020, at 11:30 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> > 
> > On Sun, Mar 01, 2020 at 10:54:23AM -0800, Andy Lutomirski wrote:
> >>> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> >>> 
> >>> On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:
> >>>> Andy Lutomirski <luto@kernel.org> writes:
> >>>>> On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> >>>>>> Andy Lutomirski <luto@amacapital.net> writes:
> >>>>>>>> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> >>>>>>>> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> >>>>>>>> except trace_hardirq_off/on() as those trace functions do not allow to
> >>>>>>>> attach anything AFAICT.
> >>>>>>> 
> >>>>>>> Can you point to whatever makes those particular functions special?  I
> >>>>>>> failed to follow the macro maze.
> >>>>>> 
> >>>>>> Those are not tracepoints and not going through the macro maze. See
> >>>>>> kernel/trace/trace_preemptirq.c
> >>>>> 
> >>>>> That has:
> >>>>> 
> >>>>> void trace_hardirqs_on(void)
> >>>>> {
> >>>>>        if (this_cpu_read(tracing_irq_cpu)) {
> >>>>>                if (!in_nmi())
> >>>>>                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> >>>>>                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
> >>>>>                this_cpu_write(tracing_irq_cpu, 0);
> >>>>>        }
> >>>>> 
> >>>>>        lockdep_hardirqs_on(CALLER_ADDR0);
> >>>>> }
> >>>>> EXPORT_SYMBOL(trace_hardirqs_on);
> >>>>> NOKPROBE_SYMBOL(trace_hardirqs_on);
> >>>>> 
> >>>>> But this calls trace_irq_enable_rcuidle(), and that's the part of the
> >>>>> macro maze I got lost in.  I found:
> >>>>> 
> >>>>> #ifdef CONFIG_TRACE_IRQFLAGS
> >>>>> DEFINE_EVENT(preemptirq_template, irq_disable,
> >>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
> >>>>>             TP_ARGS(ip, parent_ip));
> >>>>> 
> >>>>> DEFINE_EVENT(preemptirq_template, irq_enable,
> >>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
> >>>>>             TP_ARGS(ip, parent_ip));
> >>>>> #else
> >>>>> #define trace_irq_enable(...)
> >>>>> #define trace_irq_disable(...)
> >>>>> #define trace_irq_enable_rcuidle(...)
> >>>>> #define trace_irq_disable_rcuidle(...)
> >>>>> #endif
> >>>>> 
> >>>>> But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> >>>>> where I got lost in the macro maze.  I looked at the gcc asm output,
> >>>>> and there is, indeed:
> >>>> 
> >>>> DEFINE_EVENT
> >>>>  DECLARE_TRACE
> >>>>    __DECLARE_TRACE
> >>>>       __DECLARE_TRACE_RCU
> >>>>         static inline void trace_##name##_rcuidle(proto)
> >>>>            __DO_TRACE
> >>>>               if (rcuidle)
> >>>>                  ....
> >>>> 
> >>>>> But I also don't see why this is any different from any other tracepoint.
> >>>> 
> >>>> Indeed. I took a wrong turn at some point in the macro jungle :)
> >>>> 
> >>>> So tracing itself is fine, but then if you have probes or bpf programs
> >>>> attached to a tracepoint these use rcu_read_lock()/unlock() which is
> >>>> obviosly wrong in rcuidle context.
> >>> 
> >>> Definitely, any such code needs to use tricks similar to that of the
> >>> tracing code.  Or instead use something like SRCU, which is OK with
> >>> readers from idle.  Or use something like Steve Rostedt's workqueue-based
> >>> approach, though please be very careful with this latter, lest the
> >>> battery-powered embedded guys come after you for waking up idle CPUs
> >>> too often.  ;-)
> >> 
> >> Are we okay if we somehow ensure that all the entry code before
> >> enter_from_user_mode() only does rcuidle tracing variants and has
> >> kprobes off?  Including for BPF use cases?
> > 
> > That would work, though if BPF used SRCU instead of RCU, this would
> > be unnecessary.  Sadly, SRCU has full memory barriers in each of
> > srcu_read_lock() and srcu_read_unlock(), but we are working on it.
> > (As always, no promises!)
> > 
> >> It would be *really* nice if we could statically verify this, as has
> >> been mentioned elsewhere in the thread.  It would also probably be
> >> good enough if we could do it at runtime.  Maybe with lockdep on, we
> >> verify rcu state in tracepoints even if the tracepoint isn't active?
> >> And we could plausibly have some widget that could inject something
> >> into *every* kprobeable function to check rcu state.
> > 
> > Or just have at least one testing step that activates all tracepoints,
> > but with lockdep enabled?
> 
> Also kprobe.

I didn't realize we could test all kprobe points easily, but yes,
that would also be good.

> I don’t suppose we could make notrace imply nokprobe.  Then all kprobeable functions would also have entry/exit tracepoints, right?

On this, I must defer to the tracing and kprobes people.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 19:39                                 ` Andy Lutomirski
  2020-03-01 20:18                                   ` Paul E. McKenney
@ 2020-03-02  0:35                                   ` Steven Rostedt
  2020-03-02  6:47                                     ` Masami Hiramatsu
  1 sibling, 1 reply; 56+ messages in thread
From: Steven Rostedt @ 2020-03-02  0:35 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: paulmck, Andy Lutomirski, Thomas Gleixner, Peter Zijlstra, LKML,
	X86 ML, Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann,
	Masami Hiramatsu

On Sun, 1 Mar 2020 11:39:42 -0800
Andy Lutomirski <luto@amacapital.net> wrote:

> > On Mar 1, 2020, at 11:30 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> > 
> > On Sun, Mar 01, 2020 at 10:54:23AM -0800, Andy Lutomirski wrote:  
> >>> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> >>> 
> >>> On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:  
> >>>> Andy Lutomirski <luto@kernel.org> writes:  
> >>>>> On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:  
> >>>>>> Andy Lutomirski <luto@amacapital.net> writes:  
> >>>>>>>> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> >>>>>>>> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> >>>>>>>> except trace_hardirq_off/on() as those trace functions do not allow to
> >>>>>>>> attach anything AFAICT.  
> >>>>>>> 
> >>>>>>> Can you point to whatever makes those particular functions special?  I
> >>>>>>> failed to follow the macro maze.  
> >>>>>> 
> >>>>>> Those are not tracepoints and not going through the macro maze. See
> >>>>>> kernel/trace/trace_preemptirq.c  
> >>>>> 
> >>>>> That has:
> >>>>> 
> >>>>> void trace_hardirqs_on(void)
> >>>>> {
> >>>>>        if (this_cpu_read(tracing_irq_cpu)) {
> >>>>>                if (!in_nmi())
> >>>>>                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> >>>>>                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
> >>>>>                this_cpu_write(tracing_irq_cpu, 0);
> >>>>>        }
> >>>>> 
> >>>>>        lockdep_hardirqs_on(CALLER_ADDR0);
> >>>>> }
> >>>>> EXPORT_SYMBOL(trace_hardirqs_on);
> >>>>> NOKPROBE_SYMBOL(trace_hardirqs_on);
> >>>>> 
> >>>>> But this calls trace_irq_enable_rcuidle(), and that's the part of the
> >>>>> macro maze I got lost in.  I found:
> >>>>> 
> >>>>> #ifdef CONFIG_TRACE_IRQFLAGS
> >>>>> DEFINE_EVENT(preemptirq_template, irq_disable,
> >>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
> >>>>>             TP_ARGS(ip, parent_ip));
> >>>>> 
> >>>>> DEFINE_EVENT(preemptirq_template, irq_enable,
> >>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
> >>>>>             TP_ARGS(ip, parent_ip));
> >>>>> #else
> >>>>> #define trace_irq_enable(...)
> >>>>> #define trace_irq_disable(...)
> >>>>> #define trace_irq_enable_rcuidle(...)
> >>>>> #define trace_irq_disable_rcuidle(...)
> >>>>> #endif
> >>>>> 
> >>>>> But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> >>>>> where I got lost in the macro maze.  I looked at the gcc asm output,
> >>>>> and there is, indeed:  
> >>>> 
> >>>> DEFINE_EVENT
> >>>>  DECLARE_TRACE
> >>>>    __DECLARE_TRACE
> >>>>       __DECLARE_TRACE_RCU
> >>>>         static inline void trace_##name##_rcuidle(proto)
> >>>>            __DO_TRACE
> >>>>               if (rcuidle)
> >>>>                  ....
> >>>>   
> >>>>> But I also don't see why this is any different from any other tracepoint.  
> >>>> 
> >>>> Indeed. I took a wrong turn at some point in the macro jungle :)
> >>>> 
> >>>> So tracing itself is fine, but then if you have probes or bpf programs
> >>>> attached to a tracepoint these use rcu_read_lock()/unlock() which is
> >>>> obviosly wrong in rcuidle context.  
> >>> 
> >>> Definitely, any such code needs to use tricks similar to that of the
> >>> tracing code.  Or instead use something like SRCU, which is OK with
> >>> readers from idle.  Or use something like Steve Rostedt's workqueue-based
> >>> approach, though please be very careful with this latter, lest the
> >>> battery-powered embedded guys come after you for waking up idle CPUs
> >>> too often.  ;-)  
> >> 
> >> Are we okay if we somehow ensure that all the entry code before
> >> enter_from_user_mode() only does rcuidle tracing variants and has
> >> kprobes off?  Including for BPF use cases?  
> > 
> > That would work, though if BPF used SRCU instead of RCU, this would
> > be unnecessary.  Sadly, SRCU has full memory barriers in each of
> > srcu_read_lock() and srcu_read_unlock(), but we are working on it.
> > (As always, no promises!)
> >   
> >> It would be *really* nice if we could statically verify this, as has
> >> been mentioned elsewhere in the thread.  It would also probably be
> >> good enough if we could do it at runtime.  Maybe with lockdep on, we
> >> verify rcu state in tracepoints even if the tracepoint isn't active?
> >> And we could plausibly have some widget that could inject something
> >> into *every* kprobeable function to check rcu state.  
> > 
> > Or just have at least one testing step that activates all tracepoints,
> > but with lockdep enabled?  
> 
> Also kprobe.
> 
> I don’t suppose we could make notrace imply nokprobe.  Then all
> kprobeable functions would also have entry/exit tracepoints, right?

There was some code before that prevented a kprobe from being allowed
in something that was not in the ftrace mcount table (which would make
this happen). But I think that was changed because it was too
restrictive.

Masami?

-- Steve

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 18:54                             ` Andy Lutomirski
  2020-03-01 19:30                               ` Paul E. McKenney
@ 2020-03-02  1:10                               ` Joel Fernandes
  2020-03-02  2:18                                 ` Andy Lutomirski
  2020-03-02  8:10                               ` Thomas Gleixner
  2 siblings, 1 reply; 56+ messages in thread
From: Joel Fernandes @ 2020-03-02  1:10 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Paul E. McKenney, Thomas Gleixner, Steven Rostedt,
	Peter Zijlstra, LKML, X86 ML, Brian Gerst, Juergen Gross,
	Paolo Bonzini, Arnd Bergmann

On Sun, Mar 01, 2020 at 10:54:23AM -0800, Andy Lutomirski wrote:
> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:
> > > Andy Lutomirski <luto@kernel.org> writes:
> > > > On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> > > >> Andy Lutomirski <luto@amacapital.net> writes:
> > > >> >> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > > >> >> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> > > >> >> except trace_hardirq_off/on() as those trace functions do not allow to
> > > >> >> attach anything AFAICT.
> > > >> >
> > > >> > Can you point to whatever makes those particular functions special?  I
> > > >> > failed to follow the macro maze.
> > > >>
> > > >> Those are not tracepoints and not going through the macro maze. See
> > > >> kernel/trace/trace_preemptirq.c
> > > >
> > > > That has:
> > > >
> > > > void trace_hardirqs_on(void)
> > > > {
> > > >         if (this_cpu_read(tracing_irq_cpu)) {
> > > >                 if (!in_nmi())
> > > >                         trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> > > >                 tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
> > > >                 this_cpu_write(tracing_irq_cpu, 0);
> > > >         }
> > > >
> > > >         lockdep_hardirqs_on(CALLER_ADDR0);
> > > > }
> > > > EXPORT_SYMBOL(trace_hardirqs_on);
> > > > NOKPROBE_SYMBOL(trace_hardirqs_on);
> > > >
> > > > But this calls trace_irq_enable_rcuidle(), and that's the part of the
> > > > macro maze I got lost in.  I found:
> > > >
> > > > #ifdef CONFIG_TRACE_IRQFLAGS
> > > > DEFINE_EVENT(preemptirq_template, irq_disable,
> > > >              TP_PROTO(unsigned long ip, unsigned long parent_ip),
> > > >              TP_ARGS(ip, parent_ip));
> > > >
> > > > DEFINE_EVENT(preemptirq_template, irq_enable,
> > > >              TP_PROTO(unsigned long ip, unsigned long parent_ip),
> > > >              TP_ARGS(ip, parent_ip));
> > > > #else
> > > > #define trace_irq_enable(...)
> > > > #define trace_irq_disable(...)
> > > > #define trace_irq_enable_rcuidle(...)
> > > > #define trace_irq_disable_rcuidle(...)
> > > > #endif
> > > >
> > > > But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> > > > where I got lost in the macro maze.  I looked at the gcc asm output,
> > > > and there is, indeed:
> > >
> > > DEFINE_EVENT
> > >   DECLARE_TRACE
> > >     __DECLARE_TRACE
> > >        __DECLARE_TRACE_RCU
> > >          static inline void trace_##name##_rcuidle(proto)
> > >             __DO_TRACE
> > >                if (rcuidle)
> > >                   ....
> > >
> > > > But I also don't see why this is any different from any other tracepoint.
> > >
> > > Indeed. I took a wrong turn at some point in the macro jungle :)
> > >
> > > So tracing itself is fine, but then if you have probes or bpf programs
> > > attached to a tracepoint these use rcu_read_lock()/unlock() which is
> > > obviosly wrong in rcuidle context.
> >
> > Definitely, any such code needs to use tricks similar to that of the
> > tracing code.  Or instead use something like SRCU, which is OK with
> > readers from idle.  Or use something like Steve Rostedt's workqueue-based
> > approach, though please be very careful with this latter, lest the
> > battery-powered embedded guys come after you for waking up idle CPUs
> > too often.  ;-)
> >
> 
> Are we okay if we somehow ensure that all the entry code before
> enter_from_user_mode() only does rcuidle tracing variants and has
> kprobes off?  Including for BPF use cases?
> 
> It would be *really* nice if we could statically verify this, as has
> been mentioned elsewhere in the thread.  It would also probably be
> good enough if we could do it at runtime.  Maybe with lockdep on, we
> verify rcu state in tracepoints even if the tracepoint isn't active?
> And we could plausibly have some widget that could inject something
> into *every* kprobeable function to check rcu state.

You are talking about verifying that a non-rcuidle tracepoint is not called
into when RCU is not watching right? I think that's fine, though I feel
lockdep kernels should not be slowed down any more than they already are. I
feel over time if we add too many checks to lockdep enabled kernels, then it
becomes too slow even for "debug" kernels. May be it is time for a
CONFIG_LOCKDEP_SLOW or some such? And then anyone who wants to go crazy on
runtime checking can do so. I myself want to add a few.

Note that the checking is being added into "non rcu-idle" tracepoints many of
which are probably always called when RCU is watching, making such checking
useless for those tracepoints (and slowing them down however less).

Also another note would be that the whole reason we are getting rid of the
"make RCU watch when rcuidle" logic in DO_TRACE is because it is slow for
tracepoints that are frequently called into. Another reason to do it is
because tracepoint callbacks are expected to know what they are doing and
turn on RCU watching as appropriate (as consensus on the matter suggests).

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-02  1:10                               ` Joel Fernandes
@ 2020-03-02  2:18                                 ` Andy Lutomirski
  2020-03-02  2:36                                   ` Joel Fernandes
  0 siblings, 1 reply; 56+ messages in thread
From: Andy Lutomirski @ 2020-03-02  2:18 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Andy Lutomirski, Paul E. McKenney, Thomas Gleixner,
	Steven Rostedt, Peter Zijlstra, LKML, X86 ML, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann



> On Mar 1, 2020, at 5:10 PM, Joel Fernandes <joel@joelfernandes.org> wrote:
> 
> On Sun, Mar 01, 2020 at 10:54:23AM -0800, Andy Lutomirski wrote:
>>> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>>> 
>>> On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:
>>>> Andy Lutomirski <luto@kernel.org> writes:
>>>>> On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>>>>>> Andy Lutomirski <luto@amacapital.net> writes:
>>>>>>>> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>>>>>>>> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
>>>>>>>> except trace_hardirq_off/on() as those trace functions do not allow to
>>>>>>>> attach anything AFAICT.
>>>>>>> 
>>>>>>> Can you point to whatever makes those particular functions special?  I
>>>>>>> failed to follow the macro maze.
>>>>>> 
>>>>>> Those are not tracepoints and not going through the macro maze. See
>>>>>> kernel/trace/trace_preemptirq.c
>>>>> 
>>>>> That has:
>>>>> 
>>>>> void trace_hardirqs_on(void)
>>>>> {
>>>>>        if (this_cpu_read(tracing_irq_cpu)) {
>>>>>                if (!in_nmi())
>>>>>                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>>>>>                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
>>>>>                this_cpu_write(tracing_irq_cpu, 0);
>>>>>        }
>>>>> 
>>>>>        lockdep_hardirqs_on(CALLER_ADDR0);
>>>>> }
>>>>> EXPORT_SYMBOL(trace_hardirqs_on);
>>>>> NOKPROBE_SYMBOL(trace_hardirqs_on);
>>>>> 
>>>>> But this calls trace_irq_enable_rcuidle(), and that's the part of the
>>>>> macro maze I got lost in.  I found:
>>>>> 
>>>>> #ifdef CONFIG_TRACE_IRQFLAGS
>>>>> DEFINE_EVENT(preemptirq_template, irq_disable,
>>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
>>>>>             TP_ARGS(ip, parent_ip));
>>>>> 
>>>>> DEFINE_EVENT(preemptirq_template, irq_enable,
>>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
>>>>>             TP_ARGS(ip, parent_ip));
>>>>> #else
>>>>> #define trace_irq_enable(...)
>>>>> #define trace_irq_disable(...)
>>>>> #define trace_irq_enable_rcuidle(...)
>>>>> #define trace_irq_disable_rcuidle(...)
>>>>> #endif
>>>>> 
>>>>> But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
>>>>> where I got lost in the macro maze.  I looked at the gcc asm output,
>>>>> and there is, indeed:
>>>> 
>>>> DEFINE_EVENT
>>>>  DECLARE_TRACE
>>>>    __DECLARE_TRACE
>>>>       __DECLARE_TRACE_RCU
>>>>         static inline void trace_##name##_rcuidle(proto)
>>>>            __DO_TRACE
>>>>               if (rcuidle)
>>>>                  ....
>>>> 
>>>>> But I also don't see why this is any different from any other tracepoint.
>>>> 
>>>> Indeed. I took a wrong turn at some point in the macro jungle :)
>>>> 
>>>> So tracing itself is fine, but then if you have probes or bpf programs
>>>> attached to a tracepoint these use rcu_read_lock()/unlock() which is
>>>> obviosly wrong in rcuidle context.
>>> 
>>> Definitely, any such code needs to use tricks similar to that of the
>>> tracing code.  Or instead use something like SRCU, which is OK with
>>> readers from idle.  Or use something like Steve Rostedt's workqueue-based
>>> approach, though please be very careful with this latter, lest the
>>> battery-powered embedded guys come after you for waking up idle CPUs
>>> too often.  ;-)
>>> 
>> 
>> Are we okay if we somehow ensure that all the entry code before
>> enter_from_user_mode() only does rcuidle tracing variants and has
>> kprobes off?  Including for BPF use cases?
>> 
>> It would be *really* nice if we could statically verify this, as has
>> been mentioned elsewhere in the thread.  It would also probably be
>> good enough if we could do it at runtime.  Maybe with lockdep on, we
>> verify rcu state in tracepoints even if the tracepoint isn't active?
>> And we could plausibly have some widget that could inject something
>> into *every* kprobeable function to check rcu state.
> 
> You are talking about verifying that a non-rcuidle tracepoint is not called
> into when RCU is not watching right? I think that's fine, though I feel
> lockdep kernels should not be slowed down any more than they already are. I
> feel over time if we add too many checks to lockdep enabled kernels, then it
> becomes too slow even for "debug" kernels. May be it is time for a
> CONFIG_LOCKDEP_SLOW or some such? And then anyone who wants to go crazy on
> runtime checking can do so. I myself want to add a few.
> 
> Note that the checking is being added into "non rcu-idle" tracepoints many of
> which are probably always called when RCU is watching, making such checking
> useless for those tracepoints (and slowing them down however less).
> 

Indeed. Static analysis would help a lot here.

> Also another note would be that the whole reason we are getting rid of the
> "make RCU watch when rcuidle" logic in DO_TRACE is because it is slow for
> tracepoints that are frequently called into. Another reason to do it is
> because tracepoint callbacks are expected to know what they are doing and
> turn on RCU watching as appropriate (as consensus on the matter suggests).

Whoa there. We arch people need crystal clear rules as to what tracepoints can be called in what contexts and what is the responsibility of the callbacks.

> 
> thanks,
> 
> - Joel
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-02  2:18                                 ` Andy Lutomirski
@ 2020-03-02  2:36                                   ` Joel Fernandes
  2020-03-02  5:40                                     ` Andy Lutomirski
  0 siblings, 1 reply; 56+ messages in thread
From: Joel Fernandes @ 2020-03-02  2:36 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Paul E. McKenney, Thomas Gleixner,
	Steven Rostedt, Peter Zijlstra, LKML, X86 ML, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann

On Sun, Mar 01, 2020 at 06:18:51PM -0800, Andy Lutomirski wrote:
> 
> 
> > On Mar 1, 2020, at 5:10 PM, Joel Fernandes <joel@joelfernandes.org> wrote:
> > 
> > On Sun, Mar 01, 2020 at 10:54:23AM -0800, Andy Lutomirski wrote:
> >>> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> >>> 
> >>> On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:
> >>>> Andy Lutomirski <luto@kernel.org> writes:
> >>>>> On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> >>>>>> Andy Lutomirski <luto@amacapital.net> writes:
> >>>>>>>> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> >>>>>>>> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> >>>>>>>> except trace_hardirq_off/on() as those trace functions do not allow to
> >>>>>>>> attach anything AFAICT.
> >>>>>>> 
> >>>>>>> Can you point to whatever makes those particular functions special?  I
> >>>>>>> failed to follow the macro maze.
> >>>>>> 
> >>>>>> Those are not tracepoints and not going through the macro maze. See
> >>>>>> kernel/trace/trace_preemptirq.c
> >>>>> 
> >>>>> That has:
> >>>>> 
> >>>>> void trace_hardirqs_on(void)
> >>>>> {
> >>>>>        if (this_cpu_read(tracing_irq_cpu)) {
> >>>>>                if (!in_nmi())
> >>>>>                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> >>>>>                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
> >>>>>                this_cpu_write(tracing_irq_cpu, 0);
> >>>>>        }
> >>>>> 
> >>>>>        lockdep_hardirqs_on(CALLER_ADDR0);
> >>>>> }
> >>>>> EXPORT_SYMBOL(trace_hardirqs_on);
> >>>>> NOKPROBE_SYMBOL(trace_hardirqs_on);
> >>>>> 
> >>>>> But this calls trace_irq_enable_rcuidle(), and that's the part of the
> >>>>> macro maze I got lost in.  I found:
> >>>>> 
> >>>>> #ifdef CONFIG_TRACE_IRQFLAGS
> >>>>> DEFINE_EVENT(preemptirq_template, irq_disable,
> >>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
> >>>>>             TP_ARGS(ip, parent_ip));
> >>>>> 
> >>>>> DEFINE_EVENT(preemptirq_template, irq_enable,
> >>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
> >>>>>             TP_ARGS(ip, parent_ip));
> >>>>> #else
> >>>>> #define trace_irq_enable(...)
> >>>>> #define trace_irq_disable(...)
> >>>>> #define trace_irq_enable_rcuidle(...)
> >>>>> #define trace_irq_disable_rcuidle(...)
> >>>>> #endif
> >>>>> 
> >>>>> But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> >>>>> where I got lost in the macro maze.  I looked at the gcc asm output,
> >>>>> and there is, indeed:
> >>>> 
> >>>> DEFINE_EVENT
> >>>>  DECLARE_TRACE
> >>>>    __DECLARE_TRACE
> >>>>       __DECLARE_TRACE_RCU
> >>>>         static inline void trace_##name##_rcuidle(proto)
> >>>>            __DO_TRACE
> >>>>               if (rcuidle)
> >>>>                  ....
> >>>> 
> >>>>> But I also don't see why this is any different from any other tracepoint.
> >>>> 
> >>>> Indeed. I took a wrong turn at some point in the macro jungle :)
> >>>> 
> >>>> So tracing itself is fine, but then if you have probes or bpf programs
> >>>> attached to a tracepoint these use rcu_read_lock()/unlock() which is
> >>>> obviosly wrong in rcuidle context.
> >>> 
> >>> Definitely, any such code needs to use tricks similar to that of the
> >>> tracing code.  Or instead use something like SRCU, which is OK with
> >>> readers from idle.  Or use something like Steve Rostedt's workqueue-based
> >>> approach, though please be very careful with this latter, lest the
> >>> battery-powered embedded guys come after you for waking up idle CPUs
> >>> too often.  ;-)
> >>> 
> >> 
> >> Are we okay if we somehow ensure that all the entry code before
> >> enter_from_user_mode() only does rcuidle tracing variants and has
> >> kprobes off?  Including for BPF use cases?
> >> 
> >> It would be *really* nice if we could statically verify this, as has
> >> been mentioned elsewhere in the thread.  It would also probably be
> >> good enough if we could do it at runtime.  Maybe with lockdep on, we
> >> verify rcu state in tracepoints even if the tracepoint isn't active?
> >> And we could plausibly have some widget that could inject something
> >> into *every* kprobeable function to check rcu state.
> > 
> > You are talking about verifying that a non-rcuidle tracepoint is not called
> > into when RCU is not watching right? I think that's fine, though I feel
> > lockdep kernels should not be slowed down any more than they already are. I
> > feel over time if we add too many checks to lockdep enabled kernels, then it
> > becomes too slow even for "debug" kernels. May be it is time for a
> > CONFIG_LOCKDEP_SLOW or some such? And then anyone who wants to go crazy on
> > runtime checking can do so. I myself want to add a few.
> > 
> > Note that the checking is being added into "non rcu-idle" tracepoints many of
> > which are probably always called when RCU is watching, making such checking
> > useless for those tracepoints (and slowing them down however less).
> > 
> 
> Indeed. Static analysis would help a lot here.
> 
> > Also another note would be that the whole reason we are getting rid of the
> > "make RCU watch when rcuidle" logic in DO_TRACE is because it is slow for
> > tracepoints that are frequently called into. Another reason to do it is
> > because tracepoint callbacks are expected to know what they are doing and
> > turn on RCU watching as appropriate (as consensus on the matter suggests).
> 
> Whoa there. We arch people need crystal clear rules as to what tracepoints
> can be called in what contexts and what is the responsibility of the
> callbacks.
> 

The direction that Peter, Mathieu and Steve are going is that callbacks
registered on "rcu idle" tracepoints need to turn on "RCU watching"
themselves. Such as perf. Turning on "RCU watching" is non-free as I tested
in 2017 and we removed it back then, but it was added right back when perf
started splatting. Now it is being removed again, and the turning on of RCU's
eyes happens in the perf callback itself since perf uses RCU.

If you are calling trace_.._rcuidle(), can you not ensure that RCU is
watching by calling the appropriate RCU APIs in your callbacks? Or did I miss
the point?

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-02  2:36                                   ` Joel Fernandes
@ 2020-03-02  5:40                                     ` Andy Lutomirski
  0 siblings, 0 replies; 56+ messages in thread
From: Andy Lutomirski @ 2020-03-02  5:40 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Andy Lutomirski, Paul E. McKenney, Thomas Gleixner,
	Steven Rostedt, Peter Zijlstra, LKML, X86 ML, Brian Gerst,
	Juergen Gross, Paolo Bonzini, Arnd Bergmann



> On Mar 1, 2020, at 6:36 PM, Joel Fernandes <joel@joelfernandes.org> wrote:
> 
> On Sun, Mar 01, 2020 at 06:18:51PM -0800, Andy Lutomirski wrote:
>> 
>> 
>>>> On Mar 1, 2020, at 5:10 PM, Joel Fernandes <joel@joelfernandes.org> wrote:
>>> 
>>> On Sun, Mar 01, 2020 at 10:54:23AM -0800, Andy Lutomirski wrote:
>>>>> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>>>>> 
>>>>> On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:
>>>>>> Andy Lutomirski <luto@kernel.org> writes:
>>>>>>> On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>>>>>>>> Andy Lutomirski <luto@amacapital.net> writes:
>>>>>>>>>> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>>>>>>>>>> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
>>>>>>>>>> except trace_hardirq_off/on() as those trace functions do not allow to
>>>>>>>>>> attach anything AFAICT.
>>>>>>>>> 
>>>>>>>>> Can you point to whatever makes those particular functions special?  I
>>>>>>>>> failed to follow the macro maze.
>>>>>>>> 
>>>>>>>> Those are not tracepoints and not going through the macro maze. See
>>>>>>>> kernel/trace/trace_preemptirq.c
>>>>>>> 
>>>>>>> That has:
>>>>>>> 
>>>>>>> void trace_hardirqs_on(void)
>>>>>>> {
>>>>>>>       if (this_cpu_read(tracing_irq_cpu)) {
>>>>>>>               if (!in_nmi())
>>>>>>>                       trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
>>>>>>>               tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
>>>>>>>               this_cpu_write(tracing_irq_cpu, 0);
>>>>>>>       }
>>>>>>> 
>>>>>>>       lockdep_hardirqs_on(CALLER_ADDR0);
>>>>>>> }
>>>>>>> EXPORT_SYMBOL(trace_hardirqs_on);
>>>>>>> NOKPROBE_SYMBOL(trace_hardirqs_on);
>>>>>>> 
>>>>>>> But this calls trace_irq_enable_rcuidle(), and that's the part of the
>>>>>>> macro maze I got lost in.  I found:
>>>>>>> 
>>>>>>> #ifdef CONFIG_TRACE_IRQFLAGS
>>>>>>> DEFINE_EVENT(preemptirq_template, irq_disable,
>>>>>>>            TP_PROTO(unsigned long ip, unsigned long parent_ip),
>>>>>>>            TP_ARGS(ip, parent_ip));
>>>>>>> 
>>>>>>> DEFINE_EVENT(preemptirq_template, irq_enable,
>>>>>>>            TP_PROTO(unsigned long ip, unsigned long parent_ip),
>>>>>>>            TP_ARGS(ip, parent_ip));
>>>>>>> #else
>>>>>>> #define trace_irq_enable(...)
>>>>>>> #define trace_irq_disable(...)
>>>>>>> #define trace_irq_enable_rcuidle(...)
>>>>>>> #define trace_irq_disable_rcuidle(...)
>>>>>>> #endif
>>>>>>> 
>>>>>>> But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
>>>>>>> where I got lost in the macro maze.  I looked at the gcc asm output,
>>>>>>> and there is, indeed:
>>>>>> 
>>>>>> DEFINE_EVENT
>>>>>> DECLARE_TRACE
>>>>>>   __DECLARE_TRACE
>>>>>>      __DECLARE_TRACE_RCU
>>>>>>        static inline void trace_##name##_rcuidle(proto)
>>>>>>           __DO_TRACE
>>>>>>              if (rcuidle)
>>>>>>                 ....
>>>>>> 
>>>>>>> But I also don't see why this is any different from any other tracepoint.
>>>>>> 
>>>>>> Indeed. I took a wrong turn at some point in the macro jungle :)
>>>>>> 
>>>>>> So tracing itself is fine, but then if you have probes or bpf programs
>>>>>> attached to a tracepoint these use rcu_read_lock()/unlock() which is
>>>>>> obviosly wrong in rcuidle context.
>>>>> 
>>>>> Definitely, any such code needs to use tricks similar to that of the
>>>>> tracing code.  Or instead use something like SRCU, which is OK with
>>>>> readers from idle.  Or use something like Steve Rostedt's workqueue-based
>>>>> approach, though please be very careful with this latter, lest the
>>>>> battery-powered embedded guys come after you for waking up idle CPUs
>>>>> too often.  ;-)
>>>>> 
>>>> 
>>>> Are we okay if we somehow ensure that all the entry code before
>>>> enter_from_user_mode() only does rcuidle tracing variants and has
>>>> kprobes off?  Including for BPF use cases?
>>>> 
>>>> It would be *really* nice if we could statically verify this, as has
>>>> been mentioned elsewhere in the thread.  It would also probably be
>>>> good enough if we could do it at runtime.  Maybe with lockdep on, we
>>>> verify rcu state in tracepoints even if the tracepoint isn't active?
>>>> And we could plausibly have some widget that could inject something
>>>> into *every* kprobeable function to check rcu state.
>>> 
>>> You are talking about verifying that a non-rcuidle tracepoint is not called
>>> into when RCU is not watching right? I think that's fine, though I feel
>>> lockdep kernels should not be slowed down any more than they already are. I
>>> feel over time if we add too many checks to lockdep enabled kernels, then it
>>> becomes too slow even for "debug" kernels. May be it is time for a
>>> CONFIG_LOCKDEP_SLOW or some such? And then anyone who wants to go crazy on
>>> runtime checking can do so. I myself want to add a few.
>>> 
>>> Note that the checking is being added into "non rcu-idle" tracepoints many of
>>> which are probably always called when RCU is watching, making such checking
>>> useless for those tracepoints (and slowing them down however less).
>>> 
>> 
>> Indeed. Static analysis would help a lot here.
>> 
>>> Also another note would be that the whole reason we are getting rid of the
>>> "make RCU watch when rcuidle" logic in DO_TRACE is because it is slow for
>>> tracepoints that are frequently called into. Another reason to do it is
>>> because tracepoint callbacks are expected to know what they are doing and
>>> turn on RCU watching as appropriate (as consensus on the matter suggests).
>> 
>> Whoa there. We arch people need crystal clear rules as to what tracepoints
>> can be called in what contexts and what is the responsibility of the
>> callbacks.
>> 
> 
> The direction that Peter, Mathieu and Steve are going is that callbacks
> registered on "rcu idle" tracepoints need to turn on "RCU watching"
> themselves. Such as perf. Turning on "RCU watching" is non-free as I tested
> in 2017 and we removed it back then, but it was added right back when perf
> started splatting. Now it is being removed again, and the turning on of RCU's
> eyes happens in the perf callback itself since perf uses RCU.
> 
> If you are calling trace_.._rcuidle(), can you not ensure that RCU is
> watching by calling the appropriate RCU APIs in your callbacks? Or did I miss
> the point?

Unless I’ve misunderstood you, the problem is that the callback and the caller are totally different subsystems. I can count the number of people who know which tracepoints need RCU turned on in their callbacks without using any fingers (tglx and I are still at least a bit confused— I’m not convinced *anyone* knows for sure).

I think we need some credible rule for which events need special handling in their callbacks, and then we need to ensure that all subsystems that install callbacks, including BPF, follow the rule.  It should not be up to the author of a BPF program to get this right.

> 
> thanks,
> 
> - Joel
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-02  0:35                                   ` Steven Rostedt
@ 2020-03-02  6:47                                     ` Masami Hiramatsu
  0 siblings, 0 replies; 56+ messages in thread
From: Masami Hiramatsu @ 2020-03-02  6:47 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andy Lutomirski, paulmck, Andy Lutomirski, Thomas Gleixner,
	Peter Zijlstra, LKML, X86 ML, Brian Gerst, Juergen Gross,
	Paolo Bonzini, Arnd Bergmann, Masami Hiramatsu

On Sun, 1 Mar 2020 19:35:01 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Sun, 1 Mar 2020 11:39:42 -0800
> Andy Lutomirski <luto@amacapital.net> wrote:
> 
> > > On Mar 1, 2020, at 11:30 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> > > 
> > > On Sun, Mar 01, 2020 at 10:54:23AM -0800, Andy Lutomirski wrote:  
> > >>> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> > >>> 
> > >>> On Sun, Mar 01, 2020 at 07:12:25PM +0100, Thomas Gleixner wrote:  
> > >>>> Andy Lutomirski <luto@kernel.org> writes:  
> > >>>>> On Sun, Mar 1, 2020 at 7:21 AM Thomas Gleixner <tglx@linutronix.de> wrote:  
> > >>>>>> Andy Lutomirski <luto@amacapital.net> writes:  
> > >>>>>>>> On Mar 1, 2020, at 2:16 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > >>>>>>>> Ok, but for the time being anything before/after CONTEXT_KERNEL is unsafe
> > >>>>>>>> except trace_hardirq_off/on() as those trace functions do not allow to
> > >>>>>>>> attach anything AFAICT.  
> > >>>>>>> 
> > >>>>>>> Can you point to whatever makes those particular functions special?  I
> > >>>>>>> failed to follow the macro maze.  
> > >>>>>> 
> > >>>>>> Those are not tracepoints and not going through the macro maze. See
> > >>>>>> kernel/trace/trace_preemptirq.c  
> > >>>>> 
> > >>>>> That has:
> > >>>>> 
> > >>>>> void trace_hardirqs_on(void)
> > >>>>> {
> > >>>>>        if (this_cpu_read(tracing_irq_cpu)) {
> > >>>>>                if (!in_nmi())
> > >>>>>                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
> > >>>>>                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
> > >>>>>                this_cpu_write(tracing_irq_cpu, 0);
> > >>>>>        }
> > >>>>> 
> > >>>>>        lockdep_hardirqs_on(CALLER_ADDR0);
> > >>>>> }
> > >>>>> EXPORT_SYMBOL(trace_hardirqs_on);
> > >>>>> NOKPROBE_SYMBOL(trace_hardirqs_on);
> > >>>>> 
> > >>>>> But this calls trace_irq_enable_rcuidle(), and that's the part of the
> > >>>>> macro maze I got lost in.  I found:
> > >>>>> 
> > >>>>> #ifdef CONFIG_TRACE_IRQFLAGS
> > >>>>> DEFINE_EVENT(preemptirq_template, irq_disable,
> > >>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
> > >>>>>             TP_ARGS(ip, parent_ip));
> > >>>>> 
> > >>>>> DEFINE_EVENT(preemptirq_template, irq_enable,
> > >>>>>             TP_PROTO(unsigned long ip, unsigned long parent_ip),
> > >>>>>             TP_ARGS(ip, parent_ip));
> > >>>>> #else
> > >>>>> #define trace_irq_enable(...)
> > >>>>> #define trace_irq_disable(...)
> > >>>>> #define trace_irq_enable_rcuidle(...)
> > >>>>> #define trace_irq_disable_rcuidle(...)
> > >>>>> #endif
> > >>>>> 
> > >>>>> But the DEFINE_EVENT doesn't have the "_rcuidle" part.  And that's
> > >>>>> where I got lost in the macro maze.  I looked at the gcc asm output,
> > >>>>> and there is, indeed:  
> > >>>> 
> > >>>> DEFINE_EVENT
> > >>>>  DECLARE_TRACE
> > >>>>    __DECLARE_TRACE
> > >>>>       __DECLARE_TRACE_RCU
> > >>>>         static inline void trace_##name##_rcuidle(proto)
> > >>>>            __DO_TRACE
> > >>>>               if (rcuidle)
> > >>>>                  ....
> > >>>>   
> > >>>>> But I also don't see why this is any different from any other tracepoint.  
> > >>>> 
> > >>>> Indeed. I took a wrong turn at some point in the macro jungle :)
> > >>>> 
> > >>>> So tracing itself is fine, but then if you have probes or bpf programs
> > >>>> attached to a tracepoint these use rcu_read_lock()/unlock() which is
> > >>>> obviosly wrong in rcuidle context.  
> > >>> 
> > >>> Definitely, any such code needs to use tricks similar to that of the
> > >>> tracing code.  Or instead use something like SRCU, which is OK with
> > >>> readers from idle.  Or use something like Steve Rostedt's workqueue-based
> > >>> approach, though please be very careful with this latter, lest the
> > >>> battery-powered embedded guys come after you for waking up idle CPUs
> > >>> too often.  ;-)  
> > >> 
> > >> Are we okay if we somehow ensure that all the entry code before
> > >> enter_from_user_mode() only does rcuidle tracing variants and has
> > >> kprobes off?  Including for BPF use cases?  
> > > 
> > > That would work, though if BPF used SRCU instead of RCU, this would
> > > be unnecessary.  Sadly, SRCU has full memory barriers in each of
> > > srcu_read_lock() and srcu_read_unlock(), but we are working on it.
> > > (As always, no promises!)
> > >   
> > >> It would be *really* nice if we could statically verify this, as has
> > >> been mentioned elsewhere in the thread.  It would also probably be
> > >> good enough if we could do it at runtime.  Maybe with lockdep on, we
> > >> verify rcu state in tracepoints even if the tracepoint isn't active?
> > >> And we could plausibly have some widget that could inject something
> > >> into *every* kprobeable function to check rcu state.  

I'm still not clear about this point, should I check rcuidle in kprobes
int3 handler or jump optimized handler? (int3 handler will run in
irq context so is not able to use srcu anyway...) Maybe I missed the point.

> > > 
> > > Or just have at least one testing step that activates all tracepoints,
> > > but with lockdep enabled?  
> > 
> > Also kprobe.
> > 
> > I don’t suppose we could make notrace imply nokprobe.  Then all
> > kprobeable functions would also have entry/exit tracepoints, right?
> 
> There was some code before that prevented a kprobe from being allowed
> in something that was not in the ftrace mcount table (which would make
> this happen). But I think that was changed because it was too
> restrictive.

Would you mean CONFIG_KPROBE_EVENTS_ON_NOTRACE? By default notrace
means noprobe too now. With CONFIG_KPROBE_EVENTS_ON_NOTRACE=y, we can
put kprobe events on notrace functions. So if you unsure, we can put
a kprobe on those functions and see what happens.
(Note that this is only for kprobe event, not kprobes itself)

It is actually very restrictive, but it is hard to make a whitelist
maually, especially if the CC_FLAGS_FTRACE is removed while building.

Thank you,

-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code
  2020-03-01 18:54                             ` Andy Lutomirski
  2020-03-01 19:30                               ` Paul E. McKenney
  2020-03-02  1:10                               ` Joel Fernandes
@ 2020-03-02  8:10                               ` Thomas Gleixner
  2 siblings, 0 replies; 56+ messages in thread
From: Thomas Gleixner @ 2020-03-02  8:10 UTC (permalink / raw)
  To: Andy Lutomirski, Paul E. McKenney
  Cc: Andy Lutomirski, Steven Rostedt, Peter Zijlstra, LKML, X86 ML,
	Brian Gerst, Juergen Gross, Paolo Bonzini, Arnd Bergmann

Andy Lutomirski <luto@kernel.org> writes:
> On Sun, Mar 1, 2020 at 10:26 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>> > So tracing itself is fine, but then if you have probes or bpf programs
>> > attached to a tracepoint these use rcu_read_lock()/unlock() which is
>> > obviosly wrong in rcuidle context.
>>
>> Definitely, any such code needs to use tricks similar to that of the
>> tracing code.  Or instead use something like SRCU, which is OK with
>> readers from idle.  Or use something like Steve Rostedt's workqueue-based
>> approach, though please be very careful with this latter, lest the
>> battery-powered embedded guys come after you for waking up idle CPUs
>> too often.  ;-)
>>
>
> Are we okay if we somehow ensure that all the entry code before
> enter_from_user_mode() only does rcuidle tracing variants and has
> kprobes off?  Including for BPF use cases?

I think this is the right thing to do. The only requirement we have
_before_ enter_from_user_mode() is to tell lockdep that interrupts are
off. There is not even the need for a real tracepoint IMO. The fact that
the lockdep call is hidden in that tracepoint is just an implementation
detail.

That would clearly set the rules straight: Anything low level entry code
before enter_from_user_mode() returns is neither probable nor
traceable.

I know that some people will argue that this is too restrictive in terms
of instrumentation, but OTOH the whole low level entry code has to be
excluded from instrumentation anyway, so having a dozen instructions
more excluded does not matter at all. Keep it simple!

> It would be *really* nice if we could statically verify this, as has
> been mentioned elsewhere in the thread.  It would also probably be
> good enough if we could do it at runtime.  Maybe with lockdep on, we
> verify rcu state in tracepoints even if the tracepoint isn't active?
> And we could plausibly have some widget that could inject something
> into *every* kprobeable function to check rcu state.

That surely would be useful.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 56+ messages in thread

end of thread, other threads:[~2020-03-02  8:10 UTC | newest]

Thread overview: 56+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-25 22:08 [patch 0/8] x86/entry: Consolidation - Part II Thomas Gleixner
2020-02-25 22:08 ` [patch 1/8] x86/entry/64: Trace irqflags unconditionally on when returing to user space Thomas Gleixner
2020-02-27 19:49   ` Borislav Petkov
2020-02-27 22:45   ` Frederic Weisbecker
2020-02-28  8:58   ` Alexandre Chartre
2020-02-25 22:08 ` [patch 2/8] x86/entry/common: Consolidate syscall entry code Thomas Gleixner
2020-02-27 19:57   ` Borislav Petkov
2020-02-27 22:52   ` Frederic Weisbecker
2020-02-28  8:59   ` Alexandre Chartre
2020-02-25 22:08 ` [patch 3/8] x86/entry/common: Mark syscall entry points notrace/nokprobe Thomas Gleixner
2020-02-27 23:15   ` Frederic Weisbecker
2020-02-28  8:59   ` Alexandre Chartre
2020-02-25 22:08 ` [patch 4/8] x86/entry: Move irq tracing on syscall entry to C-code Thomas Gleixner
2020-02-26  5:43   ` Andy Lutomirski
2020-02-26  8:17     ` Peter Zijlstra
2020-02-26 11:20       ` Andy Lutomirski
2020-02-26 19:51         ` Thomas Gleixner
2020-02-29 14:44           ` Thomas Gleixner
2020-02-29 19:25             ` Andy Lutomirski
2020-02-29 23:58               ` Steven Rostedt
2020-03-01 10:16                 ` Thomas Gleixner
2020-03-01 14:37                   ` Andy Lutomirski
2020-03-01 15:21                     ` Thomas Gleixner
2020-03-01 16:00                       ` Andy Lutomirski
2020-03-01 18:12                         ` Thomas Gleixner
2020-03-01 18:26                           ` Paul E. McKenney
2020-03-01 18:54                             ` Andy Lutomirski
2020-03-01 19:30                               ` Paul E. McKenney
2020-03-01 19:39                                 ` Andy Lutomirski
2020-03-01 20:18                                   ` Paul E. McKenney
2020-03-02  0:35                                   ` Steven Rostedt
2020-03-02  6:47                                     ` Masami Hiramatsu
2020-03-02  1:10                               ` Joel Fernandes
2020-03-02  2:18                                 ` Andy Lutomirski
2020-03-02  2:36                                   ` Joel Fernandes
2020-03-02  5:40                                     ` Andy Lutomirski
2020-03-02  8:10                               ` Thomas Gleixner
2020-03-01 18:23                         ` Steven Rostedt
2020-03-01 18:20                       ` Steven Rostedt
2020-02-27 23:11   ` Frederic Weisbecker
2020-02-28  9:00   ` Alexandre Chartre
2020-02-25 22:08 ` [patch 5/8] x86/entry/common: Provide trace/kprobe safe exit to user space functions Thomas Gleixner
2020-02-26  5:45   ` Andy Lutomirski
2020-02-26  8:15     ` Peter Zijlstra
2020-02-27 15:43   ` Alexandre Chartre
2020-02-27 15:53     ` Thomas Gleixner
2020-02-25 22:08 ` [patch 6/8] x86/entry: Move irq tracing to syscall_slow_exit_work Thomas Gleixner
2020-02-26  5:47   ` Andy Lutomirski
2020-02-27 16:12   ` Alexandre Chartre
2020-02-25 22:08 ` [patch 7/8] x86/entry: Move irq tracing to prepare_exit_to_user_mode() Thomas Gleixner
2020-02-26  5:50   ` Andy Lutomirski
2020-02-26 19:53     ` Thomas Gleixner
2020-02-26 20:07       ` Andy Lutomirski
2020-02-25 22:08 ` [patch 8/8] x86/entry: Move irqflags tracing to do_int80_syscall_32() Thomas Gleixner
2020-02-27 16:46   ` Alexandre Chartre
2020-02-28 13:49     ` Thomas Gleixner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.