linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] x86/unwind/orc: Fix unwinding from kprobe on PUSH/POP instruction
@ 2023-02-10 22:42 Josh Poimboeuf
  2023-02-10 22:42 ` [PATCH 1/2] x86/unwind/orc: Add 'signal' field to ORC metadata Josh Poimboeuf
  2023-02-10 22:42 ` [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction Josh Poimboeuf
  0 siblings, 2 replies; 17+ messages in thread
From: Josh Poimboeuf @ 2023-02-10 22:42 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, Peter Zijlstra, Chen Zhongjin, Naveen N. Rao,
	Anil S Keshavamurthy, David S. Miller, Masami Hiramatsu

Josh Poimboeuf (2):
  x86/unwind/orc: Add 'signal' field to ORC metadata
  x86/entry: Fix unwinding from kprobe on PUSH/POP instruction

 arch/x86/entry/entry_64.S              |  9 ++++++++-
 arch/x86/include/asm/orc_types.h       |  4 +++-
 arch/x86/include/asm/unwind_hints.h    | 10 +++++-----
 arch/x86/kernel/unwind_orc.c           |  5 ++---
 include/linux/objtool.h                | 11 +++++++----
 tools/arch/x86/include/asm/orc_types.h |  4 +++-
 tools/include/linux/objtool.h          | 11 +++++++----
 tools/objtool/orc_dump.c               |  4 ++--
 8 files changed, 37 insertions(+), 21 deletions(-)

-- 
2.39.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2] x86/unwind/orc: Add 'signal' field to ORC metadata
  2023-02-10 22:42 [PATCH 0/2] x86/unwind/orc: Fix unwinding from kprobe on PUSH/POP instruction Josh Poimboeuf
@ 2023-02-10 22:42 ` Josh Poimboeuf
  2023-02-11 11:52   ` [tip: objtool/core] " tip-bot2 for Josh Poimboeuf
  2023-02-10 22:42 ` [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction Josh Poimboeuf
  1 sibling, 1 reply; 17+ messages in thread
From: Josh Poimboeuf @ 2023-02-10 22:42 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, Peter Zijlstra, Chen Zhongjin, Naveen N. Rao,
	Anil S Keshavamurthy, David S. Miller, Masami Hiramatsu

Add a 'signal' field which allows unwind hints to specify whether the
instruction pointer should be taken literally (like for most interrupts
and exceptions) rather than decremented (like for call stack return
addresses) when used to find the next ORC entry.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/x86/include/asm/orc_types.h       |  4 +++-
 arch/x86/include/asm/unwind_hints.h    | 10 +++++-----
 arch/x86/kernel/unwind_orc.c           |  5 ++---
 include/linux/objtool.h                | 11 +++++++----
 tools/arch/x86/include/asm/orc_types.h |  4 +++-
 tools/include/linux/objtool.h          | 11 +++++++----
 tools/objtool/orc_dump.c               |  4 ++--
 7 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 5a2baf28a1dc..1343a62106de 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -57,12 +57,14 @@ struct orc_entry {
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
+	unsigned	signal:1;
 	unsigned	end:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	unsigned	bp_reg:4;
 	unsigned	sp_reg:4;
-	unsigned	unused:5;
+	unsigned	unused:4;
 	unsigned	end:1;
+	unsigned	signal:1;
 	unsigned	type:2;
 #endif
 } __packed;
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index f66fbe6537dd..e7c71750b309 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -15,7 +15,7 @@
 	UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1
 .endm
 
-.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1
 	.if \base == %rsp
 		.if \indirect
 			.set sp_reg, ORC_REG_SP_INDIRECT
@@ -45,11 +45,11 @@
 		.set type, UNWIND_HINT_TYPE_REGS
 	.endif
 
-	UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+	UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type signal=\signal
 .endm
 
-.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
-	UNWIND_HINT_REGS base=\base offset=\offset partial=1
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 signal=1
+	UNWIND_HINT_REGS base=\base offset=\offset partial=1 signal=\signal
 .endm
 
 .macro UNWIND_HINT_FUNC
@@ -67,7 +67,7 @@
 #else
 
 #define UNWIND_HINT_FUNC \
-	UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0)
+	UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0, 0)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index cdf6c6060170..37307b40f8da 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -484,6 +484,8 @@ bool unwind_next_frame(struct unwind_state *state)
 		goto the_end;
 	}
 
+	state->signal = orc->signal;
+
 	/* Find the previous frame's stack: */
 	switch (orc->sp_reg) {
 	case ORC_REG_SP:
@@ -563,7 +565,6 @@ bool unwind_next_frame(struct unwind_state *state)
 		state->sp = sp;
 		state->regs = NULL;
 		state->prev_regs = NULL;
-		state->signal = false;
 		break;
 
 	case UNWIND_HINT_TYPE_REGS:
@@ -587,7 +588,6 @@ bool unwind_next_frame(struct unwind_state *state)
 		state->regs = (struct pt_regs *)sp;
 		state->prev_regs = NULL;
 		state->full_regs = true;
-		state->signal = true;
 		break;
 
 	case UNWIND_HINT_TYPE_REGS_PARTIAL:
@@ -604,7 +604,6 @@ bool unwind_next_frame(struct unwind_state *state)
 			state->prev_regs = state->regs;
 		state->regs = (void *)sp - IRET_FRAME_OFFSET;
 		state->full_regs = false;
-		state->signal = true;
 		break;
 
 	default:
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 62c54ffbeeaa..9ac3df3fccf0 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -15,6 +15,7 @@ struct unwind_hint {
 	s16		sp_offset;
 	u8		sp_reg;
 	u8		type;
+	u8		signal;
 	u8		end;
 };
 #endif
@@ -49,7 +50,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)		\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end)	\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
 	/* struct unwind_hint */				\
@@ -57,6 +58,7 @@ struct unwind_hint {
 	".short " __stringify(sp_offset) "\n\t"			\
 	".byte " __stringify(sp_reg) "\n\t"			\
 	".byte " __stringify(type) "\n\t"			\
+	".byte " __stringify(signal) "\n\t"			\
 	".byte " __stringify(end) "\n\t"			\
 	".balign 4 \n\t"					\
 	".popsection\n\t"
@@ -129,7 +131,7 @@ struct unwind_hint {
  * the debuginfo as necessary.  It will also warn if it sees any
  * inconsistencies.
  */
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .Lunwind_hint_ip_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
@@ -137,6 +139,7 @@ struct unwind_hint {
 		.short \sp_offset
 		.byte \sp_reg
 		.byte \type
+		.byte \signal
 		.byte \end
 		.balign 4
 	.popsection
@@ -174,7 +177,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)	\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
 	"\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
 #define STACK_FRAME_NON_STANDARD_FP(func)
@@ -182,7 +185,7 @@ struct unwind_hint {
 #define ASM_REACHABLE
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h
index 5a2baf28a1dc..1343a62106de 100644
--- a/tools/arch/x86/include/asm/orc_types.h
+++ b/tools/arch/x86/include/asm/orc_types.h
@@ -57,12 +57,14 @@ struct orc_entry {
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
+	unsigned	signal:1;
 	unsigned	end:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	unsigned	bp_reg:4;
 	unsigned	sp_reg:4;
-	unsigned	unused:5;
+	unsigned	unused:4;
 	unsigned	end:1;
+	unsigned	signal:1;
 	unsigned	type:2;
 #endif
 } __packed;
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 62c54ffbeeaa..9ac3df3fccf0 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -15,6 +15,7 @@ struct unwind_hint {
 	s16		sp_offset;
 	u8		sp_reg;
 	u8		type;
+	u8		signal;
 	u8		end;
 };
 #endif
@@ -49,7 +50,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)		\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end)	\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
 	/* struct unwind_hint */				\
@@ -57,6 +58,7 @@ struct unwind_hint {
 	".short " __stringify(sp_offset) "\n\t"			\
 	".byte " __stringify(sp_reg) "\n\t"			\
 	".byte " __stringify(type) "\n\t"			\
+	".byte " __stringify(signal) "\n\t"			\
 	".byte " __stringify(end) "\n\t"			\
 	".balign 4 \n\t"					\
 	".popsection\n\t"
@@ -129,7 +131,7 @@ struct unwind_hint {
  * the debuginfo as necessary.  It will also warn if it sees any
  * inconsistencies.
  */
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .Lunwind_hint_ip_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
@@ -137,6 +139,7 @@ struct unwind_hint {
 		.short \sp_offset
 		.byte \sp_reg
 		.byte \type
+		.byte \signal
 		.byte \end
 		.balign 4
 	.popsection
@@ -174,7 +177,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)	\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
 	"\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
 #define STACK_FRAME_NON_STANDARD_FP(func)
@@ -182,7 +185,7 @@ struct unwind_hint {
 #define ASM_REACHABLE
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 4f1211fec82c..2d8ebdcd1db3 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -211,8 +211,8 @@ int orc_dump(const char *_objname)
 
 		print_reg(orc[i].bp_reg, bswap_if_needed(&dummy_elf, orc[i].bp_offset));
 
-		printf(" type:%s end:%d\n",
-		       orc_type_name(orc[i].type), orc[i].end);
+		printf(" type:%s signal:%d end:%d\n",
+		       orc_type_name(orc[i].type), orc[i].signal, orc[i].end);
 	}
 
 	elf_end(elf);
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-10 22:42 [PATCH 0/2] x86/unwind/orc: Fix unwinding from kprobe on PUSH/POP instruction Josh Poimboeuf
  2023-02-10 22:42 ` [PATCH 1/2] x86/unwind/orc: Add 'signal' field to ORC metadata Josh Poimboeuf
@ 2023-02-10 22:42 ` Josh Poimboeuf
  2023-02-11 11:52   ` [tip: objtool/core] " tip-bot2 for Josh Poimboeuf
                     ` (2 more replies)
  1 sibling, 3 replies; 17+ messages in thread
From: Josh Poimboeuf @ 2023-02-10 22:42 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, Peter Zijlstra, Chen Zhongjin, Naveen N. Rao,
	Anil S Keshavamurthy, David S. Miller, Masami Hiramatsu

If a kprobe (INT3) is set on a stack-modifying single-byte instruction,
like a single-byte PUSH/POP or a LEAVE, ORC fails to unwind past it:

  Call Trace:
   <TASK>
   dump_stack_lvl+0x57/0x90
   handler_pre+0x33/0x40 [kprobe_example]
   aggr_pre_handler+0x49/0x90
   kprobe_int3_handler+0xe3/0x180
   do_int3+0x3a/0x80
   exc_int3+0x7d/0xc0
   asm_exc_int3+0x35/0x40
  RIP: 0010:kernel_clone+0xe/0x3a0
  Code: cc e8 16 b2 bf 00 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 57 41 56 41 55 41 54 cc <53> 48 89 fb 48 83 ec 68 4c 8b 27 65 48 8b 04 25 28 00 00 00 48 89
  RSP: 0018:ffffc9000074fda0 EFLAGS: 00000206
  RAX: 0000000000808100 RBX: ffff888109de9d80 RCX: 0000000000000000
  RDX: 0000000000000011 RSI: ffff888109de9d80 RDI: ffffc9000074fdc8
  RBP: ffff8881019543c0 R08: ffffffff81127e30 R09: 00000000e71742a5
  R10: ffff888104764a18 R11: 0000000071742a5e R12: ffff888100078800
  R13: ffff888100126000 R14: 0000000000000000 R15: ffff888100126005
   ? __pfx_call_usermodehelper_exec_async+0x10/0x10
   ? kernel_clone+0xe/0x3a0
   ? user_mode_thread+0x5b/0x80
   ? __pfx_call_usermodehelper_exec_async+0x10/0x10
   ? call_usermodehelper_exec_work+0x77/0xb0
   ? process_one_work+0x299/0x5f0
   ? worker_thread+0x4f/0x3a0
   ? __pfx_worker_thread+0x10/0x10
   ? kthread+0xf2/0x120
   ? __pfx_kthread+0x10/0x10
   ? ret_from_fork+0x29/0x50
   </TASK>

The problem is that #BP saves the pointer to the instruction immediately
*after* the INT3, rather than to the INT3 itself.  The instruction
replaced by the INT3 hasn't actually run, but ORC assumes otherwise and
expects the wrong stack layout.

Fix it by annotating the #BP exception as a non-signal stack frame,
which tells the ORC unwinder to decrement the instruction pointer before
looking up the corresponding ORC entry.

Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/x86/entry/entry_64.S | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 15739a2c0983..8d21881adf86 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -385,7 +385,14 @@ SYM_CODE_END(xen_error_entry)
  */
 .macro idtentry vector asmsym cfunc has_error_code:req
 SYM_CODE_START(\asmsym)
-	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+
+	.if \vector == X86_TRAP_BP
+		/* #BP advances %rip to the next instruction */
+		UNWIND_HINT_IRET_REGS offset=\has_error_code*8 signal=0
+	.else
+		UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+	.endif
+
 	ENDBR
 	ASM_CLAC
 	cld
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [tip: objtool/core] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-10 22:42 ` [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction Josh Poimboeuf
@ 2023-02-11 11:52   ` tip-bot2 for Josh Poimboeuf
  2023-02-13 14:43   ` [PATCH 2/2] " Masami Hiramatsu
  2023-02-16 11:58   ` Peter Zijlstra
  2 siblings, 0 replies; 17+ messages in thread
From: tip-bot2 for Josh Poimboeuf @ 2023-02-11 11:52 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Chen Zhongjin, Josh Poimboeuf, Ingo Molnar, x86, linux-kernel

The following commit has been merged into the objtool/core branch of tip:

Commit-ID:     37064583f63eca93c98a9cdf2360485ea05f617a
Gitweb:        https://git.kernel.org/tip/37064583f63eca93c98a9cdf2360485ea05f617a
Author:        Josh Poimboeuf <jpoimboe@kernel.org>
AuthorDate:    Fri, 10 Feb 2023 14:42:02 -08:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Sat, 11 Feb 2023 12:37:51 +01:00

x86/entry: Fix unwinding from kprobe on PUSH/POP instruction

If a kprobe (INT3) is set on a stack-modifying single-byte instruction,
like a single-byte PUSH/POP or a LEAVE, ORC fails to unwind past it:

  Call Trace:
   <TASK>
   dump_stack_lvl+0x57/0x90
   handler_pre+0x33/0x40 [kprobe_example]
   aggr_pre_handler+0x49/0x90
   kprobe_int3_handler+0xe3/0x180
   do_int3+0x3a/0x80
   exc_int3+0x7d/0xc0
   asm_exc_int3+0x35/0x40
  RIP: 0010:kernel_clone+0xe/0x3a0
  Code: cc e8 16 b2 bf 00 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 57 41 56 41 55 41 54 cc <53> 48 89 fb 48 83 ec 68 4c 8b 27 65 48 8b 04 25 28 00 00 00 48 89
  RSP: 0018:ffffc9000074fda0 EFLAGS: 00000206
  RAX: 0000000000808100 RBX: ffff888109de9d80 RCX: 0000000000000000
  RDX: 0000000000000011 RSI: ffff888109de9d80 RDI: ffffc9000074fdc8
  RBP: ffff8881019543c0 R08: ffffffff81127e30 R09: 00000000e71742a5
  R10: ffff888104764a18 R11: 0000000071742a5e R12: ffff888100078800
  R13: ffff888100126000 R14: 0000000000000000 R15: ffff888100126005
   ? __pfx_call_usermodehelper_exec_async+0x10/0x10
   ? kernel_clone+0xe/0x3a0
   ? user_mode_thread+0x5b/0x80
   ? __pfx_call_usermodehelper_exec_async+0x10/0x10
   ? call_usermodehelper_exec_work+0x77/0xb0
   ? process_one_work+0x299/0x5f0
   ? worker_thread+0x4f/0x3a0
   ? __pfx_worker_thread+0x10/0x10
   ? kthread+0xf2/0x120
   ? __pfx_kthread+0x10/0x10
   ? ret_from_fork+0x29/0x50
   </TASK>

The problem is that #BP saves the pointer to the instruction immediately
*after* the INT3, rather than to the INT3 itself.  The instruction
replaced by the INT3 hasn't actually run, but ORC assumes otherwise and
expects the wrong stack layout.

Fix it by annotating the #BP exception as a non-signal stack frame,
which tells the ORC unwinder to decrement the instruction pointer before
looking up the corresponding ORC entry.

Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/baafcd3cc1abb14cb757fe081fa696012a5265ee.1676068346.git.jpoimboe@kernel.org
---
 arch/x86/entry/entry_64.S |  9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 15739a2..8d21881 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -385,7 +385,14 @@ SYM_CODE_END(xen_error_entry)
  */
 .macro idtentry vector asmsym cfunc has_error_code:req
 SYM_CODE_START(\asmsym)
-	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+
+	.if \vector == X86_TRAP_BP
+		/* #BP advances %rip to the next instruction */
+		UNWIND_HINT_IRET_REGS offset=\has_error_code*8 signal=0
+	.else
+		UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+	.endif
+
 	ENDBR
 	ASM_CLAC
 	cld

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [tip: objtool/core] x86/unwind/orc: Add 'signal' field to ORC metadata
  2023-02-10 22:42 ` [PATCH 1/2] x86/unwind/orc: Add 'signal' field to ORC metadata Josh Poimboeuf
@ 2023-02-11 11:52   ` tip-bot2 for Josh Poimboeuf
  0 siblings, 0 replies; 17+ messages in thread
From: tip-bot2 for Josh Poimboeuf @ 2023-02-11 11:52 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Josh Poimboeuf, Ingo Molnar, x86, linux-kernel

The following commit has been merged into the objtool/core branch of tip:

Commit-ID:     ffb1b4a41016295e298409c9dbcacd55680bd6d4
Gitweb:        https://git.kernel.org/tip/ffb1b4a41016295e298409c9dbcacd55680bd6d4
Author:        Josh Poimboeuf <jpoimboe@kernel.org>
AuthorDate:    Fri, 10 Feb 2023 14:42:01 -08:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Sat, 11 Feb 2023 12:37:51 +01:00

x86/unwind/orc: Add 'signal' field to ORC metadata

Add a 'signal' field which allows unwind hints to specify whether the
instruction pointer should be taken literally (like for most interrupts
and exceptions) rather than decremented (like for call stack return
addresses) when used to find the next ORC entry.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/d2c5ec4d83a45b513d8fd72fab59f1a8cfa46871.1676068346.git.jpoimboe@kernel.org
---
 arch/x86/include/asm/orc_types.h       |  4 +++-
 arch/x86/include/asm/unwind_hints.h    | 10 +++++-----
 arch/x86/kernel/unwind_orc.c           |  5 ++---
 include/linux/objtool.h                | 11 +++++++----
 tools/arch/x86/include/asm/orc_types.h |  4 +++-
 tools/include/linux/objtool.h          | 11 +++++++----
 tools/objtool/orc_dump.c               |  4 ++--
 7 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 5a2baf2..1343a62 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -57,12 +57,14 @@ struct orc_entry {
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
+	unsigned	signal:1;
 	unsigned	end:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	unsigned	bp_reg:4;
 	unsigned	sp_reg:4;
-	unsigned	unused:5;
+	unsigned	unused:4;
 	unsigned	end:1;
+	unsigned	signal:1;
 	unsigned	type:2;
 #endif
 } __packed;
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index f66fbe6..e7c7175 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -15,7 +15,7 @@
 	UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1
 .endm
 
-.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1
 	.if \base == %rsp
 		.if \indirect
 			.set sp_reg, ORC_REG_SP_INDIRECT
@@ -45,11 +45,11 @@
 		.set type, UNWIND_HINT_TYPE_REGS
 	.endif
 
-	UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+	UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type signal=\signal
 .endm
 
-.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
-	UNWIND_HINT_REGS base=\base offset=\offset partial=1
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 signal=1
+	UNWIND_HINT_REGS base=\base offset=\offset partial=1 signal=\signal
 .endm
 
 .macro UNWIND_HINT_FUNC
@@ -67,7 +67,7 @@
 #else
 
 #define UNWIND_HINT_FUNC \
-	UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0)
+	UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0, 0)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index cdf6c60..37307b4 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -484,6 +484,8 @@ bool unwind_next_frame(struct unwind_state *state)
 		goto the_end;
 	}
 
+	state->signal = orc->signal;
+
 	/* Find the previous frame's stack: */
 	switch (orc->sp_reg) {
 	case ORC_REG_SP:
@@ -563,7 +565,6 @@ bool unwind_next_frame(struct unwind_state *state)
 		state->sp = sp;
 		state->regs = NULL;
 		state->prev_regs = NULL;
-		state->signal = false;
 		break;
 
 	case UNWIND_HINT_TYPE_REGS:
@@ -587,7 +588,6 @@ bool unwind_next_frame(struct unwind_state *state)
 		state->regs = (struct pt_regs *)sp;
 		state->prev_regs = NULL;
 		state->full_regs = true;
-		state->signal = true;
 		break;
 
 	case UNWIND_HINT_TYPE_REGS_PARTIAL:
@@ -604,7 +604,6 @@ bool unwind_next_frame(struct unwind_state *state)
 			state->prev_regs = state->regs;
 		state->regs = (void *)sp - IRET_FRAME_OFFSET;
 		state->full_regs = false;
-		state->signal = true;
 		break;
 
 	default:
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 62c54ff..9ac3df3 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -15,6 +15,7 @@ struct unwind_hint {
 	s16		sp_offset;
 	u8		sp_reg;
 	u8		type;
+	u8		signal;
 	u8		end;
 };
 #endif
@@ -49,7 +50,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)		\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end)	\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
 	/* struct unwind_hint */				\
@@ -57,6 +58,7 @@ struct unwind_hint {
 	".short " __stringify(sp_offset) "\n\t"			\
 	".byte " __stringify(sp_reg) "\n\t"			\
 	".byte " __stringify(type) "\n\t"			\
+	".byte " __stringify(signal) "\n\t"			\
 	".byte " __stringify(end) "\n\t"			\
 	".balign 4 \n\t"					\
 	".popsection\n\t"
@@ -129,7 +131,7 @@ struct unwind_hint {
  * the debuginfo as necessary.  It will also warn if it sees any
  * inconsistencies.
  */
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .Lunwind_hint_ip_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
@@ -137,6 +139,7 @@ struct unwind_hint {
 		.short \sp_offset
 		.byte \sp_reg
 		.byte \type
+		.byte \signal
 		.byte \end
 		.balign 4
 	.popsection
@@ -174,7 +177,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)	\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
 	"\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
 #define STACK_FRAME_NON_STANDARD_FP(func)
@@ -182,7 +185,7 @@ struct unwind_hint {
 #define ASM_REACHABLE
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h
index 5a2baf2..1343a62 100644
--- a/tools/arch/x86/include/asm/orc_types.h
+++ b/tools/arch/x86/include/asm/orc_types.h
@@ -57,12 +57,14 @@ struct orc_entry {
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
+	unsigned	signal:1;
 	unsigned	end:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	unsigned	bp_reg:4;
 	unsigned	sp_reg:4;
-	unsigned	unused:5;
+	unsigned	unused:4;
 	unsigned	end:1;
+	unsigned	signal:1;
 	unsigned	type:2;
 #endif
 } __packed;
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 62c54ff..9ac3df3 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -15,6 +15,7 @@ struct unwind_hint {
 	s16		sp_offset;
 	u8		sp_reg;
 	u8		type;
+	u8		signal;
 	u8		end;
 };
 #endif
@@ -49,7 +50,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)		\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end)	\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
 	/* struct unwind_hint */				\
@@ -57,6 +58,7 @@ struct unwind_hint {
 	".short " __stringify(sp_offset) "\n\t"			\
 	".byte " __stringify(sp_reg) "\n\t"			\
 	".byte " __stringify(type) "\n\t"			\
+	".byte " __stringify(signal) "\n\t"			\
 	".byte " __stringify(end) "\n\t"			\
 	".balign 4 \n\t"					\
 	".popsection\n\t"
@@ -129,7 +131,7 @@ struct unwind_hint {
  * the debuginfo as necessary.  It will also warn if it sees any
  * inconsistencies.
  */
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .Lunwind_hint_ip_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
@@ -137,6 +139,7 @@ struct unwind_hint {
 		.short \sp_offset
 		.byte \sp_reg
 		.byte \type
+		.byte \signal
 		.byte \end
 		.balign 4
 	.popsection
@@ -174,7 +177,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)	\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
 	"\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
 #define STACK_FRAME_NON_STANDARD_FP(func)
@@ -182,7 +185,7 @@ struct unwind_hint {
 #define ASM_REACHABLE
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 4f1211f..2d8ebdc 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -211,8 +211,8 @@ int orc_dump(const char *_objname)
 
 		print_reg(orc[i].bp_reg, bswap_if_needed(&dummy_elf, orc[i].bp_offset));
 
-		printf(" type:%s end:%d\n",
-		       orc_type_name(orc[i].type), orc[i].end);
+		printf(" type:%s signal:%d end:%d\n",
+		       orc_type_name(orc[i].type), orc[i].signal, orc[i].end);
 	}
 
 	elf_end(elf);

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-10 22:42 ` [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction Josh Poimboeuf
  2023-02-11 11:52   ` [tip: objtool/core] " tip-bot2 for Josh Poimboeuf
@ 2023-02-13 14:43   ` Masami Hiramatsu
  2023-02-14 11:35     ` Peter Zijlstra
  2023-02-16 11:58   ` Peter Zijlstra
  2 siblings, 1 reply; 17+ messages in thread
From: Masami Hiramatsu @ 2023-02-13 14:43 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: x86, linux-kernel, Peter Zijlstra, Chen Zhongjin, Naveen N. Rao,
	Anil S Keshavamurthy, David S. Miller, Masami Hiramatsu

On Fri, 10 Feb 2023 14:42:02 -0800
Josh Poimboeuf <jpoimboe@kernel.org> wrote:

> If a kprobe (INT3) is set on a stack-modifying single-byte instruction,
> like a single-byte PUSH/POP or a LEAVE, ORC fails to unwind past it:
> 
>   Call Trace:
>    <TASK>
>    dump_stack_lvl+0x57/0x90
>    handler_pre+0x33/0x40 [kprobe_example]
>    aggr_pre_handler+0x49/0x90
>    kprobe_int3_handler+0xe3/0x180
>    do_int3+0x3a/0x80
>    exc_int3+0x7d/0xc0
>    asm_exc_int3+0x35/0x40
>   RIP: 0010:kernel_clone+0xe/0x3a0
>   Code: cc e8 16 b2 bf 00 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 57 41 56 41 55 41 54 cc <53> 48 89 fb 48 83 ec 68 4c 8b 27 65 48 8b 04 25 28 00 00 00 48 89
>   RSP: 0018:ffffc9000074fda0 EFLAGS: 00000206
>   RAX: 0000000000808100 RBX: ffff888109de9d80 RCX: 0000000000000000
>   RDX: 0000000000000011 RSI: ffff888109de9d80 RDI: ffffc9000074fdc8
>   RBP: ffff8881019543c0 R08: ffffffff81127e30 R09: 00000000e71742a5
>   R10: ffff888104764a18 R11: 0000000071742a5e R12: ffff888100078800
>   R13: ffff888100126000 R14: 0000000000000000 R15: ffff888100126005
>    ? __pfx_call_usermodehelper_exec_async+0x10/0x10
>    ? kernel_clone+0xe/0x3a0
>    ? user_mode_thread+0x5b/0x80
>    ? __pfx_call_usermodehelper_exec_async+0x10/0x10
>    ? call_usermodehelper_exec_work+0x77/0xb0
>    ? process_one_work+0x299/0x5f0
>    ? worker_thread+0x4f/0x3a0
>    ? __pfx_worker_thread+0x10/0x10
>    ? kthread+0xf2/0x120
>    ? __pfx_kthread+0x10/0x10
>    ? ret_from_fork+0x29/0x50
>    </TASK>
> 
> The problem is that #BP saves the pointer to the instruction immediately
> *after* the INT3, rather than to the INT3 itself.  The instruction
> replaced by the INT3 hasn't actually run, but ORC assumes otherwise and
> expects the wrong stack layout.

Ah, regs->ip is not adjusted. Yes. kprobes user usually use kp->addr.
Hmm, maybe we also can adjust regs->ip in kprobes, but this change may
help future use of stackdump in the int3 code. So I agree.

> Fix it by annotating the #BP exception as a non-signal stack frame,
> which tells the ORC unwinder to decrement the instruction pointer before
> looking up the corresponding ORC entry.

Just to make it clear, this sounds like a 'hack' use of non-signal stack
frame. If so, can we change the flag name as 'literal' or 'non-literal' etc?
I concern that the 'signal' flag is used differently in the future.

Thank you,


> 
> Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>  arch/x86/entry/entry_64.S | 9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 15739a2c0983..8d21881adf86 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -385,7 +385,14 @@ SYM_CODE_END(xen_error_entry)
>   */
>  .macro idtentry vector asmsym cfunc has_error_code:req
>  SYM_CODE_START(\asmsym)
> -	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
> +
> +	.if \vector == X86_TRAP_BP
> +		/* #BP advances %rip to the next instruction */
> +		UNWIND_HINT_IRET_REGS offset=\has_error_code*8 signal=0
> +	.else
> +		UNWIND_HINT_IRET_REGS offset=\has_error_code*8
> +	.endif
> +
>  	ENDBR
>  	ASM_CLAC
>  	cld
> -- 
> 2.39.1
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-13 14:43   ` [PATCH 2/2] " Masami Hiramatsu
@ 2023-02-14 11:35     ` Peter Zijlstra
  2023-02-14 17:05       ` Josh Poimboeuf
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2023-02-14 11:35 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Josh Poimboeuf, x86, linux-kernel, Chen Zhongjin, Naveen N. Rao,
	Anil S Keshavamurthy, David S. Miller

On Mon, Feb 13, 2023 at 11:43:57PM +0900, Masami Hiramatsu wrote:

> > Fix it by annotating the #BP exception as a non-signal stack frame,
> > which tells the ORC unwinder to decrement the instruction pointer before
> > looking up the corresponding ORC entry.
> 
> Just to make it clear, this sounds like a 'hack' use of non-signal stack
> frame. If so, can we change the flag name as 'literal' or 'non-literal' etc?
> I concern that the 'signal' flag is used differently in the future.

Oooh, bike-shed :-) Let me suggest trap=1, where a trap is a fault with
a different return address, specifically the instruction after the
faulting instruction.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-14 11:35     ` Peter Zijlstra
@ 2023-02-14 17:05       ` Josh Poimboeuf
  2023-02-15 10:25         ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Josh Poimboeuf @ 2023-02-14 17:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Masami Hiramatsu, x86, linux-kernel, Chen Zhongjin,
	Naveen N. Rao, Anil S Keshavamurthy, David S. Miller

On Tue, Feb 14, 2023 at 12:35:04PM +0100, Peter Zijlstra wrote:
> On Mon, Feb 13, 2023 at 11:43:57PM +0900, Masami Hiramatsu wrote:
> 
> > > Fix it by annotating the #BP exception as a non-signal stack frame,
> > > which tells the ORC unwinder to decrement the instruction pointer before
> > > looking up the corresponding ORC entry.
> > 
> > Just to make it clear, this sounds like a 'hack' use of non-signal stack
> > frame. If so, can we change the flag name as 'literal' or 'non-literal' etc?
> > I concern that the 'signal' flag is used differently in the future.

Agreed, though I'm having trouble coming up with a succinct yet
scrutable name.  If length wasn't an issue it would be something like

  "decrement_return_address_when_looking_up_the_next_orc_entry"

> Oooh, bike-shed :-) Let me suggest trap=1, where a trap is a fault with
> a different return address, specifically the instruction after the
> faulting instruction.

I think "trap" doesn't work because

 1) It's more than just traps, it's also function calls.  We have
    traps/calls in one bucket (decrement IP); and everything else
    (faults, aborts, irqs) in the other (don't decrement IP).

 2) It's not necessarily all traps which need the flag, just those that
    affect a previously-but-now-overwritten stack-modifying instruction.
    So #OF (which we don't use?) and trap-class #DB don't seem to be
    affected.  In practice maybe this distinction doesn't matter, but
    for example there's no reason for ORC try to distinguish trap #DB
    from non-trap #DB at runtime.

-- 
Josh

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-14 17:05       ` Josh Poimboeuf
@ 2023-02-15 10:25         ` Peter Zijlstra
  2023-02-15 23:16           ` Josh Poimboeuf
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2023-02-15 10:25 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Masami Hiramatsu, x86, linux-kernel, Chen Zhongjin,
	Naveen N. Rao, Anil S Keshavamurthy, David S. Miller

On Tue, Feb 14, 2023 at 09:05:52AM -0800, Josh Poimboeuf wrote:
> On Tue, Feb 14, 2023 at 12:35:04PM +0100, Peter Zijlstra wrote:
> > On Mon, Feb 13, 2023 at 11:43:57PM +0900, Masami Hiramatsu wrote:
> > 
> > > > Fix it by annotating the #BP exception as a non-signal stack frame,
> > > > which tells the ORC unwinder to decrement the instruction pointer before
> > > > looking up the corresponding ORC entry.
> > > 
> > > Just to make it clear, this sounds like a 'hack' use of non-signal stack
> > > frame. If so, can we change the flag name as 'literal' or 'non-literal' etc?
> > > I concern that the 'signal' flag is used differently in the future.
> 
> Agreed, though I'm having trouble coming up with a succinct yet
> scrutable name.  If length wasn't an issue it would be something like
> 
>   "decrement_return_address_when_looking_up_the_next_orc_entry"
> 
> > Oooh, bike-shed :-) Let me suggest trap=1, where a trap is a fault with
> > a different return address, specifically the instruction after the
> > faulting instruction.
> 
> I think "trap" doesn't work because
> 
>  1) It's more than just traps, it's also function calls.  We have
>     traps/calls in one bucket (decrement IP); and everything else
>     (faults, aborts, irqs) in the other (don't decrement IP).
> 
>  2) It's not necessarily all traps which need the flag, just those that
>     affect a previously-but-now-overwritten stack-modifying instruction.
>     So #OF (which we don't use?) and trap-class #DB don't seem to be
>     affected.  In practice maybe this distinction doesn't matter, but
>     for example there's no reason for ORC try to distinguish trap #DB
>     from non-trap #DB at runtime.

Well, I was specifically thinking about #DB, why don't we need to
decrement when we put a hardware breakpoint on a stack modifying op?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-15 10:25         ` Peter Zijlstra
@ 2023-02-15 23:16           ` Josh Poimboeuf
  2023-02-16 10:46             ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Josh Poimboeuf @ 2023-02-15 23:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Masami Hiramatsu, x86, linux-kernel, Chen Zhongjin,
	Naveen N. Rao, Anil S Keshavamurthy, David S. Miller

On Wed, Feb 15, 2023 at 11:25:54AM +0100, Peter Zijlstra wrote:
> On Tue, Feb 14, 2023 at 09:05:52AM -0800, Josh Poimboeuf wrote:
> > On Tue, Feb 14, 2023 at 12:35:04PM +0100, Peter Zijlstra wrote:
> > > On Mon, Feb 13, 2023 at 11:43:57PM +0900, Masami Hiramatsu wrote:
> > > 
> > > > > Fix it by annotating the #BP exception as a non-signal stack frame,
> > > > > which tells the ORC unwinder to decrement the instruction pointer before
> > > > > looking up the corresponding ORC entry.
> > > > 
> > > > Just to make it clear, this sounds like a 'hack' use of non-signal stack
> > > > frame. If so, can we change the flag name as 'literal' or 'non-literal' etc?
> > > > I concern that the 'signal' flag is used differently in the future.
> > 
> > Agreed, though I'm having trouble coming up with a succinct yet
> > scrutable name.  If length wasn't an issue it would be something like
> > 
> >   "decrement_return_address_when_looking_up_the_next_orc_entry"
> > 
> > > Oooh, bike-shed :-) Let me suggest trap=1, where a trap is a fault with
> > > a different return address, specifically the instruction after the
> > > faulting instruction.
> > 
> > I think "trap" doesn't work because
> > 
> >  1) It's more than just traps, it's also function calls.  We have
> >     traps/calls in one bucket (decrement IP); and everything else
> >     (faults, aborts, irqs) in the other (don't decrement IP).
> > 
> >  2) It's not necessarily all traps which need the flag, just those that
> >     affect a previously-but-now-overwritten stack-modifying instruction.
> >     So #OF (which we don't use?) and trap-class #DB don't seem to be
> >     affected.  In practice maybe this distinction doesn't matter, but
> >     for example there's no reason for ORC try to distinguish trap #DB
> >     from non-trap #DB at runtime.
> 
> Well, I was specifically thinking about #DB, why don't we need to
> decrement when we put a hardware breakpoint on a stack modifying op?

I assume you mean the INT1 instruction.  Yeah, maybe we should care
about that.

I'm struggling to come up with any decent ideas about how to implement
that.  Presumably the #DB handler would have to communicate to the
unwinder somehow whether the given frame is a trap.

Alternatively I was thinking the unwinder could read the instruction,
but then it doesn't know whether to read regs->ip or the previous
instruction.

-- 
Josh

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-15 23:16           ` Josh Poimboeuf
@ 2023-02-16 10:46             ` Peter Zijlstra
  2023-02-16 11:30               ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2023-02-16 10:46 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Masami Hiramatsu, x86, linux-kernel, Chen Zhongjin,
	Naveen N. Rao, Anil S Keshavamurthy, David S. Miller

On Wed, Feb 15, 2023 at 03:16:37PM -0800, Josh Poimboeuf wrote:
> On Wed, Feb 15, 2023 at 11:25:54AM +0100, Peter Zijlstra wrote:

> > Well, I was specifically thinking about #DB, why don't we need to
> > decrement when we put a hardware breakpoint on a stack modifying op?
> 
> I assume you mean the INT1 instruction.  Yeah, maybe we should care
> about that.

Nah, I was thinking #DB from DR7, but ...

> I'm struggling to come up with any decent ideas about how to implement
> that.  Presumably the #DB handler would have to communicate to the
> unwinder somehow whether the given frame is a trap.

... I had forgotten that #DB is not unconditionally trap :/ The worst
part seems to be that code breakpoints are faults while data breakpoints
are traps.

And you so don't want to go decode the DR registers in the unwinder,
quality mess this :/

Put a breakpoint on the stack and you've got PUSH doing a trap, put a
breakpoint on the PUSH instruction and you get a fault, and lo and
behold, you get a different unwind :-(

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-16 10:46             ` Peter Zijlstra
@ 2023-02-16 11:30               ` Peter Zijlstra
  2023-02-16 14:35                 ` Masami Hiramatsu
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2023-02-16 11:30 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Masami Hiramatsu, x86, linux-kernel, Chen Zhongjin,
	Naveen N. Rao, Anil S Keshavamurthy, David S. Miller

On Thu, Feb 16, 2023 at 11:46:30AM +0100, Peter Zijlstra wrote:
> On Wed, Feb 15, 2023 at 03:16:37PM -0800, Josh Poimboeuf wrote:
> > On Wed, Feb 15, 2023 at 11:25:54AM +0100, Peter Zijlstra wrote:
> 
> > > Well, I was specifically thinking about #DB, why don't we need to
> > > decrement when we put a hardware breakpoint on a stack modifying op?
> > 
> > I assume you mean the INT1 instruction.  Yeah, maybe we should care
> > about that.
> 
> Nah, I was thinking #DB from DR7, but ...
> 
> > I'm struggling to come up with any decent ideas about how to implement
> > that.  Presumably the #DB handler would have to communicate to the
> > unwinder somehow whether the given frame is a trap.
> 
> ... I had forgotten that #DB is not unconditionally trap :/ The worst
> part seems to be that code breakpoints are faults while data breakpoints
> are traps.
> 
> And you so don't want to go decode the DR registers in the unwinder,
> quality mess this :/
> 
> Put a breakpoint on the stack and you've got PUSH doing a trap, put a
> breakpoint on the PUSH instruction and you get a fault, and lo and
> behold, you get a different unwind :-(

It could be I'm just confusing things... when #DB traps it is actually
because the instruction is complete, so looking up the ORC based on the
next instruction is correct, while when #DB faults, it is because the
instruction has not yet completed and again ORC lookup on IP just works.

So while determining if #DB is trap or fault is a giant pain in the
arse, it does not actually matter for the unwinder in this case.

And with the INT3 thing the problem is that we've replaced an
instruction that was supposed to do a stack op.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-10 22:42 ` [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction Josh Poimboeuf
  2023-02-11 11:52   ` [tip: objtool/core] " tip-bot2 for Josh Poimboeuf
  2023-02-13 14:43   ` [PATCH 2/2] " Masami Hiramatsu
@ 2023-02-16 11:58   ` Peter Zijlstra
  2023-02-16 16:06     ` Josh Poimboeuf
  2 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2023-02-16 11:58 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: x86, linux-kernel, Chen Zhongjin, Naveen N. Rao,
	Anil S Keshavamurthy, David S. Miller, Masami Hiramatsu

On Fri, Feb 10, 2023 at 02:42:02PM -0800, Josh Poimboeuf wrote:

> The problem is that #BP saves the pointer to the instruction immediately
> *after* the INT3, rather than to the INT3 itself.  The instruction
> replaced by the INT3 hasn't actually run, but ORC assumes otherwise and
> expects the wrong stack layout.
> 
> Fix it by annotating the #BP exception as a non-signal stack frame,
> which tells the ORC unwinder to decrement the instruction pointer before
> looking up the corresponding ORC entry.
> 
> Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>  arch/x86/entry/entry_64.S | 9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 15739a2c0983..8d21881adf86 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -385,7 +385,14 @@ SYM_CODE_END(xen_error_entry)
>   */
>  .macro idtentry vector asmsym cfunc has_error_code:req
>  SYM_CODE_START(\asmsym)
> -	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
> +
> +	.if \vector == X86_TRAP_BP
> +		/* #BP advances %rip to the next instruction */
> +		UNWIND_HINT_IRET_REGS offset=\has_error_code*8 signal=0

So the fact that INT3 is trap like is not the problem, the problem is
that we use INT3 to overwrite stack modifying instruction and we should
not assume those instructions have completed when in the #BP handler.

Now, the reason all this actually works is because INT3 itself does not
modify the stack so rewinding on non-overwrite INT3 instructions is
invariant wrt stack state.

> +	.else
> +		UNWIND_HINT_IRET_REGS offset=\has_error_code*8
> +	.endif



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-16 11:30               ` Peter Zijlstra
@ 2023-02-16 14:35                 ` Masami Hiramatsu
  2023-02-16 16:58                   ` Josh Poimboeuf
  0 siblings, 1 reply; 17+ messages in thread
From: Masami Hiramatsu @ 2023-02-16 14:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Josh Poimboeuf, Masami Hiramatsu, x86, linux-kernel,
	Chen Zhongjin, Naveen N. Rao, Anil S Keshavamurthy,
	David S. Miller

On Thu, 16 Feb 2023 12:30:24 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Feb 16, 2023 at 11:46:30AM +0100, Peter Zijlstra wrote:
> > On Wed, Feb 15, 2023 at 03:16:37PM -0800, Josh Poimboeuf wrote:
> > > On Wed, Feb 15, 2023 at 11:25:54AM +0100, Peter Zijlstra wrote:
> > 
> > > > Well, I was specifically thinking about #DB, why don't we need to
> > > > decrement when we put a hardware breakpoint on a stack modifying op?
> > > 
> > > I assume you mean the INT1 instruction.  Yeah, maybe we should care
> > > about that.
> > 
> > Nah, I was thinking #DB from DR7, but ...
> > 
> > > I'm struggling to come up with any decent ideas about how to implement
> > > that.  Presumably the #DB handler would have to communicate to the
> > > unwinder somehow whether the given frame is a trap.
> > 
> > ... I had forgotten that #DB is not unconditionally trap :/ The worst
> > part seems to be that code breakpoints are faults while data breakpoints
> > are traps.
> > 
> > And you so don't want to go decode the DR registers in the unwinder,
> > quality mess this :/
> > 
> > Put a breakpoint on the stack and you've got PUSH doing a trap, put a
> > breakpoint on the PUSH instruction and you get a fault, and lo and
> > behold, you get a different unwind :-(
> 
> It could be I'm just confusing things... when #DB traps it is actually
> because the instruction is complete, so looking up the ORC based on the
> next instruction is correct, while when #DB faults, it is because the
> instruction has not yet completed and again ORC lookup on IP just works.
> 
> So while determining if #DB is trap or fault is a giant pain in the
> arse, it does not actually matter for the unwinder in this case.
> 
> And with the INT3 thing the problem is that we've replaced an
> instruction that was supposed to do a stack op.
> 

If the kprobe checks whether the original instruction do a stack op and
if so, setting a flag on current_kprobe will help unwinder finds that case?

Of course all INT3 user may need to do this but it should be limited.

Thank you, 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-16 11:58   ` Peter Zijlstra
@ 2023-02-16 16:06     ` Josh Poimboeuf
  2023-02-17 12:44       ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Josh Poimboeuf @ 2023-02-16 16:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, Chen Zhongjin, Naveen N. Rao,
	Anil S Keshavamurthy, David S. Miller, Masami Hiramatsu

On Thu, Feb 16, 2023 at 12:58:58PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 10, 2023 at 02:42:02PM -0800, Josh Poimboeuf wrote:
> 
> > The problem is that #BP saves the pointer to the instruction immediately
> > *after* the INT3, rather than to the INT3 itself.  The instruction
> > replaced by the INT3 hasn't actually run, but ORC assumes otherwise and
> > expects the wrong stack layout.
> > 
> > Fix it by annotating the #BP exception as a non-signal stack frame,
> > which tells the ORC unwinder to decrement the instruction pointer before
> > looking up the corresponding ORC entry.
> > 
> > Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
> > Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> > ---
> >  arch/x86/entry/entry_64.S | 9 ++++++++-
> >  1 file changed, 8 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > index 15739a2c0983..8d21881adf86 100644
> > --- a/arch/x86/entry/entry_64.S
> > +++ b/arch/x86/entry/entry_64.S
> > @@ -385,7 +385,14 @@ SYM_CODE_END(xen_error_entry)
> >   */
> >  .macro idtentry vector asmsym cfunc has_error_code:req
> >  SYM_CODE_START(\asmsym)
> > -	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
> > +
> > +	.if \vector == X86_TRAP_BP
> > +		/* #BP advances %rip to the next instruction */
> > +		UNWIND_HINT_IRET_REGS offset=\has_error_code*8 signal=0
> 
> So the fact that INT3 is trap like is not the problem, the problem is
> that we use INT3 to overwrite stack modifying instruction and we should
> not assume those instructions have completed when in the #BP handler.
> 
> Now, the reason all this actually works is because INT3 itself does not
> modify the stack so rewinding on non-overwrite INT3 instructions is
> invariant wrt stack state.

Right, that's what my patch description attempting to say.

That's also why I was asking about INT1, which is a trap.  Do we care
about INT1?

-- 
Josh

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-16 14:35                 ` Masami Hiramatsu
@ 2023-02-16 16:58                   ` Josh Poimboeuf
  0 siblings, 0 replies; 17+ messages in thread
From: Josh Poimboeuf @ 2023-02-16 16:58 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Peter Zijlstra, x86, linux-kernel, Chen Zhongjin, Naveen N. Rao,
	Anil S Keshavamurthy, David S. Miller

On Thu, Feb 16, 2023 at 11:35:19PM +0900, Masami Hiramatsu wrote:
> > It could be I'm just confusing things... when #DB traps it is actually
> > because the instruction is complete, so looking up the ORC based on the
> > next instruction is correct, while when #DB faults, it is because the
> > instruction has not yet completed and again ORC lookup on IP just works.
> > 
> > So while determining if #DB is trap or fault is a giant pain in the
> > arse, it does not actually matter for the unwinder in this case.
> > 
> > And with the INT3 thing the problem is that we've replaced an
> > instruction that was supposed to do a stack op.
> > 
> 
> If the kprobe checks whether the original instruction do a stack op and
> if so, setting a flag on current_kprobe will help unwinder finds that case?
> 
> Of course all INT3 user may need to do this but it should be limited.

No, for INT3, even if the original instruction wasn't a stack op, we can
treat it the same way.  Either way, we know the instruction hasn't
executed so we can still use that address to look up the ORC entry.

-- 
Josh

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction
  2023-02-16 16:06     ` Josh Poimboeuf
@ 2023-02-17 12:44       ` Peter Zijlstra
  0 siblings, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2023-02-17 12:44 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: x86, linux-kernel, Chen Zhongjin, Naveen N. Rao,
	Anil S Keshavamurthy, David S. Miller, Masami Hiramatsu

On Thu, Feb 16, 2023 at 08:06:19AM -0800, Josh Poimboeuf wrote:
> On Thu, Feb 16, 2023 at 12:58:58PM +0100, Peter Zijlstra wrote:
> > On Fri, Feb 10, 2023 at 02:42:02PM -0800, Josh Poimboeuf wrote:
> > 
> > > The problem is that #BP saves the pointer to the instruction immediately
> > > *after* the INT3, rather than to the INT3 itself.  The instruction
> > > replaced by the INT3 hasn't actually run, but ORC assumes otherwise and
> > > expects the wrong stack layout.
> > > 
> > > Fix it by annotating the #BP exception as a non-signal stack frame,
> > > which tells the ORC unwinder to decrement the instruction pointer before
> > > looking up the corresponding ORC entry.
> > > 
> > > Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
> > > Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> > > ---
> > >  arch/x86/entry/entry_64.S | 9 ++++++++-
> > >  1 file changed, 8 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > index 15739a2c0983..8d21881adf86 100644
> > > --- a/arch/x86/entry/entry_64.S
> > > +++ b/arch/x86/entry/entry_64.S
> > > @@ -385,7 +385,14 @@ SYM_CODE_END(xen_error_entry)
> > >   */
> > >  .macro idtentry vector asmsym cfunc has_error_code:req
> > >  SYM_CODE_START(\asmsym)
> > > -	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
> > > +
> > > +	.if \vector == X86_TRAP_BP
> > > +		/* #BP advances %rip to the next instruction */
> > > +		UNWIND_HINT_IRET_REGS offset=\has_error_code*8 signal=0
> > 
> > So the fact that INT3 is trap like is not the problem, the problem is
> > that we use INT3 to overwrite stack modifying instruction and we should
> > not assume those instructions have completed when in the #BP handler.
> > 
> > Now, the reason all this actually works is because INT3 itself does not
> > modify the stack so rewinding on non-overwrite INT3 instructions is
> > invariant wrt stack state.
> 
> Right, that's what my patch description attempting to say.
> 
> That's also why I was asking about INT1, which is a trap.  Do we care
> about INT1?

We do not care about INT1, #DB is an IST and an allround pain in the
backside, INT3 is where it's at.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2023-02-17 12:44 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-10 22:42 [PATCH 0/2] x86/unwind/orc: Fix unwinding from kprobe on PUSH/POP instruction Josh Poimboeuf
2023-02-10 22:42 ` [PATCH 1/2] x86/unwind/orc: Add 'signal' field to ORC metadata Josh Poimboeuf
2023-02-11 11:52   ` [tip: objtool/core] " tip-bot2 for Josh Poimboeuf
2023-02-10 22:42 ` [PATCH 2/2] x86/entry: Fix unwinding from kprobe on PUSH/POP instruction Josh Poimboeuf
2023-02-11 11:52   ` [tip: objtool/core] " tip-bot2 for Josh Poimboeuf
2023-02-13 14:43   ` [PATCH 2/2] " Masami Hiramatsu
2023-02-14 11:35     ` Peter Zijlstra
2023-02-14 17:05       ` Josh Poimboeuf
2023-02-15 10:25         ` Peter Zijlstra
2023-02-15 23:16           ` Josh Poimboeuf
2023-02-16 10:46             ` Peter Zijlstra
2023-02-16 11:30               ` Peter Zijlstra
2023-02-16 14:35                 ` Masami Hiramatsu
2023-02-16 16:58                   ` Josh Poimboeuf
2023-02-16 11:58   ` Peter Zijlstra
2023-02-16 16:06     ` Josh Poimboeuf
2023-02-17 12:44       ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).