linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/7] DWARF: add option to preserve unwind info
@ 2017-05-05 12:21 Jiri Slaby
  2017-05-05 12:21 ` [PATCH 2/7] DWARF: EH-frame based stack unwinding Jiri Slaby
                   ` (5 more replies)
  0 siblings, 6 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-05 12:21 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, live-patching, linux-kernel, Jiri Slaby,
	Masahiro Yamada, Michal Marek, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, x86, linux-kbuild

It may be useful for some tools like gdb or latencytop. So we introduce
CONFIG_UNWIND_INFO. This information is also used by the DWARF unwinder in the
next patches.

In particular, we enable asynchronous unwind tables -- the out-of-band DWARF
info proper. And we also let the linker to generate an index for a fast binary
lookup (eh-frame-hdr).

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michal Marek <mmarek@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: linux-kbuild@vger.kernel.org
---
 Makefile                      |  5 +++++
 arch/x86/Makefile             |  2 ++
 arch/x86/entry/calling.h      | 13 +++++++++++++
 arch/x86/kernel/vmlinux.lds.S |  2 ++
 lib/Kconfig.debug             | 10 ++++++++++
 5 files changed, 32 insertions(+)

diff --git a/Makefile b/Makefile
index 23cc5d65172b..0eabd1300c69 100644
--- a/Makefile
+++ b/Makefile
@@ -739,6 +739,11 @@ endif
 
 KBUILD_CFLAGS   += $(call cc-option, -fno-var-tracking-assignments)
 
+ifdef CONFIG_UNWIND_INFO
+KBUILD_CFLAGS	+= -fasynchronous-unwind-tables
+LDFLAGS_vmlinux	+= --eh-frame-hdr
+endif
+
 ifdef CONFIG_DEBUG_INFO
 ifdef CONFIG_DEBUG_INFO_SPLIT
 KBUILD_CFLAGS   += $(call cc-option, -gsplit-dwarf, -g)
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 4430dd489620..a4ab13b383cb 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -212,7 +212,9 @@ KBUILD_CFLAGS += -pipe
 # Workaround for a gcc prelease that unfortunately was shipped in a suse release
 KBUILD_CFLAGS += -Wno-sign-compare
 #
+ifneq ($(CONFIG_UNWIND_INFO),y)
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
+endif
 
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 05ed3d393da7..fd2334ee62fd 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -48,6 +48,19 @@ For 32-bit we have the following conventions - kernel is built with
 
 */
 
+#if !defined(CONFIG_UNWIND_INFO) && defined(CONFIG_AS_CFI_SECTIONS) \
+	&& defined(__ASSEMBLY__)
+	/*
+	 * Emit CFI data in .debug_frame sections, not .eh_frame sections.
+	 * The latter we currently just discard since we don't do DWARF
+	 * unwinding at runtime.  So only the offline DWARF information is
+	 * useful to anyone.  Note we should not use this directive if this
+	 * file is used in the vDSO assembly, or if vmlinux.lds.S gets
+	 * changed so it doesn't discard .eh_frame.
+	 */
+	.cfi_sections .debug_frame
+#endif
+
 #ifdef CONFIG_X86_64
 
 /*
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c8a3b61be0aa..7dfe103d4fae 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -345,7 +345,9 @@ SECTIONS
 	/* Sections to be discarded */
 	DISCARDS
 	/DISCARD/ : {
+#ifndef CONFIG_UNWIND_INFO
 		*(.eh_frame)
+#endif
 	}
 }
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e4587ebe52c7..e90125b6498e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -45,6 +45,16 @@ config MESSAGE_LOGLEVEL_DEFAULT
 	  by default. To change that, use loglevel=<x> in the kernel bootargs,
 	  or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value.
 
+config UNWIND_INFO
+	bool "Compile the kernel with frame unwind information"
+	depends on !IA64 && !PARISC && !ARM
+	depends on !MODULES || !(MIPS || PPC || SUPERH || V850)
+	help
+	  If you say Y here the resulting kernel image will be slightly larger
+	  but not slower, and it will give very useful debugging information.
+	  If you don't debug the kernel, you can say N, but we may not be able
+	  to solve problems without frame unwind information or frame pointers.
+
 config BOOT_PRINTK_DELAY
 	bool "Delay each boot printk message by N milliseconds"
 	depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 2/7] DWARF: EH-frame based stack unwinding
  2017-05-05 12:21 [PATCH 1/7] DWARF: add option to preserve unwind info Jiri Slaby
@ 2017-05-05 12:21 ` Jiri Slaby
  2017-05-05 12:21 ` [PATCH 3/7] vmlinux.lds: preserve eh_frame for DWARF unwinder Jiri Slaby
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-05 12:21 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, live-patching, linux-kernel, Jiri Slaby, Jan Beulich,
	Jeff Mahoney, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86

Add core code of the DWARF unwinder. This code is capable of unwinding
the stack using the out-of-band DWARF information. We have been using
this for years in SUSE. Now, we have cleaned it up and propose it for
merging upstream. Not only as a frame pointer unwinding alternative for
debugging purposes, but also as a base for DWARF-based stack checking
in the kernel live patching.

Also, a nice side effect of using the DWARF unwinder is increased
performance. We performed several benchmarks and the gain is speedup
about 10 %. This stems from using the out-of-band data and freeing up
one register (rbp), as we do not need to preserve the frame pointer
anywhere.

It is worth noting, that this is incomplete implementation in terms of
the DWARF standard. But it is complete in terms of what we actually need
for the in-kernel DWARF unwinding.

A known drawback is that the assembler is not unwound properly. We used
to have manual DWARF annotations before 131484c8da97 (x86/debug: Remove
perpetually broken, unmaintainable dwarf annotations), but they are long
gone. There is a current work in progress to annotate assembly
automatically by objtool. In the meantime, if enabled, the frame pointer
can be used for the rest of the stack instead.

Besides that, there is a current work in progress to annotate assembly
automatically by objtool. For this reason, the DWARF unwinder will be
by default off until assembly is handled properly.

This is only core code, the further patches will plug it into the kernel
unwinder's core and to kernel and module inits/exits. Finally, an
appropriate config option is added.

Final note: yes, this is the unwinded which was submitted for upstream a
long time ago and it was rejected. This is a completely redesigned
version, so we send it again for consideration.

Thanks should go also to Andi Kleen, and Jeff Mahoney, who helped to
drag the patch in SUSE's repositories for almost a decade.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
---
 arch/x86/entry/entry_32.S    |   21 +
 arch/x86/entry/entry_64.S    |   18 +
 arch/x86/include/asm/dwarf.h |  130 +++
 include/linux/dwarf.h        |   61 ++
 kernel/Makefile              |    1 +
 kernel/dwarf.c               | 1802 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 2033 insertions(+)
 create mode 100644 arch/x86/include/asm/dwarf.h
 create mode 100644 include/linux/dwarf.h
 create mode 100644 kernel/dwarf.c

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 50bc26949e9e..1229ced3f584 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -291,6 +291,27 @@ ENTRY(ret_from_fork)
 	jmp	2b
 END(ret_from_fork)
 
+#ifdef CONFIG_DWARF_UNWIND
+ENTRY(arch_dwarf_init_running)
+	movl	4(%esp), %edx
+	movl	(%esp), %ecx
+	movl	%ecx, PT_EIP(%edx)
+	leal	4(%esp), %eax
+	movl	%eax, PT_OLDESP(%edx)
+	movl	%ebx, PT_EBX(%edx)
+	movl	%esi, PT_ESI(%edx)
+	movl	%edi, PT_EDI(%edx)
+	movl	%ebp, PT_EBP(%edx)
+	movl	$__KERNEL_CS, PT_CS(%edx)
+	movl	$__USER_DS, PT_DS(%edx)
+	movl	$__USER_DS, PT_ES(%edx)
+	movl	$__KERNEL_PERCPU, PT_FS(%edx)
+	movl	$__KERNEL_STACK_CANARY, PT_GS(%edx)
+	movl	$__KERNEL_DS, PT_OLDSS(%edx)
+	ret
+ENDPROC(arch_dwarf_init_running)
+#endif
+
 /*
  * Return to user mode is not as complex as all this looks,
  * but we want the default path for a system call return to
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 607d72c4a485..ee90b81065bd 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -902,6 +902,24 @@ ENTRY(do_softirq_own_stack)
 	ret
 END(do_softirq_own_stack)
 
+#ifdef CONFIG_DWARF_UNWIND
+ENTRY(arch_dwarf_init_running)
+	movq	%r15, R15(%rdi)
+	movq	%r14, R14(%rdi)
+	movq	%r13, R13(%rdi)
+	movq	%r12, R12(%rdi)
+	movq	%rbp, RBP(%rdi)
+	movq	%rbx, RBX(%rdi)
+	movq	(%rsp), %r9
+	movq	%r9, RIP(%rdi)
+	leaq	8(%rsp), %r9
+	movq	%r9, RSP(%rdi)
+	movq	$__KERNEL_CS, CS(%rdi)
+	movq	$__KERNEL_DS, SS(%rdi)
+	ret
+ENDPROC(arch_dwarf_init_running)
+#endif
+
 #ifdef CONFIG_XEN
 idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
 
diff --git a/arch/x86/include/asm/dwarf.h b/arch/x86/include/asm/dwarf.h
new file mode 100644
index 000000000000..87f0839a6037
--- /dev/null
+++ b/arch/x86/include/asm/dwarf.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2002-2017 Novell, Inc.
+ *	Jan Beulich <jbeulich@novell.com>
+ *	Jiri Slaby <jirislaby@kernel.org>
+ * The code below is released under version 2 of the GNU GPL.
+ */
+
+#ifndef _ASM_X86_DWARF_H
+#define _ASM_X86_DWARF_H
+
+#ifdef CONFIG_DWARF_UNWIND
+
+#include <linux/uaccess.h>
+
+/*
+ * On x86-64, we might need to account for the special exception and interrupt
+ * handling stacks here, since normally
+ *	EXCEPTION_STACK_ORDER < THREAD_ORDER < IRQSTACK_ORDER,
+ * but the construct is needed only for getting across the stack switch to
+ * the interrupt stack - thus considering the IRQ stack itself is unnecessary,
+ * and the overhead of comparing against all exception handling stacks seems
+ * not desirable.
+ */
+#define STACK_LIMIT(ptr)	(((ptr) - 1) & ~(THREAD_SIZE - 1))
+
+#define DW_PC(frame)		((frame)->u.regs.ip)
+#define DW_SP(frame)		((frame)->u.regs.sp)
+#ifdef CONFIG_FRAME_POINTER
+# define DW_FP(frame)		((frame)->u.regs.bp)
+# define FRAME_LINK_OFFSET	0
+# define STACK_BOTTOM(tsk)	STACK_LIMIT((tsk)->thread.sp0)
+# define TSK_STACK_TOP(tsk)	((tsk)->thread.sp0)
+#else
+# define DW_FP(frame)		((void)(frame), 0UL)
+#endif
+
+#ifdef CONFIG_X86_64
+
+#include <asm/vsyscall.h>
+
+#define FRAME_RETADDR_OFFSET 8
+
+#define DW_REGISTER_INFO \
+	PTREGS_INFO(ax), \
+	PTREGS_INFO(dx), \
+	PTREGS_INFO(cx), \
+	PTREGS_INFO(bx), \
+	PTREGS_INFO(si), \
+	PTREGS_INFO(di), \
+	PTREGS_INFO(bp), \
+	PTREGS_INFO(sp), \
+	PTREGS_INFO(r8), \
+	PTREGS_INFO(r9), \
+	PTREGS_INFO(r10), \
+	PTREGS_INFO(r11), \
+	PTREGS_INFO(r12), \
+	PTREGS_INFO(r13), \
+	PTREGS_INFO(r14), \
+	PTREGS_INFO(r15), \
+	PTREGS_INFO(ip)
+
+#else /* X86_32 */
+
+#define FRAME_RETADDR_OFFSET 4
+
+#define DW_REGISTER_INFO \
+	PTREGS_INFO(ax), \
+	PTREGS_INFO(cx), \
+	PTREGS_INFO(dx), \
+	PTREGS_INFO(bx), \
+	PTREGS_INFO(sp), \
+	PTREGS_INFO(bp), \
+	PTREGS_INFO(si), \
+	PTREGS_INFO(di), \
+	PTREGS_INFO(ip)
+
+#endif
+
+#define DW_DEFAULT_RA(raItem, data_align) \
+	((raItem).where == MEMORY && \
+	 !((raItem).value * (data_align) + sizeof(void *)))
+
+static inline void arch_dwarf_init_frame_info(struct unwind_state *info,
+		struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+	info->u.regs = *regs;
+#else
+	if (user_mode(regs))
+		info->u.regs = *regs;
+	else {
+		memcpy(&info->u.regs, regs, offsetof(struct pt_regs, sp));
+		info->u.regs.sp = (unsigned long)&regs->sp;
+		info->u.regs.ss = __KERNEL_DS;
+	}
+#endif
+}
+
+static inline void arch_dwarf_init_blocked(struct unwind_state *info)
+{
+	struct inactive_task_frame *frame = (void *)info->task->thread.sp;
+
+	probe_kernel_address(&frame->ret_addr, info->u.regs.ip);
+	probe_kernel_address(&frame->bp, info->u.regs.bp);
+	info->u.regs.sp = info->task->thread.sp;
+	info->u.regs.cs = __KERNEL_CS;
+	info->u.regs.ss = __KERNEL_DS;
+#ifdef CONFIG_X86_32
+	info->u.regs.ds = __USER_DS;
+	info->u.regs.es = __USER_DS;
+#endif
+}
+
+static inline int arch_dwarf_user_mode(struct unwind_state *info)
+{
+	return user_mode(&info->u.regs)
+#ifdef CONFIG_X86_64
+	       || (long)info->u.regs.ip >= 0
+	       || (info->u.regs.ip >= VSYSCALL_ADDR &&
+		   info->u.regs.ip < VSYSCALL_ADDR + PAGE_SIZE)
+	       || (long)info->u.regs.sp >= 0;
+#else
+	       || info->u.regs.ip < PAGE_OFFSET
+	       || info->u.regs.sp < PAGE_OFFSET;
+#endif
+}
+
+#endif /* CONFIG_DWARF_UNWIND */
+
+#endif /* _ASM_X86_DWARF_H */
diff --git a/include/linux/dwarf.h b/include/linux/dwarf.h
new file mode 100644
index 000000000000..2ec3ea4872f1
--- /dev/null
+++ b/include/linux/dwarf.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2002-2017 Novell, Inc.
+ *	Jan Beulich <jbeulich@novell.com>
+ *	Jiri Slaby <jirislaby@kernel.org>
+ * This code is released under version 2 of the GNU GPL.
+ */
+
+#ifndef _LINUX_DWARF_H
+#define _LINUX_DWARF_H
+
+#include <linux/linkage.h>
+
+struct module;
+struct dwarf_table;
+
+#ifdef CONFIG_DWARF_UNWIND
+
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#ifndef ARCH_DWARF_SECTION_NAME
+#define ARCH_DWARF_SECTION_NAME ".eh_frame"
+#endif
+
+void dwarf_init(void);
+void dwarf_setup(void);
+
+#ifdef CONFIG_MODULES
+
+struct dwarf_table *dwarf_add_table(struct module *mod, const void *table_start,
+		unsigned long table_size);
+void dwarf_remove_table(struct dwarf_table *table, bool init_only);
+
+#endif /* CONFIG_MODULES */
+
+asmlinkage void arch_dwarf_init_running(struct pt_regs *regs);
+
+int dwarf_unwind(struct unwind_state *state);
+
+#else /* CONFIG_DWARF_UNWIND */
+
+static inline void dwarf_init(void) {}
+static inline void dwarf_setup(void) {}
+
+#ifdef CONFIG_MODULES
+
+static inline struct dwarf_table *dwarf_add_table(struct module *mod,
+		const void *table_start, unsigned long table_size)
+{
+	return NULL;
+}
+
+static inline void dwarf_remove_table(struct dwarf_table *table, bool init_only)
+{
+}
+
+#endif /* CONFIG_MODULES */
+
+#endif /* CONFIG_DWARF_UNWIND */
+
+#endif /* _LINUX_DWARF_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 72aa080f91f0..8fae6c1f5dd7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_DWARF_UNWIND) += dwarf.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_CRASH_CORE) += crash_core.o
 obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
diff --git a/kernel/dwarf.c b/kernel/dwarf.c
new file mode 100644
index 000000000000..ae8b38289034
--- /dev/null
+++ b/kernel/dwarf.c
@@ -0,0 +1,1802 @@
+/*
+ * Copyright (C) 2002-2017 Novell, Inc.
+ *	Jan Beulich <jbeulich@novell.com>
+ *	Jiri Slaby <jirislaby@kernel.org>
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * Simple DWARF unwinder for kernel stacks. This is used for debugging and
+ * error reporting purposes. The kernel does not need full-blown stack
+ * unwinding with all the bells and whistles, so there is not much point in
+ * implementing the full DWARF2 API.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/dwarf.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/stop_machine.h>
+#include <linux/uaccess.h>
+
+#include <asm/sections.h>
+#include <asm/unaligned.h>
+
+/* this is just for measuring and debugging purposes */
+#if 0
+#define DW_NOINLINE noinline
+#else
+#define DW_NOINLINE
+#endif
+
+#define MAX_STACK_DEPTH 8
+
+#define PTREGS_INFO(f) { \
+		BUILD_BUG_ON_ZERO(offsetof(struct pt_regs, f) \
+				  % FIELD_SIZEOF(struct pt_regs, f)) \
+		+ offsetof(struct pt_regs, f) \
+		  / FIELD_SIZEOF(struct pt_regs, f), \
+		FIELD_SIZEOF(struct pt_regs, f) \
+	}
+#define FRAME_REG(f, r, t) (((t *)&(f)->u.regs_arr)[reg_info[r].offs])
+
+static const struct {
+	unsigned offs:BITS_PER_LONG / 2;
+	unsigned width:BITS_PER_LONG / 2;
+} reg_info[] = {
+	DW_REGISTER_INFO
+};
+
+#undef PTREGS_INFO
+
+#ifndef REG_INVALID
+#define REG_INVALID(r) (reg_info[r].width == 0)
+#endif
+
+#define DW_CFA_nop				0x00
+#define DW_CFA_set_loc				0x01
+#define DW_CFA_advance_loc1			0x02
+#define DW_CFA_advance_loc2			0x03
+#define DW_CFA_advance_loc4			0x04
+#define DW_CFA_offset_extended			0x05
+#define DW_CFA_restore_extended			0x06
+#define DW_CFA_undefined			0x07
+#define DW_CFA_same_value			0x08
+#define DW_CFA_register				0x09
+#define DW_CFA_remember_state			0x0a
+#define DW_CFA_restore_state			0x0b
+#define DW_CFA_def_cfa				0x0c
+#define DW_CFA_def_cfa_register			0x0d
+#define DW_CFA_def_cfa_offset			0x0e
+#define DW_CFA_def_cfa_expression		0x0f
+#define DW_CFA_expression			0x10
+#define DW_CFA_offset_extended_sf		0x11
+#define DW_CFA_def_cfa_sf			0x12
+#define DW_CFA_def_cfa_offset_sf		0x13
+#define DW_CFA_val_offset			0x14
+#define DW_CFA_val_offset_sf			0x15
+#define DW_CFA_val_expression			0x16
+#define DW_CFA_lo_user				0x1c
+#define DW_CFA_GNU_window_save			0x2d
+#define DW_CFA_GNU_args_size			0x2e
+#define DW_CFA_GNU_negative_offset_extended	0x2f
+#define DW_CFA_hi_user				0x3f
+
+#define DW_CFA_advance_loc			0x40
+#define DW_CFA_offset				0x80
+#define DW_CFA_restore				0xc0
+#define DW_CFA_ENCODED_MASK			0xc0
+#define DW_CFA_ENCODED_OP			(~DW_CFA_ENCODED_MASK)
+
+#define DW_EH_PE_FORM		0x07
+#define DW_EH_PE_native		0x00
+#define DW_EH_PE_leb128		0x01
+#define DW_EH_PE_data2		0x02
+#define DW_EH_PE_data4		0x03
+#define DW_EH_PE_data8		0x04
+#define DW_EH_PE_signed		0x08
+#define DW_EH_PE_ADJUST		0x70
+#define DW_EH_PE_abs		0x00
+#define DW_EH_PE_pcrel		0x10
+#define DW_EH_PE_textrel	0x20
+#define DW_EH_PE_datarel	0x30
+#define DW_EH_PE_funcrel	0x40
+#define DW_EH_PE_aligned	0x50
+#define DW_EH_PE_indirect	0x80
+#define DW_EH_PE_omit		0xff
+
+#define DW_OP_addr		0x03
+#define DW_OP_deref		0x06
+#define DW_OP_const1u		0x08
+#define DW_OP_const1s		0x09
+#define DW_OP_const2u		0x0a
+#define DW_OP_const2s		0x0b
+#define DW_OP_const4u		0x0c
+#define DW_OP_const4s		0x0d
+#define DW_OP_const8u		0x0e
+#define DW_OP_const8s		0x0f
+#define DW_OP_constu		0x10
+#define DW_OP_consts		0x11
+#define DW_OP_dup		0x12
+#define DW_OP_drop		0x13
+#define DW_OP_over		0x14
+#define DW_OP_pick		0x15
+#define DW_OP_swap		0x16
+#define DW_OP_rot		0x17
+#define DW_OP_xderef		0x18
+#define DW_OP_abs		0x19
+#define DW_OP_and		0x1a
+#define DW_OP_div		0x1b
+#define DW_OP_minus		0x1c
+#define DW_OP_mod		0x1d
+#define DW_OP_mul		0x1e
+#define DW_OP_neg		0x1f
+#define DW_OP_not		0x20
+#define DW_OP_or		0x21
+#define DW_OP_plus		0x22
+#define DW_OP_plus_uconst	0x23
+#define DW_OP_shl		0x24
+#define DW_OP_shr		0x25
+#define DW_OP_shra		0x26
+#define DW_OP_xor		0x27
+#define DW_OP_bra		0x28
+#define DW_OP_eq		0x29
+#define DW_OP_ge		0x2a
+#define DW_OP_gt		0x2b
+#define DW_OP_le		0x2c
+#define DW_OP_lt		0x2d
+#define DW_OP_ne		0x2e
+#define DW_OP_skip		0x2f
+#define DW_OP_lit0		0x30
+#define DW_OP_lit31		0x4f
+#define DW_OP_reg0		0x50
+#define DW_OP_reg31		0x6f
+#define DW_OP_breg0		0x70
+#define DW_OP_breg31		0x8f
+#define DW_OP_regx		0x90
+#define DW_OP_fbreg		0x91
+#define DW_OP_bregx		0x92
+#define DW_OP_piece		0x93
+#define DW_OP_deref_size	0x94
+#define DW_OP_xderef_size	0x95
+#define DW_OP_nop		0x96
+
+typedef unsigned long uleb128_t;
+typedef signed long sleb128_t;
+#define sleb128abs __builtin_labs
+
+static struct dwarf_table {
+	struct {
+		unsigned long pc;
+		unsigned long range;
+	} core, init;
+	const void *address;
+	unsigned long size;
+	const unsigned char *header;
+	unsigned long hdrsz;
+	struct dwarf_table *link;
+	const char *name;
+} root_table;
+
+union dw_uni_ptr {
+	const u8 *pu8;
+	const s8 *ps8;
+	const u16 *pu16;
+	const s16 *ps16;
+	const u32 *pu32;
+	const s32 *ps32;
+	const u64 *pu64;
+	const s64 *ps64;
+	const unsigned long *pul;
+};
+
+struct dw_unwind_state {
+	uleb128_t loc, orig_loc;
+
+	struct {
+		const u8 *ins_start, *ins_end;
+
+		uleb128_t code_align, ret_addr_reg;
+		sleb128_t data_align;
+		bool is_signal_frame, has_aug_data;
+		u8 ptr_type;
+	} cie;
+
+	struct {
+		unsigned long pc_begin, pc_end;
+		const u8 *ins_start, *ins_end;
+	} fde;
+
+	struct cfa {
+		uleb128_t reg, offs, elen;
+		const u8 *expr;
+	} cfa;
+	struct {
+		enum dw_item_location {
+			NOWHERE,
+			MEMORY,
+			REGISTER,
+			VALUE,
+		} where;
+		uleb128_t value;
+	} regs[ARRAY_SIZE(reg_info)];
+	const u8 *label;
+	const u8 *stack[MAX_STACK_DEPTH];
+	u8 stackDepth;
+};
+
+static const struct cfa badCFA = {
+	.reg = ARRAY_SIZE(reg_info),
+	.offs = 1,
+};
+
+static unsigned int unwind_debug;
+static int __init unwind_debug_setup(char *s)
+{
+	if (kstrtouint(s, 0, &unwind_debug))
+		return 1;
+	return 0;
+}
+early_param("unwind_debug", unwind_debug_setup);
+
+#define dprintk(lvl, fmt, args...) do {				\
+	if (unwind_debug >= lvl)					\
+		printk(KERN_DEBUG "unwind: " fmt "\n", ##args);	\
+} while (0)
+
+static struct dwarf_table *dw_find_table(unsigned long pc)
+{
+	struct dwarf_table *table;
+
+	for (table = &root_table; table; table = table->link)
+		if ((pc >= table->core.pc &&
+				 pc < table->core.pc + table->core.range) ||
+				(pc >= table->init.pc &&
+				 pc < table->init.pc + table->init.range))
+			break;
+
+	return table;
+}
+
+static unsigned long dw_read_pointer(const u8 **pLoc,
+				  const void *end,
+				  int ptrType,
+				  unsigned long data_base);
+
+static void __init_or_module
+init_unwind_table(struct dwarf_table *table, const char *name,
+		  const void *core_start, unsigned long core_size,
+		  const void *init_start, unsigned long init_size,
+		  const void *table_start, unsigned long table_size,
+		  const u8 *header_start, unsigned long header_size)
+{
+	const u8 *ptr = header_start + 4;
+	const u8 *end = header_start + header_size;
+
+	table->core.pc = (unsigned long)core_start;
+	table->core.range = core_size;
+	table->init.pc = (unsigned long)init_start;
+	table->init.range = init_size;
+	table->address = table_start;
+	table->size = table_size;
+
+	/* See if the linker provided table looks valid. */
+	if (header_size <= 4 || header_start[0] != 1 ||
+			/* ptr to eh_frame */
+			(void *)dw_read_pointer(&ptr, end, header_start[1], 0) !=
+				table_start ||
+			/* fde count */
+			!dw_read_pointer(&ptr, end, header_start[2], 0) ||
+			/* table[0] -- initial location */
+			!dw_read_pointer(&ptr, end, header_start[3],
+				(unsigned long)header_start) ||
+			/* table[0] -- address */
+			!dw_read_pointer(&ptr, end, header_start[3],
+				(unsigned long)header_start))
+		header_start = NULL;
+
+	table->hdrsz = header_size;
+	smp_wmb(); /* counterpart in dwarf_unwind */
+	table->header = header_start;
+	table->link = NULL;
+	table->name = name;
+}
+
+/*
+ * dwarf_init -- initialize unwind support
+ */
+void __init dwarf_init(void)
+{
+	extern const char __start_unwind[], __end_unwind[];
+	extern const char __start_unwind_hdr[], __end_unwind_hdr[];
+#ifdef CONFIG_DEBUG_RODATA
+	unsigned long text_len = _etext - _text;
+	const void *init_start = __init_begin;
+	unsigned long init_len = __init_end - __init_begin;
+#else
+	unsigned long text_len = _end - _text;
+	const void *init_start = NULL;
+	unsigned long init_len = 0;
+#endif
+	init_unwind_table(&root_table, "kernel", _text, text_len,
+			init_start, init_len,
+			__start_unwind, __end_unwind - __start_unwind,
+			__start_unwind_hdr,
+			__end_unwind_hdr - __start_unwind_hdr);
+}
+
+static const u32 *cie_for_fde(const u32 *fde, const struct dwarf_table *table,
+		unsigned long *start_loc, struct dw_unwind_state *state);
+
+struct eh_frame_hdr_table_entry {
+	unsigned long start, fde;
+};
+
+static int __init_or_module
+cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2)
+{
+	const struct eh_frame_hdr_table_entry *e1 = p1;
+	const struct eh_frame_hdr_table_entry *e2 = p2;
+
+	return (e1->start > e2->start) - (e1->start < e2->start);
+}
+
+static void __init_or_module
+swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size)
+{
+	struct eh_frame_hdr_table_entry *e1 = p1;
+	struct eh_frame_hdr_table_entry *e2 = p2;
+	unsigned long v;
+
+	v = e1->start;
+	e1->start = e2->start;
+	e2->start = v;
+	v = e1->fde;
+	e1->fde = e2->fde;
+	e2->fde = v;
+}
+
+static void __init_or_module
+setup_unwind_table(struct dwarf_table *table,
+		   void *(*alloc)(size_t, gfp_t))
+{
+	unsigned long tableSize = table->size, hdrSize, start_loc;
+	unsigned int n;
+	const u32 *fde;
+	struct {
+		u8 version;
+		u8 eh_frame_ptr_enc;
+		u8 fde_count_enc;
+		u8 table_enc;
+		unsigned long eh_frame_ptr;
+		unsigned int fde_count;
+		struct eh_frame_hdr_table_entry table[];
+	} __attribute__((__packed__)) *header;
+
+	if (table->header)
+		return;
+
+	if (table->hdrsz)
+		pr_warn(".eh_frame_hdr for '%s' present but unusable\n",
+			table->name);
+
+	if (tableSize & (sizeof(*fde) - 1))
+		return;
+
+	for (fde = table->address, n = 0;
+			tableSize > sizeof(*fde) &&
+			tableSize - sizeof(*fde) >= *fde;
+			tableSize -= sizeof(*fde) + *fde,
+			fde += 1 + *fde / sizeof(*fde)) {
+		const u32 *cie = cie_for_fde(fde, table, &start_loc, NULL);
+
+		if (IS_ERR(cie)) {
+			if (PTR_ERR(cie) == -ENODEV)
+				continue;
+			return;
+		}
+
+		if (!start_loc)
+			return;
+		++n;
+	}
+
+	if (tableSize || n < 32)
+		return;
+
+	hdrSize = sizeof(*header) + n * sizeof(*header->table);
+	dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name,
+			hdrSize);
+	header = alloc(hdrSize, GFP_KERNEL);
+	if (!header)
+		return;
+	header->version	= 1;
+	header->eh_frame_ptr_enc = DW_EH_PE_abs | DW_EH_PE_native;
+	header->fde_count_enc = DW_EH_PE_abs | DW_EH_PE_data4;
+	header->table_enc = DW_EH_PE_abs | DW_EH_PE_native;
+	put_unaligned((unsigned long)table->address, &header->eh_frame_ptr);
+	BUILD_BUG_ON(offsetof(typeof(*header), fde_count) %
+			__alignof(typeof(header->fde_count)));
+	header->fde_count = n;
+
+	BUILD_BUG_ON(offsetof(typeof(*header), table) %
+			__alignof(typeof(*header->table)));
+	for (fde = table->address, tableSize = table->size, n = 0;
+			tableSize;
+			tableSize -= sizeof(*fde) + *fde,
+			fde += 1 + *fde / sizeof(*fde)) {
+		const u32 *cie = cie_for_fde(fde, table, &start_loc, NULL);
+		if (PTR_ERR(cie) == -ENODEV)
+			continue;
+
+		header->table[n].start = start_loc;
+		header->table[n].fde = (unsigned long)fde;
+		++n;
+	}
+	WARN_ON(n != header->fde_count);
+
+	sort(header->table, n, sizeof(*header->table),
+			cmp_eh_frame_hdr_table_entries,
+			swap_eh_frame_hdr_table_entries);
+
+	table->hdrsz = hdrSize;
+	smp_wmb(); /* counterpart in dwarf_unwind */
+	table->header = (const void *)header;
+}
+
+static void *__init balloc(size_t sz, gfp_t unused)
+{
+	return __alloc_bootmem_nopanic(sz, sizeof(unsigned long),
+			__pa(MAX_DMA_ADDRESS));
+}
+
+void __init dwarf_setup(void)
+{
+	setup_unwind_table(&root_table, balloc);
+}
+
+#ifdef CONFIG_MODULES
+
+static struct dwarf_table *last_table = &root_table;
+
+/*
+ * dwarf_add_table -- insert a dwarf table for a module
+ *
+ * Must be called with module_mutex held.
+ */
+struct dwarf_table *dwarf_add_table(struct module *module,
+		const void *table_start, unsigned long table_size)
+{
+	struct dwarf_table *table;
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+	unsigned long core_size = module->core_layout.text_size;
+	unsigned long init_size = module->init_layout.text_size;
+#else
+	unsigned long core_size = module->core_layout.size;
+	unsigned long init_size = module->init_layout.size;
+#endif
+
+	if (table_size <= 0)
+		return NULL;
+
+	table = kmalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	init_unwind_table(table, module->name,
+			module->core_layout.base, core_size,
+			module->init_layout.base, init_size,
+			table_start, table_size, NULL, 0);
+	setup_unwind_table(table, kmalloc);
+
+	last_table->link = table;
+	last_table = table;
+
+	return table;
+}
+
+struct unlink_table_info {
+	struct dwarf_table *table;
+	bool init_only;
+};
+
+static int unlink_table(void *arg)
+{
+	struct unlink_table_info *info = arg;
+	struct dwarf_table *table = info->table, *prev;
+
+	for (prev = &root_table; prev->link && prev->link != table;
+			prev = prev->link)
+		;
+
+	if (prev->link) {
+		if (info->init_only) {
+			table->init.pc = 0;
+			table->init.range = 0;
+			info->table = NULL;
+		} else {
+			prev->link = table->link;
+			if (!prev->link)
+				last_table = prev;
+		}
+	} else
+		info->table = NULL;
+
+	return 0;
+}
+
+/*
+ * dwarf_remove_table -- remove a dwarf table for a module
+ *
+ * Must be called with module_mutex held.
+ */
+void dwarf_remove_table(struct dwarf_table *table, bool init_only)
+{
+	struct unlink_table_info info;
+
+	if (!table || table == &root_table)
+		return;
+
+	if (init_only && table == last_table) {
+		table->init.pc = 0;
+		table->init.range = 0;
+		return;
+	}
+
+	info.table = table;
+	info.init_only = init_only;
+	stop_machine(unlink_table, &info, NULL);
+
+	if (info.table) {
+		kfree(table->header);
+		kfree(table);
+	}
+}
+
+#endif /* CONFIG_MODULES */
+
+static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
+{
+	const u8 *cur = *pcur;
+	uleb128_t value;
+	unsigned int shift;
+
+	for (shift = 0, value = 0; cur < end; shift += 7) {
+		if (shift + 7 > 8 * sizeof(value)
+		    && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+			cur = end + 1;
+			break;
+		}
+		value |= (uleb128_t)(*cur & 0x7f) << shift;
+		if (!(*cur++ & 0x80))
+			break;
+	}
+	*pcur = cur;
+
+	return value;
+}
+
+static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
+{
+	const u8 *cur = *pcur;
+	sleb128_t value;
+	unsigned int shift;
+
+	for (shift = 0, value = 0; cur < end; shift += 7) {
+		if (shift + 7 > 8 * sizeof(value)
+		    && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+			cur = end + 1;
+			break;
+		}
+		value |= (sleb128_t)(*cur & 0x7f) << shift;
+		if (!(*cur & 0x80)) {
+			value |= -(*cur++ & 0x40) << shift;
+			break;
+		}
+	}
+	*pcur = cur;
+
+	return value;
+}
+
+static int dw_parse_cie(const u32 *cie, struct dw_unwind_state *state)
+{
+	const u8 *ptr = (const u8 *)(cie + 2); /* skip len+ID */
+	const u8 *end = (const u8 *)(cie + 1) + *cie, *aug_data_end;
+	const char *aug = NULL;
+	unsigned int version = *ptr++;
+	int ptr_type = DW_EH_PE_native | DW_EH_PE_abs;
+	uleb128_t code_align, ret_addr_reg, len;
+	sleb128_t data_align;
+	bool signal_frame = false;
+
+	if (version != 1)
+		return -1;
+
+	if (*ptr) {
+		aug = (const char *)ptr;
+
+		/* check if augmentation string is nul-terminated */
+		ptr = memchr(aug, 0, end - ptr);
+		if (ptr == NULL)
+			return -1;
+
+		/* check if augmentation size is first (and thus present) */
+		if (*aug != 'z')
+			return -1;
+	} else if (!state)
+		goto finish;
+
+	ptr++; /* skip terminator */
+	code_align = get_uleb128(&ptr, end);
+	data_align = get_sleb128(&ptr, end);
+	ret_addr_reg = version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
+
+	if (aug) {
+		len = get_uleb128(&ptr, end); /* augmentation length */
+		if (ptr + len < ptr || ptr + len > end)
+			return -1;
+
+		aug_data_end = ptr + len;
+		while (*++aug) {
+			if (ptr >= aug_data_end)
+				return -1;
+			switch (*aug) {
+			case 'L':
+				++ptr;
+				break;
+			case 'P': {
+				int ptr_type = *ptr++;
+
+				if (!dw_read_pointer(&ptr, aug_data_end,
+							ptr_type, 0) ||
+						ptr > aug_data_end)
+					return -1;
+				break;
+			}
+			case 'R':
+				ptr_type = *ptr;
+				break;
+			case 'S':
+				signal_frame = true;
+				break;
+			default:
+				pr_info("%s: unhandled AUG %c\n", __func__,
+						*aug);
+				return -1;
+			}
+		}
+	} else
+		aug_data_end = ptr;
+
+	if (state) {
+		state->cie.ins_start = aug_data_end;
+		state->cie.ins_end = end;
+
+		state->cie.code_align = code_align;
+		state->cie.data_align = data_align;
+		state->cie.ret_addr_reg = ret_addr_reg;
+		state->cie.is_signal_frame = signal_frame;
+		state->cie.has_aug_data = aug;
+	}
+finish:
+	return ptr_type;
+}
+
+static const u32 *cie_for_fde(const u32 *fde, const struct dwarf_table *table,
+		unsigned long *start_loc, struct dw_unwind_state *state)
+{
+	const u32 *cie;
+	int ptr_type;
+
+	/* no len or odd len? */
+	if (!*fde || (*fde & (sizeof(*fde) - 1)))
+		return ERR_PTR(-EBADF);
+
+	/* a CIE? */
+	if (!fde[1])
+		return ERR_PTR(-ENODEV);
+
+	/* invalid pointer? */
+	if (fde[1] & (sizeof(*fde) - 1))
+		return ERR_PTR(-EINVAL);
+
+	/* out of range? */
+	if (fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address)
+		return ERR_PTR(-EINVAL);
+
+	cie = fde + 1 - fde[1] / sizeof(*fde);
+
+	/* invalid CIE? */
+	if (*cie <= sizeof(*cie) + 4 || *cie >= fde[1] - sizeof(*fde) ||
+			(*cie & (sizeof(*cie) - 1)) || cie[1])
+		return ERR_PTR(-EINVAL);
+
+	ptr_type = dw_parse_cie(cie, state);
+	if (ptr_type < 0)
+		return ERR_PTR(-EINVAL);
+
+	if (start_loc) {
+		const u8 *ptr = (const u8 *)(fde + 2);
+		const u8 *end = (const u8 *)(fde + 1) + *fde;
+		/* PC begin */
+		*start_loc = dw_read_pointer(&ptr, end, ptr_type, 0);
+
+		/* force absolute type for PC range */
+		if (!(ptr_type & DW_EH_PE_indirect))
+			ptr_type &= DW_EH_PE_FORM | DW_EH_PE_signed;
+
+		if (state) {
+			/* PC range */
+			state->fde.pc_end = *start_loc +
+				dw_read_pointer(&ptr, end, ptr_type, 0);
+
+			/* skip augmentation */
+			if (state->cie.has_aug_data) {
+				uleb128_t aug_len = get_uleb128(&ptr, end);
+
+				ptr += aug_len;
+				if (ptr > end)
+					return ERR_PTR(-EINVAL);
+			}
+
+			state->fde.ins_start = ptr;
+			state->fde.ins_end = end;
+			state->cie.ptr_type = ptr_type;
+		}
+	}
+
+	return cie;
+}
+
+static unsigned long dw_read_pointer(const u8 **pLoc, const void *end,
+		int ptrType, unsigned long data_base)
+{
+	unsigned long value = 0;
+	union dw_uni_ptr ptr;
+
+	if (ptrType < 0 || ptrType == DW_EH_PE_omit) {
+		dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType,
+				*pLoc, end);
+		return 0;
+	}
+	ptr.pu8 = *pLoc;
+	switch (ptrType & DW_EH_PE_FORM) {
+	case DW_EH_PE_data2:
+		if (end < (const void *)(ptr.pu16 + 1)) {
+			dprintk(1, "Data16 overrun (%p,%p).", ptr.pu8, end);
+			return 0;
+		}
+		if (ptrType & DW_EH_PE_signed)
+			value = get_unaligned(ptr.ps16++);
+		else
+			value = get_unaligned(ptr.pu16++);
+		break;
+	case DW_EH_PE_data4:
+#ifdef CONFIG_64BIT
+		if (end < (const void *)(ptr.pu32 + 1)) {
+			dprintk(1, "Data32 overrun (%p,%p).", ptr.pu8, end);
+			return 0;
+		}
+		if (ptrType & DW_EH_PE_signed)
+			value = get_unaligned(ptr.ps32++);
+		else
+			value = get_unaligned(ptr.pu32++);
+		break;
+	case DW_EH_PE_data8:
+		BUILD_BUG_ON(sizeof(u64) != sizeof(value));
+#else
+		BUILD_BUG_ON(sizeof(u32) != sizeof(value));
+#endif
+		/* fallthrough */
+	case DW_EH_PE_native:
+		if (end < (const void *)(ptr.pul + 1)) {
+			dprintk(1, "DataUL overrun (%p,%p).", ptr.pu8, end);
+			return 0;
+		}
+		value = get_unaligned(ptr.pul++);
+		break;
+	case DW_EH_PE_leb128:
+		BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
+		if (ptrType & DW_EH_PE_signed)
+			value = get_sleb128(&ptr.pu8, end);
+		else
+			value = get_uleb128(&ptr.pu8, end);
+		if ((const void *)ptr.pu8 > end) {
+			dprintk(1, "DataLEB overrun (%p,%p).", ptr.pu8, end);
+			return 0;
+		}
+		break;
+	default:
+		dprintk(2, "Cannot decode pointer type %02X (%p,%p).",
+				ptrType, ptr.pu8, end);
+		return 0;
+	}
+	switch (ptrType & DW_EH_PE_ADJUST) {
+	case DW_EH_PE_abs:
+		break;
+	case DW_EH_PE_pcrel:
+		value += (unsigned long)*pLoc;
+		break;
+	case DW_EH_PE_textrel:
+		dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.",
+				ptrType, *pLoc, end);
+		return 0;
+	case DW_EH_PE_datarel:
+		if (likely(data_base)) {
+			value += data_base;
+			break;
+		}
+		dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.",
+				ptrType, *pLoc, end);
+		return 0;
+	default:
+		dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
+				ptrType, *pLoc, end);
+		return 0;
+	}
+	if ((ptrType & DW_EH_PE_indirect)
+	    && probe_kernel_address((unsigned long *)value, value)) {
+		dprintk(1, "Cannot read indirect value %lx (%p,%p).",
+				value, *pLoc, end);
+		return 0;
+	}
+	*pLoc = ptr.pu8;
+
+	return value;
+}
+
+static bool dw_advance_loc(unsigned long delta, struct dw_unwind_state *state)
+{
+	state->loc += delta * state->cie.code_align;
+
+	return delta > 0;
+}
+
+static void dw_set_rule(uleb128_t reg, enum dw_item_location where,
+		uleb128_t value, struct dw_unwind_state *state)
+{
+	if (reg < ARRAY_SIZE(state->regs)) {
+		state->regs[reg].where = where;
+		state->regs[reg].value = value;
+	}
+}
+
+static bool dw_process_CFI_encoded(struct dw_unwind_state *state,
+		union dw_uni_ptr *ptr, const u8 *end, u8 inst)
+{
+	u8 op = inst & DW_CFA_ENCODED_OP;
+
+	switch (inst & DW_CFA_ENCODED_MASK) {
+	case DW_CFA_advance_loc:
+		return dw_advance_loc(op, state);
+	case DW_CFA_offset:
+		dw_set_rule(op, MEMORY, get_uleb128(&ptr->pu8, end), state);
+		break;
+	case DW_CFA_restore:
+		dw_set_rule(op, NOWHERE, 0, state);
+		break;
+	}
+
+	return true;
+}
+
+static bool dw_process_CFIs(struct dw_unwind_state *state, unsigned long pc);
+
+static int dw_process_CFI_normal(struct dw_unwind_state *state,
+		union dw_uni_ptr *ptr, const u8 *end, u8 inst)
+{
+	uleb128_t value;
+	bool ok = true;
+
+	switch (inst) {
+	case DW_CFA_nop:
+		break;
+	case DW_CFA_set_loc:
+		state->loc = dw_read_pointer(&ptr->pu8, end,
+				state->cie.ptr_type, 0);
+		if (state->loc == 0)
+			return false;
+		break;
+	case DW_CFA_advance_loc1:
+		return ptr->pu8 < end && dw_advance_loc(*ptr->pu8++, state);
+	case DW_CFA_advance_loc2:
+		return ptr->pu8 <= end + 2 &&
+			dw_advance_loc(*ptr->pu16++, state);
+	case DW_CFA_advance_loc4:
+		return ptr->pu8 <= end + 4 &&
+			dw_advance_loc(*ptr->pu32++, state);
+	case DW_CFA_offset_extended:
+		value = get_uleb128(&ptr->pu8, end);
+		dw_set_rule(value, MEMORY, get_uleb128(&ptr->pu8, end), state);
+		break;
+	case DW_CFA_val_offset:
+		value = get_uleb128(&ptr->pu8, end);
+		dw_set_rule(value, VALUE, get_uleb128(&ptr->pu8, end), state);
+		break;
+	case DW_CFA_offset_extended_sf:
+		value = get_uleb128(&ptr->pu8, end);
+		dw_set_rule(value, MEMORY, get_sleb128(&ptr->pu8, end), state);
+		break;
+	case DW_CFA_val_offset_sf:
+		value = get_uleb128(&ptr->pu8, end);
+		dw_set_rule(value, VALUE, get_sleb128(&ptr->pu8, end), state);
+		break;
+	/*todo case DW_CFA_expression: */
+	/*todo case DW_CFA_val_expression: */
+	case DW_CFA_restore_extended:
+	case DW_CFA_undefined:
+	case DW_CFA_same_value:
+		dw_set_rule(get_uleb128(&ptr->pu8, end), NOWHERE, 0, state);
+		break;
+	case DW_CFA_register:
+		value = get_uleb128(&ptr->pu8, end);
+		dw_set_rule(value, REGISTER, get_uleb128(&ptr->pu8, end),
+				state);
+		break;
+	case DW_CFA_remember_state:
+		if (ptr->pu8 == state->label) {
+			state->label = NULL;
+			/* required state */
+			return 2;
+		}
+		if (state->stackDepth >= MAX_STACK_DEPTH) {
+			dprintk(1, "State stack overflow (%p, %p).", ptr->pu8,
+					end);
+			return false;
+		}
+		state->stack[state->stackDepth++] = ptr->pu8;
+		break;
+	case DW_CFA_restore_state: {
+		const uleb128_t loc = state->loc;
+		const u8 *label = state->label;
+
+		if (!state->stackDepth) {
+			dprintk(1, "State stack underflow (%p, %p).", ptr->pu8,
+					end);
+			return false;
+		}
+
+		state->label = state->stack[state->stackDepth - 1];
+		memcpy(&state->cfa, &badCFA, sizeof(badCFA));
+		memset(state->regs, 0, sizeof(state->regs));
+		state->stackDepth = 0;
+		ok = dw_process_CFIs(state, 0);
+		state->loc = loc;
+		state->label = label;
+
+		break;
+	}
+	case DW_CFA_def_cfa:
+		state->cfa.reg = get_uleb128(&ptr->pu8, end);
+		state->cfa.elen = 0;
+		/* fallthrough */
+	case DW_CFA_def_cfa_offset:
+		state->cfa.offs = get_uleb128(&ptr->pu8, end);
+		break;
+	case DW_CFA_def_cfa_sf:
+		state->cfa.reg = get_uleb128(&ptr->pu8, end);
+		state->cfa.elen = 0;
+		/* fallthrough */
+	case DW_CFA_def_cfa_offset_sf:
+		state->cfa.offs = get_sleb128(&ptr->pu8, end) *
+			state->cie.data_align;
+		break;
+	case DW_CFA_def_cfa_register:
+		state->cfa.reg = get_uleb128(&ptr->pu8, end);
+		state->cfa.elen = 0;
+		break;
+	case DW_CFA_def_cfa_expression:
+		state->cfa.elen = get_uleb128(&ptr->pu8, end);
+		if (!state->cfa.elen) {
+			dprintk(1, "Zero-length CFA expression.");
+			return false;
+		}
+		state->cfa.expr = ptr->pu8;
+		ptr->pu8 += state->cfa.elen;
+		break;
+	case DW_CFA_GNU_args_size:
+		get_uleb128(&ptr->pu8, end);
+		break;
+	case DW_CFA_GNU_negative_offset_extended:
+		value = get_uleb128(&ptr->pu8, end);
+		dw_set_rule(value, MEMORY,
+				(uleb128_t)0 - get_uleb128(&ptr->pu8, end),
+				state);
+		break;
+	case DW_CFA_GNU_window_save:
+	default:
+		dprintk(1, "Unrecognized CFI op %02X (%p, %p).", ptr->pu8[-1],
+				ptr->pu8 - 1, end);
+		return false;
+	}
+
+	return ok;
+}
+
+static bool dw_process_CFI(struct dw_unwind_state *state, const u8 *start,
+		const u8 *end, unsigned long pc)
+{
+	union dw_uni_ptr ptr;
+	int res;
+
+	dprintk(4, "processing CFI: %p-%p", start, end);
+
+	for (ptr.pu8 = start; ptr.pu8 < end; ) {
+		u8 inst = *ptr.pu8++;
+
+		if (inst & DW_CFA_ENCODED_MASK)
+			res = dw_process_CFI_encoded(state, &ptr, end, inst);
+		else {
+			res = dw_process_CFI_normal(state, &ptr, end, inst);
+			if (res == 2)
+				return true;
+		}
+
+		if (!res)
+			return false;
+
+		if (ptr.pu8 > end) {
+			dprintk(1, "Data overrun (%p, %p).", ptr.pu8, end);
+			return false;
+		}
+		if (pc != 0 && pc < state->loc)
+			return true;
+	}
+
+	if (ptr.pu8 < end) {
+		dprintk(1, "Data underrun (%p, %p).", ptr.pu8, end);
+		return false;
+	}
+
+	if (ptr.pu8 != end) {
+		pr_err("%s: Data mismatch (%p, %p).\n", __func__, ptr.pu8, end);
+		return false;
+	}
+
+	return (pc == 0 ||
+		(/*
+		  * TODO While in theory this should apply, gcc in practice
+		  * omits everything past the function prolog, and hence the
+		  * location never reaches the end of the function.
+		  *    pc < state->loc &&
+		  */
+		 state->label == NULL));
+}
+
+static bool dw_process_CFIs(struct dw_unwind_state *state, unsigned long pc)
+{
+	bool ok = true;
+
+	state->loc = state->orig_loc;
+
+	ok = dw_process_CFI(state, state->cie.ins_start, state->cie.ins_end, 0);
+	if (pc == 0 && state->label == NULL)
+		return ok;
+	if (!ok)
+		return false;
+
+	return dw_process_CFI(state, state->fde.ins_start, state->fde.ins_end,
+			pc);
+}
+
+static unsigned long dw_evaluate_cfa(const u8 *expr, const u8 *end,
+			      const struct unwind_state *frame)
+{
+	union dw_uni_ptr ptr = { expr };
+	unsigned long stack[8], val1, val2;
+	unsigned int stidx = 0;
+	int ret;
+
+#define PUSH(v) do {							\
+	unsigned long v__ = (v);					\
+									\
+	if (stidx >= ARRAY_SIZE(stack))					\
+		return 0;						\
+	stack[stidx++] = v__;						\
+} while (0)
+
+#define POP() ({							\
+	if (!stidx)							\
+		return 0;						\
+	stack[--stidx];							\
+})
+
+	while (ptr.pu8 < end) {
+		switch (*ptr.pu8++) {
+		/*todo case DW_OP_addr: */
+		case DW_OP_deref:
+			val1 = POP();
+			ret = probe_kernel_address((unsigned long *)val1, val2);
+			if (ret) {
+				dprintk(1, "Cannot de-reference %lx (%p,%p).",
+						val1, ptr.pu8 - 1, end);
+				return 0;
+			}
+			PUSH(val2);
+			break;
+		/*todo? case DW_OP_xderef: */
+		/*todo case DW_OP_deref_size: */
+		/*todo? case DW_OP_xderef_size: */
+		case DW_OP_const1u:
+			if (ptr.pu8 < end)
+				PUSH(*ptr.pu8);
+			++ptr.pu8;
+			break;
+		case DW_OP_const1s:
+			if (ptr.pu8 < end)
+				PUSH(*ptr.ps8);
+			++ptr.ps8;
+			break;
+		case DW_OP_const2u:
+			if (ptr.pu8 + 1 < end)
+				PUSH(*ptr.pu16);
+			++ptr.pu16;
+			break;
+		case DW_OP_const2s:
+			if (ptr.pu8 + 1 < end)
+				PUSH(*ptr.ps16);
+			++ptr.ps16;
+			break;
+		case DW_OP_const4u:
+			if (ptr.pu8 + 3 < end)
+				PUSH(*ptr.pu32);
+			++ptr.pu32;
+			break;
+		case DW_OP_const4s:
+			if (ptr.pu8 + 3 < end)
+				PUSH(*ptr.ps32);
+			++ptr.ps32;
+			break;
+		case DW_OP_const8u:
+			if (ptr.pu8 + 7 < end)
+				PUSH(*ptr.pu64);
+			++ptr.pu64;
+			break;
+		case DW_OP_const8s:
+			if (ptr.pu8 + 7 < end)
+				PUSH(*ptr.ps64);
+			++ptr.ps64;
+			break;
+		case DW_OP_constu:
+			PUSH(get_uleb128(&ptr.pu8, end));
+			break;
+		case DW_OP_consts:
+			PUSH(get_sleb128(&ptr.pu8, end));
+			break;
+		case DW_OP_dup:
+			if (!stidx)
+				return 0;
+			PUSH(stack[stidx - 1]);
+			break;
+		case DW_OP_drop:
+			(void)POP();
+			break;
+		case DW_OP_over:
+			if (stidx <= 1)
+				return 0;
+			PUSH(stack[stidx - 2]);
+			break;
+		case DW_OP_pick:
+			if (ptr.pu8 < end) {
+				if (stidx <= *ptr.pu8)
+					return 0;
+				PUSH(stack[stidx - *ptr.pu8 - 1]);
+			}
+			++ptr.pu8;
+			break;
+		case DW_OP_swap:
+			if (stidx <= 1)
+				return 0;
+			val1 = stack[stidx - 1];
+			stack[stidx - 1] = stack[stidx - 2];
+			stack[stidx - 2] = val1;
+			break;
+		case DW_OP_rot:
+			if (stidx <= 2)
+				return 0;
+			val1 = stack[stidx - 1];
+			stack[stidx - 1] = stack[stidx - 2];
+			stack[stidx - 2] = stack[stidx - 3];
+			stack[stidx - 3] = val1;
+			break;
+		case DW_OP_abs:
+			PUSH(__builtin_labs(POP()));
+			break;
+		case DW_OP_and:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 & val1);
+			break;
+		case DW_OP_div:
+			val1 = POP();
+			if (!val1)
+				return 0;
+			val2 = POP();
+			PUSH(val2 / val1);
+			break;
+		case DW_OP_minus:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 - val1);
+			break;
+		case DW_OP_mod:
+			val1 = POP();
+			if (!val1)
+				return 0;
+			val2 = POP();
+			PUSH(val2 % val1);
+			break;
+		case DW_OP_mul:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 * val1);
+			break;
+		case DW_OP_neg:
+			PUSH(-(long)POP());
+			break;
+		case DW_OP_not:
+			PUSH(~POP());
+			break;
+		case DW_OP_or:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 | val1);
+			break;
+		case DW_OP_plus:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 + val1);
+			break;
+		case DW_OP_plus_uconst:
+			PUSH(POP() + get_uleb128(&ptr.pu8, end));
+			break;
+		case DW_OP_shl:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val1 < BITS_PER_LONG ? val2 << val1 : 0);
+			break;
+		case DW_OP_shr:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val1 < BITS_PER_LONG ? val2 >> val1 : 0);
+			break;
+		case DW_OP_shra:
+			val1 = POP();
+			val2 = POP();
+			if (val1 < BITS_PER_LONG)
+				PUSH((long)val2 >> val1);
+			else
+				PUSH((long)val2 < 0 ? -1 : 0);
+			break;
+		case DW_OP_xor:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 ^ val1);
+			break;
+		case DW_OP_bra:
+			if (!POP()) {
+				++ptr.ps16;
+				break;
+			}
+			/* fallthrough */
+		case DW_OP_skip:
+			if (ptr.pu8 + 1 < end) {
+				ptr.pu8 += *ptr.ps16;
+				if (ptr.pu8 < expr)
+					return 0;
+			} else
+				++ptr.ps16;
+			break;
+		case DW_OP_eq:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 == val1);
+			break;
+		case DW_OP_ne:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 != val1);
+			break;
+		case DW_OP_lt:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 < val1);
+			break;
+		case DW_OP_le:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 <= val1);
+			break;
+		case DW_OP_ge:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 >= val1);
+			break;
+		case DW_OP_gt:
+			val1 = POP();
+			val2 = POP();
+			PUSH(val2 > val1);
+			break;
+		case DW_OP_lit0 ... DW_OP_lit31:
+			PUSH(ptr.pu8[-1] - DW_OP_lit0);
+			break;
+		case DW_OP_breg0 ... DW_OP_breg31:
+			val1 = ptr.pu8[-1] - DW_OP_breg0;
+			if (0)
+				/* fallthrough */
+		case DW_OP_bregx:
+			val1 = get_uleb128(&ptr.pu8, end);
+			if (val1 >= ARRAY_SIZE(reg_info) ||
+					reg_info[val1].width != sizeof(unsigned long))
+				return 0;
+
+			PUSH(FRAME_REG(frame, val1, unsigned long)
+					+ get_sleb128(&ptr.pu8, end));
+			break;
+		/*todo? case DW_OP_fbreg: */
+		/*todo? case DW_OP_piece: */
+		case DW_OP_nop:
+			break;
+		default:
+			dprintk(1, "Unsupported expression op %02x (%p,%p).",
+					ptr.pu8[-1], ptr.pu8 - 1, end);
+			return 0;
+		}
+	}
+	if (ptr.pu8 > end)
+		return 0;
+	val1 = POP();
+#undef POP
+#undef PUSH
+	return val1;
+}
+
+static unsigned long dw_get_cfa(const struct unwind_state *frame,
+		const struct dw_unwind_state *state)
+{
+	if (state->cfa.elen) {
+		unsigned long cfa = dw_evaluate_cfa(state->cfa.expr,
+				state->cfa.expr + state->cfa.elen, frame);
+		if (!cfa)
+			dprintk(1, "Bad CFA expr (%p:%lu).", state->cfa.expr,
+					state->cfa.elen);
+
+		return cfa;
+	}
+
+	if (state->cfa.reg >= ARRAY_SIZE(reg_info) ||
+			reg_info[state->cfa.reg].width !=
+					sizeof(unsigned long) ||
+			FRAME_REG(frame, state->cfa.reg, unsigned long) %
+					sizeof(unsigned long) ||
+			state->cfa.offs % sizeof(unsigned long)) {
+		dprintk(1, "Bad CFA (%lu, %lx).", state->cfa.reg,
+				state->cfa.offs);
+		return 0;
+	}
+
+	return FRAME_REG(frame, state->cfa.reg, unsigned long) +
+		state->cfa.offs;
+}
+
+static int dw_update_frame(struct unwind_state *frame,
+		struct dw_unwind_state *state)
+{
+	unsigned long start_loc, end_loc, sp, pc, cfa;
+	unsigned int i;
+	int ret;
+
+	cfa = dw_get_cfa(frame, state);
+	if (!cfa)
+		return -EIO;
+
+#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
+	if (frame->call_frame &&
+			!UNW_DEFAULT_RA(state->regs[state->cie.ret_addr_reg],
+					state->cie.data_align))
+		frame->call_frame = 0;
+#endif
+	start_loc = min_t(unsigned long, DW_SP(frame), cfa);
+	end_loc = max_t(unsigned long, DW_SP(frame), cfa);
+	if (STACK_LIMIT(start_loc) != STACK_LIMIT(end_loc)) {
+		start_loc = min(STACK_LIMIT(cfa), cfa);
+		end_loc = max(STACK_LIMIT(cfa), cfa);
+	}
+
+#ifdef CONFIG_64BIT
+# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
+#else
+# define CASES CASE(8); CASE(16); CASE(32)
+#endif
+	pc = DW_PC(frame);
+	sp = DW_SP(frame);
+
+	/* 1st step: store the computed register values */
+	for (i = 0; i < ARRAY_SIZE(state->regs); ++i) {
+		if (REG_INVALID(i)) {
+			if (state->regs[i].where == NOWHERE)
+				continue;
+			dprintk(1, "Cannot restore register %u (%d).", i,
+					state->regs[i].where);
+			return -EIO;
+		}
+
+		if (state->regs[i].where != REGISTER)
+			continue;
+
+		if (state->regs[i].value >= ARRAY_SIZE(reg_info) ||
+				REG_INVALID(state->regs[i].value) ||
+				reg_info[i].width > reg_info[state->regs[i].value].width) {
+			dprintk(1, "Cannot restore register %u from register %lu.",
+					i, state->regs[i].value);
+			return -EIO;
+		}
+
+		switch (reg_info[state->regs[i].value].width) {
+#define CASE(n)	case sizeof(u##n):					\
+			state->regs[i].value = FRAME_REG(frame, 	\
+					state->regs[i].value, const u##n); \
+			break
+		CASES;
+#undef CASE
+		default:
+			dprintk(1, "Unsupported register size %u (%lu).",
+					reg_info[state->regs[i].value].width,
+					state->regs[i].value);
+			return -EIO;
+		}
+	}
+
+	/* 2nd step: update the state, incl. registers */
+	for (i = 0; i < ARRAY_SIZE(state->regs); ++i) {
+		if (REG_INVALID(i))
+			continue;
+		switch (state->regs[i].where) {
+		case NOWHERE:
+			if (reg_info[i].width != sizeof(DW_SP(frame))
+			    || &FRAME_REG(frame, i, __typeof__(DW_SP(frame)))
+			       != &DW_SP(frame))
+				continue;
+			DW_SP(frame) = cfa;
+			break;
+		case REGISTER:
+			switch (reg_info[i].width) {
+#define CASE(n)		case sizeof(u##n):				\
+				FRAME_REG(frame, i, u##n) =		\
+					state->regs[i].value;		\
+				break
+			CASES;
+#undef CASE
+			default:
+				dprintk(1, "Unsupported register size %u (%u).",
+						reg_info[i].width, i);
+				return -EIO;
+			}
+			break;
+		case VALUE:
+			if (reg_info[i].width != sizeof(unsigned long)) {
+				dprintk(1, "Unsupported value size %u (%u).",
+						reg_info[i].width, i);
+				return -EIO;
+			}
+			FRAME_REG(frame, i, unsigned long) = cfa +
+				state->regs[i].value * state->cie.data_align;
+			break;
+		case MEMORY: {
+			unsigned long off = state->regs[i].value *
+				state->cie.data_align;
+			unsigned long addr = cfa + off;
+
+			if (off % sizeof(unsigned long) ||
+					addr < start_loc ||
+					addr + sizeof(unsigned long) < addr ||
+					addr + sizeof(unsigned long) > end_loc) {
+				dprintk(1, "Bad memory location %lx (%lx).",
+					addr, state->regs[i].value);
+				return -EIO;
+			}
+
+			switch (reg_info[i].width) {
+#define CASE(n)		case sizeof(u##n):				\
+				ret = probe_kernel_address((void *)addr, \
+						FRAME_REG(frame, i, u##n)); \
+				if (ret)				\
+					return -EFAULT;			\
+				break
+			CASES;
+#undef CASE
+			default:
+				dprintk(1, "Unsupported memory size %u (%u).",
+					reg_info[i].width, i);
+				return -EIO;
+			}
+			break;
+		}
+		}
+	}
+
+	if (DW_PC(frame) % state->cie.code_align ||
+			DW_SP(frame) % sleb128abs(state->cie.data_align)) {
+		dprintk(1, "Output pointer(s) misaligned (%lx,%lx).",
+				DW_PC(frame), DW_SP(frame));
+		return -EIO;
+	}
+
+	if (pc == DW_PC(frame) && sp == DW_SP(frame)) {
+		dprintk(1, "No progress (%lx,%lx).", pc, sp);
+		return -EIO;
+	}
+
+	return 0;
+#undef CASES
+}
+
+static DW_NOINLINE int
+dw_lookup_fde_binary(struct unwind_state *frame,
+		const struct dwarf_table *table, unsigned long pc,
+		struct dw_unwind_state *state)
+{
+	unsigned long tableSize, pc_begin = 0;
+	unsigned int fde_cnt;
+	const u32 *fde = NULL, *cie;
+	const u8 *hdr = table->header;
+	const u8 *ptr, *end;
+
+	/* version */
+	if (hdr[0] != 1)
+		goto failed;
+
+	/* table_enc */
+	switch (hdr[3] & DW_EH_PE_FORM) {
+	case DW_EH_PE_native:
+		tableSize = sizeof(unsigned long);
+		break;
+	case DW_EH_PE_data2:
+		tableSize = 2;
+		break;
+	case DW_EH_PE_data4:
+		tableSize = 4;
+		break;
+	case DW_EH_PE_data8:
+		tableSize = 8;
+		break;
+	default:
+		goto failed;
+	}
+
+	ptr = hdr + 4;
+	end = hdr + table->hdrsz;
+
+	/* eh_frame_ptr */
+	if (dw_read_pointer(&ptr, end, hdr[1], 0) !=
+			(unsigned long)table->address)
+		goto failed;
+
+	/* fde_cnt */
+	fde_cnt = dw_read_pointer(&ptr, end, hdr[2], 0);
+	if (fde_cnt == 0 || fde_cnt != (end - ptr) / (2 * tableSize))
+		goto failed;
+
+	if ((end - ptr) % (2 * tableSize))
+		goto failed;
+
+	/* binary search in fde_cnt entries */
+	do {
+		const u8 *cur = ptr + (fde_cnt / 2) * (2 * tableSize);
+
+		/* location */
+		pc_begin = dw_read_pointer(&cur, cur + tableSize, hdr[3],
+				(unsigned long)hdr);
+		if (pc < pc_begin)
+			fde_cnt /= 2;
+		else {
+			ptr = cur - tableSize;
+			fde_cnt = (fde_cnt + 1) / 2;
+		}
+	} while (pc_begin && fde_cnt > 1);
+
+	if (pc_begin == 0 || fde_cnt != 1)
+		goto failed;
+
+	/* read the bisected one -- location... */
+	pc_begin = dw_read_pointer(&ptr, ptr + tableSize, hdr[3],
+			(unsigned long)hdr);
+	if (pc_begin == 0 || pc < pc_begin)
+		goto failed;
+
+	/* ...and its corresponding fde */
+	fde = (void *)dw_read_pointer(&ptr, ptr + tableSize, hdr[3],
+			(unsigned long)hdr);
+	if (!fde)
+		goto failed;
+
+	dprintk(4, "have fde: %zx at %p", (void *)fde - table->address, fde);
+
+	/* now find a cie for the fde */
+
+	cie = cie_for_fde(fde, table, &state->fde.pc_begin, state);
+	if (IS_ERR(cie) || state->fde.pc_begin != pc_begin ||
+			pc >= state->fde.pc_end)
+		goto discard;
+
+	dprintk(4, "still have fde at %p", fde);
+
+	return 0;
+discard:
+	dprintk(1, "Binary lookup result for %lx discarded.", pc);
+	return -EIO;
+failed:
+	if (hdr)
+		dprintk(3, "Binary lookup for %lx failed.", pc);
+	return -EIO;
+}
+
+static DW_NOINLINE int
+dw_lookup_fde_linear(struct unwind_state *frame,
+		const struct dwarf_table *table, unsigned long pc,
+		struct dw_unwind_state *state)
+{
+	unsigned long tableSize = table->size;
+	const u32 *fde;
+
+	for (fde = table->address; tableSize > sizeof(*fde) &&
+			tableSize - sizeof(*fde) >= *fde;
+			tableSize -= sizeof(*fde) + *fde,
+			fde += 1 + *fde / sizeof(*fde)) {
+		const u32 *cie = cie_for_fde(fde, table, &state->fde.pc_begin,
+				state);
+		if (IS_ERR(cie)) {
+			if (PTR_ERR(cie) == -EBADF)
+				break;
+			continue;
+		}
+
+		if (!state->fde.pc_begin)
+			continue;
+
+		if (pc >= state->fde.pc_begin && pc < state->fde.pc_end)
+			return 0;
+	}
+
+	dprintk(3, "Linear lookup for %lx failed.", pc);
+
+	return -ENOENT;
+}
+
+
+#ifdef CONFIG_FRAME_POINTER
+static DW_NOINLINE int dw_fp_fallback(struct unwind_state *frame)
+{
+	unsigned long top = TSK_STACK_TOP(frame->task);
+	unsigned long bottom = STACK_BOTTOM(frame->task);
+	unsigned long fp = DW_FP(frame);
+	unsigned long sp = DW_SP(frame);
+	unsigned long link;
+	int ret;
+
+	if ((sp | fp) & (sizeof(unsigned long) - 1))
+		return -EPERM;
+
+#if FRAME_RETADDR_OFFSET < 0
+	if (!(sp < top && fp <= sp && bottom < fp))
+#else
+	if (!(sp > top && fp >= sp && bottom > fp))
+#endif
+		return -ENXIO;
+
+	ret = probe_kernel_address((unsigned long *)(fp + FRAME_LINK_OFFSET),
+			link);
+	if (ret)
+		return -ENXIO;
+
+#if FRAME_RETADDR_OFFSET < 0
+	if (!(link > bottom && link < fp))
+#else
+	if (!(link < bottom && link > fp))
+#endif
+		return -ENXIO;
+
+	if (link & (sizeof(link) - 1))
+		return -ENXIO;
+
+	fp += FRAME_RETADDR_OFFSET;
+	ret = probe_kernel_address((unsigned long *)fp, DW_PC(frame));
+	if (ret)
+		return -ENXIO;
+	pr_info("%s: read fp=%lx into PC=%lx\n", __func__,
+			fp, DW_PC(frame));
+
+	/* Ok, we can use it */
+#if FRAME_RETADDR_OFFSET < 0
+	DW_SP(frame) = fp - sizeof(DW_PC(frame));
+#else
+	DW_SP(frame) = fp + sizeof(DW_PC(frame));
+#endif
+	DW_FP(frame) = link;
+	return 0;
+}
+#else
+static inline int dw_fp_fallback(struct unwind_state *frame)
+{
+	return -ENXIO;
+}
+#endif
+
+/*
+ * dwarf_unwind -- unwind to previous to frame
+ *
+ * Returns 0 if successful, negative number in case of error.
+ */
+int dwarf_unwind(struct unwind_state *frame)
+{
+	struct dw_unwind_state state = {
+		.cie.ptr_type = -1,
+		.cfa = badCFA,
+	};
+	const struct dwarf_table *table;
+	unsigned long pc = DW_PC(frame) - frame->call_frame;
+	int ret;
+
+	if (DW_PC(frame) == 0)
+		return -EINVAL;
+
+	table = dw_find_table(pc);
+	if (table == NULL || (table->size & (sizeof(u32) - 1)))
+		goto fallback;
+
+	/* counterpart in setup_unwind_table, to protect table->header */
+	smp_rmb();
+
+	dprintk(4, "have table: %lx-%lx (%s) hdr=%p addr=%p for %pS",
+			table->core.pc,
+			table->core.pc + table->core.range - 1,
+			table->name,
+			table->header, table->address, (void *)pc);
+
+	if (table->header)
+		ret = dw_lookup_fde_binary(frame, table, pc, &state);
+	else
+		ret = dw_lookup_fde_linear(frame, table, pc, &state);
+
+	if (ret)
+		goto fallback;
+
+	if (state.cie.code_align == 0 || state.cie.data_align == 0)
+		goto fallback;
+
+	if (DW_PC(frame) % state.cie.code_align ||
+			DW_SP(frame) % sleb128abs(state.cie.data_align)) {
+		dprintk(1, "Input pointer(s) misaligned (%lx, %lx).",
+				DW_PC(frame), DW_SP(frame));
+		return -EPERM;
+	}
+
+	if (state.cie.ret_addr_reg >= ARRAY_SIZE(reg_info) ||
+			REG_INVALID(state.cie.ret_addr_reg) ||
+			reg_info[state.cie.ret_addr_reg].width !=
+				sizeof(unsigned long)) {
+		dprintk(1, "CIE validation failed (ret addr reg).");
+		goto fallback;
+	}
+
+	frame->call_frame = !state.cie.is_signal_frame;
+	state.orig_loc = state.fde.pc_begin;
+
+	/* process instructions */
+	ret = dw_process_CFIs(&state, pc);
+	if (!ret || state.loc > state.fde.pc_end ||
+			state.regs[state.cie.ret_addr_reg].where == NOWHERE) {
+		dprintk(1, "Unusable unwind info (%p, %p).",
+				state.fde.ins_start, state.fde.ins_end);
+		return -EIO;
+	}
+
+	ret = dw_update_frame(frame, &state);
+	if (ret)
+		return ret;
+
+	return 0;
+fallback:
+	return dw_fp_fallback(frame);
+}
+EXPORT_SYMBOL_GPL(dwarf_unwind);
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 3/7] vmlinux.lds: preserve eh_frame for DWARF unwinder
  2017-05-05 12:21 [PATCH 1/7] DWARF: add option to preserve unwind info Jiri Slaby
  2017-05-05 12:21 ` [PATCH 2/7] DWARF: EH-frame based stack unwinding Jiri Slaby
@ 2017-05-05 12:21 ` Jiri Slaby
  2017-05-05 12:21 ` [PATCH 4/7] DWARF: initialize structures for kernel and modules Jiri Slaby
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-05 12:21 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, live-patching, linux-kernel, Jiri Slaby,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86, Arnd Bergmann,
	linux-arch

If we choose to use the DWARF unwinder, we need to preserve the eh_frame
section. So define EH_FRAME macro to contain it in vmlinux.lds.h. We
also define symbols __start_unwind_hdr/__end_unwind_hdr and
__start_unwind/__end_unwind to use in the code.

The config option proper is added at the end of the series.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-arch@vger.kernel.org
---
 arch/x86/tools/relocs.c           |  1 +
 include/asm-generic/vmlinux.lds.h | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 73eb7fd4aec4..47b7bdd6f535 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -58,6 +58,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 	"(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
 	"__(start|end)_pci_.*|"
 	"__(start|end)_builtin_fw|"
+	"__(start|end)_unwind(|_hdr)|"
 	"__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
 	"__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
 	"__(start|stop)___param|"
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 314a0b9219c6..349034b41e60 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -406,6 +406,8 @@
 		MEM_KEEP(exit.rodata)					\
 	}								\
 									\
+	EH_FRAME							\
+									\
 	/* Built-in module parameters. */				\
 	__param : AT(ADDR(__param) - LOAD_OFFSET) {			\
 		VMLINUX_SYMBOL(__start___param) = .;			\
@@ -882,3 +884,23 @@
 	BSS(bss_align)							\
 	. = ALIGN(stop_align);						\
 	VMLINUX_SYMBOL(__bss_stop) = .;
+
+#ifdef CONFIG_DWARF_UNWIND
+#define EH_FRAME							\
+		/* Unwind data binary search table */			\
+		. = ALIGN(8);						\
+		.eh_frame_hdr : AT(ADDR(.eh_frame_hdr) - LOAD_OFFSET) {	\
+			VMLINUX_SYMBOL(__start_unwind_hdr) = .;		\
+			*(.eh_frame_hdr)				\
+			VMLINUX_SYMBOL(__end_unwind_hdr) = .;		\
+		}							\
+		/* Unwind data */					\
+		. = ALIGN(8);						\
+		.eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {		\
+			VMLINUX_SYMBOL(__start_unwind) = .;		\
+			*(.eh_frame)					\
+			VMLINUX_SYMBOL(__end_unwind) = .;		\
+		}
+#else
+#define EH_FRAME
+#endif
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 4/7] DWARF: initialize structures for kernel and modules
  2017-05-05 12:21 [PATCH 1/7] DWARF: add option to preserve unwind info Jiri Slaby
  2017-05-05 12:21 ` [PATCH 2/7] DWARF: EH-frame based stack unwinding Jiri Slaby
  2017-05-05 12:21 ` [PATCH 3/7] vmlinux.lds: preserve eh_frame for DWARF unwinder Jiri Slaby
@ 2017-05-05 12:21 ` Jiri Slaby
  2017-05-05 12:21 ` [PATCH 5/7] unwinder: show_stack, check also ret_addr_p's contents Jiri Slaby
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-05 12:21 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, live-patching, linux-kernel, Jiri Slaby, Jessica Yu,
	Rusty Russell

When booting, initialize eh_frame for use in the DWARF unwinder. Do the
same for every coming module's info.

And destroy the information when a module is going away.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h |  4 ++++
 init/main.c            |  3 +++
 kernel/module.c        | 32 +++++++++++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 21f56393602f..21f7a23f8004 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -257,6 +257,7 @@ extern const typeof(name) __mod_##type##__##name##_device_table		\
  * files require multiple MODULE_FIRMWARE() specifiers */
 #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware)
 
+struct dwarf_table;
 struct notifier_block;
 
 #ifdef CONFIG_MODULES
@@ -393,6 +394,9 @@ struct module {
 	struct module_layout core_layout __module_layout_align;
 	struct module_layout init_layout;
 
+	/* The handle returned from dwarf_add_table. */
+	struct dwarf_table *dwarf_info;
+
 	/* Arch-specific module values */
 	struct mod_arch_specific arch;
 
diff --git a/init/main.c b/init/main.c
index cc48053bb39f..5acedeae355d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -22,6 +22,7 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
+#include <linux/dwarf.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/initrd.h>
@@ -490,6 +491,7 @@ asmlinkage __visible void __init start_kernel(void)
 	char *command_line;
 	char *after_dashes;
 
+	dwarf_init();
 	set_task_stack_end_magic(&init_task);
 	smp_setup_processor_id();
 	debug_objects_early_init();
@@ -514,6 +516,7 @@ asmlinkage __visible void __init start_kernel(void)
 	setup_arch(&command_line);
 	mm_init_cpumask(&init_mm);
 	setup_command_line(command_line);
+	dwarf_setup();
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
 	boot_cpu_state_init();
diff --git a/kernel/module.c b/kernel/module.c
index 4a3665f8f837..3571328e41fe 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -38,6 +38,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/moduleparam.h>
+#include <linux/dwarf.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/vermagic.h>
@@ -314,7 +315,7 @@ struct load_info {
 	unsigned long mod_kallsyms_init_off;
 #endif
 	struct {
-		unsigned int sym, str, mod, vers, info, pcpu;
+		unsigned int sym, str, mod, vers, info, pcpu, dwarf;
 	} index;
 };
 
@@ -753,6 +754,27 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
 
 #endif /* CONFIG_SMP */
 
+static unsigned int find_dwarf(struct load_info *info)
+{
+	unsigned int section = 0;
+#ifdef ARCH_DWARF_SECTION_NAME
+	section = find_sec(info, ARCH_DWARF_SECTION_NAME);
+	if (section)
+		info->sechdrs[section].sh_flags |= SHF_ALLOC;
+#endif
+	return section;
+}
+
+static void add_dwarf_table(struct module *mod, struct load_info *info)
+{
+	int index = info->index.dwarf;
+
+	/* Size of section 0 is 0, so this is ok if there is no dwarf info. */
+	mod->dwarf_info = dwarf_add_table(mod,
+					  (void *)info->sechdrs[index].sh_addr,
+					  info->sechdrs[index].sh_size);
+}
+
 #define MODINFO_ATTR(field)	\
 static void setup_modinfo_##field(struct module *mod, const char *s)  \
 {                                                                     \
@@ -2133,6 +2155,8 @@ static void free_module(struct module *mod)
 	/* Remove dynamic debug info */
 	ddebug_remove_module(mod->name);
 
+	dwarf_remove_table(mod->dwarf_info, false);
+
 	/* Arch-specific cleanup. */
 	module_arch_cleanup(mod);
 
@@ -2970,6 +2994,8 @@ static struct module *setup_load_info(struct load_info *info, int flags)
 
 	info->index.pcpu = find_pcpusec(info);
 
+	info->index.dwarf = find_dwarf(info);
+
 	/* Check module struct version now, before we try to use module. */
 	if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
 		return ERR_PTR(-ENOEXEC);
@@ -3459,6 +3485,7 @@ static noinline int do_init_module(struct module *mod)
 	/* Drop initial reference. */
 	module_put(mod);
 	trim_init_extable(mod);
+	dwarf_remove_table(mod->dwarf_info, true);
 #ifdef CONFIG_KALLSYMS
 	/* Switch to core kallsyms now init is done: kallsyms may be walking! */
 	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
@@ -3734,6 +3761,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
 			goto sysfs_cleanup;
 	}
 
+	/* Initialize dwarf table */
+	add_dwarf_table(mod, info);
+
 	/* Get rid of temporary copy. */
 	free_copy(info);
 
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 5/7] unwinder: show_stack, check also ret_addr_p's contents
  2017-05-05 12:21 [PATCH 1/7] DWARF: add option to preserve unwind info Jiri Slaby
                   ` (2 preceding siblings ...)
  2017-05-05 12:21 ` [PATCH 4/7] DWARF: initialize structures for kernel and modules Jiri Slaby
@ 2017-05-05 12:21 ` Jiri Slaby
  2017-05-05 12:21 ` [PATCH 6/7] unwinder: plug in the DWARF unwinder Jiri Slaby
  2017-05-05 12:22 ` [PATCH 7/7] DWARF: add the config option Jiri Slaby
  5 siblings, 0 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-05 12:21 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, live-patching, linux-kernel, Jiri Slaby,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86,
	Josh Poimboeuf

The DWARF unwinder stores the return address in an internal structure.
This is returned to the generic unwinder. So do not check only
ret_addr_p against the current stack pointer, but also if the address is
the return address.

This makes the unwinder to really think the addresses are reliable.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/kernel/dumpstack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dbce3cca94cb..963c6b60887f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -123,7 +123,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 				continue;
 			}
 
-			if (stack == ret_addr_p)
+			if (stack == ret_addr_p ||
+					(ret_addr_p && addr == *ret_addr_p))
 				reliable = 1;
 
 			/*
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 6/7] unwinder: plug in the DWARF unwinder
  2017-05-05 12:21 [PATCH 1/7] DWARF: add option to preserve unwind info Jiri Slaby
                   ` (3 preceding siblings ...)
  2017-05-05 12:21 ` [PATCH 5/7] unwinder: show_stack, check also ret_addr_p's contents Jiri Slaby
@ 2017-05-05 12:21 ` Jiri Slaby
  2017-05-05 12:22 ` [PATCH 7/7] DWARF: add the config option Jiri Slaby
  5 siblings, 0 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-05 12:21 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, live-patching, linux-kernel, Jiri Slaby,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86,
	Josh Poimboeuf

The DWARF unwinder is ready to use, so plug it into the generic
unwinding code.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/include/asm/unwind.h  | 36 +++++++++++++++-
 arch/x86/kernel/Makefile       |  4 ++
 arch/x86/kernel/unwind_dwarf.c | 95 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/kernel/unwind_dwarf.c

diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e6676495b125..4c792ad0d9f9 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -12,7 +12,14 @@ struct unwind_state {
 	struct task_struct *task;
 	int graph_idx;
 	bool error;
-#ifdef CONFIG_FRAME_POINTER
+#ifdef CONFIG_DWARF_UNWIND
+	union {
+		struct pt_regs regs;
+		char regs_arr[sizeof(struct pt_regs)];
+	} u;
+	unsigned long dw_sp;
+	unsigned call_frame:1;
+#elif defined(CONFIG_FRAME_POINTER)
 	bool got_irq;
 	unsigned long *bp, *orig_sp;
 	struct pt_regs *regs;
@@ -48,7 +55,32 @@ static inline bool unwind_error(struct unwind_state *state)
 	return state->error;
 }
 
-#ifdef CONFIG_FRAME_POINTER
+#ifdef CONFIG_DWARF_UNWIND
+
+#include <asm/dwarf.h>
+
+static inline
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return NULL;
+
+	return &DW_PC(state);
+}
+
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return NULL;
+
+	/*
+	 * Should we return something? If we return the registers here, they
+	 * are output for every frame.
+	 */
+	return NULL;
+}
+
+#elif defined(CONFIG_FRAME_POINTER)
 
 static inline
 unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4b994232cb57..604cbcbba9a5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -124,11 +124,15 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
 
+ifdef CONFIG_DWARF_UNWIND
+obj-y					+= unwind_dwarf.o
+else
 ifdef CONFIG_FRAME_POINTER
 obj-y					+= unwind_frame.o
 else
 obj-y					+= unwind_guess.o
 endif
+endif
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/unwind_dwarf.c b/arch/x86/kernel/unwind_dwarf.c
new file mode 100644
index 000000000000..bbcca970aca8
--- /dev/null
+++ b/arch/x86/kernel/unwind_dwarf.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2016-2017 SUSE
+ *      Jiri Slaby <jirislaby@kernel.org>
+ * This code is released under the GPL v2.
+ */
+
+#include <linux/dwarf.h>
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+	unsigned long *addr = unwind_get_return_address_ptr(state);
+
+	if (unwind_done(state))
+		return 0;
+
+	return ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr,
+			addr);
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		goto bad;
+
+	if (arch_dwarf_user_mode(state))
+		goto bad;
+
+	if ((state->dw_sp & PAGE_MASK) == (DW_SP(state) & PAGE_MASK) &&
+			state->dw_sp > DW_SP(state))
+		goto bad;
+
+	if (dwarf_unwind(state) || !DW_PC(state))
+		goto bad;
+
+	state->dw_sp = DW_SP(state);
+
+	return true;
+bad:
+	state->stack_info.type = STACK_TYPE_UNKNOWN;
+	return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+		struct pt_regs *regs, unsigned long *first_frame)
+{
+	bool do_skipping = true;
+	char type;
+
+	memset(state, 0, sizeof(*state));
+	state->task = task;
+
+	if (regs) {
+		arch_dwarf_init_frame_info(state, regs);
+		type = 'R';
+	} else if (task == current) {
+		arch_dwarf_init_running(&state->u.regs);
+		type = 'C';
+#ifdef CONFIG_SMP
+	} else if (task->on_cpu) {
+		return;
+#endif
+	} else {
+		arch_dwarf_init_blocked(state);
+		type = 'B';
+		do_skipping = false;
+	}
+
+	pr_debug("%s: %c FF=%p rip=%lx (%pS) rsp=%lx rbp=%lx\n",
+			__func__, type, first_frame, DW_PC(state),
+			(void *)DW_PC(state), DW_SP(state), DW_FP(state));
+
+	get_stack_info((void *)DW_SP(state), task, &state->stack_info,
+			&state->stack_mask);
+
+	state->dw_sp = DW_SP(state);
+
+	if (arch_dwarf_user_mode(state))
+		return;
+
+	while (do_skipping) {
+		if (DW_SP(state) > (unsigned long)first_frame) {
+			pr_debug("%s: hit first=%p sp=%lx\n", __func__,
+					first_frame, DW_SP(state));
+			break;
+		}
+		if (!unwind_next_frame(state))
+			break;
+		pr_debug("%s: skipped to %pS rsp=%lx rbp=%lx\n", __func__,
+				(void *)DW_PC(state), DW_SP(state),
+				DW_FP(state));
+	}
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 7/7] DWARF: add the config option
  2017-05-05 12:21 [PATCH 1/7] DWARF: add option to preserve unwind info Jiri Slaby
                   ` (4 preceding siblings ...)
  2017-05-05 12:21 ` [PATCH 6/7] unwinder: plug in the DWARF unwinder Jiri Slaby
@ 2017-05-05 12:22 ` Jiri Slaby
  2017-05-05 19:57   ` Linus Torvalds
  5 siblings, 1 reply; 70+ messages in thread
From: Jiri Slaby @ 2017-05-05 12:22 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, live-patching, linux-kernel, Jiri Slaby,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86

The DWARF unwinder is in place and ready. So introduce the config option
to allow users to enable it. It is by default off due to missing
assembly annotations.

And we now allow turning off FRAME_POINTERS if DWARF unwinder is
selected.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
---
 arch/x86/Kconfig  |  2 +-
 lib/Kconfig.debug | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cd18994a9555..b37fa89c5f19 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -689,7 +689,7 @@ config X86_32_IRIS
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
 	prompt "Single-depth WCHAN output"
-	depends on X86
+	depends on X86 && !DWARF_UNWIND
 	---help---
 	  Calculate simpler /proc/<PID>/wchan values. If this option
 	  is disabled then wchan values will recurse back to the
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e90125b6498e..03297955ece0 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -55,6 +55,17 @@ config UNWIND_INFO
 	  If you don't debug the kernel, you can say N, but we may not be able
 	  to solve problems without frame unwind information or frame pointers.
 
+config DWARF_UNWIND
+	bool "DWARF stack unwind support"
+	depends on UNWIND_INFO && !KASAN
+	depends on X86
+	help
+	  This enables more precise stack traces, omitting all unrelated
+	  occurrences of pointers into kernel code from the dump.
+
+	  KASAN is too slow with this unwinder, so it is excluded from
+	  using in parallel.
+
 config BOOT_PRINTK_DELAY
 	bool "Delay each boot printk message by N milliseconds"
 	depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
@@ -1690,7 +1701,8 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
 	depends on !X86_64
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !X86 && !ARM_UNWIND && !ARC && !SCORE
+	select UNWIND_INFO if X86 && !FRAME_POINTER
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
@@ -1699,7 +1711,8 @@ config LATENCYTOP
 	depends on DEBUG_KERNEL
 	depends on STACKTRACE_SUPPORT
 	depends on PROC_FS
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !X86 && !ARM_UNWIND && !ARC
+	select UNWIND_INFO if X86 && !FRAME_POINTER
 	select KALLSYMS
 	select KALLSYMS_ALL
 	select STACKTRACE
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-05 12:22 ` [PATCH 7/7] DWARF: add the config option Jiri Slaby
@ 2017-05-05 19:57   ` Linus Torvalds
  2017-05-06  7:19     ` Ingo Molnar
                       ` (3 more replies)
  0 siblings, 4 replies; 70+ messages in thread
From: Linus Torvalds @ 2017-05-05 19:57 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: Andrew Morton, live-patching, Linux Kernel Mailing List,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	the arch/x86 maintainers

On Fri, May 5, 2017 at 5:22 AM, Jiri Slaby <jslaby@suse.cz> wrote:
> The DWARF unwinder is in place and ready. So introduce the config option
> to allow users to enable it. It is by default off due to missing
> assembly annotations.

Who actually ends up using this?

Because from the last time we had fancy unwindoers, and all the
problems it caused for oops handling with absolutely _zero_ upsides
ever, I do not ever again want to see fancy unwinders with complex
state machine handling used by the oopsing code.

The fact that it gets disabled for KASAN also makes me suspicious. It
basically means that now all the accesses it does are not even
validated.

The fact that the most of the code seems to be disabled for the first
six patches, and then just enabled in the last patch, also seems to
mean that the series also gets no bisection coverage or testing that
the individual patches make any sense. (ie there's a lot of code
inside "CONFIG_DWARF_UNWIND" in the early patches but that config
option cannot even be enabled until the last patch).

We used to have nasty issues with not just missing dwarf info, but
also actively *wrong* dwarf info. Compiler versions that generated
subtly wrong info, because nobody actually really depended on it, and
the people who had tested it seldom did the kinds of things we do in
the kernel (eg inline asms etc).

So I'm personally still very suspicious of these things.

Last time I had huge issues with people also always blaming *anything*
else than that unwinder. It was always "oh, somebody wrote asm without
getting it right". Or "oh, the compiler generated bad tables, it's not
*my* fault that now the kernel oopsing code no longer works".

When I asked for more stricter debug table validation to avoid issues,
it was always "oh, we fixed it, no worries", and then two months later
somebody hit another issue.

Put another way; the last time we did crazy stuff like this, it got
reverted. For a damn good reason, despite some people being in denial
about those reasons.

                       Linus

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-05 19:57   ` Linus Torvalds
@ 2017-05-06  7:19     ` Ingo Molnar
  2017-05-10  7:46       ` Jiri Slaby
  2017-05-06 14:24     ` Jiri Kosina
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 70+ messages in thread
From: Ingo Molnar @ 2017-05-06  7:19 UTC (permalink / raw)
  To: Linus Torvalds, Josh Poimboeuf
  Cc: Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Josh Poimboeuf


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, May 5, 2017 at 5:22 AM, Jiri Slaby <jslaby@suse.cz> wrote:
> > The DWARF unwinder is in place and ready. So introduce the config option
> > to allow users to enable it. It is by default off due to missing
> > assembly annotations.
> 
> Who actually ends up using this?

Also, why wasn't Josh Poimboeuf Cc:-ed, who de-facto maintains the x86 unwinding 
code?

AFAICS this series is just repeating the old mistakes of the old Dwarf unwinders 
of trusting GCC's unwinder data. So NAK for the time being on the whole approach:

NAcked-by: Ingo Molnar <mingo@kernel.org>

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-05 19:57   ` Linus Torvalds
  2017-05-06  7:19     ` Ingo Molnar
@ 2017-05-06 14:24     ` Jiri Kosina
  2017-05-07 16:55     ` Josh Poimboeuf
  2017-05-10  7:39     ` Jiri Slaby
  3 siblings, 0 replies; 70+ messages in thread
From: Jiri Kosina @ 2017-05-06 14:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Josh Poimboeuf


[ adding Josh to CC ]

On Fri, 5 May 2017, Linus Torvalds wrote:

> > The DWARF unwinder is in place and ready. So introduce the config option
> > to allow users to enable it. It is by default off due to missing
> > assembly annotations.
> 
> Who actually ends up using this?

As a datapoint -- we've been using dwarf unwinder in SUSE kernels since 
stone age, because we do not enable frame pointer, as it adds up to 10% of 
runtime slowdown for certain workloads, but we want reliable stacktraces 
so that we can actually debug user reports.

Now that we have nicely reliable stacktraces thanks to Josh's objtool for 
FP-enabled builds, we'd like to bring comparably understandable traces 
also to non-FP enabled builds.

> Put another way; the last time we did crazy stuff like this, it got
> reverted. For a damn good reason, despite some people being in denial
> about those reasons.

There had been personal issues involved previously, no questions about 
that :)

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-05 19:57   ` Linus Torvalds
  2017-05-06  7:19     ` Ingo Molnar
  2017-05-06 14:24     ` Jiri Kosina
@ 2017-05-07 16:55     ` Josh Poimboeuf
  2017-05-07 17:59       ` Ingo Molnar
                         ` (4 more replies)
  2017-05-10  7:39     ` Jiri Slaby
  3 siblings, 5 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-07 16:55 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina

On Fri, May 05, 2017 at 12:57:11PM -0700, Linus Torvalds wrote:
> On Fri, May 5, 2017 at 5:22 AM, Jiri Slaby <jslaby@suse.cz> wrote:
> > The DWARF unwinder is in place and ready. So introduce the config option
> > to allow users to enable it. It is by default off due to missing
> > assembly annotations.
> 
> Who actually ends up using this?
> 
> Because from the last time we had fancy unwindoers, and all the
> problems it caused for oops handling with absolutely _zero_ upsides
> ever, I do not ever again want to see fancy unwinders with complex
> state machine handling used by the oopsing code.
> 
> The fact that it gets disabled for KASAN also makes me suspicious. It
> basically means that now all the accesses it does are not even
> validated.
> 
> The fact that the most of the code seems to be disabled for the first
> six patches, and then just enabled in the last patch, also seems to
> mean that the series also gets no bisection coverage or testing that
> the individual patches make any sense. (ie there's a lot of code
> inside "CONFIG_DWARF_UNWIND" in the early patches but that config
> option cannot even be enabled until the last patch).
> 
> We used to have nasty issues with not just missing dwarf info, but
> also actively *wrong* dwarf info. Compiler versions that generated
> subtly wrong info, because nobody actually really depended on it, and
> the people who had tested it seldom did the kinds of things we do in
> the kernel (eg inline asms etc).
> 
> So I'm personally still very suspicious of these things.
> 
> Last time I had huge issues with people also always blaming *anything*
> else than that unwinder. It was always "oh, somebody wrote asm without
> getting it right". Or "oh, the compiler generated bad tables, it's not
> *my* fault that now the kernel oopsing code no longer works".
> 
> When I asked for more stricter debug table validation to avoid issues,
> it was always "oh, we fixed it, no worries", and then two months later
> somebody hit another issue.
> 
> Put another way; the last time we did crazy stuff like this, it got
> reverted. For a damn good reason, despite some people being in denial
> about those reasons.

Here's another possible idea that's been rattling around in my head.
It's purely theoretical at this point, so I don't know for sure that it
would work.  But I haven't been able to find any major issues with it
yet.

DWARF is great for debuggers.  It helps you find all the registers on
the stack, so you can see function arguments and local variables.  All
expressed in a nice compact format.

But that's overkill for unwinders.  We don't need all those registers,
and the state machine is too complicated.  Unwinders basically only need
to know one thing: given an instruction address and a stack pointer,
where is the caller's stack frame?

I'm thinking/hoping that information can be expressed in a simple, easy
to parse, reasonably sized data structure.  Something like a sorted
array of this:

	struct undwarf {
		unsigned int ip;	/* instruction pointer (relative offset from base) */
		unsigned prev_frame:13;	/* offset to previous frame from current stack pointer */
		unsigned regs:1;	/* whether prev_frame contains entry regs (regs->ip) */
		unsigned align:2;	/* some details for dealing with gcc stack realignment */
	} __packed;

	extern struct undwarf undwarves[];

One instance of the structure would exist for each time the stack
pointer changes, e.g. for every function entry, push/pop, and rsp
add/subtract.  The data could be assembled and sorted offline, possibly
derived from DWARF, or more likely, generated by objtool.  After doing
some rough calculations, I think the section size would be comparable to
the sizes of the DWARF .eh_frame sections it would replace.

If it worked, the "undwarf" unwinder would be a lot simpler than a real
DWARF unwinder.  And validating the sanity of the data at runtime would
be a lot more straightforward.  It could ensure that each stack pointer
is within the bounds of the current stack, like our current unwinder
does.

It could also be easily plugged into the existing x86 unwinder
framework, and would "just work" with the oops dumping code
(show_trace_log_lvl), where if something goes wrong, all the remaining
found text addresses on the stack still get printed with the '?' prefix.

Modules could also have their own undwarf tables.

We could also add reliability checks and warnings to make it suitable
for live patching.  That combined with a debug feature to do periodic
unwinds from an NMI handler should flush out most issues over time.

Taking the idea further, this could even (eventually) support the
unwinding of generated code like bpf and dynamic ftrace trampolines.
When generating code, they could also register undwarf structs
associated with the code, which could be stored in a separate undwarf
list.

I'm not 100% sure it would work, but I can start prototyping it if there
aren't any objections.  The unwinder itself would be easy.  Most of the
work would be in tooling, though much of the needed tooling actually
already exists (in objtool).

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-07 16:55     ` Josh Poimboeuf
@ 2017-05-07 17:59       ` Ingo Molnar
  2017-05-07 18:08         ` hpa
  2017-05-08  5:35       ` Andy Lutomirski
                         ` (3 subsequent siblings)
  4 siblings, 1 reply; 70+ messages in thread
From: Ingo Molnar @ 2017-05-07 17:59 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina


* Josh Poimboeuf <jpoimboe@redhat.com> wrote:

> One instance of the structure would exist for each time the stack
> pointer changes, e.g. for every function entry, push/pop, and rsp
> add/subtract.  The data could be assembled and sorted offline, possibly
> derived from DWARF, or more likely, generated by objtool.  After doing
> some rough calculations, I think the section size would be comparable to
> the sizes of the DWARF .eh_frame sections it would replace.

That's something I've been thinking about as well: if objtool generates the 
unwinder data structures then the kernel is not directly exposed to tooling bugs 
anymore.

A fair chunk of the fragility of DWARF comes from the fact that it's generated by 
a tool chain that we cannot fix as part of the kernel project. If GCC generates 
crap debuginfo, and GDB happens to work with it but the kernel not, we'll have to 
work it around in the kernel. If GCC starts bloating debuginfo in the future we 
are screwed as well, etc.

If objtool generates debuginfo then it's _our_ responsibility to have sane 
unwinder info and we obviously manage its structure and size as well. Win-win.

The unwinder itself should still do sanity checks, etc. (like all good debugging 
infrastructure code) - but the nature of the kernel's exposure to tool chain 
details changes in a very fundamental way.

So yes, I think this is a very good idea, assuming it works in practice! ;-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-07 17:59       ` Ingo Molnar
@ 2017-05-07 18:08         ` hpa
  2017-05-07 21:48           ` Josh Poimboeuf
  0 siblings, 1 reply; 70+ messages in thread
From: hpa @ 2017-05-07 18:08 UTC (permalink / raw)
  To: Ingo Molnar, Josh Poimboeuf
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Andy Lutomirski, Jiri Kosina

On May 7, 2017 10:59:16 AM PDT, Ingo Molnar <mingo@kernel.org> wrote:
>
>* Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
>> One instance of the structure would exist for each time the stack
>> pointer changes, e.g. for every function entry, push/pop, and rsp
>> add/subtract.  The data could be assembled and sorted offline,
>possibly
>> derived from DWARF, or more likely, generated by objtool.  After
>doing
>> some rough calculations, I think the section size would be comparable
>to
>> the sizes of the DWARF .eh_frame sections it would replace.
>
>That's something I've been thinking about as well: if objtool generates
>the 
>unwinder data structures then the kernel is not directly exposed to
>tooling bugs 
>anymore.
>
>A fair chunk of the fragility of DWARF comes from the fact that it's
>generated by 
>a tool chain that we cannot fix as part of the kernel project. If GCC
>generates 
>crap debuginfo, and GDB happens to work with it but the kernel not,
>we'll have to 
>work it around in the kernel. If GCC starts bloating debuginfo in the
>future we 
>are screwed as well, etc.
>
>If objtool generates debuginfo then it's _our_ responsibility to have
>sane 
>unwinder info and we obviously manage its structure and size as well.
>Win-win.
>
>The unwinder itself should still do sanity checks, etc. (like all good
>debugging 
>infrastructure code) - but the nature of the kernel's exposure to tool
>chain 
>details changes in a very fundamental way.
>
>So yes, I think this is a very good idea, assuming it works in
>practice! ;-)
>
>Thanks,
>
>	Ingo

Can objtool verify the unwinder at each address in the kernel, or is that an AI-complete problem?
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-07 18:08         ` hpa
@ 2017-05-07 21:48           ` Josh Poimboeuf
  2017-05-08  7:50             ` Vojtech Pavlik
  0 siblings, 1 reply; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-07 21:48 UTC (permalink / raw)
  To: hpa
  Cc: Ingo Molnar, Linus Torvalds, Jiri Slaby, Andrew Morton,
	live-patching, Linux Kernel Mailing List, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina

On Sun, May 07, 2017 at 11:08:19AM -0700, hpa@zytor.com wrote:
> On May 7, 2017 10:59:16 AM PDT, Ingo Molnar <mingo@kernel.org> wrote:
> >
> >* Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> >
> >> One instance of the structure would exist for each time the stack
> >> pointer changes, e.g. for every function entry, push/pop, and rsp
> >> add/subtract.  The data could be assembled and sorted offline,
> >possibly
> >> derived from DWARF, or more likely, generated by objtool.  After
> >doing
> >> some rough calculations, I think the section size would be comparable
> >to
> >> the sizes of the DWARF .eh_frame sections it would replace.
> >
> >That's something I've been thinking about as well: if objtool generates
> >the 
> >unwinder data structures then the kernel is not directly exposed to
> >tooling bugs 
> >anymore.
> >
> >A fair chunk of the fragility of DWARF comes from the fact that it's
> >generated by 
> >a tool chain that we cannot fix as part of the kernel project. If GCC
> >generates 
> >crap debuginfo, and GDB happens to work with it but the kernel not,
> >we'll have to 
> >work it around in the kernel. If GCC starts bloating debuginfo in the
> >future we 
> >are screwed as well, etc.
> >
> >If objtool generates debuginfo then it's _our_ responsibility to have
> >sane 
> >unwinder info and we obviously manage its structure and size as well.
> >Win-win.
> >
> >The unwinder itself should still do sanity checks, etc. (like all good
> >debugging 
> >infrastructure code) - but the nature of the kernel's exposure to tool
> >chain 
> >details changes in a very fundamental way.
> >
> >So yes, I think this is a very good idea, assuming it works in
> >practice! ;-)
> >
> >Thanks,
> >
> >	Ingo
> 
> Can objtool verify the unwinder at each address in the kernel, or is that an AI-complete problem?

It can't verify the *unwinder*, but it can verify the data which is fed
to the unwinder (either DWARF or the structs I proposed above).  For
each function, it follows every possible code path, and it can keep
track of the stack pointer while doing so.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-07 16:55     ` Josh Poimboeuf
  2017-05-07 17:59       ` Ingo Molnar
@ 2017-05-08  5:35       ` Andy Lutomirski
  2017-05-08  6:15         ` Ingo Molnar
  2017-05-08 14:40         ` Josh Poimboeuf
  2017-05-09  0:21       ` Andy Lutomirski
                         ` (2 subsequent siblings)
  4 siblings, 2 replies; 70+ messages in thread
From: Andy Lutomirski @ 2017-05-08  5:35 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina

On Sun, May 7, 2017 at 9:55 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Fri, May 05, 2017 at 12:57:11PM -0700, Linus Torvalds wrote:
>> On Fri, May 5, 2017 at 5:22 AM, Jiri Slaby <jslaby@suse.cz> wrote:
>> > The DWARF unwinder is in place and ready. So introduce the config option
>> > to allow users to enable it. It is by default off due to missing
>> > assembly annotations.
>>
>> Who actually ends up using this?
>>
>> Because from the last time we had fancy unwindoers, and all the
>> problems it caused for oops handling with absolutely _zero_ upsides
>> ever, I do not ever again want to see fancy unwinders with complex
>> state machine handling used by the oopsing code.
>>
>> The fact that it gets disabled for KASAN also makes me suspicious. It
>> basically means that now all the accesses it does are not even
>> validated.
>>
>> The fact that the most of the code seems to be disabled for the first
>> six patches, and then just enabled in the last patch, also seems to
>> mean that the series also gets no bisection coverage or testing that
>> the individual patches make any sense. (ie there's a lot of code
>> inside "CONFIG_DWARF_UNWIND" in the early patches but that config
>> option cannot even be enabled until the last patch).
>>
>> We used to have nasty issues with not just missing dwarf info, but
>> also actively *wrong* dwarf info. Compiler versions that generated
>> subtly wrong info, because nobody actually really depended on it, and
>> the people who had tested it seldom did the kinds of things we do in
>> the kernel (eg inline asms etc).
>>
>> So I'm personally still very suspicious of these things.
>>
>> Last time I had huge issues with people also always blaming *anything*
>> else than that unwinder. It was always "oh, somebody wrote asm without
>> getting it right". Or "oh, the compiler generated bad tables, it's not
>> *my* fault that now the kernel oopsing code no longer works".
>>
>> When I asked for more stricter debug table validation to avoid issues,
>> it was always "oh, we fixed it, no worries", and then two months later
>> somebody hit another issue.
>>
>> Put another way; the last time we did crazy stuff like this, it got
>> reverted. For a damn good reason, despite some people being in denial
>> about those reasons.
>
> Here's another possible idea that's been rattling around in my head.
> It's purely theoretical at this point, so I don't know for sure that it
> would work.  But I haven't been able to find any major issues with it
> yet.
>
> DWARF is great for debuggers.  It helps you find all the registers on
> the stack, so you can see function arguments and local variables.  All
> expressed in a nice compact format.
>
> But that's overkill for unwinders.  We don't need all those registers,
> and the state machine is too complicated.  Unwinders basically only need
> to know one thing: given an instruction address and a stack pointer,
> where is the caller's stack frame?

I think that, if the code were sufficiently robust, it would be handy
if the unwinder displayed function arguments.  DWARF can do that to a
limited extent.

That being said, having a simpler table format would probably cover
most of the use cases.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-08  5:35       ` Andy Lutomirski
@ 2017-05-08  6:15         ` Ingo Molnar
  2017-05-08 14:40         ` Josh Poimboeuf
  1 sibling, 0 replies; 70+ messages in thread
From: Ingo Molnar @ 2017-05-08  6:15 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Josh Poimboeuf, Linus Torvalds, Jiri Slaby, Andrew Morton,
	live-patching, Linux Kernel Mailing List, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers,
	Andy Lutomirski, Jiri Kosina


* Andy Lutomirski <luto@amacapital.net> wrote:

> I think that, if the code were sufficiently robust, it would be handy if the 
> unwinder displayed function arguments.  DWARF can do that to a limited extent.
> 
> That being said, having a simpler table format would probably cover most of the 
> use cases.

I'd say that if objtool generates the kernel's debuginfo, it all becomes a kernel 
internal matter to a large degree: we can add function argument display support as 
well and see what effect it has on data structure size and complexity - it would 
certainly be a nice improvement in oops output to see function arguments.

But it should all start from a minimum complexity step (which will be complex 
enough!) and the first step should be stack trace display, in a performance 
optimized data structure, to allow both kernel stack dumps and various tracing and 
other instrumentation facilities that make use of the kernel's dwarf-ish unwider 
as-is.

The goal of this first step would be to allow x86 to drop generating the RBP frame 
pointer:

 - This shrinks kernel text and instruction count (by 1-2% IIRC),
 - speeds up certain code paths measurably,
 - plus it also allows larger functions to use one more general purpose register.

... so it's a good thing to have, if and only if the unwinder's fragility and 
complexity does not kill us. And much of that fragility comes from who generates 
the debuginfo ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-07 21:48           ` Josh Poimboeuf
@ 2017-05-08  7:50             ` Vojtech Pavlik
  2017-05-08 13:14               ` Josh Poimboeuf
  0 siblings, 1 reply; 70+ messages in thread
From: Vojtech Pavlik @ 2017-05-08  7:50 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: hpa, Ingo Molnar, Linus Torvalds, Jiri Slaby, Andrew Morton,
	live-patching, Linux Kernel Mailing List, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina

On Sun, May 07, 2017 at 04:48:36PM -0500, Josh Poimboeuf wrote:

> > Can objtool verify the unwinder at each address in the kernel, or is that an AI-complete problem?
> 
> It can't verify the *unwinder*, but it can verify the data which is fed
> to the unwinder (either DWARF or the structs I proposed above).  For
> each function, it follows every possible code path, and it can keep
> track of the stack pointer while doing so.

In that case, the kernel build process can verify the DWARF data and its
compatibility with the kernel unwinder by running the unwinder against
each kernel code address verifying the output and bail if there is a bug
in the toolchain that affects it.

-- 
Vojtech Pavlik
Director SuSE Labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-08  7:50             ` Vojtech Pavlik
@ 2017-05-08 13:14               ` Josh Poimboeuf
  0 siblings, 0 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-08 13:14 UTC (permalink / raw)
  To: Vojtech Pavlik
  Cc: hpa, Ingo Molnar, Linus Torvalds, Jiri Slaby, Andrew Morton,
	live-patching, Linux Kernel Mailing List, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina

On Mon, May 08, 2017 at 09:50:54AM +0200, Vojtech Pavlik wrote:
> On Sun, May 07, 2017 at 04:48:36PM -0500, Josh Poimboeuf wrote:
> 
> > > Can objtool verify the unwinder at each address in the kernel, or is that an AI-complete problem?
> > 
> > It can't verify the *unwinder*, but it can verify the data which is fed
> > to the unwinder (either DWARF or the structs I proposed above).  For
> > each function, it follows every possible code path, and it can keep
> > track of the stack pointer while doing so.
> 
> In that case, the kernel build process can verify the DWARF data and its
> compatibility with the kernel unwinder by running the unwinder against
> each kernel code address verifying the output

If I understand the idea correctly, we'd have to make the unwinder
dual-purpose such that it can run both in the kernel and in some kind of
user space objtool test harness.  The stack wouldn't be real, so
presumably each iteration of the test would only unwind a frame
associated with the current function.

It wouldn't be able to test edge cases like entry code and generated
code which aren't normal "functions", which objtool currently has no way
of understanding.  Also it wouldn't test how the unwinder deals with
corrupt DWARF data or corrupt stacks, unless we integrated some kind of
fuzzer in the harness.

And, at the end of the day, we'd still just be testing in an artificial
unit test environment.  So I'm not really crazy about the idea.

> and bail if there is a bug in the toolchain that affects it.

Objtool can already find _toolchain_ bugs without having to run the
unwinder in some kind of emulator.  It can't find _unwinder_ bugs, but I
really think such testing should be done at runtime in the unwinder's
native kernel environment.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-08  5:35       ` Andy Lutomirski
  2017-05-08  6:15         ` Ingo Molnar
@ 2017-05-08 14:40         ` Josh Poimboeuf
  2017-05-08 18:57           ` hpa
  1 sibling, 1 reply; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-08 14:40 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina

On Sun, May 07, 2017 at 10:35:28PM -0700, Andy Lutomirski wrote:
> I think that, if the code were sufficiently robust, it would be handy
> if the unwinder displayed function arguments.  DWARF can do that to a
> limited extent.

Honestly I get the feeling that displaying function arguments wouldn't
be realistic (DWARF or no DWARF).  On x86-64, arguments are passed in
registers, so tracking down their values is a lot more involved than
just looking at the stack.

The DWARF CFI only shows you the callee-saved registers.  To figure out
the other registers you'd have to dive into the other DWARF sections and
examine previous stack frames for clues.  I think it's not a
deterministic process, based on how often I see gdb complain with
'<value optimized out>'.  I'd bet it's a lot harder than a basic stack
dump.

Also, most kernel functions rely on pointer arguments, which are pretty
much useless without dumping the contents of the structs they point to.
But then doing that properly would be a whole new level of difficulty.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-08 14:40         ` Josh Poimboeuf
@ 2017-05-08 18:57           ` hpa
  0 siblings, 0 replies; 70+ messages in thread
From: hpa @ 2017-05-08 18:57 UTC (permalink / raw)
  To: Josh Poimboeuf, Andy Lutomirski
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Andy Lutomirski, Jiri Kosina

On May 8, 2017 7:40:49 AM PDT, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>On Sun, May 07, 2017 at 10:35:28PM -0700, Andy Lutomirski wrote:
>> I think that, if the code were sufficiently robust, it would be handy
>> if the unwinder displayed function arguments.  DWARF can do that to a
>> limited extent.
>
>Honestly I get the feeling that displaying function arguments wouldn't
>be realistic (DWARF or no DWARF).  On x86-64, arguments are passed in
>registers, so tracking down their values is a lot more involved than
>just looking at the stack.
>
>The DWARF CFI only shows you the callee-saved registers.  To figure out
>the other registers you'd have to dive into the other DWARF sections
>and
>examine previous stack frames for clues.  I think it's not a
>deterministic process, based on how often I see gdb complain with
>'<value optimized out>'.  I'd bet it's a lot harder than a basic stack
>dump.
>
>Also, most kernel functions rely on pointer arguments, which are pretty
>much useless without dumping the contents of the structs they point to.
>But then doing that properly would be a whole new level of difficulty.

At some point you are just reinventing k(g)db...
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-07 16:55     ` Josh Poimboeuf
  2017-05-07 17:59       ` Ingo Molnar
  2017-05-08  5:35       ` Andy Lutomirski
@ 2017-05-09  0:21       ` Andy Lutomirski
  2017-05-09  1:38         ` Josh Poimboeuf
  2017-05-09 18:47       ` Jiri Kosina
  2017-05-19 20:53       ` Josh Poimboeuf
  4 siblings, 1 reply; 70+ messages in thread
From: Andy Lutomirski @ 2017-05-09  0:21 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina

On Sun, May 7, 2017 at 9:55 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Fri, May 05, 2017 at 12:57:11PM -0700, Linus Torvalds wrote:
>> On Fri, May 5, 2017 at 5:22 AM, Jiri Slaby <jslaby@suse.cz> wrote:
>> > The DWARF unwinder is in place and ready. So introduce the config option
>> > to allow users to enable it. It is by default off due to missing
>> > assembly annotations.
>>
>> Who actually ends up using this?
>>
>> Because from the last time we had fancy unwindoers, and all the
>> problems it caused for oops handling with absolutely _zero_ upsides
>> ever, I do not ever again want to see fancy unwinders with complex
>> state machine handling used by the oopsing code.
>>
>> The fact that it gets disabled for KASAN also makes me suspicious. It
>> basically means that now all the accesses it does are not even
>> validated.
>>
>> The fact that the most of the code seems to be disabled for the first
>> six patches, and then just enabled in the last patch, also seems to
>> mean that the series also gets no bisection coverage or testing that
>> the individual patches make any sense. (ie there's a lot of code
>> inside "CONFIG_DWARF_UNWIND" in the early patches but that config
>> option cannot even be enabled until the last patch).
>>
>> We used to have nasty issues with not just missing dwarf info, but
>> also actively *wrong* dwarf info. Compiler versions that generated
>> subtly wrong info, because nobody actually really depended on it, and
>> the people who had tested it seldom did the kinds of things we do in
>> the kernel (eg inline asms etc).
>>
>> So I'm personally still very suspicious of these things.
>>
>> Last time I had huge issues with people also always blaming *anything*
>> else than that unwinder. It was always "oh, somebody wrote asm without
>> getting it right". Or "oh, the compiler generated bad tables, it's not
>> *my* fault that now the kernel oopsing code no longer works".
>>
>> When I asked for more stricter debug table validation to avoid issues,
>> it was always "oh, we fixed it, no worries", and then two months later
>> somebody hit another issue.
>>
>> Put another way; the last time we did crazy stuff like this, it got
>> reverted. For a damn good reason, despite some people being in denial
>> about those reasons.
>
> Here's another possible idea that's been rattling around in my head.
> It's purely theoretical at this point, so I don't know for sure that it
> would work.  But I haven't been able to find any major issues with it
> yet.
>
> DWARF is great for debuggers.  It helps you find all the registers on
> the stack, so you can see function arguments and local variables.  All
> expressed in a nice compact format.
>
> But that's overkill for unwinders.  We don't need all those registers,
> and the state machine is too complicated.  Unwinders basically only need
> to know one thing: given an instruction address and a stack pointer,
> where is the caller's stack frame?
>
> I'm thinking/hoping that information can be expressed in a simple, easy
> to parse, reasonably sized data structure.  Something like a sorted
> array of this:
>
>         struct undwarf {
>                 unsigned int ip;        /* instruction pointer (relative offset from base) */
>                 unsigned prev_frame:13; /* offset to previous frame from current stack pointer */
>                 unsigned regs:1;        /* whether prev_frame contains entry regs (regs->ip) */
>                 unsigned align:2;       /* some details for dealing with gcc stack realignment */
>         } __packed;
>
>         extern struct undwarf undwarves[];

Some comments in case you're actually planning to do this:

'unsigned int ip' is the majority of the size of this thing.  It might
be worth trying to store a lot fewer bits.  You could split the
structure into:

struct undwarf_header {
  unsigned long start_ip;
  unsigned align:2;  /* i'm assuming this rarely changes */
  ...;
  unsigned int offset_to_details;
};

and

struct undwarf_details {
  unsigned short ip_offset;
  unsigned short prev_frame;
};

and you'd find the details by first looking up the last header before
the ip and then finding the details starting at (uintptr_t)header +
header->offset_to_details.

Also, don't you need some indication of which reg is the base from
which you find previous frame?  After all, sometimes GCC will emit a
frame pointer even in an otherwise frame-pointer-omitting kernel.

--Andy

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-09  0:21       ` Andy Lutomirski
@ 2017-05-09  1:38         ` Josh Poimboeuf
  2017-05-09  2:31           ` Andy Lutomirski
  0 siblings, 1 reply; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-09  1:38 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Jiri Kosina

On Mon, May 08, 2017 at 05:21:24PM -0700, Andy Lutomirski wrote:
> On Sun, May 7, 2017 at 9:55 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> >         struct undwarf {
> >                 unsigned int ip;        /* instruction pointer (relative offset from base) */
> >                 unsigned prev_frame:13; /* offset to previous frame from current stack pointer */
> >                 unsigned regs:1;        /* whether prev_frame contains entry regs (regs->ip) */
> >                 unsigned align:2;       /* some details for dealing with gcc stack realignment */
> >         } __packed;
> >
> >         extern struct undwarf undwarves[];
> 
> Some comments in case you're actually planning to do this:
> 
> 'unsigned int ip' is the majority of the size of this thing.  It might
> be worth trying to store a lot fewer bits.  You could split the
> structure into:
> 
> struct undwarf_header {
>   unsigned long start_ip;
>   unsigned align:2;  /* i'm assuming this rarely changes */
>   ...;
>   unsigned int offset_to_details;
> };
> 
> and
> 
> struct undwarf_details {
>   unsigned short ip_offset;
>   unsigned short prev_frame;
> };
> 
> and you'd find the details by first looking up the last header before
> the ip and then finding the details starting at (uintptr_t)header +
> header->offset_to_details.

Good idea.  According to some back-of-a-napkin math, a scheme like this
could reduce the data size from 1.8M down to 1.2M with my kernel config,
a not-too-shabby 600k savings.

> Also, don't you need some indication of which reg is the base from
> which you find previous frame?  After all, sometimes GCC will emit a
> frame pointer even in an otherwise frame-pointer-omitting kernel.

I don't think we *need* to do that.  I believe the base reg can just
always[*] be the stack pointer, even with frame pointers.

That said, it might be beneficial to use the frame pointer as the base
reg where applicable, because it would shrink the data size, especially
in a kernel with frame pointers enabled.  And I think we would only need
an extra bit to store that info.

First I'll just start with the simplest possible scheme (i.e., what I
proposed above).  Even that calculates out to be smaller than the DWARF
.eh_frame stuff.  Then after that we can look at compression techniques
(and their associated tradeoffs).

[*] ignoring rare gcc stack realignments

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-09  1:38         ` Josh Poimboeuf
@ 2017-05-09  2:31           ` Andy Lutomirski
  2017-05-09  3:38             ` Josh Poimboeuf
  0 siblings, 1 reply; 70+ messages in thread
From: Andy Lutomirski @ 2017-05-09  2:31 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Andy Lutomirski, Linus Torvalds, Jiri Slaby, Andrew Morton,
	live-patching, Linux Kernel Mailing List, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers,
	Jiri Kosina

On Mon, May 8, 2017 at 6:38 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Mon, May 08, 2017 at 05:21:24PM -0700, Andy Lutomirski wrote:
>> On Sun, May 7, 2017 at 9:55 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>> >         struct undwarf {
>> >                 unsigned int ip;        /* instruction pointer (relative offset from base) */
>> >                 unsigned prev_frame:13; /* offset to previous frame from current stack pointer */
>> >                 unsigned regs:1;        /* whether prev_frame contains entry regs (regs->ip) */
>> >                 unsigned align:2;       /* some details for dealing with gcc stack realignment */
>> >         } __packed;
>> >
>> >         extern struct undwarf undwarves[];
>>
>> Some comments in case you're actually planning to do this:
>>
>> 'unsigned int ip' is the majority of the size of this thing.  It might
>> be worth trying to store a lot fewer bits.  You could split the
>> structure into:
>>
>> struct undwarf_header {
>>   unsigned long start_ip;
>>   unsigned align:2;  /* i'm assuming this rarely changes */
>>   ...;
>>   unsigned int offset_to_details;
>> };
>>
>> and
>>
>> struct undwarf_details {
>>   unsigned short ip_offset;
>>   unsigned short prev_frame;
>> };
>>
>> and you'd find the details by first looking up the last header before
>> the ip and then finding the details starting at (uintptr_t)header +
>> header->offset_to_details.
>
> Good idea.  According to some back-of-a-napkin math, a scheme like this
> could reduce the data size from 1.8M down to 1.2M with my kernel config,
> a not-too-shabby 600k savings.
>
>> Also, don't you need some indication of which reg is the base from
>> which you find previous frame?  After all, sometimes GCC will emit a
>> frame pointer even in an otherwise frame-pointer-omitting kernel.
>
> I don't think we *need* to do that.  I believe the base reg can just
> always[*] be the stack pointer, even with frame pointers.

What if there are functions that use alloca or variable length arrays
on the stack?  Functions using AHASH_REQUEST_ON_STACK come to mind.

--Andy

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-09  2:31           ` Andy Lutomirski
@ 2017-05-09  3:38             ` Josh Poimboeuf
  2017-05-09 10:00               ` hpa
  0 siblings, 1 reply; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-09  3:38 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Jiri Kosina

On Mon, May 08, 2017 at 07:31:50PM -0700, Andy Lutomirski wrote:
> On Mon, May 8, 2017 at 6:38 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> >> Also, don't you need some indication of which reg is the base from
> >> which you find previous frame?  After all, sometimes GCC will emit a
> >> frame pointer even in an otherwise frame-pointer-omitting kernel.
> >
> > I don't think we *need* to do that.  I believe the base reg can just
> > always[*] be the stack pointer, even with frame pointers.
> 
> What if there are functions that use alloca or variable length arrays
> on the stack?  Functions using AHASH_REQUEST_ON_STACK come to mind.

Wow, mind blown.  This is why I added you to CC!

Ok, I guess we'll need to be able to use the frame pointer as a base
reg.  It should be easy anyway.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-09  3:38             ` Josh Poimboeuf
@ 2017-05-09 10:00               ` hpa
  2017-05-09 14:58                 ` Josh Poimboeuf
  2017-05-10  8:15                 ` Jiri Slaby
  0 siblings, 2 replies; 70+ messages in thread
From: hpa @ 2017-05-09 10:00 UTC (permalink / raw)
  To: Josh Poimboeuf, Andy Lutomirski
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Jiri Kosina, hjl.tools

On May 8, 2017 8:38:31 PM PDT, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>On Mon, May 08, 2017 at 07:31:50PM -0700, Andy Lutomirski wrote:
>> On Mon, May 8, 2017 at 6:38 PM, Josh Poimboeuf <jpoimboe@redhat.com>
>wrote:
>> >> Also, don't you need some indication of which reg is the base from
>> >> which you find previous frame?  After all, sometimes GCC will emit
>a
>> >> frame pointer even in an otherwise frame-pointer-omitting kernel.
>> >
>> > I don't think we *need* to do that.  I believe the base reg can
>just
>> > always[*] be the stack pointer, even with frame pointers.
>> 
>> What if there are functions that use alloca or variable length arrays
>> on the stack?  Functions using AHASH_REQUEST_ON_STACK come to mind.
>
>Wow, mind blown.  This is why I added you to CC!
>
>Ok, I guess we'll need to be able to use the frame pointer as a base
>reg.  It should be easy anyway.

[Adding H.J. Lu for obvious expertise - H.J., the issue at hand is doing stack unwinding to display the call stack on a kernel failure.  Right now the standard kernel configuration is using frame pointers even on x86-64 because we had very bad experiences with fragility of .eh_frame-based unwinding.  However, on some kernel workloads the cost of frame pointers  I is quite high.]

Yes is exactly why.  I believe gcc will still use a frame pointer when the relationship between sp and the incoming stack frame is indeterminate for some reason, but I have to admit that a) I'm not 100% sure and b) there are DWARF annotations for exactly this, so even if it is true for current gcc that a frame pointer is not needed it might not be in the future.

As far as I understand, the .eh_frame section is supposed to contain the subset of the DWARF bytecode needed to do a stack unwind when an exception is thrown, whereas the .debug* sections contain the full DWARF data a debugger might want.  Thus .eh_frame is mapped into the runtime process while .debug* [usually?] is not.  .debug* can easily be 10x larger than the executable text segments.

Since C doesn't *have* exceptions, the main user of this for C code is the case of calls C++ > C > C++ or equivalent; thus the vulnerability to toolchain filters failures – the case of an exception-caused unwind in the middle of a C function might not have been extensively tested.  [H.J.: is that correct?  Or could an asynchronous event like a signal cause an unwind of the .eh_frame from an arbitrary point?  If so, how is that tested?]

In the case of the kernel, size matters in a different way, because even though it will be cache cold this data takes up unreclaimable RAM.  However, frame pointer-related code eats up not just RAM but cache.

Assembly language routines become problematic no matter what you do unless you restrict the way the assembly can be written.  Experience has shown us that hand-maintaining annotations in assembly code is doomed to failure, and in many cases it isn't even clear to even experienced programmers how to do it.  [H.J.: is the .cfi_* operations set even documented anywhere in a way that non-compiler-writers can comprehend it?] Inline assembly becomes problematic in the case of non-frame-pointer builds if the assembly changes the stack pointer.  A static tool might be able to annotate simple cases like push/pop.

I'm, ahem, highly skeptical to creating our own unwinding data format unless there is *documented, supported, and tested* way to force the compiler to *automatically* fall back to frame pointers any time there may be complexity involved, which at a very minimum includes any kind of data-dependent manipulation of the stack pointer.  Otherwise you will have to fail the kernel build when your static tool runs into instruction sequences it can't deal with, but the compiler may generate them - boom.  Worse, your tool will not even recognize the problem and you're in a worse place than when you started.


-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-09 10:00               ` hpa
@ 2017-05-09 14:58                 ` Josh Poimboeuf
  2017-05-09 16:46                   ` H.J. Lu
  2017-05-10  8:15                 ` Jiri Slaby
  1 sibling, 1 reply; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-09 14:58 UTC (permalink / raw)
  To: hpa
  Cc: Andy Lutomirski, Linus Torvalds, Jiri Slaby, Andrew Morton,
	live-patching, Linux Kernel Mailing List, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Jiri Kosina, hjl.tools

On Tue, May 09, 2017 at 03:00:45AM -0700, hpa@zytor.com wrote:
> I'm, ahem, highly skeptical to creating our own unwinding data format
> unless there is *documented, supported, and tested* way to force the
> compiler to *automatically* fall back to frame pointers any time there
> may be complexity involved, which at a very minimum includes any kind
> of data-dependent manipulation of the stack pointer.

That would be nice.  But isn't falling back to a frame pointer (or
another callee-saved reg or a stack location) already needed in such
cases?  Otherwise how could DWARF unwinding work?

> Otherwise you will have to fail the kernel build when your static tool
> runs into instruction sequences it can't deal with, but the compiler
> may generate them - boom.

Failing the build is harsh, we could just warn about it and skip the
data for the affected function(s).

BTW, there is another option.  Instead of generating the data from
scratch, we could just convert gcc's DWARF CFI to the format we need.

However that wouldn't solve the problems we have with the holes and
inaccuracies in DWARF from our hand-annotated asm, inline asm, and
special sections (extable, alternatives, etc).  We'd still have to rely
on objtool for that, so we'd still be in the same boat of needing
objtool to be able to follow gcc code paths.

So yes, it sucks that objtool needs to work for unwinding to work.  But
if we want decent DWARF-esque unwinding, I don't see any way around
that due to the low-level nature of the kernel.

> Worse, your tool will not even recognize the problem and you're in a
> worse place than when you started.

We could have a runtime NMI-based stack checker which ensures it can
always unwind to the bottom of the stack.  Over time this would
hopefully provide full validation of the unwinder data and
functionality.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-09 14:58                 ` Josh Poimboeuf
@ 2017-05-09 16:46                   ` H.J. Lu
  0 siblings, 0 replies; 70+ messages in thread
From: H.J. Lu @ 2017-05-09 16:46 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: H. Peter Anvin, Andy Lutomirski, Linus Torvalds, Jiri Slaby,
	Andrew Morton, live-patching, Linux Kernel Mailing List,
	Thomas Gleixner, Ingo Molnar, the arch/x86 maintainers,
	Jiri Kosina

On Tue, May 9, 2017 at 7:58 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Tue, May 09, 2017 at 03:00:45AM -0700, hpa@zytor.com wrote:
>> I'm, ahem, highly skeptical to creating our own unwinding data format
>> unless there is *documented, supported, and tested* way to force the
>> compiler to *automatically* fall back to frame pointers any time there
>> may be complexity involved, which at a very minimum includes any kind
>> of data-dependent manipulation of the stack pointer.
>
> That would be nice.  But isn't falling back to a frame pointer (or
> another callee-saved reg or a stack location) already needed in such
> cases?  Otherwise how could DWARF unwinding work?
>
>> Otherwise you will have to fail the kernel build when your static tool
>> runs into instruction sequences it can't deal with, but the compiler
>> may generate them - boom.
>
> Failing the build is harsh, we could just warn about it and skip the
> data for the affected function(s).
>
> BTW, there is another option.  Instead of generating the data from
> scratch, we could just convert gcc's DWARF CFI to the format we need.
>
> However that wouldn't solve the problems we have with the holes and
> inaccuracies in DWARF from our hand-annotated asm, inline asm, and
> special sections (extable, alternatives, etc).  We'd still have to rely
> on objtool for that, so we'd still be in the same boat of needing
> objtool to be able to follow gcc code paths.

CFI directives are documented in GNU assembler manual.  They
store unwind info in .eh_frame section.  They work well with assembly
codes in glibc.  But I don't know how well they work with kernel unwind.

> So yes, it sucks that objtool needs to work for unwinding to work.  But
> if we want decent DWARF-esque unwinding, I don't see any way around
> that due to the low-level nature of the kernel.
>
>> Worse, your tool will not even recognize the problem and you're in a
>> worse place than when you started.
>
> We could have a runtime NMI-based stack checker which ensures it can
> always unwind to the bottom of the stack.  Over time this would
> hopefully provide full validation of the unwinder data and
> functionality.
>
> --
> Josh



-- 
H.J.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-07 16:55     ` Josh Poimboeuf
                         ` (2 preceding siblings ...)
  2017-05-09  0:21       ` Andy Lutomirski
@ 2017-05-09 18:47       ` Jiri Kosina
  2017-05-09 19:22         ` Josh Poimboeuf
  2017-05-19 20:53       ` Josh Poimboeuf
  4 siblings, 1 reply; 70+ messages in thread
From: Jiri Kosina @ 2017-05-09 18:47 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Andy Lutomirski

On Sun, 7 May 2017, Josh Poimboeuf wrote:

> DWARF is great for debuggers.  It helps you find all the registers on 
> the stack, so you can see function arguments and local variables.  All 
> expressed in a nice compact format.
> 
> But that's overkill for unwinders.  We don't need all those registers,
> and the state machine is too complicated.  

OTOH if we make the failures in processing of those "auxiliary" 
information non-fatal (in a sense that it neither causes kernel bug nor 
does it actually corrupt the unwinding process, but the only effect is 
losing "optional" information), having this data available doesn't hurt. 

It's there anyway for builds containing debuginfo, and the information is 
all there so that it can be used by things like gdb or crash, so it seems 
natural to re-use as much as possible of it.

> Unwinders basically only need to know one thing: given an instruction 
> address and a stack pointer, where is the caller's stack frame?

Again, DWARF should be able to give us all of this (including the 
FP-fallback etc). It feels a bit silly to purposedly ignore it and 
reinvent parts of it again, instead of fixing (read: "asking toolchain 
guys to fix") the cases where we actually are not getting the proper data 
in DWARF. That's a win-win at the end of the day.

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-09 18:47       ` Jiri Kosina
@ 2017-05-09 19:22         ` Josh Poimboeuf
  2017-05-10  8:32           ` Jiri Slaby
  0 siblings, 1 reply; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-09 19:22 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Linus Torvalds, Jiri Slaby, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Andy Lutomirski

On Tue, May 09, 2017 at 08:47:50PM +0200, Jiri Kosina wrote:
> On Sun, 7 May 2017, Josh Poimboeuf wrote:
> 
> > DWARF is great for debuggers.  It helps you find all the registers on 
> > the stack, so you can see function arguments and local variables.  All 
> > expressed in a nice compact format.
> > 
> > But that's overkill for unwinders.  We don't need all those registers,
> > and the state machine is too complicated.  
> 
> OTOH if we make the failures in processing of those "auxiliary" 
> information non-fatal (in a sense that it neither causes kernel bug nor 
> does it actually corrupt the unwinding process, but the only effect is 
> losing "optional" information), having this data available doesn't hurt. 

But it does hurt, in the sense that the complicated format of DWARF CFI
means the unwinder has to jump through a lot more hoops to read it.

> It's there anyway for builds containing debuginfo, and the information is 
> all there so that it can be used by things like gdb or crash, so it seems 
> natural to re-use as much as possible of it.

There's a valid argument to be made that we should start with the DWARF
data instead of creating the new data from scratch.  That might be fine.
Right now I don't have a strong feeling about it either way.

But if we do that, we should still convert the DWARF data to a simple
streamlined format for the in-kernel unwinder, so it can easily be read
by the kernel without having to fire up a DWARF state machine in the
middle of an oops.

And if we wanted it to be reasonably reliable, we'd also need to fix up
the DWARF data somehow before converting it, presumably with objtool.

> > Unwinders basically only need to know one thing: given an instruction 
> > address and a stack pointer, where is the caller's stack frame?
> 
> Again, DWARF should be able to give us all of this (including the 
> FP-fallback etc). It feels a bit silly to purposedly ignore it and 
> reinvent parts of it again, instead of fixing (read: "asking toolchain 
> guys to fix") the cases where we actually are not getting the proper data 
> in DWARF. That's a win-win at the end of the day.

Most of the kernel DWARF issues I've seen aren't caused by toolchain
bugs.  They're caused by the kernel's quirks: asm, inline asm, special
sections.

And anyway, fixing the correctness of the DWARF data is only half the
problem IMO.  The other half of the problem is unwinder complexity.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-05 19:57   ` Linus Torvalds
                       ` (2 preceding siblings ...)
  2017-05-07 16:55     ` Josh Poimboeuf
@ 2017-05-10  7:39     ` Jiri Slaby
  2017-05-10 12:42       ` Josh Poimboeuf
  2017-05-10 18:11       ` Linus Torvalds
  3 siblings, 2 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-10  7:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, live-patching, Linux Kernel Mailing List,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	the arch/x86 maintainers

On 05/05/2017, 09:57 PM, Linus Torvalds wrote:
> On Fri, May 5, 2017 at 5:22 AM, Jiri Slaby <jslaby@suse.cz> wrote:
>> The DWARF unwinder is in place and ready. So introduce the config option
>> to allow users to enable it. It is by default off due to missing
>> assembly annotations.
> 
> Who actually ends up using this?

Every SUSE user has been using this for almost a decade and we are not
about to switch to FP for performance reasons as noted by Jiri Kosina.
So SUSE users are going to be exposed to DWARF unwinder for another
decade or so at least.

Therefore, this is another attempt to make the unwinder (in some form)
upstream. Since this was first proposed many years ago, we have been
forced to forward-port it over and over and everyone knows what pain it
is. So it is nice, that this opened the discussion at least.

> Because from the last time we had fancy unwindoers, and all the
> problems it caused for oops handling with absolutely _zero_ upsides
> ever, I do not ever again want to see fancy unwinders with complex
> state machine handling used by the oopsing code.

Well, reliable stack-traces with minimal performance impact thanks to
out-of-band data is hell good reason in my opinion.

> The fact that it gets disabled for KASAN also makes me suspicious. It
> basically means that now all the accesses it does are not even
> validated.

OK, I inclined to disable KASAN when I started cleaning this up for
_performance_ reasons. The system was so slow, that the RCU stall or
soft-lockup detectors came up complaining. From that time, I measured
the bottlenecks and optimized the unwinder so that 1000 iterations of
unwinding takes:

Before:
real    0m1.808s
user    0m0.001s
sys     0m1.807s

After:
real    0m0.018s
user    0m0.001s
sys     0m0.017s

So let me check, whether KASAN still has to be disabled globally. I do
not think so.

OTOH, TBH, I am not sure KASAN can be enabled for dwarf.c, the same as
holds now for the rest of the current unwinding:
KASAN_SANITIZE_dumpstack.o                              := n
KASAN_SANITIZE_dumpstack_$(BITS).o                      := n
KASAN_SANITIZE_stacktrace.o := n

Still, I can let KASAN := y for dwarf.c for testing purposes locally and
smoke-test the unwinder.

> The fact that the most of the code seems to be disabled for the first
> six patches, and then just enabled in the last patch, also seems to
> mean that the series also gets no bisection coverage or testing that
> the individual patches make any sense. (ie there's a lot of code
> inside "CONFIG_DWARF_UNWIND" in the early patches but that config
> option cannot even be enabled until the last patch).

Correct. This was one big patch previously. I separated that patch into
several smaller commits touching different places of the kernel for
easier review.

It does not make sense to test any of the patches separately except the
first. Hence the config option which enables the rest of the series is
the last one. I deemed this as one of possible approaches to split
patches (I have seen this many times in the past.) I can of course
squash this back into a single patch (or two).

> We used to have nasty issues with not just missing dwarf info, but
> also actively *wrong* dwarf info. Compiler versions that generated
> subtly wrong info, because nobody actually really depended on it, and
> the people who had tested it seldom did the kinds of things we do in 
> the kernel (eg inline asms etc).

I must admit I am not aware of any issues in this manner during the
years. Again, this unwinder is the default in SUSE kernels since ever,
so we have been using gcc from at least 3.2 to 7. But see below.

> So I'm personally still very suspicious of these things.
> 
> Last time I had huge issues with people also always blaming *anything*
> else than that unwinder. It was always "oh, somebody wrote asm without
> getting it right". Or "oh, the compiler generated bad tables, it's not
> *my* fault that now the kernel oopsing code no longer works".

Now we have objtool. My objtool clone:
1) verifies the DWARF data (prepared by Josh)

2) generates DWARF data for assembly -- incomplete yet: see the thread
about x86 assembly cleanup which is a pre-requisite for this to work.
This is BTW the reason why the DWARF unwinder is default-off in this
series yet.

And we can add:
3) fix up the data, if they are wrong

That said, objtool could handle the data so they are correct and
as-expected for the unwinder. Without objtool, the data (and unwinder)
is hopeless (only vdso from all the assembly is annotated.)

> When I asked for more stricter debug table validation to avoid issues,
> it was always "oh, we fixed it, no worries", and then two months later
> somebody hit another issue.

Reasonable, indeed. I am all for strict checking. objtool is to do that.

> Put another way; the last time we did crazy stuff like this, it got
> reverted. For a damn good reason, despite some people being in denial
> about those reasons.

Speaking for myself, having it out-of-tree causes me only troubles with
fwd-porting. So I am all ears to find a path to make this upstream and
maintain this there according to opinions of general kernel-community.
(Which reminds me I didn't add an entry to the MAINTAINERS file.)

thanks,
-- 
js
suse labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-06  7:19     ` Ingo Molnar
@ 2017-05-10  7:46       ` Jiri Slaby
  0 siblings, 0 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-10  7:46 UTC (permalink / raw)
  To: Ingo Molnar, Linus Torvalds, Josh Poimboeuf
  Cc: Andrew Morton, live-patching, Linux Kernel Mailing List,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	the arch/x86 maintainers

On 05/06/2017, 09:19 AM, Ingo Molnar wrote:
> 
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
>> On Fri, May 5, 2017 at 5:22 AM, Jiri Slaby <jslaby@suse.cz> wrote:
>>> The DWARF unwinder is in place and ready. So introduce the config option
>>> to allow users to enable it. It is by default off due to missing
>>> assembly annotations.
>>
>> Who actually ends up using this?
> 
> Also, why wasn't Josh Poimboeuf Cc:-ed, who de-facto maintains the x86 unwinding 
> code?

I explicitly CCed Josh on 5/7 and 6/7 which touches the code. Besides
that, I assumed he is implicitly CCed via live-patching@vger.kernel.org
which is carbon-copied on each of the patches.

> AFAICS this series is just repeating the old mistakes of the old Dwarf unwinders 
> of trusting GCC's unwinder data. So NAK for the time being on the whole approach:
> 
> NAcked-by: Ingo Molnar <mingo@kernel.org>

OK, as the series stands now, we indeed do. Noteworthy, we, in SUSE, had
no problems with this reliance for all the time we have been using the
unwinder.

Anyway, objtool is about to vaidate the DWARF data, generate it for
assembly and potentially fix it if problems occur. Could you elaborate
on what else would help you to change your stance?

thanks,
-- 
js
suse labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-09 10:00               ` hpa
  2017-05-09 14:58                 ` Josh Poimboeuf
@ 2017-05-10  8:15                 ` Jiri Slaby
  2017-05-10 13:09                   ` Josh Poimboeuf
  1 sibling, 1 reply; 70+ messages in thread
From: Jiri Slaby @ 2017-05-10  8:15 UTC (permalink / raw)
  To: hpa, Josh Poimboeuf, Andy Lutomirski
  Cc: Linus Torvalds, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Jiri Kosina, hjl.tools

On 05/09/2017, 12:00 PM, hpa@zytor.com wrote:
> As far as I understand, the .eh_frame section is supposed to contain the subset of the DWARF bytecode needed to do a stack unwind when an exception is thrown, whereas the .debug* sections contain the full DWARF data a debugger might want.  Thus .eh_frame is mapped into the runtime process while .debug* [usually?] is not.  .debug* can easily be 10x larger than the executable text segments.

As it currently stands, the (same) data is generated either to
.eh_frame, or to .debug_frame. Depending if DWARF_UNWINDER is turned on,
or off, respectively. vdso has the same data in both, always.

> Assembly language routines become problematic no matter what you do unless you restrict the way the assembly can be written.  Experience has shown us that hand-maintaining annotations in assembly code is doomed to failure, and in many cases it isn't even clear to even experienced programmers how to do it.

Sure, manual annotations of assembly will be avoided as much as
possible. We have to rely objtool to generate them in most cases.

>  [H.J.: is the .cfi_* operations set even documented anywhere in a way that non-compiler-writers can comprehend it?]

Until now, I always looked into as manual:
$ pinfo --node='CFI directives' as

> I'm, ahem, highly skeptical to creating our own unwinding data format unless there is *documented, supported, and tested* way to force the compiler to *automatically* fall back to frame pointers any time there may be complexity involved,

I second this. Inventing a new format like this mostly ends up with
using the standard one after several iterations. One cannot think of all
the consequences and needs while proposing a new one.

The memory footprint is ~2M for average vmlinux. And people who need to
access:
* either need it frequently -- those do not need performance (LOCKDEP,
KASAN, or other debug builds)
* or are in the middle of WARNING, BUG, crash, panic or such and this is
not that often...

And we would need *both*. The limited proprietary one in some sort of
.kernel_eh_frame, and DWARF cfis in .debug_frame for tools like crash,
gdb and so on.

And yes, the DWARF unwinder falls back to FP if they are available (see
function dw_fp_fallback).

thanks,
-- 
js
suse labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-09 19:22         ` Josh Poimboeuf
@ 2017-05-10  8:32           ` Jiri Slaby
  2017-05-10 13:13             ` Josh Poimboeuf
  2017-05-23  7:07             ` Peter Zijlstra
  0 siblings, 2 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-10  8:32 UTC (permalink / raw)
  To: Josh Poimboeuf, Jiri Kosina
  Cc: Linus Torvalds, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Andy Lutomirski

On 05/09/2017, 09:22 PM, Josh Poimboeuf wrote:
> On Tue, May 09, 2017 at 08:47:50PM +0200, Jiri Kosina wrote:
>> On Sun, 7 May 2017, Josh Poimboeuf wrote:
>>
>>> DWARF is great for debuggers.  It helps you find all the registers on 
>>> the stack, so you can see function arguments and local variables.  All 
>>> expressed in a nice compact format.
>>>
>>> But that's overkill for unwinders.  We don't need all those registers,
>>> and the state machine is too complicated.  
>>
>> OTOH if we make the failures in processing of those "auxiliary" 
>> information non-fatal (in a sense that it neither causes kernel bug nor 
>> does it actually corrupt the unwinding process, but the only effect is 
>> losing "optional" information), having this data available doesn't hurt. 
> 
> But it does hurt, in the sense that the complicated format of DWARF CFI
> means the unwinder has to jump through a lot more hoops to read it.

Why that matters, actually? Unwinder is nothing to be performance
oriented. And if somebody is doing a lot of unwinding during runtime,
they can switch to in-this-case-faster FP unwinder.

> And if we wanted it to be reasonably reliable, we'd also need to fix up
> the DWARF data somehow before converting it, presumably with objtool.

We have to do this anyway. Be it the DWARF info or whatever we end up with.

>>> Unwinders basically only need to know one thing: given an instruction 
>>> address and a stack pointer, where is the caller's stack frame?
>>
>> Again, DWARF should be able to give us all of this (including the 
>> FP-fallback etc). It feels a bit silly to purposedly ignore it and 
>> reinvent parts of it again, instead of fixing (read: "asking toolchain 
>> guys to fix")

And we can just do, if a totally broken compiler emerges:
#if defined(CONFIG_DWARF_UNWINDER) && GCC_VERSION == 59000
#error Sorry, choose a different compiler or disable DWARF unwinder
#endif

We haven't to do this during the past decade and I am sceptic if we
would have to do it in the next one.

>>  the cases where we actually are not getting the proper data 
>> in DWARF. That's a win-win at the end of the day.
> 
> Most of the kernel DWARF issues I've seen aren't caused by toolchain
> bugs.  They're caused by the kernel's quirks: asm, inline asm, special
> sections.

Right.

> And anyway, fixing the correctness of the DWARF data is only half the
> problem IMO.  The other half of the problem is unwinder complexity.

Complex, but generic and working. IMO, it would be rather though to come
up with some tool working on different compilers or even different
versions of gcc. I mean some tool to convert the DWARF data to something
proprietary. The conversion would be as complex as is the unwinder plus
conversion to the proprietary format and its dump into ELF. We would
still rely on a (now out-of-kernel-runtime-code) complex monolith to do
it right.

thanks,
-- 
js
suse labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-10  7:39     ` Jiri Slaby
@ 2017-05-10 12:42       ` Josh Poimboeuf
  2017-05-10 12:47         ` Jiri Slaby
  2017-05-10 18:11       ` Linus Torvalds
  1 sibling, 1 reply; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-10 12:42 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: Linus Torvalds, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers

On Wed, May 10, 2017 at 09:39:39AM +0200, Jiri Slaby wrote:
> OTOH, TBH, I am not sure KASAN can be enabled for dwarf.c, the same as
> holds now for the rest of the current unwinding:
> KASAN_SANITIZE_dumpstack.o                              := n
> KASAN_SANITIZE_dumpstack_$(BITS).o                      := n
> KASAN_SANITIZE_stacktrace.o := n

Most of the unwinder code is now in unwind_frame.c, which *does* have
KASAN enabled.

I think the above is leftover from the days before I rewrote the
unwinder.  I'll look at renabling KASAN for those files.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-10 12:42       ` Josh Poimboeuf
@ 2017-05-10 12:47         ` Jiri Slaby
  0 siblings, 0 replies; 70+ messages in thread
From: Jiri Slaby @ 2017-05-10 12:47 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Linus Torvalds, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers

On 05/10/2017, 02:42 PM, Josh Poimboeuf wrote:
> On Wed, May 10, 2017 at 09:39:39AM +0200, Jiri Slaby wrote:
>> OTOH, TBH, I am not sure KASAN can be enabled for dwarf.c, the same as
>> holds now for the rest of the current unwinding:
>> KASAN_SANITIZE_dumpstack.o                              := n
>> KASAN_SANITIZE_dumpstack_$(BITS).o                      := n
>> KASAN_SANITIZE_stacktrace.o := n
> 
> Most of the unwinder code is now in unwind_frame.c, which *does* have
> KASAN enabled.
> 
> I think the above is leftover from the days before I rewrote the
> unwinder.  I'll look at renabling KASAN for those files.

Ok, fair enough.

I will do some measurements in the FP field while I will be at it.

thanks,
-- 
js
suse labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-10  8:15                 ` Jiri Slaby
@ 2017-05-10 13:09                   ` Josh Poimboeuf
  2017-05-10 16:23                     ` H.J. Lu
  0 siblings, 1 reply; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-10 13:09 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: hpa, Andy Lutomirski, Linus Torvalds, Andrew Morton,
	live-patching, Linux Kernel Mailing List, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Jiri Kosina, hjl.tools

On Wed, May 10, 2017 at 10:15:09AM +0200, Jiri Slaby wrote:
> > I'm, ahem, highly skeptical to creating our own unwinding data
> > format unless there is *documented, supported, and tested* way to
> > force the compiler to *automatically* fall back to frame pointers
> > any time there may be complexity involved,
> 
> I second this. Inventing a new format like this mostly ends up with
> using the standard one after several iterations. One cannot think of all
> the consequences and needs while proposing a new one.
> 
> The memory footprint is ~2M for average vmlinux. And people who need to
> access:
> * either need it frequently -- those do not need performance (LOCKDEP,
> KASAN, or other debug builds)
> * or are in the middle of WARNING, BUG, crash, panic or such and this is
> not that often...
> 
> And we would need *both*. The limited proprietary one in some sort of
> .kernel_eh_frame, and DWARF cfis in .debug_frame for tools like crash,
> gdb and so on.

I don't think so.  DWARF CFI is optimized for size.  My proposal is to
take the same data (or some subset of it) and reformat it to optimize
for simplicity.

If, for some reason, we ended up needing *all* the original DWARF data,
we could still have it in the simpler format.  In that case it might end
up being 8M instead of 2M :-) But I don't see that being possible.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-10  8:32           ` Jiri Slaby
@ 2017-05-10 13:13             ` Josh Poimboeuf
  2017-05-23  7:07             ` Peter Zijlstra
  1 sibling, 0 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-10 13:13 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: Jiri Kosina, Linus Torvalds, Andrew Morton, live-patching,
	Linux Kernel Mailing List, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, the arch/x86 maintainers, Andy Lutomirski

On Wed, May 10, 2017 at 10:32:06AM +0200, Jiri Slaby wrote:
> On 05/09/2017, 09:22 PM, Josh Poimboeuf wrote:
> > On Tue, May 09, 2017 at 08:47:50PM +0200, Jiri Kosina wrote:
> >> On Sun, 7 May 2017, Josh Poimboeuf wrote:
> >>
> >>> DWARF is great for debuggers.  It helps you find all the registers on 
> >>> the stack, so you can see function arguments and local variables.  All 
> >>> expressed in a nice compact format.
> >>>
> >>> But that's overkill for unwinders.  We don't need all those registers,
> >>> and the state machine is too complicated.  
> >>
> >> OTOH if we make the failures in processing of those "auxiliary" 
> >> information non-fatal (in a sense that it neither causes kernel bug nor 
> >> does it actually corrupt the unwinding process, but the only effect is 
> >> losing "optional" information), having this data available doesn't hurt. 
> > 
> > But it does hurt, in the sense that the complicated format of DWARF CFI
> > means the unwinder has to jump through a lot more hoops to read it.
> 
> Why that matters, actually? Unwinder is nothing to be performance
> oriented. And if somebody is doing a lot of unwinding during runtime,
> they can switch to in-this-case-faster FP unwinder.

More complexity == more bugs.

> > And anyway, fixing the correctness of the DWARF data is only half the
> > problem IMO.  The other half of the problem is unwinder complexity.
> 
> Complex, but generic and working. IMO, it would be rather though to come
> up with some tool working on different compilers or even different
> versions of gcc. I mean some tool to convert the DWARF data to something
> proprietary. The conversion would be as complex as is the unwinder plus
> conversion to the proprietary format and its dump into ELF. We would
> still rely on a (now out-of-kernel-runtime-code) complex monolith to do
> it right.

Complexity outside of the kernel is infinitely better than complexity in
mission critical oops code.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-10 13:09                   ` Josh Poimboeuf
@ 2017-05-10 16:23                     ` H.J. Lu
  0 siblings, 0 replies; 70+ messages in thread
From: H.J. Lu @ 2017-05-10 16:23 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Jiri Slaby, H. Peter Anvin, Andy Lutomirski, Linus Torvalds,
	Andrew Morton, live-patching, Linux Kernel Mailing List,
	Thomas Gleixner, Ingo Molnar, the arch/x86 maintainers,
	Jiri Kosina

On Wed, May 10, 2017 at 6:09 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Wed, May 10, 2017 at 10:15:09AM +0200, Jiri Slaby wrote:
>> > I'm, ahem, highly skeptical to creating our own unwinding data
>> > format unless there is *documented, supported, and tested* way to
>> > force the compiler to *automatically* fall back to frame pointers
>> > any time there may be complexity involved,
>>
>> I second this. Inventing a new format like this mostly ends up with
>> using the standard one after several iterations. One cannot think of all
>> the consequences and needs while proposing a new one.
>>
>> The memory footprint is ~2M for average vmlinux. And people who need to
>> access:
>> * either need it frequently -- those do not need performance (LOCKDEP,
>> KASAN, or other debug builds)
>> * or are in the middle of WARNING, BUG, crash, panic or such and this is
>> not that often...
>>
>> And we would need *both*. The limited proprietary one in some sort of
>> .kernel_eh_frame, and DWARF cfis in .debug_frame for tools like crash,
>> gdb and so on.
>
> I don't think so.  DWARF CFI is optimized for size.  My proposal is to
> take the same data (or some subset of it) and reformat it to optimize
> for simplicity.
>
> If, for some reason, we ended up needing *all* the original DWARF data,
> we could still have it in the simpler format.  In that case it might end
> up being 8M instead of 2M :-) But I don't see that being possible.

There is a compact EH for MIPS:

https://github.com/itanium-cxx-abi/cxx-abi/blob/master/MIPSCompactEH.pdf

It can be extended to other targets.

-- 
H.J.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-10  7:39     ` Jiri Slaby
  2017-05-10 12:42       ` Josh Poimboeuf
@ 2017-05-10 18:11       ` Linus Torvalds
  1 sibling, 0 replies; 70+ messages in thread
From: Linus Torvalds @ 2017-05-10 18:11 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: Andrew Morton, live-patching, Linux Kernel Mailing List,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	the arch/x86 maintainers

On Wed, May 10, 2017 at 12:39 AM, Jiri Slaby <jslaby@suse.cz> wrote:
>
> Every SUSE user has been using this for almost a decade and we are not
> about to switch to FP for performance reasons as noted by Jiri Kosina.

The whole "not about to switch on frame pointers" argument is bogus.

Lots of people don't have frame pointers. The tracebacks don't look
all that horrible despite that.

If the problem is that some debug option that you want to use do that
"select FRAME_POINTER" thing, then maybe we should just fix that.

For example, I think it's annoying that the LATENCYTOP helper config
option basically forces frame pointers. That just seems stupid. Even
enabling CONFIG_STACKTRACE doesn't do that.

We do fine without frame pointers. Do traces get a bit uglier? Sure.
But that's not a huge deal.

> Well, reliable stack-traces with minimal performance impact thanks to
> out-of-band data is hell good reason in my opinion.

It's not the performance impact.

It's the other crap.

It's the fact that you have a whole state machine that isn't even
used. The only reason for that state machine is for register contents,
but then the register contents aren't actually used by the stack trace
code afaik.

Yeah, in theory that register engine might be used for dynamic stack
sizes too, but I don't think gcc actually generates code like that -
it uses frame pointers for variable-sized stacks, doesn't it?

But historically, it's even more the "oops, the unwind tables are
buggy because the test coverage is horrible, and we walked off into
the weeds while walking them, taking a recursive page fault, which
turned a WARN_ON() into a dead machine that didn't even give us the
information we wanted in the first place".

Now, it may be that with tools like objtool, we might be able to
*validate* the tables, which might actually be a good safety net.

So I'm not saying that the unwinder is a total no-go.

But I want to get rid of these red herring arguments in its favor.

If the argument *for* the unwinder i scrazy bullshit (eg "I want to
use LATENCYTOP without CONFIG_FRAME_POINTER"), then we should fix
those things independently.

>> The fact that it gets disabled for KASAN also makes me suspicious. It
>> basically means that now all the accesses it does are not even
>> validated.
>
> OK, I inclined to disable KASAN when I started cleaning this up for
> _performance_ reasons. The system was so slow, that the RCU stall or
> soft-lockup detectors came up complaining. From that time, I measured
> the bottlenecks and optimized the unwinder so that 1000 iterations of
> unwinding takes:
>
> Before:
> real    0m1.808s
> user    0m0.001s
> sys     0m1.807s
>
> After:
> real    0m0.018s
> user    0m0.001s
> sys     0m0.017s
>
> So let me check, whether KASAN still has to be disabled globally. I do
> not think so.
>
> OTOH, TBH, I am not sure KASAN can be enabled for dwarf.c, the same as
> holds now for the rest of the current unwinding:
> KASAN_SANITIZE_dumpstack.o                              := n
> KASAN_SANITIZE_dumpstack_$(BITS).o                      := n
> KASAN_SANITIZE_stacktrace.o := n

That's fine. But if the unwinder means no KASAN at all, then I don't
think the unwinder is good.

> Now we have objtool. My objtool clone:
> 1) verifies the DWARF data (prepared by Josh)
>
> 2) generates DWARF data for assembly -- incomplete yet: see the thread
> about x86 assembly cleanup which is a pre-requisite for this to work.
> This is BTW the reason why the DWARF unwinder is default-off in this
> series yet.
>
> And we can add:
> 3) fix up the data, if they are wrong

Yes. objtool might make the unwinder acceptable.

One of the things that caused the old unwinder to be absolutely
incredible *crap* was how it turned assembly language (both inline and
separate .S files) from a useful thing to absolutely horrible line
noise that was illegible and unmaintainable, and likely to be buggy to
boot.

So we may be in a different situation that we used to. But still..

              Linus

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-07 16:55     ` Josh Poimboeuf
                         ` (3 preceding siblings ...)
  2017-05-09 18:47       ` Jiri Kosina
@ 2017-05-19 20:53       ` Josh Poimboeuf
  2017-05-19 20:57         ` H. Peter Anvin
  2017-05-19 20:59         ` H. Peter Anvin
  4 siblings, 2 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-19 20:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jiri Slaby, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers,
	Andy Lutomirski, Jiri Kosina, Linus Torvalds

On Sun, May 07, 2017 at 11:55:24AM -0500, Josh Poimboeuf wrote:
> I'm thinking/hoping that information can be expressed in a simple, easy
> to parse, reasonably sized data structure.  Something like a sorted
> array of this:
> 
> 	struct undwarf {
> 		unsigned int ip;	/* instruction pointer (relative offset from base) */
> 		unsigned prev_frame:13;	/* offset to previous frame from current stack pointer */
> 		unsigned regs:1;	/* whether prev_frame contains entry regs (regs->ip) */
> 		unsigned align:2;	/* some details for dealing with gcc stack realignment */
> 	} __packed;
> 
> 	extern struct undwarf undwarves[];
> 
> One instance of the structure would exist for each time the stack
> pointer changes, e.g. for every function entry, push/pop, and rsp
> add/subtract.  The data could be assembled and sorted offline, possibly
> derived from DWARF, or more likely, generated by objtool.  After doing
> some rough calculations, I think the section size would be comparable to
> the sizes of the DWARF .eh_frame sections it would replace.
> 
> If it worked, the "undwarf" unwinder would be a lot simpler than a real
> DWARF unwinder.  And validating the sanity of the data at runtime would
> be a lot more straightforward.  It could ensure that each stack pointer
> is within the bounds of the current stack, like our current unwinder
> does.

I've been hacking away at this, and so far it's working well.  The code
is much simpler than a DWARF unwinder.  Right now the kernel piece is
only ~350 lines of code.  The vast majority of the changes are in
objtool.

It's now successfully unwinding through entry code and most other asm
files, dumping entry regs, dealing with aligned stacks, dynamic stacks,
etc.

Here's the struct in its current state:

	#define UNDWARF_REG_UNDEFINED		0
	#define UNDWARF_REG_CFA			1
	#define UNDWARF_REG_SP			2
	#define UNDWARF_REG_FP			3
	#define UNDWARF_REG_SP_INDIRECT		4
	#define UNDWARF_REG_FP_INDIRECT		5
	#define UNDWARF_REG_R10			6
	#define UNDWARF_REG_DI			7
	#define UNDWARF_REG_DX			8

	#define UNDWARF_TYPE_NORMAL		0
	#define UNDWARF_TYPE_REGS		1
	#define UNDWARF_TYPE_REGS_IRET		2

	struct undwarf_state {
		int ip;
		unsigned int len;
		short cfa_offset;
		short fp_offset;
		unsigned cfa_reg:4;
		unsigned fp_reg:4;
		unsigned type:2;
	};

With frame pointers disabled, around 300,000 of those structs are needed
for my kernel, which works out to be 4.7M of data.  By comparison, the
DWARF eh_frame sections would be 2.1M.  I think we should be able to
compress it down to a comparable size by rearranging the data a little
bit.

The entry code needs some annotations to give some hints to objtool
about how to generate the data, but it's not bad:

  https://paste.fedoraproject.org/paste/Xq3bPlx5An0Si7AshZTdkF5M1UNdIGYhyRLivL9gydE=

I still have a lot of work to do on the tooling side: module support,
sorting the undwarf table at build time, and a lot of cleanups.  But
overall it's looking feasible.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-19 20:53       ` Josh Poimboeuf
@ 2017-05-19 20:57         ` H. Peter Anvin
  2017-05-19 20:59         ` H. Peter Anvin
  1 sibling, 0 replies; 70+ messages in thread
From: H. Peter Anvin @ 2017-05-19 20:57 UTC (permalink / raw)
  To: Josh Poimboeuf, linux-kernel
  Cc: Jiri Slaby, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina, Linus Torvalds

On 05/19/17 13:53, Josh Poimboeuf wrote:
>>
>> One instance of the structure would exist for each time the stack
>> pointer changes, e.g. for every function entry, push/pop, and rsp
>> add/subtract.  The data could be assembled and sorted offline, possibly
>> derived from DWARF, or more likely, generated by objtool.  After doing
>> some rough calculations, I think the section size would be comparable to
>> the sizes of the DWARF .eh_frame sections it would replace.
>>
>> If it worked, the "undwarf" unwinder would be a lot simpler than a real
>> DWARF unwinder.  And validating the sanity of the data at runtime would
>> be a lot more straightforward.  It could ensure that each stack pointer
>> is within the bounds of the current stack, like our current unwinder
>> does.
> 
> I've been hacking away at this, and so far it's working well.  The code
> is much simpler than a DWARF unwinder.  Right now the kernel piece is
> only ~350 lines of code.  The vast majority of the changes are in
> objtool.
> 
> It's now successfully unwinding through entry code and most other asm
> files, dumping entry regs, dealing with aligned stacks, dynamic stacks,
> etc.
> 
> Here's the struct in its current state:

How are you handling control flow?

	-hpa

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-19 20:53       ` Josh Poimboeuf
  2017-05-19 20:57         ` H. Peter Anvin
@ 2017-05-19 20:59         ` H. Peter Anvin
  2017-05-19 21:29           ` Josh Poimboeuf
  1 sibling, 1 reply; 70+ messages in thread
From: H. Peter Anvin @ 2017-05-19 20:59 UTC (permalink / raw)
  To: Josh Poimboeuf, linux-kernel
  Cc: Jiri Slaby, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina, Linus Torvalds

On 05/19/17 13:53, Josh Poimboeuf wrote:
> 
> Here's the struct in its current state:
> 
> 	#define UNDWARF_REG_UNDEFINED		0
> 	#define UNDWARF_REG_CFA			1
> 	#define UNDWARF_REG_SP			2
> 	#define UNDWARF_REG_FP			3
> 	#define UNDWARF_REG_SP_INDIRECT		4
> 	#define UNDWARF_REG_FP_INDIRECT		5
> 	#define UNDWARF_REG_R10			6
> 	#define UNDWARF_REG_DI			7
> 	#define UNDWARF_REG_DX			8
> 

Why only those registers?  Also, if you have the option I would really
suggest using the actual x86 register numbers (ax, ex, dx, bx, sp, bp,
si, di, r8-r15 in that order.)

	-hpa

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-19 20:59         ` H. Peter Anvin
@ 2017-05-19 21:29           ` Josh Poimboeuf
  2017-05-19 21:35             ` Josh Poimboeuf
  2017-05-22 11:12             ` Ingo Molnar
  0 siblings, 2 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-19 21:29 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-kernel, Jiri Slaby, Andrew Morton, live-patching,
	Thomas Gleixner, Ingo Molnar, the arch/x86 maintainers,
	Andy Lutomirski, Jiri Kosina, Linus Torvalds

> How are you handling control flow?

Control flow of what?

> > Here's the struct in its current state:
> > 
> > 	#define UNDWARF_REG_UNDEFINED		0
> > 	#define UNDWARF_REG_CFA			1
> > 	#define UNDWARF_REG_SP			2
> > 	#define UNDWARF_REG_FP			3
> > 	#define UNDWARF_REG_SP_INDIRECT		4
> > 	#define UNDWARF_REG_FP_INDIRECT		5
> > 	#define UNDWARF_REG_R10			6
> > 	#define UNDWARF_REG_DI			7
> > 	#define UNDWARF_REG_DX			8
> > 
> 
> Why only those registers?  Also, if you have the option I would really
> suggest using the actual x86 register numbers (ax, ex, dx, bx, sp, bp,
> si, di, r8-r15 in that order.)

Those are the only registers which are ever needed as the base for
finding the previous stack frame.  99% of the time it's sp or bp, the
other registers are needed for aligned stacks and entry code.

Using the actual register numbers isn't an option because I don't need
them all and they need to fit in a small number of bits.

This construct might be useful for other arches, which is why I called
it "FP" instead of "BP".  But then I ruined that with the last 3 :-)

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-19 21:29           ` Josh Poimboeuf
@ 2017-05-19 21:35             ` Josh Poimboeuf
  2017-05-20  5:23               ` Andy Lutomirski
  2017-05-26  6:54               ` Jiri Slaby
  2017-05-22 11:12             ` Ingo Molnar
  1 sibling, 2 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-19 21:35 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-kernel, Jiri Slaby, Andrew Morton, live-patching,
	Thomas Gleixner, Ingo Molnar, the arch/x86 maintainers,
	Andy Lutomirski, Jiri Kosina, Linus Torvalds

On Fri, May 19, 2017 at 04:29:13PM -0500, Josh Poimboeuf wrote:
> > How are you handling control flow?
> 
> Control flow of what?
> 
> > > Here's the struct in its current state:
> > > 
> > > 	#define UNDWARF_REG_UNDEFINED		0
> > > 	#define UNDWARF_REG_CFA			1
> > > 	#define UNDWARF_REG_SP			2
> > > 	#define UNDWARF_REG_FP			3
> > > 	#define UNDWARF_REG_SP_INDIRECT		4
> > > 	#define UNDWARF_REG_FP_INDIRECT		5
> > > 	#define UNDWARF_REG_R10			6
> > > 	#define UNDWARF_REG_DI			7
> > > 	#define UNDWARF_REG_DX			8
> > > 
> > 
> > Why only those registers?  Also, if you have the option I would really
> > suggest using the actual x86 register numbers (ax, ex, dx, bx, sp, bp,
> > si, di, r8-r15 in that order.)
> 
> Those are the only registers which are ever needed as the base for
> finding the previous stack frame.  99% of the time it's sp or bp, the
> other registers are needed for aligned stacks and entry code.
> 
> Using the actual register numbers isn't an option because I don't need
> them all and they need to fit in a small number of bits.
> 
> This construct might be useful for other arches, which is why I called
> it "FP" instead of "BP".  But then I ruined that with the last 3 :-)

BTW, here's the link to the unwinder code if you're interested:

  https://github.com/jpoimboe/linux/blob/undwarf/arch/x86/kernel/unwind_undwarf.c

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-19 21:35             ` Josh Poimboeuf
@ 2017-05-20  5:23               ` Andy Lutomirski
  2017-05-20 16:20                 ` Josh Poimboeuf
  2017-05-20 20:16                 ` Linus Torvalds
  2017-05-26  6:54               ` Jiri Slaby
  1 sibling, 2 replies; 70+ messages in thread
From: Andy Lutomirski @ 2017-05-20  5:23 UTC (permalink / raw)
  To: Josh Poimboeuf, H. J. Lu
  Cc: H. Peter Anvin, linux-kernel, Jiri Slaby, Andrew Morton,
	live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Andy Lutomirski, Jiri Kosina,
	Linus Torvalds

On Fri, May 19, 2017 at 2:35 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Fri, May 19, 2017 at 04:29:13PM -0500, Josh Poimboeuf wrote:
>> > How are you handling control flow?
>>
>> Control flow of what?
>>
>> > > Here's the struct in its current state:
>> > >
>> > >   #define UNDWARF_REG_UNDEFINED           0
>> > >   #define UNDWARF_REG_CFA                 1
>> > >   #define UNDWARF_REG_SP                  2
>> > >   #define UNDWARF_REG_FP                  3
>> > >   #define UNDWARF_REG_SP_INDIRECT         4
>> > >   #define UNDWARF_REG_FP_INDIRECT         5
>> > >   #define UNDWARF_REG_R10                 6
>> > >   #define UNDWARF_REG_DI                  7
>> > >   #define UNDWARF_REG_DX                  8
>> > >
>> >
>> > Why only those registers?  Also, if you have the option I would really
>> > suggest using the actual x86 register numbers (ax, ex, dx, bx, sp, bp,
>> > si, di, r8-r15 in that order.)
>>
>> Those are the only registers which are ever needed as the base for
>> finding the previous stack frame.  99% of the time it's sp or bp, the
>> other registers are needed for aligned stacks and entry code.
>>
>> Using the actual register numbers isn't an option because I don't need
>> them all and they need to fit in a small number of bits.
>>
>> This construct might be useful for other arches, which is why I called
>> it "FP" instead of "BP".  But then I ruined that with the last 3 :-)
>
> BTW, here's the link to the unwinder code if you're interested:
>
>   https://github.com/jpoimboe/linux/blob/undwarf/arch/x86/kernel/unwind_undwarf.c

At the risk of potentially overcomplicating matters, here's a
suggestion.  As far as I know, all (or most all?) unwinders
effectively do the following in a loop:

1. Look up the IP to figure out how to unwind from that IP.
2. Use the results of that lookup to compute the previous frame state.

The results of step 1 could perhaps be expressed like this:

struct reg_formula {
  unsigned int source_reg :4;
  long offset;
  bool dereference;  /* true: *(reg + offset); false: (reg + offset) */
  /* For DWARF, I think this can be considerably more complicated, but
I doubt it's useful. */
};

struct unwind_step {
  u16 available_regs;  /* mask of the caller frame regs that we are
able to recover */
  struct reg_formula[16];
};

The CFA computation is just reg_formula[UNWIND_REG_SP] (or that plus
or minus sizeof(unsigned long) or whatever -- I can never remember
exactly what CFA refers to.)  For a frame pointer-based unwinder, the
entire unwind_step is a foregone conclusion independent of IP: SP = BP
+ 8 (or whatever), BP = *(BP + whatever), all other regs unknown.

Could it make sense to actually structure the code this way?  I can
see a few advantages.  It would make the actual meat of the unwind
loop totally independent of the unwinding algorithm in use, it would
make the meat be dead simple (and thus easy to verify for
non-crashiness), and I think it just might open the door for a real
in-kernel DWARF unwinder that Linus would be okay with.  Specifically,
write a function like:

bool get_dwarf_step(struct unwind_step *step, unsigned long ip);

Put this function in its own file and make it buildable as kernel code
or as user code.  Write a test case that runs it on every single
address on the kernel image (in user mode!) with address-sanitizer
enabled (or in Valgrind or both) and make sure that (a) it doesn't
blow up and (b) that the results are credible (e.g. by comparing to
objtool output).  Heck, you could even fuzz-test it where the fuzzer
is allowed to corrupt the actual DWARF data.  You could do the same
thing with whatever crazy super-compacted undwarf scheme someone comes
up with down the road, too.

I personally like the idea of using real DWARF annotations in the
entry code because it makes gdb work better (not kgdb -- real gdb
attached to KVM).  I bet that we could get entry asm annotations into
good shape if we really wanted to.  OTOH, getting DWARF to work well
for inline asm is really nasty IIRC.

(H.J., could we get a binutils feature that allows is to do:

pushq %whatever
.cfi_adjust_sp -8
...
popq %whatever
.cfi_adjust_sp 8

that will emit the right DWARF instructions regardless of whether
there's a frame pointer or not?  .cfi_adjust_cfa_offset is not
particularly helpful here because it's totally wrong if the CFA is
currently being computed based on BP.)


Also, you read the stack like this:

*val = *(unsigned long *)addr;

how about probe_kernel_read() instead?

--Andy

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20  5:23               ` Andy Lutomirski
@ 2017-05-20 16:20                 ` Josh Poimboeuf
  2017-05-20 17:19                   ` Josh Poimboeuf
  2017-05-20 20:01                   ` H.J. Lu
  2017-05-20 20:16                 ` Linus Torvalds
  1 sibling, 2 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-20 16:20 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: H. J. Lu, H. Peter Anvin, linux-kernel, Jiri Slaby,
	Andrew Morton, live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Jiri Kosina, Linus Torvalds

On Fri, May 19, 2017 at 10:23:53PM -0700, Andy Lutomirski wrote:
> On Fri, May 19, 2017 at 2:35 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > On Fri, May 19, 2017 at 04:29:13PM -0500, Josh Poimboeuf wrote:
> >> > How are you handling control flow?
> >>
> >> Control flow of what?
> >>
> >> > > Here's the struct in its current state:
> >> > >
> >> > >   #define UNDWARF_REG_UNDEFINED           0
> >> > >   #define UNDWARF_REG_CFA                 1
> >> > >   #define UNDWARF_REG_SP                  2
> >> > >   #define UNDWARF_REG_FP                  3
> >> > >   #define UNDWARF_REG_SP_INDIRECT         4
> >> > >   #define UNDWARF_REG_FP_INDIRECT         5
> >> > >   #define UNDWARF_REG_R10                 6
> >> > >   #define UNDWARF_REG_DI                  7
> >> > >   #define UNDWARF_REG_DX                  8
> >> > >
> >> >
> >> > Why only those registers?  Also, if you have the option I would really
> >> > suggest using the actual x86 register numbers (ax, ex, dx, bx, sp, bp,
> >> > si, di, r8-r15 in that order.)
> >>
> >> Those are the only registers which are ever needed as the base for
> >> finding the previous stack frame.  99% of the time it's sp or bp, the
> >> other registers are needed for aligned stacks and entry code.
> >>
> >> Using the actual register numbers isn't an option because I don't need
> >> them all and they need to fit in a small number of bits.
> >>
> >> This construct might be useful for other arches, which is why I called
> >> it "FP" instead of "BP".  But then I ruined that with the last 3 :-)
> >
> > BTW, here's the link to the unwinder code if you're interested:
> >
> >   https://github.com/jpoimboe/linux/blob/undwarf/arch/x86/kernel/unwind_undwarf.c
> 
> At the risk of potentially overcomplicating matters, here's a
> suggestion.  As far as I know, all (or most all?) unwinders
> effectively do the following in a loop:
> 
> 1. Look up the IP to figure out how to unwind from that IP.
> 2. Use the results of that lookup to compute the previous frame state.
> 
> The results of step 1 could perhaps be expressed like this:
> 
> struct reg_formula {
>   unsigned int source_reg :4;
>   long offset;
>   bool dereference;  /* true: *(reg + offset); false: (reg + offset) */
>   /* For DWARF, I think this can be considerably more complicated, but
> I doubt it's useful. */
> };
> 
> struct unwind_step {
>   u16 available_regs;  /* mask of the caller frame regs that we are
> able to recover */
>   struct reg_formula[16];
> };

Ok, so I assume you mean we would need to have an in-kernel DWARF reader
which reads .eh_frame and converts it to the above, which is called for
every step of the unwind phase.

> The CFA computation is just reg_formula[UNWIND_REG_SP] (or that plus
> or minus sizeof(unsigned long) or whatever -- I can never remember
> exactly what CFA refers to.)  For a frame pointer-based unwinder, the
> entire unwind_step is a foregone conclusion independent of IP: SP = BP
> + 8 (or whatever), BP = *(BP + whatever), all other regs unknown.
> 
> Could it make sense to actually structure the code this way?  I can
> see a few advantages.  It would make the actual meat of the unwind
> loop totally independent of the unwinding algorithm in use,

Yes, this part of it is an interesting idea, separating the debuginfo
reading step from the unwinding step.  And for people who don't want to
carry around megs of DWARF data, get_dwarf_step() could just be a fake
lookup which always returns the frame pointer version.  

> it would
> make the meat be dead simple (and thus easy to verify for
> non-crashiness), and I think it just might open the door for a real
> in-kernel DWARF unwinder that Linus would be okay with.  Specifically,
> write a function like:
> 
> bool get_dwarf_step(struct unwind_step *step, unsigned long ip);

If we keep the frame pointer encoding thing for non-DWARF kernels then
we may also need to pass in bp as well.

Or maybe we could force frame pointer users to at least have DWARF data
for the entry code.  Then even frame pointer kernels could detect entry
regs and we could get rid of that nasty frame pointer encoding thing.

> Put this function in its own file and make it buildable as kernel code
> or as user code.  Write a test case that runs it on every single
> address on the kernel image (in user mode!) with address-sanitizer
> enabled (or in Valgrind or both) and make sure that (a) it doesn't
> blow up and (b) that the results are credible (e.g. by comparing to
> objtool output).  Heck, you could even fuzz-test it where the fuzzer
> is allowed to corrupt the actual DWARF data.  You could do the same
> thing with whatever crazy super-compacted undwarf scheme someone comes
> up with down the road, too.

I think your proposal can be separated into two ideas, which can each be
considered on their own merit:

1) Put .eh_frame in the kernel, along with an in-kernel DWARF unwinder.
   Manage the complexity of the unwinder by validating the output of the
   complex part of the algorithm in user space.

   That's a lot of hoops to jump through.  The only real advantage I can
   see is that it would allow us to use the toolchain's DWARF data.  But
   that data is going to have issues anyway because of inline asm,
   special sections (e.g., exception tables), generated code (e.g.,
   bpf), hand-coded asm with missing/incorrect annotations, etc.

   Now, we could use objtool to find such issues and warn about them.
   Then those issues could be corrected, either by hand or
   programmatically.

   But then, if we're going that far, why not just have objtool reformat
   the data into something much simpler?  It already has the knowledge
   to do so.  Then we don't have to jump through all those hoops to
   justify jumping through more hoops in the kernel (i.e., having a
   complex DWARF state machine).  With a simple debuginfo format, the
   kernel unwinder is simple enough that we don't need to validate its
   functionality in a simulator.

2) Make a unified unwinder which uses get_dwarf_step() to abstract out
   the differences between frame pointers and DWARF (or undwarf or
   whatever else).  This could an interesting.  Though I'm not sure how
   it would integrate with our "guess" unwinder for kernels which don't
   have frame pointers or DWARF/undwarf.  I guess we could still keep
   that one separate.

> I personally like the idea of using real DWARF annotations in the
> entry code because it makes gdb work better (not kgdb -- real gdb
> attached to KVM).
>
> I bet that we could get entry asm annotations into
> good shape if we really wanted to.  OTOH, getting DWARF to work well
> for inline asm is really nasty IIRC.
> 
> (H.J., could we get a binutils feature that allows is to do:
> 
> pushq %whatever
> .cfi_adjust_sp -8
> ...
> popq %whatever
> .cfi_adjust_sp 8
> 
> that will emit the right DWARF instructions regardless of whether
> there's a frame pointer or not?  .cfi_adjust_cfa_offset is not
> particularly helpful here because it's totally wrong if the CFA is
> currently being computed based on BP.)

I agree that entry_64.o should have DWARF data, regardless of what the
in-kernel representation of the data looks like.  And the same for all
the other asm .o files.

But how we achieve that is debatable.  In the past, having all the
manual .cfi annotations everywhere for every push/pop was an
unmaintainable disaster.

If we could have binutils do automatic adjustments for pushes and pops
and sp adds/subtracts without the .cfi annotations, that would be great.

Otherwise, objtool can do it.  And it can write both undwarf and DWARF,
if needed.

> Also, you read the stack like this:
> 
> *val = *(unsigned long *)addr;
> 
> how about probe_kernel_read() instead?

It depends on how paranoid you want to be.  The undwarf code has the
level of paranoia the frame pointer code has always had.

In other words, it ensures each dereference is within the bounds of the
current stack.  It relies on task->stack and the percpu exception/irq
stack pointers, as well as the previous stack pointers at the edge of
each stack.

I don't think we've ever had a problem with that level of paranoia, so I
think staying with the status quo might be fine.

If we did use probe_kernel_read() we'd have to use some other form of it
because it does some kasan and hardened usercopy checks which wouldn't
always be appropriate.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20 16:20                 ` Josh Poimboeuf
@ 2017-05-20 17:19                   ` Josh Poimboeuf
  2017-05-20 20:01                   ` H.J. Lu
  1 sibling, 0 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-20 17:19 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: H. J. Lu, H. Peter Anvin, linux-kernel, Jiri Slaby,
	Andrew Morton, live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Jiri Kosina, Linus Torvalds

On Sat, May 20, 2017 at 11:20:34AM -0500, Josh Poimboeuf wrote:
>    But then, if we're going that far, why not just have objtool reformat
>    the data into something much simpler?  It already has the knowledge
>    to do so.  Then we don't have to jump through all those hoops to
>    justify jumping through more hoops in the kernel (i.e., having a
>    complex DWARF state machine).  With a simple debuginfo format, the
>    kernel unwinder is simple enough that we don't need to validate its
>    functionality in a simulator.

I should clarify that it doesn't have to be objtool which does this.  It
could instead be a simple DWARF-to-undwarf conversion tool which runs
during the vmlinux linking stage.

Anyway we're both proposing simplifying the DWARF data into an
easier-to-parse format.  I think the question is whether we want that
simplification process to happen in the kernel (in the middle of a
kernel unwind operation), or at build time.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20 16:20                 ` Josh Poimboeuf
  2017-05-20 17:19                   ` Josh Poimboeuf
@ 2017-05-20 20:01                   ` H.J. Lu
  2017-05-20 21:58                     ` Andy Lutomirski
  2017-05-22 21:07                     ` H. Peter Anvin
  1 sibling, 2 replies; 70+ messages in thread
From: H.J. Lu @ 2017-05-20 20:01 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Andy Lutomirski, H. Peter Anvin, linux-kernel, Jiri Slaby,
	Andrew Morton, live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Jiri Kosina, Linus Torvalds

On Sat, May 20, 2017 at 9:20 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:

>>
>> (H.J., could we get a binutils feature that allows is to do:
>>
>> pushq %whatever
>> .cfi_adjust_sp -8
>> ...
>> popq %whatever
>> .cfi_adjust_sp 8
>>

Np.  Compiler needs to generate this.

-- 
H.J.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20  5:23               ` Andy Lutomirski
  2017-05-20 16:20                 ` Josh Poimboeuf
@ 2017-05-20 20:16                 ` Linus Torvalds
  2017-05-20 21:56                   ` Andy Lutomirski
  1 sibling, 1 reply; 70+ messages in thread
From: Linus Torvalds @ 2017-05-20 20:16 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Josh Poimboeuf, H. J. Lu, H. Peter Anvin, linux-kernel,
	Jiri Slaby, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Jiri Kosina

On Fri, May 19, 2017 at 10:23 PM, Andy Lutomirski <luto@kernel.org> wrote:
>
> I personally like the idea of using real DWARF annotations in the
> entry code because it makes gdb work better (not kgdb -- real gdb
> attached to KVM).  I bet that we could get entry asm annotations into
> good shape if we really wanted to.  OTOH, getting DWARF to work well
> for inline asm is really nasty IIRC.

No. I will NAK *any* attempt to make our asm contain the crazy
shit-for-brains annotations.

Been there, done that, got the T-shirt, and then doused the T-shirt in
gasoline and put it on fire.

The amount of unreadable crap and bugs it requires is not worth the
pain. Not for *any* amount of gain, and the gain here is basically
zero.

> (H.J., could we get a binutils feature that allows is to do:
>
> pushq %whatever
> .cfi_adjust_sp -8
> ...
> popq %whatever
> .cfi_adjust_sp 8
>
> that will emit the right DWARF instructions regardless of whether
> there's a frame pointer or not?  .cfi_adjust_cfa_offset is not
> particularly helpful here because it's totally wrong if the CFA is
> currently being computed based on BP.)

Yeah, that's just a small example of the kind of crap people have to deal with.

Not going to happen. Assembler files are hard enough to write (and
read) as-is, anything that expects us to go back to the bad old days
with crazy shit cfi annotations is going to get violently NAK'ed and
vetoed forever.

The *only* acceptable model is automated tools (ie objtool). Don't
even bother to try to go any other way. Because I will not accept that
shit.

                 Linus

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20 20:16                 ` Linus Torvalds
@ 2017-05-20 21:56                   ` Andy Lutomirski
  2017-05-20 23:00                     ` Linus Torvalds
  0 siblings, 1 reply; 70+ messages in thread
From: Andy Lutomirski @ 2017-05-20 21:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andy Lutomirski, Josh Poimboeuf, H. J. Lu, H. Peter Anvin,
	linux-kernel, Jiri Slaby, Andrew Morton, live-patching,
	Thomas Gleixner, Ingo Molnar, the arch/x86 maintainers,
	Jiri Kosina

On Sat, May 20, 2017 at 1:16 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Fri, May 19, 2017 at 10:23 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>
>> I personally like the idea of using real DWARF annotations in the
>> entry code because it makes gdb work better (not kgdb -- real gdb
>> attached to KVM).  I bet that we could get entry asm annotations into
>> good shape if we really wanted to.  OTOH, getting DWARF to work well
>> for inline asm is really nasty IIRC.
>
> No. I will NAK *any* attempt to make our asm contain the crazy
> shit-for-brains annotations.
>
> Been there, done that, got the T-shirt, and then doused the T-shirt in
> gasoline and put it on fire.
>
> The amount of unreadable crap and bugs it requires is not worth the
> pain. Not for *any* amount of gain, and the gain here is basically
> zero.

But what if objtool autogenerated the annotations, perhaps with a tiny
bit of help telling it "hardware frame goes here" or "pt_regs goes
here"?

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20 20:01                   ` H.J. Lu
@ 2017-05-20 21:58                     ` Andy Lutomirski
  2017-05-20 22:20                       ` H.J. Lu
  2017-05-22 21:07                     ` H. Peter Anvin
  1 sibling, 1 reply; 70+ messages in thread
From: Andy Lutomirski @ 2017-05-20 21:58 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Josh Poimboeuf, Andy Lutomirski, H. Peter Anvin, linux-kernel,
	Jiri Slaby, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Jiri Kosina,
	Linus Torvalds

On Sat, May 20, 2017 at 1:01 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, May 20, 2017 at 9:20 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
>>>
>>> (H.J., could we get a binutils feature that allows is to do:
>>>
>>> pushq %whatever
>>> .cfi_adjust_sp -8
>>> ...
>>> popq %whatever
>>> .cfi_adjust_sp 8
>>>
>
> Np.  Compiler needs to generate this.
>

How would the compiler generate this when inline asm is involved?  For
the kernel, objtool could get around the need to have these
annotations, but not so much for user code?  Is the compiler supposed
to parse the inline asm?  Would the compiler provide some magic % code
to represent the current CFA base register?

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20 21:58                     ` Andy Lutomirski
@ 2017-05-20 22:20                       ` H.J. Lu
  2017-05-22 11:34                         ` Jiri Kosina
  0 siblings, 1 reply; 70+ messages in thread
From: H.J. Lu @ 2017-05-20 22:20 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Josh Poimboeuf, H. Peter Anvin, linux-kernel, Jiri Slaby,
	Andrew Morton, live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Jiri Kosina, Linus Torvalds

On Sat, May 20, 2017 at 2:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
> On Sat, May 20, 2017 at 1:01 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Sat, May 20, 2017 at 9:20 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>>
>>>>
>>>> (H.J., could we get a binutils feature that allows is to do:
>>>>
>>>> pushq %whatever
>>>> .cfi_adjust_sp -8
>>>> ...
>>>> popq %whatever
>>>> .cfi_adjust_sp 8
>>>>
>>
>> Np.  Compiler needs to generate this.
>>
>
> How would the compiler generate this when inline asm is involved?  For
> the kernel, objtool could get around the need to have these
> annotations, but not so much for user code?  Is the compiler supposed
> to parse the inline asm?  Would the compiler provide some magic % code
> to represent the current CFA base register?

Here is one example of inline asm with call frame info:

https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sigaction.c;h=be058bac436d1cc9794b2b03107676ed99f6b872;hb=HEAD

-- 
H.J.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20 21:56                   ` Andy Lutomirski
@ 2017-05-20 23:00                     ` Linus Torvalds
  2017-05-20 23:29                       ` Linus Torvalds
  0 siblings, 1 reply; 70+ messages in thread
From: Linus Torvalds @ 2017-05-20 23:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Josh Poimboeuf, H. J. Lu, H. Peter Anvin, linux-kernel,
	Jiri Slaby, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Jiri Kosina

On Sat, May 20, 2017 at 2:56 PM, Andy Lutomirski <luto@kernel.org> wrote:
> On Sat, May 20, 2017 at 1:16 PM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
>>
>> The amount of unreadable crap and bugs it requires is not worth the
>> pain. Not for *any* amount of gain, and the gain here is basically
>> zero.
>
> But what if objtool autogenerated the annotations, perhaps with a tiny
> bit of help telling it "hardware frame goes here" or "pt_regs goes
> here"?

You snipped the next part of my email, where I said:

> The *only* acceptable model is automated tools (ie objtool). Don't
> even bother to try to go any other way. Because I will not accept that
> shit.

so yes, objtool parsing things on its own is acceptable (and it had
better not need any help - it already checks frame pointer data).

The CFI annotations needed in asm are horrendous. We used to have
them, and we didn't have even _remotely_ complete annotations and
despite that they were
 (a) wrong
 (b) incomplete
 (c) made the asm impossible to read and even worse to modify.

hjl already posted an example of the kinds of horrors glibc does to do
things "right". And those rabbit ears around "right" are there for a
reason. There's no way that is ever right - even if it gets the right
results, it's an unmaintainable piece of crap.

So no, we're never ever adding that CFI garbage back into the kernel.

A tool that can generate it is ok, but even then we should expect
inevitable bugs and not trust the end result blindly.

Because dwarf info is complex enough that other tools have gotten it
wrong many many times. Just google for "gcc bugzilla cfi" or go to the
gcc bugzilla and search for "DWARF" or whatever. It's not "oh, we once
had a bug". It's constant.

One of the reasons I like the idea of having objtool generate
something *simpler* than dwarf is that it not only is much easier to
parse, it has a much higher likelihood of not having some crazy bug.
If objtool mainly looks at the actual instructions, and perhaps uses
dwarf information as additional input and creates something much
simpler than dwarf, it might have a chance in hell of occasionally
even getting it right.

Because dwarf information is really really complicated. It's
complicated because it contains *way* more information than just how
to find the next stack frame.

I mean, it has basically a RPN interpreter built in, and that's the
_simple_ part.

                  Linus

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20 23:00                     ` Linus Torvalds
@ 2017-05-20 23:29                       ` Linus Torvalds
  0 siblings, 0 replies; 70+ messages in thread
From: Linus Torvalds @ 2017-05-20 23:29 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Josh Poimboeuf, H. J. Lu, H. Peter Anvin, linux-kernel,
	Jiri Slaby, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Jiri Kosina

On Sat, May 20, 2017 at 4:00 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> hjl already posted an example of the kinds of horrors glibc does to do
> things "right".

Side note: we'd hopefully/presumably never need anything _that_
disgusting for the kernel, so hjl's example is probably an extreme
one.

But even when we just did the pushq/popq_cfi macros etc to try to have
simple and reasonably legible annotations for the common cases, it got
pretty ugly.

It wasn't that extreme glibc kind of "50 lines of ugly for two
instructions of code", but it was pretty bad. And as far as I know we
never even tried to annotate places where we did "pushf/pop %reg" in
inline asm (for saving/restoring flags)

               Linus

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-19 21:29           ` Josh Poimboeuf
  2017-05-19 21:35             ` Josh Poimboeuf
@ 2017-05-22 11:12             ` Ingo Molnar
  2017-05-22 21:16               ` H. Peter Anvin
  1 sibling, 1 reply; 70+ messages in thread
From: Ingo Molnar @ 2017-05-22 11:12 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: H. Peter Anvin, linux-kernel, Jiri Slaby, Andrew Morton,
	live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Andy Lutomirski, Jiri Kosina,
	Linus Torvalds


* Josh Poimboeuf <jpoimboe@redhat.com> wrote:

> > How are you handling control flow?
> 
> Control flow of what?
> 
> > > Here's the struct in its current state:
> > > 
> > > 	#define UNDWARF_REG_UNDEFINED		0
> > > 	#define UNDWARF_REG_CFA			1
> > > 	#define UNDWARF_REG_SP			2
> > > 	#define UNDWARF_REG_FP			3
> > > 	#define UNDWARF_REG_SP_INDIRECT		4
> > > 	#define UNDWARF_REG_FP_INDIRECT		5
> > > 	#define UNDWARF_REG_R10			6
> > > 	#define UNDWARF_REG_DI			7
> > > 	#define UNDWARF_REG_DX			8
> > > 
> > 
> > Why only those registers?  Also, if you have the option I would really
> > suggest using the actual x86 register numbers (ax, ex, dx, bx, sp, bp,
> > si, di, r8-r15 in that order.)
> 
> Those are the only registers which are ever needed as the base for
> finding the previous stack frame.  99% of the time it's sp or bp, the
> other registers are needed for aligned stacks and entry code.
> 
> Using the actual register numbers isn't an option because I don't need
> them all and they need to fit in a small number of bits.
> 
> This construct might be useful for other arches, which is why I called
> it "FP" instead of "BP".  But then I ruined that with the last 3 :-)

Please call it BP - 'FP' can easily be read as floating-point, making it all 
super-confusing. We should use canonical x86 register names and ordering - even
if not all registers are used straight away.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20 22:20                       ` H.J. Lu
@ 2017-05-22 11:34                         ` Jiri Kosina
  2017-05-22 14:39                           ` H.J. Lu
  0 siblings, 1 reply; 70+ messages in thread
From: Jiri Kosina @ 2017-05-22 11:34 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Andy Lutomirski, Josh Poimboeuf, H. Peter Anvin, linux-kernel,
	Jiri Slaby, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Linus Torvalds

On Sat, 20 May 2017, H.J. Lu wrote:

> >>>> pushq %whatever
> >>>> .cfi_adjust_sp -8
> >>>> ...
> >>>> popq %whatever
> >>>> .cfi_adjust_sp 8
> >>>>
> >>
> >> Np.  Compiler needs to generate this.
> >>
> >
> > How would the compiler generate this when inline asm is involved?  For
> > the kernel, objtool could get around the need to have these
> > annotations, but not so much for user code?  Is the compiler supposed
> > to parse the inline asm?  Would the compiler provide some magic % code
> > to represent the current CFA base register?
> 
> Here is one example of inline asm with call frame info:
> 
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sigaction.c;h=be058bac436d1cc9794b2b03107676ed99f6b872;hb=HEAD

That brings us basically pretty close to square one though; having to 
maintain "manual" anotations. Something we're pretty much trying to avoid 
through this excercise.

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-22 11:34                         ` Jiri Kosina
@ 2017-05-22 14:39                           ` H.J. Lu
  0 siblings, 0 replies; 70+ messages in thread
From: H.J. Lu @ 2017-05-22 14:39 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Andy Lutomirski, Josh Poimboeuf, H. Peter Anvin, linux-kernel,
	Jiri Slaby, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Linus Torvalds

On Mon, May 22, 2017 at 4:34 AM, Jiri Kosina <jikos@kernel.org> wrote:
> On Sat, 20 May 2017, H.J. Lu wrote:
>
>> >>>> pushq %whatever
>> >>>> .cfi_adjust_sp -8
>> >>>> ...
>> >>>> popq %whatever
>> >>>> .cfi_adjust_sp 8
>> >>>>
>> >>
>> >> Np.  Compiler needs to generate this.
>> >>
>> >
>> > How would the compiler generate this when inline asm is involved?  For
>> > the kernel, objtool could get around the need to have these
>> > annotations, but not so much for user code?  Is the compiler supposed
>> > to parse the inline asm?  Would the compiler provide some magic % code
>> > to represent the current CFA base register?
>>
>> Here is one example of inline asm with call frame info:
>>
>> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sigaction.c;h=be058bac436d1cc9794b2b03107676ed99f6b872;hb=HEAD
>
> That brings us basically pretty close to square one though; having to
> maintain "manual" anotations. Something we're pretty much trying to avoid
> through this excercise.

Assembler only encodes instructions.  You need to a different tool
to figure out what an instruction does.


-- 
H.J.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-20 20:01                   ` H.J. Lu
  2017-05-20 21:58                     ` Andy Lutomirski
@ 2017-05-22 21:07                     ` H. Peter Anvin
  2017-05-22 21:37                       ` H. Peter Anvin
  1 sibling, 1 reply; 70+ messages in thread
From: H. Peter Anvin @ 2017-05-22 21:07 UTC (permalink / raw)
  To: H.J. Lu, Josh Poimboeuf
  Cc: Andy Lutomirski, linux-kernel, Jiri Slaby, Andrew Morton,
	live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Jiri Kosina, Linus Torvalds

On 05/20/17 13:01, H.J. Lu wrote:
> On Sat, May 20, 2017 at 9:20 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> 
>>>
>>> (H.J., could we get a binutils feature that allows is to do:
>>>
>>> pushq %whatever
>>> .cfi_adjust_sp -8
>>> ...
>>> popq %whatever
>>> .cfi_adjust_sp 8
>>>
> 
> Np.  Compiler needs to generate this.
> 

For actual assembly we have such a feature, it is called macros.

push/pop is the easy stuff; macros take care of that, but the real pain
is dealing with the flow of control.

	-hpa

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-22 11:12             ` Ingo Molnar
@ 2017-05-22 21:16               ` H. Peter Anvin
  2017-05-22 23:23                 ` Jiri Kosina
  2017-05-23  5:49                 ` Ingo Molnar
  0 siblings, 2 replies; 70+ messages in thread
From: H. Peter Anvin @ 2017-05-22 21:16 UTC (permalink / raw)
  To: Ingo Molnar, Josh Poimboeuf
  Cc: linux-kernel, Jiri Slaby, Andrew Morton, live-patching,
	Thomas Gleixner, Ingo Molnar, the arch/x86 maintainers,
	Andy Lutomirski, Jiri Kosina, Linus Torvalds

On 05/22/17 04:12, Ingo Molnar wrote:
\>>
>> This construct might be useful for other arches, which is why I called
>> it "FP" instead of "BP".  But then I ruined that with the last 3 :-)
> 
> Please call it BP - 'FP' can easily be read as floating-point, making it all 
> super-confusing. We should use canonical x86 register names and ordering - even
> if not all registers are used straight away.
> 

Seriously, I suspect that at the end of the day we will have reinvented
DWARF.

	-hpa

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-22 21:07                     ` H. Peter Anvin
@ 2017-05-22 21:37                       ` H. Peter Anvin
  2017-05-22 22:11                         ` Josh Poimboeuf
  0 siblings, 1 reply; 70+ messages in thread
From: H. Peter Anvin @ 2017-05-22 21:37 UTC (permalink / raw)
  To: H.J. Lu, Josh Poimboeuf
  Cc: Andy Lutomirski, linux-kernel, Jiri Slaby, Andrew Morton,
	live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Jiri Kosina, Linus Torvalds

On 05/22/17 14:07, H. Peter Anvin wrote:
> On 05/20/17 13:01, H.J. Lu wrote:
>> On Sat, May 20, 2017 at 9:20 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>>
>>>>
>>>> (H.J., could we get a binutils feature that allows is to do:
>>>>
>>>> pushq %whatever
>>>> .cfi_adjust_sp -8
>>>> ...
>>>> popq %whatever
>>>> .cfi_adjust_sp 8
>>>>
>>
>> Np.  Compiler needs to generate this.
>>
> 
> For actual assembly we have such a feature, it is called macros.
> 
> push/pop is the easy stuff; macros take care of that, but the real pain
> is dealing with the flow of control.
> 

My biggest beef with the CFI directives that gas uses is that there is
that .cfi_remember_state/.cfi_restore_state doesn't have a way to
specify more than one state.  That makes it really hard to get sanity
around control flow changes, especially with code that is intentionally
out of line.

That, and some of the CFI directives seem to be a bit ill-defined in
their definition (are they even applicable to anything other than
DWARF?)  They almost seem to be referencing some external specification,
but the only thing I'm finding is the DWARF documentation which is
written in very different terms.

The best description of what a personality routine is I found in an
article by Ian Lance Taylor.  It doesn't seem to be applicable to C as
far as I can tell.

	-hpa

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-22 21:37                       ` H. Peter Anvin
@ 2017-05-22 22:11                         ` Josh Poimboeuf
  0 siblings, 0 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-22 22:11 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: H.J. Lu, Andy Lutomirski, linux-kernel, Jiri Slaby,
	Andrew Morton, live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Jiri Kosina, Linus Torvalds

On Mon, May 22, 2017 at 02:37:50PM -0700, H. Peter Anvin wrote:
> On 05/22/17 14:07, H. Peter Anvin wrote:
> > On 05/20/17 13:01, H.J. Lu wrote:
> >> On Sat, May 20, 2017 at 9:20 AM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> >>
> >>>>
> >>>> (H.J., could we get a binutils feature that allows is to do:
> >>>>
> >>>> pushq %whatever
> >>>> .cfi_adjust_sp -8
> >>>> ...
> >>>> popq %whatever
> >>>> .cfi_adjust_sp 8
> >>>>
> >>
> >> Np.  Compiler needs to generate this.
> >>
> > 
> > For actual assembly we have such a feature, it is called macros.
> > 
> > push/pop is the easy stuff; macros take care of that, but the real pain
> > is dealing with the flow of control.
> > 
> 
> My biggest beef with the CFI directives that gas uses is that there is
> that .cfi_remember_state/.cfi_restore_state doesn't have a way to
> specify more than one state.  That makes it really hard to get sanity
> around control flow changes, especially with code that is intentionally
> out of line.
> 
> That, and some of the CFI directives seem to be a bit ill-defined in
> their definition (are they even applicable to anything other than
> DWARF?)  They almost seem to be referencing some external specification,
> but the only thing I'm finding is the DWARF documentation which is
> written in very different terms.
> 
> The best description of what a personality routine is I found in an
> article by Ian Lance Taylor.  It doesn't seem to be applicable to C as
> far as I can tell.

So my understanding is that there's stock DWARF (.debug_frame) and then
there's souped-up DWARF (.eh_frame), which is basically DWARF with a few
extensions.

The remember/restore state thing is stock DWARF (DW_CFA_remember_state
and DW_CFA_restore_state).  The personality routine thing is in the
.eh_frame extension which is documented here:

  http://refspecs.linuxfoundation.org/LSB_5.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-22 21:16               ` H. Peter Anvin
@ 2017-05-22 23:23                 ` Jiri Kosina
  2017-05-23  5:49                 ` Ingo Molnar
  1 sibling, 0 replies; 70+ messages in thread
From: Jiri Kosina @ 2017-05-22 23:23 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, Josh Poimboeuf, linux-kernel, Jiri Slaby,
	Andrew Morton, live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Andy Lutomirski, Linus Torvalds

On Mon, 22 May 2017, H. Peter Anvin wrote:

> > Please call it BP - 'FP' can easily be read as floating-point, making 
> > it all super-confusing. We should use canonical x86 register names and 
> > ordering - even if not all registers are used straight away.
> 
> Seriously, I suspect that at the end of the day we will have reinvented
> DWARF.

If this is the fear, what is the proposal then? We simply have to deal 
with !FRAME_POINTER builds.

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-22 21:16               ` H. Peter Anvin
  2017-05-22 23:23                 ` Jiri Kosina
@ 2017-05-23  5:49                 ` Ingo Molnar
  2017-05-26 19:16                   ` hpa
  1 sibling, 1 reply; 70+ messages in thread
From: Ingo Molnar @ 2017-05-23  5:49 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Josh Poimboeuf, linux-kernel, Jiri Slaby, Andrew Morton,
	live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Andy Lutomirski, Jiri Kosina,
	Linus Torvalds, H. Peter Anvin, Peter Zijlstra


* H. Peter Anvin <hpa@zytor.com> wrote:

> On 05/22/17 04:12, Ingo Molnar wrote:
> \>>
> >> This construct might be useful for other arches, which is why I called
> >> it "FP" instead of "BP".  But then I ruined that with the last 3 :-)
> > 
> > Please call it BP - 'FP' can easily be read as floating-point, making it all 
> > super-confusing. We should use canonical x86 register names and ordering - even
> > if not all registers are used straight away.
> > 
> 
> Seriously, I suspect that at the end of the day we will have reinvented
> DWARF.

Absolutely - the main difference is:

 - the debug-info implementation is _internal_ to the kernel so it can be fixed
   instead of "oh, wait 2 years for the toolchain to fix this particular bug, work
   it around in the kernel meanwhile" kind of crazy flow and dependencies. I.e. 
   the debug-info generation and parsing code is both part of the kernel Git tree 
   and can be iterated (and fixed) at once with.

 - the debug-info is auto-generated for assembly as well, leaving assembly code 
   maintainable.

 - the debug-info has a sane data structure designed for robustness and 
   compactness

So even if it's a subset of the existing complexity of dwarf et al we are still 
literally infinitely better off with this model.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-10  8:32           ` Jiri Slaby
  2017-05-10 13:13             ` Josh Poimboeuf
@ 2017-05-23  7:07             ` Peter Zijlstra
  2017-05-23  7:27               ` Ingo Molnar
  1 sibling, 1 reply; 70+ messages in thread
From: Peter Zijlstra @ 2017-05-23  7:07 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: Josh Poimboeuf, Jiri Kosina, Linus Torvalds, Andrew Morton,
	live-patching, Linux Kernel Mailing List, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, the arch/x86 maintainers,
	Andy Lutomirski

On Wed, May 10, 2017 at 10:32:06AM +0200, Jiri Slaby wrote:
> > But it does hurt, in the sense that the complicated format of DWARF CFI
> > means the unwinder has to jump through a lot more hoops to read it.
> 
> Why that matters, actually? Unwinder is nothing to be performance
> oriented. And if somebody is doing a lot of unwinding during runtime,
> they can switch to in-this-case-faster FP unwinder.

perf (and ftrace) like the unwinder to be considered performance
oriented.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-23  7:07             ` Peter Zijlstra
@ 2017-05-23  7:27               ` Ingo Molnar
  0 siblings, 0 replies; 70+ messages in thread
From: Ingo Molnar @ 2017-05-23  7:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jiri Slaby, Josh Poimboeuf, Jiri Kosina, Linus Torvalds,
	Andrew Morton, live-patching, Linux Kernel Mailing List,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	the arch/x86 maintainers, Andy Lutomirski


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, May 10, 2017 at 10:32:06AM +0200, Jiri Slaby wrote:
> > > But it does hurt, in the sense that the complicated format of DWARF CFI
> > > means the unwinder has to jump through a lot more hoops to read it.
> > 
> > Why that matters, actually? Unwinder is nothing to be performance
> > oriented. And if somebody is doing a lot of unwinding during runtime,
> > they can switch to in-this-case-faster FP unwinder.
> 
> perf (and ftrace) like the unwinder to be considered performance
> oriented.

Yes, and given how critical debugging is there's a kind of useful synergy here: 
overall the perf unwinder is run about 10 orders of magnitude more often than the 
debugging unwinder, so it's a very big help in shaking out unwinder/debug-info 
bugs and increasing robustness overall.

The 'price' for using the unwinder in perf is that it has to be fast.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-19 21:35             ` Josh Poimboeuf
  2017-05-20  5:23               ` Andy Lutomirski
@ 2017-05-26  6:54               ` Jiri Slaby
  2017-05-26 11:29                 ` Jiri Slaby
  1 sibling, 1 reply; 70+ messages in thread
From: Jiri Slaby @ 2017-05-26  6:54 UTC (permalink / raw)
  To: Josh Poimboeuf, H. Peter Anvin
  Cc: linux-kernel, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina, Linus Torvalds

On 05/19/2017, 11:35 PM, Josh Poimboeuf wrote:
>   https://github.com/jpoimboe/linux/blob/undwarf/arch/x86/kernel/unwind_undwarf.c

JFYI, it crashes in sha1_transform_avx due to crypto changes. You
perhaps missed that this beast uses ebp (not rbp) register for
computations. I had to do:

--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -37,7 +37,7 @@
 #define REG_A  %ecx
 #define REG_B  %esi
 #define REG_C  %edi
-#define REG_D  %ebp
+#define REG_D  %r12d
 #define REG_E  %edx

 #define REG_T1 %eax
@@ -74,6 +74,7 @@
        SYM_FUNC_START(\name)

        push    %rbx
+       push    %r12
        push    %rbp

        mov     %rsp, %rbp
@@ -99,6 +100,7 @@
        rep stosq

        leaveq                          # deallocate workspace
+       pop     %r12
        pop     %rbx
        ret


I am afraid there are more of these, e.g. in aesni-intel_asm.S.

thanks,
-- 
js
suse labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-26  6:54               ` Jiri Slaby
@ 2017-05-26 11:29                 ` Jiri Slaby
  2017-05-26 12:14                   ` Josh Poimboeuf
  0 siblings, 1 reply; 70+ messages in thread
From: Jiri Slaby @ 2017-05-26 11:29 UTC (permalink / raw)
  To: Josh Poimboeuf, H. Peter Anvin
  Cc: linux-kernel, Andrew Morton, live-patching, Thomas Gleixner,
	Ingo Molnar, the arch/x86 maintainers, Andy Lutomirski,
	Jiri Kosina, Linus Torvalds

On 05/26/2017, 08:54 AM, Jiri Slaby wrote:
> On 05/19/2017, 11:35 PM, Josh Poimboeuf wrote:
>>   https://github.com/jpoimboe/linux/blob/undwarf/arch/x86/kernel/unwind_undwarf.c
> 
> JFYI, it crashes in sha1_transform_avx due to crypto changes. You
> perhaps missed that this beast uses ebp (not rbp) register for
> computations. I had to do:
> 
> --- a/arch/x86/crypto/sha1_ssse3_asm.S
> +++ b/arch/x86/crypto/sha1_ssse3_asm.S
> @@ -37,7 +37,7 @@
>  #define REG_A  %ecx
>  #define REG_B  %esi
>  #define REG_C  %edi
> -#define REG_D  %ebp
> +#define REG_D  %r12d
>  #define REG_E  %edx
> 
>  #define REG_T1 %eax
> @@ -74,6 +74,7 @@
>         SYM_FUNC_START(\name)
> 
>         push    %rbx
> +       push    %r12
>         push    %rbp
> 
>         mov     %rsp, %rbp
> @@ -99,6 +100,7 @@
>         rep stosq
> 
>         leaveq                          # deallocate workspace
> +       pop     %r12
>         pop     %rbx
>         ret
> 
> 
> I am afraid there are more of these, e.g. in aesni-intel_asm.S.

aesni-intel_asm.S is OK -- only untouched x86_32 part uses ebp.

But sha1_avx2_x86_64_asm.S is not. They use *all* usable registers
including ebp in the computations hidden behind the
SHA1_PIPELINED_MAIN_BODY macro. The only work around I can see is to
push rbp/pop rbp around the computation as it used to do with rbx:

--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -636,6 +636,7 @@ _loop3:
        /* Align stack */
        mov     %rsp, %rbp
        and     $~(0x20-1), %rsp
+       push    %rbp
        sub     $RESERVE_STACK, %rsp

        avx2_zeroupper
@@ -661,6 +662,7 @@ _loop3:
        avx2_zeroupper

        add     $RESERVE_STACK, %rsp
+       pop     %rbp

        leaveq
        pop     %r15

regards,
-- 
js
suse labs

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-26 11:29                 ` Jiri Slaby
@ 2017-05-26 12:14                   ` Josh Poimboeuf
  0 siblings, 0 replies; 70+ messages in thread
From: Josh Poimboeuf @ 2017-05-26 12:14 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: H. Peter Anvin, linux-kernel, Andrew Morton, live-patching,
	Thomas Gleixner, Ingo Molnar, the arch/x86 maintainers,
	Andy Lutomirski, Jiri Kosina, Linus Torvalds

On Fri, May 26, 2017 at 01:29:01PM +0200, Jiri Slaby wrote:
> On 05/26/2017, 08:54 AM, Jiri Slaby wrote:
> > On 05/19/2017, 11:35 PM, Josh Poimboeuf wrote:
> >>   https://github.com/jpoimboe/linux/blob/undwarf/arch/x86/kernel/unwind_undwarf.c
> > 
> > JFYI, it crashes in sha1_transform_avx due to crypto changes. You
> > perhaps missed that this beast uses ebp (not rbp) register for
> > computations. I had to do:
> > 
> > --- a/arch/x86/crypto/sha1_ssse3_asm.S
> > +++ b/arch/x86/crypto/sha1_ssse3_asm.S
> > @@ -37,7 +37,7 @@
> >  #define REG_A  %ecx
> >  #define REG_B  %esi
> >  #define REG_C  %edi
> > -#define REG_D  %ebp
> > +#define REG_D  %r12d
> >  #define REG_E  %edx
> > 
> >  #define REG_T1 %eax
> > @@ -74,6 +74,7 @@
> >         SYM_FUNC_START(\name)
> > 
> >         push    %rbx
> > +       push    %r12
> >         push    %rbp
> > 
> >         mov     %rsp, %rbp
> > @@ -99,6 +100,7 @@
> >         rep stosq
> > 
> >         leaveq                          # deallocate workspace
> > +       pop     %r12
> >         pop     %rbx
> >         ret
> > 
> > 
> > I am afraid there are more of these, e.g. in aesni-intel_asm.S.
> 
> aesni-intel_asm.S is OK -- only untouched x86_32 part uses ebp.
> 
> But sha1_avx2_x86_64_asm.S is not. They use *all* usable registers
> including ebp in the computations hidden behind the
> SHA1_PIPELINED_MAIN_BODY macro. The only work around I can see is to
> push rbp/pop rbp around the computation as it used to do with rbx:
> 
> --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
> +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
> @@ -636,6 +636,7 @@ _loop3:
>         /* Align stack */
>         mov     %rsp, %rbp
>         and     $~(0x20-1), %rsp
> +       push    %rbp
>         sub     $RESERVE_STACK, %rsp
> 
>         avx2_zeroupper
> @@ -661,6 +662,7 @@ _loop3:
>         avx2_zeroupper
> 
>         add     $RESERVE_STACK, %rsp
> +       pop     %rbp
> 
>         leaveq
>         pop     %r15

Thanks, the first fix looks good.  Is the second one needed though?  It
already pushes rbp before it aligns the stack.

DWARF/undwarf will be immune to these issues, so I'll be moving a lot of
these crypto changes to a separate branch.  They were only in this
branch because the new-and-improved objtool can now find rbp misusage in
leaf functions.

It seems that most of the crypto code is frame pointer ignorant.  IMO,
leaf functions shouldn't be allowed to use rbp because it breaks frame
pointers when preempted by an interrupt.  GCC seems to agree.

I added a check to objtool to find the ones which use rbp badly.  Here
are the ones I see with my config:

  arch/x86/crypto/des3_ede-asm_64.o: warning: objtool: des3_ede_x86_64_crypt_blk uses BP as a scratch register
  arch/x86/crypto/des3_ede-asm_64.o: warning: objtool: des3_ede_x86_64_crypt_blk_3way uses BP as a scratch register
  arch/x86/crypto/blowfish-x86_64-asm_64.o: warning: objtool: __blowfish_enc_blk_4way uses BP as a scratch register
  arch/x86/crypto/blowfish-x86_64-asm_64.o: warning: objtool: blowfish_dec_blk_4way uses BP as a scratch register
  arch/x86/crypto/twofish-x86_64-asm_64-3way.o: warning: objtool: __twofish_enc_blk_3way uses BP as a scratch register
  arch/x86/crypto/twofish-x86_64-asm_64-3way.o: warning: objtool: twofish_dec_blk_3way uses BP as a scratch register
  arch/x86/crypto/sha256-avx2-asm.o: warning: objtool: sha256_transform_rorx uses BP as a scratch register
  arch/x86/crypto/cast5-avx-x86_64-asm_64.o: warning: objtool: __cast5_enc_blk16 uses BP as a scratch register
  arch/x86/crypto/cast5-avx-x86_64-asm_64.o: warning: objtool: __cast5_dec_blk16 uses BP as a scratch register
  arch/x86/crypto/cast6-avx-x86_64-asm_64.o: warning: objtool: __cast6_enc_blk8 uses BP as a scratch register
  arch/x86/crypto/cast6-avx-x86_64-asm_64.o: warning: objtool: __cast6_dec_blk8 uses BP as a scratch register
  arch/x86/crypto/twofish-avx-x86_64-asm_64.o: warning: objtool: __twofish_enc_blk8 uses BP as a scratch register
  arch/x86/crypto/twofish-avx-x86_64-asm_64.o: warning: objtool: __twofish_dec_blk8 uses BP as a scratch register

(And that doesn't include the ones which misuse ebp.)

It may be a challenge to fix some of those which use all available
registers.

-- 
Josh

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-23  5:49                 ` Ingo Molnar
@ 2017-05-26 19:16                   ` hpa
  2017-05-28  9:12                     ` Ingo Molnar
  0 siblings, 1 reply; 70+ messages in thread
From: hpa @ 2017-05-26 19:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Josh Poimboeuf, linux-kernel, Jiri Slaby, Andrew Morton,
	live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Andy Lutomirski, Jiri Kosina,
	Linus Torvalds, Peter Zijlstra

On May 22, 2017 10:49:06 PM PDT, Ingo Molnar <mingo@kernel.org> wrote:
>
>* H. Peter Anvin <hpa@zytor.com> wrote:
>
>> On 05/22/17 04:12, Ingo Molnar wrote:
>> \>>
>> >> This construct might be useful for other arches, which is why I
>called
>> >> it "FP" instead of "BP".  But then I ruined that with the last 3
>:-)
>> > 
>> > Please call it BP - 'FP' can easily be read as floating-point,
>making it all 
>> > super-confusing. We should use canonical x86 register names and
>ordering - even
>> > if not all registers are used straight away.
>> > 
>> 
>> Seriously, I suspect that at the end of the day we will have
>reinvented
>> DWARF.
>
>Absolutely - the main difference is:
>
>- the debug-info implementation is _internal_ to the kernel so it can
>be fixed
>instead of "oh, wait 2 years for the toolchain to fix this particular
>bug, work
>it around in the kernel meanwhile" kind of crazy flow and dependencies.
>I.e. 
>the debug-info generation and parsing code is both part of the kernel
>Git tree 
>   and can be iterated (and fixed) at once with.
>
>- the debug-info is auto-generated for assembly as well, leaving
>assembly code 
>   maintainable.
>
>- the debug-info has a sane data structure designed for robustness and 
>   compactness
>
>So even if it's a subset of the existing complexity of dwarf et al we
>are still 
>literally infinitely better off with this model.
>
>Thanks,
>
>	Ingo

This assumes that it actually ends up being feasible for objtool to do so.

It is worth noting that using DWARF for unwinding vs auto-generating the unwind information are independent issues.

Another option is to use (or postprocess) the compiler-generated DWARF for C modules and pursue autogeneration only for assembly modules, which ought to be a much easier problem and is less dependent on discovering new compiler-generated patterns.
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 7/7] DWARF: add the config option
  2017-05-26 19:16                   ` hpa
@ 2017-05-28  9:12                     ` Ingo Molnar
  0 siblings, 0 replies; 70+ messages in thread
From: Ingo Molnar @ 2017-05-28  9:12 UTC (permalink / raw)
  To: hpa
  Cc: Josh Poimboeuf, linux-kernel, Jiri Slaby, Andrew Morton,
	live-patching, Thomas Gleixner, Ingo Molnar,
	the arch/x86 maintainers, Andy Lutomirski, Jiri Kosina,
	Linus Torvalds, Peter Zijlstra


* hpa@zytor.com <hpa@zytor.com> wrote:

> This assumes that it actually ends up being feasible for objtool to do so.

Yes, agreed, that's a big precondition. I'm cautiously optimistic based on Josh's 
experiments that he posted about in this thread.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 70+ messages in thread

end of thread, other threads:[~2017-05-28  9:12 UTC | newest]

Thread overview: 70+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-05 12:21 [PATCH 1/7] DWARF: add option to preserve unwind info Jiri Slaby
2017-05-05 12:21 ` [PATCH 2/7] DWARF: EH-frame based stack unwinding Jiri Slaby
2017-05-05 12:21 ` [PATCH 3/7] vmlinux.lds: preserve eh_frame for DWARF unwinder Jiri Slaby
2017-05-05 12:21 ` [PATCH 4/7] DWARF: initialize structures for kernel and modules Jiri Slaby
2017-05-05 12:21 ` [PATCH 5/7] unwinder: show_stack, check also ret_addr_p's contents Jiri Slaby
2017-05-05 12:21 ` [PATCH 6/7] unwinder: plug in the DWARF unwinder Jiri Slaby
2017-05-05 12:22 ` [PATCH 7/7] DWARF: add the config option Jiri Slaby
2017-05-05 19:57   ` Linus Torvalds
2017-05-06  7:19     ` Ingo Molnar
2017-05-10  7:46       ` Jiri Slaby
2017-05-06 14:24     ` Jiri Kosina
2017-05-07 16:55     ` Josh Poimboeuf
2017-05-07 17:59       ` Ingo Molnar
2017-05-07 18:08         ` hpa
2017-05-07 21:48           ` Josh Poimboeuf
2017-05-08  7:50             ` Vojtech Pavlik
2017-05-08 13:14               ` Josh Poimboeuf
2017-05-08  5:35       ` Andy Lutomirski
2017-05-08  6:15         ` Ingo Molnar
2017-05-08 14:40         ` Josh Poimboeuf
2017-05-08 18:57           ` hpa
2017-05-09  0:21       ` Andy Lutomirski
2017-05-09  1:38         ` Josh Poimboeuf
2017-05-09  2:31           ` Andy Lutomirski
2017-05-09  3:38             ` Josh Poimboeuf
2017-05-09 10:00               ` hpa
2017-05-09 14:58                 ` Josh Poimboeuf
2017-05-09 16:46                   ` H.J. Lu
2017-05-10  8:15                 ` Jiri Slaby
2017-05-10 13:09                   ` Josh Poimboeuf
2017-05-10 16:23                     ` H.J. Lu
2017-05-09 18:47       ` Jiri Kosina
2017-05-09 19:22         ` Josh Poimboeuf
2017-05-10  8:32           ` Jiri Slaby
2017-05-10 13:13             ` Josh Poimboeuf
2017-05-23  7:07             ` Peter Zijlstra
2017-05-23  7:27               ` Ingo Molnar
2017-05-19 20:53       ` Josh Poimboeuf
2017-05-19 20:57         ` H. Peter Anvin
2017-05-19 20:59         ` H. Peter Anvin
2017-05-19 21:29           ` Josh Poimboeuf
2017-05-19 21:35             ` Josh Poimboeuf
2017-05-20  5:23               ` Andy Lutomirski
2017-05-20 16:20                 ` Josh Poimboeuf
2017-05-20 17:19                   ` Josh Poimboeuf
2017-05-20 20:01                   ` H.J. Lu
2017-05-20 21:58                     ` Andy Lutomirski
2017-05-20 22:20                       ` H.J. Lu
2017-05-22 11:34                         ` Jiri Kosina
2017-05-22 14:39                           ` H.J. Lu
2017-05-22 21:07                     ` H. Peter Anvin
2017-05-22 21:37                       ` H. Peter Anvin
2017-05-22 22:11                         ` Josh Poimboeuf
2017-05-20 20:16                 ` Linus Torvalds
2017-05-20 21:56                   ` Andy Lutomirski
2017-05-20 23:00                     ` Linus Torvalds
2017-05-20 23:29                       ` Linus Torvalds
2017-05-26  6:54               ` Jiri Slaby
2017-05-26 11:29                 ` Jiri Slaby
2017-05-26 12:14                   ` Josh Poimboeuf
2017-05-22 11:12             ` Ingo Molnar
2017-05-22 21:16               ` H. Peter Anvin
2017-05-22 23:23                 ` Jiri Kosina
2017-05-23  5:49                 ` Ingo Molnar
2017-05-26 19:16                   ` hpa
2017-05-28  9:12                     ` Ingo Molnar
2017-05-10  7:39     ` Jiri Slaby
2017-05-10 12:42       ` Josh Poimboeuf
2017-05-10 12:47         ` Jiri Slaby
2017-05-10 18:11       ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).