All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
@ 2020-01-09 14:27 ` Cheng Jian
  0 siblings, 0 replies; 10+ messages in thread
From: Cheng Jian @ 2020-01-09 14:27 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: xiexiuqi, huawei.libin, cj.chengjian, bobo.shaobowang,
	catalin.marinas, mark.rutland, duwe

When we tracing multiple functions, it has to use a list
function and cause all the other functions being traced
to check the hash of the ftrace_ops. But this is very
inefficient.

we can call a dynamically allocated trampoline which calls
the callback directly to solve this problem. This patch
introduce dynamically alloced trampolines for ARM64.

If a callback is registered to a function and there's no
other callback registered to that function, the ftrace_ops
will get its own trampoline allocated for it that will call
the function directly.

We merge two functions (ftrace_caller/ftrace_regs_caller and
ftrace_common) into one function, so we no longer need a jump
to ftrace_common and fix it to NOP.

similar to X86_64, save the local ftrace_ops at the end.

the ftrace trampoline layout :

                          low
ftrace_(regs_)caller  => +---------------+
                         | ftrace_caller |
ftrace_common         => +---------------+
                         | ftrace_common |
function_trace_op_ptr => | ...           | ldr x2, <ftrace_ops>
           |             | b ftrace_stub |
	   |             | nop           | fgraph call
           |             +---------------+
           +------------>| ftrace_ops    |
                         +---------------+
			 | PLT entrys    | (TODO)
                         +---------------+
                          high

Known issues :
If kaslr is enabled, the address of tramp and ftrace call
may be far away. Therefore, long jump support is required.
Here I intend to use the same solution as module relocating,
Reserve enough space for PLT at the end when allocating, can
use PLT to complete these long jumps.

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
---
 arch/arm64/kernel/entry-ftrace.S |   4 +
 arch/arm64/kernel/ftrace.c       | 310 +++++++++++++++++++++++++++++++
 2 files changed, 314 insertions(+)

diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 7d02f9966d34..f5ee797804ac 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -77,17 +77,20 @@
 
 ENTRY(ftrace_regs_caller)
 	ftrace_regs_entry	1
+GLOBAL(ftrace_regs_caller_end)
 	b	ftrace_common
 ENDPROC(ftrace_regs_caller)
 
 ENTRY(ftrace_caller)
 	ftrace_regs_entry	0
+GLOBAL(ftrace_caller_end)
 	b	ftrace_common
 ENDPROC(ftrace_caller)
 
 ENTRY(ftrace_common)
 	sub	x0, x30, #AARCH64_INSN_SIZE	// ip (callsite's BL insn)
 	mov	x1, x9				// parent_ip (callsite's LR)
+GLOBAL(function_trace_op_ptr)
 	ldr_l	x2, function_trace_op		// op
 	mov	x3, sp				// regs
 
@@ -121,6 +124,7 @@ ftrace_common_return:
 	/* Restore the callsite's SP */
 	add	sp, sp, #S_FRAME_SIZE + 16
 
+GLOBAL(ftrace_common_end)
 	ret	x9
 ENDPROC(ftrace_common)
 
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 8618faa82e6d..95ea68ef6228 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -10,11 +10,13 @@
 #include <linux/module.h>
 #include <linux/swab.h>
 #include <linux/uaccess.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
 #include <asm/ftrace.h>
 #include <asm/insn.h>
+#include <asm/set_memory.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /*
@@ -47,6 +49,314 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
 	return 0;
 }
 
+/* ftrace dynamic trampolines */
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#ifdef CONFIG_MODULES
+#include <linux/moduleloader.h>
+
+static inline void *alloc_tramp(unsigned long size)
+{
+	return module_alloc(size);
+}
+
+static inline void tramp_free(void *tramp)
+{
+	module_memfree(tramp);
+}
+#else
+static inline void *alloc_tramp(unsigned long size)
+{
+	return NULL;
+}
+
+static inline void tramp_free(void *tramp) {}
+#endif
+
+extern void ftrace_regs_caller_end(void);
+extern void ftrace_caller_end(void);
+extern void ftrace_common(void);
+extern void ftrace_common_end(void);
+
+extern void function_trace_op_ptr(void);
+
+extern struct ftrace_ops *function_trace_op;
+
+/*
+ * ftrace_caller() or ftrace_regs_caller() trampoline
+ *				+-----------------------+
+ * ftrace_(regs_)caller =>	|	......		|
+ * ftrace_(regs_)caller_end =>	| b ftrace_common	| => nop
+ *				+-----------------------+
+ * ftrace_common =>		|	......		|
+ * function_trace_op_ptr =>	| adrp x2, sym		| => nop
+ *				| ldr  x2,[x2,:lo12:sym]| => ldr x2 <ftrace_op>
+ *				|	......		|
+ * ftrace_common_end  =>	|	retq		|
+ *				+-----------------------+
+ * ftrace_opt =>		|	ftrace_opt	|
+ *				+-----------------------+
+ */
+static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
+{
+	unsigned long start_offset_caller, end_offset_caller, caller_size;
+	unsigned long start_offset_common, end_offset_common, common_size;
+	unsigned long op_offset, offset, size, ip, npages;
+	void *trampoline;
+	unsigned long *ptr;
+	/* ldr x2, <label> */
+	u32 pc_ldr = 0x58000002;
+	u32 mask = BIT(19) - 1;
+	int shift = 5;
+	int ret;
+	u32 nop;
+
+	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+		start_offset_caller = (unsigned long)ftrace_regs_caller;
+		end_offset_caller = (unsigned long)ftrace_regs_caller_end;
+	} else {
+		start_offset_caller = (unsigned long)ftrace_caller;
+		end_offset_caller = (unsigned long)ftrace_caller_end;
+	}
+	start_offset_common = (unsigned long)ftrace_common;
+	end_offset_common = (unsigned long)ftrace_common_end;
+
+	op_offset = (unsigned long)function_trace_op_ptr;
+
+	/*
+	 * Merge ftrace_caller/ftrace_regs_caller and ftrace_common
+	 * to one function in ftrace trampoline.
+	 */
+	caller_size = end_offset_caller - start_offset_caller + AARCH64_INSN_SIZE;
+	common_size = end_offset_common - start_offset_common + AARCH64_INSN_SIZE;
+	size = caller_size + common_size;
+
+	trampoline = alloc_tramp(caller_size + common_size + sizeof(void *));
+	if (!trampoline)
+		return 0;
+
+	*tramp_size = caller_size + common_size + sizeof(void *);
+	npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
+
+	/* Copy ftrace_caller/ftrace_regs_caller onto the trampoline memory */
+	ret = probe_kernel_read(trampoline, (void *)start_offset_caller, caller_size);
+	if (WARN_ON(ret < 0))
+		goto free;
+
+	/*
+	 * Copy ftrace_common to the trampoline memory
+	 * below ftrace_caller/ftrace_regs_caller. so
+	 * we can merge the two function to one function.
+	 */
+	ret = probe_kernel_read(trampoline + caller_size, (void *)start_offset_common, common_size);
+	if (WARN_ON(ret < 0))
+		goto free;
+
+	/*
+	 * We merge the two functions to one function, so these is
+	 * no need to use jump instructions to ftrace_common, modify
+	 * it to NOP.
+	 */
+	ip = (unsigned long)trampoline + caller_size - AARCH64_INSN_SIZE;
+	nop = aarch64_insn_gen_nop();
+	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
+
+	/*
+	 * Stored ftrace_ops at the end of the trampoline.
+	 * This will be used to load the third parameter for the callback.
+	 * Basically, that location at the end of the trampoline takes the
+	 * place of the global function_trace_op variable.
+	 */
+	ptr = (unsigned long *)(trampoline + size);
+	*ptr = (unsigned long)ops;
+
+	/*
+	 * Update the trampoline ops REF
+	 *
+	 * OLD INSNS : ldr_l x2, function_trace_op
+	 *	adrp	x2, sym
+	 *	ldr	x2, [x2, :lo12:\sym]
+	 *
+	 * NEW INSNS:
+	 *	nop
+	 *	ldr x2, <ftrace_ops>
+	 */
+	op_offset -= start_offset_common;
+	ip = (unsigned long)trampoline + caller_size + op_offset;
+	nop = aarch64_insn_gen_nop();
+	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
+
+	op_offset += AARCH64_INSN_SIZE;
+	ip = (unsigned long)trampoline + caller_size + op_offset;
+	offset = (unsigned long)ptr - ip;
+	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
+		goto free;
+	offset = offset / AARCH64_INSN_SIZE;
+	pc_ldr |= (offset & mask) << shift;
+	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);
+
+	ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+
+	set_vm_flush_reset_perms(trampoline);
+
+	/*
+	 * Module allocation needs to be completed by making the page
+	 * executable. The page is still writable, which is a security hazard,
+	 * but anyhow ftrace breaks W^X completely.
+	 */
+	set_memory_x((unsigned long)trampoline, npages);
+
+	return (unsigned long)trampoline;
+
+free:
+	tramp_free(trampoline);
+	return 0;
+}
+
+static unsigned long calc_trampoline_call_offset(struct ftrace_ops *ops)
+{
+	unsigned call_offset, end_offset, offset;
+
+	call_offset = (unsigned long)&ftrace_call;
+	end_offset = (unsigned long)ftrace_common_end;
+	offset = end_offset - call_offset;
+
+	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
+}
+
+static int ftrace_trampoline_modify_call(struct ftrace_ops *ops)
+{
+	unsigned long offset;
+	unsigned long pc;
+	ftrace_func_t func;
+	u32 new;
+
+	offset = calc_trampoline_call_offset(ops);
+	pc = ops->trampoline + offset;
+
+	func = ftrace_ops_get_func(ops);
+	new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK);
+
+	return ftrace_modify_code(pc, 0, new, false);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static unsigned long calc_trampoline_graph_call_offset(struct ftrace_ops *ops)
+{
+	unsigned end_offset, call_offset, offset;
+
+	call_offset = (unsigned long)&ftrace_graph_call;
+	end_offset = (unsigned long)ftrace_common_end;
+	offset = end_offset - call_offset;
+
+	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
+}
+
+extern int ftrace_graph_active;
+static int ftrace_trampoline_modify_graph_call(struct ftrace_ops *ops)
+{
+	unsigned long offset;
+	unsigned long pc;
+	u32 branch, nop;
+
+	offset = calc_trampoline_graph_call_offset(ops);
+	pc = ops->trampoline + offset;
+
+	branch = aarch64_insn_gen_branch_imm(pc,
+					     (unsigned long)ftrace_graph_caller,
+					     AARCH64_INSN_BRANCH_NOLINK);
+	nop = aarch64_insn_gen_nop();
+
+	if (ftrace_graph_active)
+		return ftrace_modify_code(pc, 0, branch, false);
+	else
+		return ftrace_modify_code(pc, 0, nop, false);
+}
+#else
+static inline int ftrace_trampoline_modify_graph_call(struct ftrace_ops *ops)
+{
+	return 0;
+}
+#endif
+
+void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+	int npages;
+
+	if (ops->trampoline) {
+		/*
+		 * The ftrace_ops caller may set up its own trampoline.
+		 * In such a case, this code must not modify it.
+		 */
+		if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+			return;
+		npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
+		set_memory_rw(ops->trampoline, npages);
+	} else {
+		unsigned int size;
+
+		ops->trampoline = create_trampoline(ops, &size);
+		if (!ops->trampoline)
+			return;
+		ops->trampoline_size = size;
+		npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	}
+
+	/* atomic modify trampoline: <call func> */
+	WARN_ON(ftrace_trampoline_modify_call(ops));
+
+	/* atomic modify trampoline: <call graph func> */
+	WARN_ON(ftrace_trampoline_modify_graph_call(ops));
+
+	set_memory_ro(ops->trampoline, npages);
+}
+
+static void *addr_from_call(void *ptr)
+{
+	u32 insn;
+	unsigned long offset;
+
+	if (aarch64_insn_read(ptr, &insn))
+		return NULL;
+
+	/* Make sure this is a call */
+	if (!aarch64_insn_is_bl(insn)) {
+		pr_warn("Expected bl instruction, but not\n");
+		return NULL;
+	}
+
+	offset = aarch64_get_branch_offset(insn);
+
+	return (void *)ptr + AARCH64_INSN_SIZE + offset;
+}
+
+void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+			   unsigned long frame_pointer);
+
+void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+	unsigned long offset;
+
+	/* If we didn't allocate this trampoline, consider it static */
+	if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+		return NULL;
+
+	offset = calc_trampoline_call_offset(ops);
+
+	return addr_from_call((void *)ops->trampoline + offset);
+}
+
+void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+	if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+		return;
+
+	tramp_free((void *)ops->trampoline);
+	ops->trampoline = 0;
+	ops->trampoline_size = 0;
+}
+#endif
+
 /*
  * Replace tracer function in ftrace_caller()
  */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
@ 2020-01-09 14:27 ` Cheng Jian
  0 siblings, 0 replies; 10+ messages in thread
From: Cheng Jian @ 2020-01-09 14:27 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: mark.rutland, cj.chengjian, xiexiuqi, catalin.marinas,
	bobo.shaobowang, duwe, huawei.libin

When we tracing multiple functions, it has to use a list
function and cause all the other functions being traced
to check the hash of the ftrace_ops. But this is very
inefficient.

we can call a dynamically allocated trampoline which calls
the callback directly to solve this problem. This patch
introduce dynamically alloced trampolines for ARM64.

If a callback is registered to a function and there's no
other callback registered to that function, the ftrace_ops
will get its own trampoline allocated for it that will call
the function directly.

We merge two functions (ftrace_caller/ftrace_regs_caller and
ftrace_common) into one function, so we no longer need a jump
to ftrace_common and fix it to NOP.

similar to X86_64, save the local ftrace_ops at the end.

the ftrace trampoline layout :

                          low
ftrace_(regs_)caller  => +---------------+
                         | ftrace_caller |
ftrace_common         => +---------------+
                         | ftrace_common |
function_trace_op_ptr => | ...           | ldr x2, <ftrace_ops>
           |             | b ftrace_stub |
	   |             | nop           | fgraph call
           |             +---------------+
           +------------>| ftrace_ops    |
                         +---------------+
			 | PLT entrys    | (TODO)
                         +---------------+
                          high

Known issues :
If kaslr is enabled, the address of tramp and ftrace call
may be far away. Therefore, long jump support is required.
Here I intend to use the same solution as module relocating,
Reserve enough space for PLT at the end when allocating, can
use PLT to complete these long jumps.

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
---
 arch/arm64/kernel/entry-ftrace.S |   4 +
 arch/arm64/kernel/ftrace.c       | 310 +++++++++++++++++++++++++++++++
 2 files changed, 314 insertions(+)

diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 7d02f9966d34..f5ee797804ac 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -77,17 +77,20 @@
 
 ENTRY(ftrace_regs_caller)
 	ftrace_regs_entry	1
+GLOBAL(ftrace_regs_caller_end)
 	b	ftrace_common
 ENDPROC(ftrace_regs_caller)
 
 ENTRY(ftrace_caller)
 	ftrace_regs_entry	0
+GLOBAL(ftrace_caller_end)
 	b	ftrace_common
 ENDPROC(ftrace_caller)
 
 ENTRY(ftrace_common)
 	sub	x0, x30, #AARCH64_INSN_SIZE	// ip (callsite's BL insn)
 	mov	x1, x9				// parent_ip (callsite's LR)
+GLOBAL(function_trace_op_ptr)
 	ldr_l	x2, function_trace_op		// op
 	mov	x3, sp				// regs
 
@@ -121,6 +124,7 @@ ftrace_common_return:
 	/* Restore the callsite's SP */
 	add	sp, sp, #S_FRAME_SIZE + 16
 
+GLOBAL(ftrace_common_end)
 	ret	x9
 ENDPROC(ftrace_common)
 
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 8618faa82e6d..95ea68ef6228 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -10,11 +10,13 @@
 #include <linux/module.h>
 #include <linux/swab.h>
 #include <linux/uaccess.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
 #include <asm/ftrace.h>
 #include <asm/insn.h>
+#include <asm/set_memory.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /*
@@ -47,6 +49,314 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
 	return 0;
 }
 
+/* ftrace dynamic trampolines */
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#ifdef CONFIG_MODULES
+#include <linux/moduleloader.h>
+
+static inline void *alloc_tramp(unsigned long size)
+{
+	return module_alloc(size);
+}
+
+static inline void tramp_free(void *tramp)
+{
+	module_memfree(tramp);
+}
+#else
+static inline void *alloc_tramp(unsigned long size)
+{
+	return NULL;
+}
+
+static inline void tramp_free(void *tramp) {}
+#endif
+
+extern void ftrace_regs_caller_end(void);
+extern void ftrace_caller_end(void);
+extern void ftrace_common(void);
+extern void ftrace_common_end(void);
+
+extern void function_trace_op_ptr(void);
+
+extern struct ftrace_ops *function_trace_op;
+
+/*
+ * ftrace_caller() or ftrace_regs_caller() trampoline
+ *				+-----------------------+
+ * ftrace_(regs_)caller =>	|	......		|
+ * ftrace_(regs_)caller_end =>	| b ftrace_common	| => nop
+ *				+-----------------------+
+ * ftrace_common =>		|	......		|
+ * function_trace_op_ptr =>	| adrp x2, sym		| => nop
+ *				| ldr  x2,[x2,:lo12:sym]| => ldr x2 <ftrace_op>
+ *				|	......		|
+ * ftrace_common_end  =>	|	retq		|
+ *				+-----------------------+
+ * ftrace_opt =>		|	ftrace_opt	|
+ *				+-----------------------+
+ */
+static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
+{
+	unsigned long start_offset_caller, end_offset_caller, caller_size;
+	unsigned long start_offset_common, end_offset_common, common_size;
+	unsigned long op_offset, offset, size, ip, npages;
+	void *trampoline;
+	unsigned long *ptr;
+	/* ldr x2, <label> */
+	u32 pc_ldr = 0x58000002;
+	u32 mask = BIT(19) - 1;
+	int shift = 5;
+	int ret;
+	u32 nop;
+
+	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+		start_offset_caller = (unsigned long)ftrace_regs_caller;
+		end_offset_caller = (unsigned long)ftrace_regs_caller_end;
+	} else {
+		start_offset_caller = (unsigned long)ftrace_caller;
+		end_offset_caller = (unsigned long)ftrace_caller_end;
+	}
+	start_offset_common = (unsigned long)ftrace_common;
+	end_offset_common = (unsigned long)ftrace_common_end;
+
+	op_offset = (unsigned long)function_trace_op_ptr;
+
+	/*
+	 * Merge ftrace_caller/ftrace_regs_caller and ftrace_common
+	 * to one function in ftrace trampoline.
+	 */
+	caller_size = end_offset_caller - start_offset_caller + AARCH64_INSN_SIZE;
+	common_size = end_offset_common - start_offset_common + AARCH64_INSN_SIZE;
+	size = caller_size + common_size;
+
+	trampoline = alloc_tramp(caller_size + common_size + sizeof(void *));
+	if (!trampoline)
+		return 0;
+
+	*tramp_size = caller_size + common_size + sizeof(void *);
+	npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
+
+	/* Copy ftrace_caller/ftrace_regs_caller onto the trampoline memory */
+	ret = probe_kernel_read(trampoline, (void *)start_offset_caller, caller_size);
+	if (WARN_ON(ret < 0))
+		goto free;
+
+	/*
+	 * Copy ftrace_common to the trampoline memory
+	 * below ftrace_caller/ftrace_regs_caller. so
+	 * we can merge the two function to one function.
+	 */
+	ret = probe_kernel_read(trampoline + caller_size, (void *)start_offset_common, common_size);
+	if (WARN_ON(ret < 0))
+		goto free;
+
+	/*
+	 * We merge the two functions to one function, so these is
+	 * no need to use jump instructions to ftrace_common, modify
+	 * it to NOP.
+	 */
+	ip = (unsigned long)trampoline + caller_size - AARCH64_INSN_SIZE;
+	nop = aarch64_insn_gen_nop();
+	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
+
+	/*
+	 * Stored ftrace_ops at the end of the trampoline.
+	 * This will be used to load the third parameter for the callback.
+	 * Basically, that location at the end of the trampoline takes the
+	 * place of the global function_trace_op variable.
+	 */
+	ptr = (unsigned long *)(trampoline + size);
+	*ptr = (unsigned long)ops;
+
+	/*
+	 * Update the trampoline ops REF
+	 *
+	 * OLD INSNS : ldr_l x2, function_trace_op
+	 *	adrp	x2, sym
+	 *	ldr	x2, [x2, :lo12:\sym]
+	 *
+	 * NEW INSNS:
+	 *	nop
+	 *	ldr x2, <ftrace_ops>
+	 */
+	op_offset -= start_offset_common;
+	ip = (unsigned long)trampoline + caller_size + op_offset;
+	nop = aarch64_insn_gen_nop();
+	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
+
+	op_offset += AARCH64_INSN_SIZE;
+	ip = (unsigned long)trampoline + caller_size + op_offset;
+	offset = (unsigned long)ptr - ip;
+	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
+		goto free;
+	offset = offset / AARCH64_INSN_SIZE;
+	pc_ldr |= (offset & mask) << shift;
+	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);
+
+	ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+
+	set_vm_flush_reset_perms(trampoline);
+
+	/*
+	 * Module allocation needs to be completed by making the page
+	 * executable. The page is still writable, which is a security hazard,
+	 * but anyhow ftrace breaks W^X completely.
+	 */
+	set_memory_x((unsigned long)trampoline, npages);
+
+	return (unsigned long)trampoline;
+
+free:
+	tramp_free(trampoline);
+	return 0;
+}
+
+static unsigned long calc_trampoline_call_offset(struct ftrace_ops *ops)
+{
+	unsigned call_offset, end_offset, offset;
+
+	call_offset = (unsigned long)&ftrace_call;
+	end_offset = (unsigned long)ftrace_common_end;
+	offset = end_offset - call_offset;
+
+	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
+}
+
+static int ftrace_trampoline_modify_call(struct ftrace_ops *ops)
+{
+	unsigned long offset;
+	unsigned long pc;
+	ftrace_func_t func;
+	u32 new;
+
+	offset = calc_trampoline_call_offset(ops);
+	pc = ops->trampoline + offset;
+
+	func = ftrace_ops_get_func(ops);
+	new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK);
+
+	return ftrace_modify_code(pc, 0, new, false);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static unsigned long calc_trampoline_graph_call_offset(struct ftrace_ops *ops)
+{
+	unsigned end_offset, call_offset, offset;
+
+	call_offset = (unsigned long)&ftrace_graph_call;
+	end_offset = (unsigned long)ftrace_common_end;
+	offset = end_offset - call_offset;
+
+	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
+}
+
+extern int ftrace_graph_active;
+static int ftrace_trampoline_modify_graph_call(struct ftrace_ops *ops)
+{
+	unsigned long offset;
+	unsigned long pc;
+	u32 branch, nop;
+
+	offset = calc_trampoline_graph_call_offset(ops);
+	pc = ops->trampoline + offset;
+
+	branch = aarch64_insn_gen_branch_imm(pc,
+					     (unsigned long)ftrace_graph_caller,
+					     AARCH64_INSN_BRANCH_NOLINK);
+	nop = aarch64_insn_gen_nop();
+
+	if (ftrace_graph_active)
+		return ftrace_modify_code(pc, 0, branch, false);
+	else
+		return ftrace_modify_code(pc, 0, nop, false);
+}
+#else
+static inline int ftrace_trampoline_modify_graph_call(struct ftrace_ops *ops)
+{
+	return 0;
+}
+#endif
+
+void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+	int npages;
+
+	if (ops->trampoline) {
+		/*
+		 * The ftrace_ops caller may set up its own trampoline.
+		 * In such a case, this code must not modify it.
+		 */
+		if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+			return;
+		npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
+		set_memory_rw(ops->trampoline, npages);
+	} else {
+		unsigned int size;
+
+		ops->trampoline = create_trampoline(ops, &size);
+		if (!ops->trampoline)
+			return;
+		ops->trampoline_size = size;
+		npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	}
+
+	/* atomic modify trampoline: <call func> */
+	WARN_ON(ftrace_trampoline_modify_call(ops));
+
+	/* atomic modify trampoline: <call graph func> */
+	WARN_ON(ftrace_trampoline_modify_graph_call(ops));
+
+	set_memory_ro(ops->trampoline, npages);
+}
+
+static void *addr_from_call(void *ptr)
+{
+	u32 insn;
+	unsigned long offset;
+
+	if (aarch64_insn_read(ptr, &insn))
+		return NULL;
+
+	/* Make sure this is a call */
+	if (!aarch64_insn_is_bl(insn)) {
+		pr_warn("Expected bl instruction, but not\n");
+		return NULL;
+	}
+
+	offset = aarch64_get_branch_offset(insn);
+
+	return (void *)ptr + AARCH64_INSN_SIZE + offset;
+}
+
+void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+			   unsigned long frame_pointer);
+
+void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+	unsigned long offset;
+
+	/* If we didn't allocate this trampoline, consider it static */
+	if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+		return NULL;
+
+	offset = calc_trampoline_call_offset(ops);
+
+	return addr_from_call((void *)ops->trampoline + offset);
+}
+
+void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+	if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+		return;
+
+	tramp_free((void *)ops->trampoline);
+	ops->trampoline = 0;
+	ops->trampoline_size = 0;
+}
+#endif
+
 /*
  * Replace tracer function in ftrace_caller()
  */
-- 
2.17.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
  2020-01-09 14:27 ` Cheng Jian
@ 2020-01-09 16:48   ` Mark Rutland
  -1 siblings, 0 replies; 10+ messages in thread
From: Mark Rutland @ 2020-01-09 16:48 UTC (permalink / raw)
  To: Cheng Jian
  Cc: linux-arm-kernel, linux-kernel, xiexiuqi, huawei.libin,
	bobo.shaobowang, catalin.marinas, duwe

On Thu, Jan 09, 2020 at 02:27:36PM +0000, Cheng Jian wrote:
> When we tracing multiple functions, it has to use a list
> function and cause all the other functions being traced
> to check the hash of the ftrace_ops. But this is very
> inefficient.

Just how bad is this, and when/where does it matter?

How much does this patch improve matters?

> we can call a dynamically allocated trampoline which calls
> the callback directly to solve this problem. This patch
> introduce dynamically alloced trampolines for ARM64.
> 
> If a callback is registered to a function and there's no
> other callback registered to that function, the ftrace_ops
> will get its own trampoline allocated for it that will call
> the function directly.
> 
> We merge two functions (ftrace_caller/ftrace_regs_caller and
> ftrace_common) into one function, so we no longer need a jump
> to ftrace_common and fix it to NOP.
> 
> similar to X86_64, save the local ftrace_ops at the end.
> 
> the ftrace trampoline layout :
> 
>                           low
> ftrace_(regs_)caller  => +---------------+
>                          | ftrace_caller |
> ftrace_common         => +---------------+
>                          | ftrace_common |
> function_trace_op_ptr => | ...           | ldr x2, <ftrace_ops>
>            |             | b ftrace_stub |
>            |             | nop           | fgraph call
>            |             +---------------+
>            +------------>| ftrace_ops    |
>                          +---------------+
>                          | PLT entrys    | (TODO)
>                          +---------------+
>                           high
> 
> Known issues :
> If kaslr is enabled, the address of tramp and ftrace call
> may be far away. Therefore, long jump support is required.
> Here I intend to use the same solution as module relocating,
> Reserve enough space for PLT at the end when allocating, can
> use PLT to complete these long jumps.

This can happen both ways; the callsite can also be too far from the
trampoline to be able to branch to it.

I've had issues with that for other reasons, and I think that we might
be able to use -fpatchable-function-entry=N,M to place a PLT immediately
before each function for that. However, I'm wary of doing so because it
makes it much harder to modify the patch site itself.

> 
> Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
> ---
>  arch/arm64/kernel/entry-ftrace.S |   4 +
>  arch/arm64/kernel/ftrace.c       | 310 +++++++++++++++++++++++++++++++
>  2 files changed, 314 insertions(+)
> 
> diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
> index 7d02f9966d34..f5ee797804ac 100644
> --- a/arch/arm64/kernel/entry-ftrace.S
> +++ b/arch/arm64/kernel/entry-ftrace.S
> @@ -77,17 +77,20 @@
>  
>  ENTRY(ftrace_regs_caller)
>  	ftrace_regs_entry	1
> +GLOBAL(ftrace_regs_caller_end)
>  	b	ftrace_common
>  ENDPROC(ftrace_regs_caller)
>  
>  ENTRY(ftrace_caller)
>  	ftrace_regs_entry	0
> +GLOBAL(ftrace_caller_end)
>  	b	ftrace_common
>  ENDPROC(ftrace_caller)
>  
>  ENTRY(ftrace_common)
>  	sub	x0, x30, #AARCH64_INSN_SIZE	// ip (callsite's BL insn)
>  	mov	x1, x9				// parent_ip (callsite's LR)
> +GLOBAL(function_trace_op_ptr)
>  	ldr_l	x2, function_trace_op		// op
>  	mov	x3, sp				// regs
>  
> @@ -121,6 +124,7 @@ ftrace_common_return:
>  	/* Restore the callsite's SP */
>  	add	sp, sp, #S_FRAME_SIZE + 16
>  
> +GLOBAL(ftrace_common_end)
>  	ret	x9

This doesn't look right. Surely you want the RET, too?

>  ENDPROC(ftrace_common)
>  
> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> index 8618faa82e6d..95ea68ef6228 100644
> --- a/arch/arm64/kernel/ftrace.c
> +++ b/arch/arm64/kernel/ftrace.c
> @@ -10,11 +10,13 @@
>  #include <linux/module.h>
>  #include <linux/swab.h>
>  #include <linux/uaccess.h>
> +#include <linux/vmalloc.h>
>  
>  #include <asm/cacheflush.h>
>  #include <asm/debug-monitors.h>
>  #include <asm/ftrace.h>
>  #include <asm/insn.h>
> +#include <asm/set_memory.h>
>  
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  /*
> @@ -47,6 +49,314 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
>  	return 0;
>  }
>  
> +/* ftrace dynamic trampolines */
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
> +#ifdef CONFIG_MODULES
> +#include <linux/moduleloader.h>
> +
> +static inline void *alloc_tramp(unsigned long size)
> +{
> +	return module_alloc(size);
> +}
> +
> +static inline void tramp_free(void *tramp)
> +{
> +	module_memfree(tramp);
> +}
> +#else
> +static inline void *alloc_tramp(unsigned long size)
> +{
> +	return NULL;
> +}
> +
> +static inline void tramp_free(void *tramp) {}
> +#endif
> +
> +extern void ftrace_regs_caller_end(void);
> +extern void ftrace_caller_end(void);
> +extern void ftrace_common(void);
> +extern void ftrace_common_end(void);
> +
> +extern void function_trace_op_ptr(void);
> +
> +extern struct ftrace_ops *function_trace_op;
> +
> +/*
> + * ftrace_caller() or ftrace_regs_caller() trampoline
> + *				+-----------------------+
> + * ftrace_(regs_)caller =>	|	......		|
> + * ftrace_(regs_)caller_end =>	| b ftrace_common	| => nop
> + *				+-----------------------+
> + * ftrace_common =>		|	......		|
> + * function_trace_op_ptr =>	| adrp x2, sym		| => nop
> + *				| ldr  x2,[x2,:lo12:sym]| => ldr x2 <ftrace_op>
> + *				|	......		|
> + * ftrace_common_end  =>	|	retq		|

Copy-paste from x86? arm64 doesn't have a retq instruction.

> + *				+-----------------------+
> + * ftrace_opt =>		|	ftrace_opt	|
> + *				+-----------------------+

Typo: s/opt/ops/ ?

> + */
> +static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
> +{
> +	unsigned long start_offset_caller, end_offset_caller, caller_size;
> +	unsigned long start_offset_common, end_offset_common, common_size;
> +	unsigned long op_offset, offset, size, ip, npages;
> +	void *trampoline;
> +	unsigned long *ptr;
> +	/* ldr x2, <label> */
> +	u32 pc_ldr = 0x58000002;
> +	u32 mask = BIT(19) - 1;

Instead of open-coding this, please teach the insn framework how to
encode LDR (immediate).

> +	int shift = 5;
> +	int ret;
> +	u32 nop;
> +
> +	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
> +		start_offset_caller = (unsigned long)ftrace_regs_caller;
> +		end_offset_caller = (unsigned long)ftrace_regs_caller_end;
> +	} else {
> +		start_offset_caller = (unsigned long)ftrace_caller;
> +		end_offset_caller = (unsigned long)ftrace_caller_end;
> +	}
> +	start_offset_common = (unsigned long)ftrace_common;
> +	end_offset_common = (unsigned long)ftrace_common_end;
> +
> +	op_offset = (unsigned long)function_trace_op_ptr;
> +
> +	/*
> +	 * Merge ftrace_caller/ftrace_regs_caller and ftrace_common
> +	 * to one function in ftrace trampoline.
> +	 */
> +	caller_size = end_offset_caller - start_offset_caller + AARCH64_INSN_SIZE;
> +	common_size = end_offset_common - start_offset_common + AARCH64_INSN_SIZE;
> +	size = caller_size + common_size;
> +
> +	trampoline = alloc_tramp(caller_size + common_size + sizeof(void *));
> +	if (!trampoline)
> +		return 0;
> +
> +	*tramp_size = caller_size + common_size + sizeof(void *);
> +	npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
> +
> +	/* Copy ftrace_caller/ftrace_regs_caller onto the trampoline memory */
> +	ret = probe_kernel_read(trampoline, (void *)start_offset_caller, caller_size);
> +	if (WARN_ON(ret < 0))
> +		goto free;
> +
> +	/*
> +	 * Copy ftrace_common to the trampoline memory
> +	 * below ftrace_caller/ftrace_regs_caller. so
> +	 * we can merge the two function to one function.
> +	 */
> +	ret = probe_kernel_read(trampoline + caller_size, (void *)start_offset_common, common_size);
> +	if (WARN_ON(ret < 0))
> +		goto free;
> +
> +	/*
> +	 * We merge the two functions to one function, so these is
> +	 * no need to use jump instructions to ftrace_common, modify
> +	 * it to NOP.
> +	 */
> +	ip = (unsigned long)trampoline + caller_size - AARCH64_INSN_SIZE;
> +	nop = aarch64_insn_gen_nop();
> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
> +
> +	/*
> +	 * Stored ftrace_ops at the end of the trampoline.
> +	 * This will be used to load the third parameter for the callback.
> +	 * Basically, that location at the end of the trampoline takes the
> +	 * place of the global function_trace_op variable.
> +	 */
> +	ptr = (unsigned long *)(trampoline + size);
> +	*ptr = (unsigned long)ops;
> +
> +	/*
> +	 * Update the trampoline ops REF
> +	 *
> +	 * OLD INSNS : ldr_l x2, function_trace_op
> +	 *	adrp	x2, sym
> +	 *	ldr	x2, [x2, :lo12:\sym]
> +	 *
> +	 * NEW INSNS:
> +	 *	nop
> +	 *	ldr x2, <ftrace_ops>
> +	 */
> +	op_offset -= start_offset_common;
> +	ip = (unsigned long)trampoline + caller_size + op_offset;
> +	nop = aarch64_insn_gen_nop();
> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
> +
> +	op_offset += AARCH64_INSN_SIZE;
> +	ip = (unsigned long)trampoline + caller_size + op_offset;
> +	offset = (unsigned long)ptr - ip;
> +	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
> +		goto free;
> +	offset = offset / AARCH64_INSN_SIZE;
> +	pc_ldr |= (offset & mask) << shift;
> +	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);

I think it would be much better to have a separate template for the
trampoline which we don't have to patch in this way. It can even be
placed into a non-executable RO section, since the template shouldn't be
executed directly.

> +
> +	ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
> +
> +	set_vm_flush_reset_perms(trampoline);
> +
> +	/*
> +	 * Module allocation needs to be completed by making the page
> +	 * executable. The page is still writable, which is a security hazard,
> +	 * but anyhow ftrace breaks W^X completely.
> +	 */
> +	set_memory_x((unsigned long)trampoline, npages);

Why is the page still writeable? Surely you can make it RO first?

Please do not break W^X restrictions; we've tried to ensure that arm64
does the right thing by default here.

Note that arm64's ftrace_modify_code() will use a writeable alias, so it
can be used after the memory has been marked RO.

> +
> +	return (unsigned long)trampoline;
> +
> +free:
> +	tramp_free(trampoline);
> +	return 0;
> +}
> +
> +static unsigned long calc_trampoline_call_offset(struct ftrace_ops *ops)
> +{
> +	unsigned call_offset, end_offset, offset;
> +
> +	call_offset = (unsigned long)&ftrace_call;
> +	end_offset = (unsigned long)ftrace_common_end;
> +	offset = end_offset - call_offset;
> +
> +	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
> +}
> +
> +static int ftrace_trampoline_modify_call(struct ftrace_ops *ops)
> +{
> +	unsigned long offset;
> +	unsigned long pc;
> +	ftrace_func_t func;
> +	u32 new;
> +
> +	offset = calc_trampoline_call_offset(ops);
> +	pc = ops->trampoline + offset;
> +
> +	func = ftrace_ops_get_func(ops);
> +	new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK);
> +
> +	return ftrace_modify_code(pc, 0, new, false);
> +}
> +
> +#ifdef CONFIG_FUNCTION_GRAPH_TRACER
> +
> +static unsigned long calc_trampoline_graph_call_offset(struct ftrace_ops *ops)
> +{
> +	unsigned end_offset, call_offset, offset;
> +
> +	call_offset = (unsigned long)&ftrace_graph_call;
> +	end_offset = (unsigned long)ftrace_common_end;
> +	offset = end_offset - call_offset;
> +
> +	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
> +}
> +
> +extern int ftrace_graph_active;
> +static int ftrace_trampoline_modify_graph_call(struct ftrace_ops *ops)
> +{
> +	unsigned long offset;
> +	unsigned long pc;
> +	u32 branch, nop;
> +
> +	offset = calc_trampoline_graph_call_offset(ops);
> +	pc = ops->trampoline + offset;
> +
> +	branch = aarch64_insn_gen_branch_imm(pc,
> +					     (unsigned long)ftrace_graph_caller,
> +					     AARCH64_INSN_BRANCH_NOLINK);
> +	nop = aarch64_insn_gen_nop();
> +
> +	if (ftrace_graph_active)
> +		return ftrace_modify_code(pc, 0, branch, false);
> +	else
> +		return ftrace_modify_code(pc, 0, nop, false);
> +}

It should be posbile to share the bulk of this code with the non-graph
versions.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
@ 2020-01-09 16:48   ` Mark Rutland
  0 siblings, 0 replies; 10+ messages in thread
From: Mark Rutland @ 2020-01-09 16:48 UTC (permalink / raw)
  To: Cheng Jian
  Cc: xiexiuqi, catalin.marinas, linux-kernel, bobo.shaobowang, duwe,
	huawei.libin, linux-arm-kernel

On Thu, Jan 09, 2020 at 02:27:36PM +0000, Cheng Jian wrote:
> When we tracing multiple functions, it has to use a list
> function and cause all the other functions being traced
> to check the hash of the ftrace_ops. But this is very
> inefficient.

Just how bad is this, and when/where does it matter?

How much does this patch improve matters?

> we can call a dynamically allocated trampoline which calls
> the callback directly to solve this problem. This patch
> introduce dynamically alloced trampolines for ARM64.
> 
> If a callback is registered to a function and there's no
> other callback registered to that function, the ftrace_ops
> will get its own trampoline allocated for it that will call
> the function directly.
> 
> We merge two functions (ftrace_caller/ftrace_regs_caller and
> ftrace_common) into one function, so we no longer need a jump
> to ftrace_common and fix it to NOP.
> 
> similar to X86_64, save the local ftrace_ops at the end.
> 
> the ftrace trampoline layout :
> 
>                           low
> ftrace_(regs_)caller  => +---------------+
>                          | ftrace_caller |
> ftrace_common         => +---------------+
>                          | ftrace_common |
> function_trace_op_ptr => | ...           | ldr x2, <ftrace_ops>
>            |             | b ftrace_stub |
>            |             | nop           | fgraph call
>            |             +---------------+
>            +------------>| ftrace_ops    |
>                          +---------------+
>                          | PLT entrys    | (TODO)
>                          +---------------+
>                           high
> 
> Known issues :
> If kaslr is enabled, the address of tramp and ftrace call
> may be far away. Therefore, long jump support is required.
> Here I intend to use the same solution as module relocating,
> Reserve enough space for PLT at the end when allocating, can
> use PLT to complete these long jumps.

This can happen both ways; the callsite can also be too far from the
trampoline to be able to branch to it.

I've had issues with that for other reasons, and I think that we might
be able to use -fpatchable-function-entry=N,M to place a PLT immediately
before each function for that. However, I'm wary of doing so because it
makes it much harder to modify the patch site itself.

> 
> Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
> ---
>  arch/arm64/kernel/entry-ftrace.S |   4 +
>  arch/arm64/kernel/ftrace.c       | 310 +++++++++++++++++++++++++++++++
>  2 files changed, 314 insertions(+)
> 
> diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
> index 7d02f9966d34..f5ee797804ac 100644
> --- a/arch/arm64/kernel/entry-ftrace.S
> +++ b/arch/arm64/kernel/entry-ftrace.S
> @@ -77,17 +77,20 @@
>  
>  ENTRY(ftrace_regs_caller)
>  	ftrace_regs_entry	1
> +GLOBAL(ftrace_regs_caller_end)
>  	b	ftrace_common
>  ENDPROC(ftrace_regs_caller)
>  
>  ENTRY(ftrace_caller)
>  	ftrace_regs_entry	0
> +GLOBAL(ftrace_caller_end)
>  	b	ftrace_common
>  ENDPROC(ftrace_caller)
>  
>  ENTRY(ftrace_common)
>  	sub	x0, x30, #AARCH64_INSN_SIZE	// ip (callsite's BL insn)
>  	mov	x1, x9				// parent_ip (callsite's LR)
> +GLOBAL(function_trace_op_ptr)
>  	ldr_l	x2, function_trace_op		// op
>  	mov	x3, sp				// regs
>  
> @@ -121,6 +124,7 @@ ftrace_common_return:
>  	/* Restore the callsite's SP */
>  	add	sp, sp, #S_FRAME_SIZE + 16
>  
> +GLOBAL(ftrace_common_end)
>  	ret	x9

This doesn't look right. Surely you want the RET, too?

>  ENDPROC(ftrace_common)
>  
> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> index 8618faa82e6d..95ea68ef6228 100644
> --- a/arch/arm64/kernel/ftrace.c
> +++ b/arch/arm64/kernel/ftrace.c
> @@ -10,11 +10,13 @@
>  #include <linux/module.h>
>  #include <linux/swab.h>
>  #include <linux/uaccess.h>
> +#include <linux/vmalloc.h>
>  
>  #include <asm/cacheflush.h>
>  #include <asm/debug-monitors.h>
>  #include <asm/ftrace.h>
>  #include <asm/insn.h>
> +#include <asm/set_memory.h>
>  
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  /*
> @@ -47,6 +49,314 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
>  	return 0;
>  }
>  
> +/* ftrace dynamic trampolines */
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
> +#ifdef CONFIG_MODULES
> +#include <linux/moduleloader.h>
> +
> +static inline void *alloc_tramp(unsigned long size)
> +{
> +	return module_alloc(size);
> +}
> +
> +static inline void tramp_free(void *tramp)
> +{
> +	module_memfree(tramp);
> +}
> +#else
> +static inline void *alloc_tramp(unsigned long size)
> +{
> +	return NULL;
> +}
> +
> +static inline void tramp_free(void *tramp) {}
> +#endif
> +
> +extern void ftrace_regs_caller_end(void);
> +extern void ftrace_caller_end(void);
> +extern void ftrace_common(void);
> +extern void ftrace_common_end(void);
> +
> +extern void function_trace_op_ptr(void);
> +
> +extern struct ftrace_ops *function_trace_op;
> +
> +/*
> + * ftrace_caller() or ftrace_regs_caller() trampoline
> + *				+-----------------------+
> + * ftrace_(regs_)caller =>	|	......		|
> + * ftrace_(regs_)caller_end =>	| b ftrace_common	| => nop
> + *				+-----------------------+
> + * ftrace_common =>		|	......		|
> + * function_trace_op_ptr =>	| adrp x2, sym		| => nop
> + *				| ldr  x2,[x2,:lo12:sym]| => ldr x2 <ftrace_op>
> + *				|	......		|
> + * ftrace_common_end  =>	|	retq		|

Copy-paste from x86? arm64 doesn't have a retq instruction.

> + *				+-----------------------+
> + * ftrace_opt =>		|	ftrace_opt	|
> + *				+-----------------------+

Typo: s/opt/ops/ ?

> + */
> +static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
> +{
> +	unsigned long start_offset_caller, end_offset_caller, caller_size;
> +	unsigned long start_offset_common, end_offset_common, common_size;
> +	unsigned long op_offset, offset, size, ip, npages;
> +	void *trampoline;
> +	unsigned long *ptr;
> +	/* ldr x2, <label> */
> +	u32 pc_ldr = 0x58000002;
> +	u32 mask = BIT(19) - 1;

Instead of open-coding this, please teach the insn framework how to
encode LDR (immediate).

> +	int shift = 5;
> +	int ret;
> +	u32 nop;
> +
> +	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
> +		start_offset_caller = (unsigned long)ftrace_regs_caller;
> +		end_offset_caller = (unsigned long)ftrace_regs_caller_end;
> +	} else {
> +		start_offset_caller = (unsigned long)ftrace_caller;
> +		end_offset_caller = (unsigned long)ftrace_caller_end;
> +	}
> +	start_offset_common = (unsigned long)ftrace_common;
> +	end_offset_common = (unsigned long)ftrace_common_end;
> +
> +	op_offset = (unsigned long)function_trace_op_ptr;
> +
> +	/*
> +	 * Merge ftrace_caller/ftrace_regs_caller and ftrace_common
> +	 * to one function in ftrace trampoline.
> +	 */
> +	caller_size = end_offset_caller - start_offset_caller + AARCH64_INSN_SIZE;
> +	common_size = end_offset_common - start_offset_common + AARCH64_INSN_SIZE;
> +	size = caller_size + common_size;
> +
> +	trampoline = alloc_tramp(caller_size + common_size + sizeof(void *));
> +	if (!trampoline)
> +		return 0;
> +
> +	*tramp_size = caller_size + common_size + sizeof(void *);
> +	npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
> +
> +	/* Copy ftrace_caller/ftrace_regs_caller onto the trampoline memory */
> +	ret = probe_kernel_read(trampoline, (void *)start_offset_caller, caller_size);
> +	if (WARN_ON(ret < 0))
> +		goto free;
> +
> +	/*
> +	 * Copy ftrace_common to the trampoline memory
> +	 * below ftrace_caller/ftrace_regs_caller. so
> +	 * we can merge the two function to one function.
> +	 */
> +	ret = probe_kernel_read(trampoline + caller_size, (void *)start_offset_common, common_size);
> +	if (WARN_ON(ret < 0))
> +		goto free;
> +
> +	/*
> +	 * We merge the two functions to one function, so these is
> +	 * no need to use jump instructions to ftrace_common, modify
> +	 * it to NOP.
> +	 */
> +	ip = (unsigned long)trampoline + caller_size - AARCH64_INSN_SIZE;
> +	nop = aarch64_insn_gen_nop();
> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
> +
> +	/*
> +	 * Stored ftrace_ops at the end of the trampoline.
> +	 * This will be used to load the third parameter for the callback.
> +	 * Basically, that location at the end of the trampoline takes the
> +	 * place of the global function_trace_op variable.
> +	 */
> +	ptr = (unsigned long *)(trampoline + size);
> +	*ptr = (unsigned long)ops;
> +
> +	/*
> +	 * Update the trampoline ops REF
> +	 *
> +	 * OLD INSNS : ldr_l x2, function_trace_op
> +	 *	adrp	x2, sym
> +	 *	ldr	x2, [x2, :lo12:\sym]
> +	 *
> +	 * NEW INSNS:
> +	 *	nop
> +	 *	ldr x2, <ftrace_ops>
> +	 */
> +	op_offset -= start_offset_common;
> +	ip = (unsigned long)trampoline + caller_size + op_offset;
> +	nop = aarch64_insn_gen_nop();
> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
> +
> +	op_offset += AARCH64_INSN_SIZE;
> +	ip = (unsigned long)trampoline + caller_size + op_offset;
> +	offset = (unsigned long)ptr - ip;
> +	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
> +		goto free;
> +	offset = offset / AARCH64_INSN_SIZE;
> +	pc_ldr |= (offset & mask) << shift;
> +	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);

I think it would be much better to have a separate template for the
trampoline which we don't have to patch in this way. It can even be
placed into a non-executable RO section, since the template shouldn't be
executed directly.

> +
> +	ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
> +
> +	set_vm_flush_reset_perms(trampoline);
> +
> +	/*
> +	 * Module allocation needs to be completed by making the page
> +	 * executable. The page is still writable, which is a security hazard,
> +	 * but anyhow ftrace breaks W^X completely.
> +	 */
> +	set_memory_x((unsigned long)trampoline, npages);

Why is the page still writeable? Surely you can make it RO first?

Please do not break W^X restrictions; we've tried to ensure that arm64
does the right thing by default here.

Note that arm64's ftrace_modify_code() will use a writeable alias, so it
can be used after the memory has been marked RO.

> +
> +	return (unsigned long)trampoline;
> +
> +free:
> +	tramp_free(trampoline);
> +	return 0;
> +}
> +
> +static unsigned long calc_trampoline_call_offset(struct ftrace_ops *ops)
> +{
> +	unsigned call_offset, end_offset, offset;
> +
> +	call_offset = (unsigned long)&ftrace_call;
> +	end_offset = (unsigned long)ftrace_common_end;
> +	offset = end_offset - call_offset;
> +
> +	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
> +}
> +
> +static int ftrace_trampoline_modify_call(struct ftrace_ops *ops)
> +{
> +	unsigned long offset;
> +	unsigned long pc;
> +	ftrace_func_t func;
> +	u32 new;
> +
> +	offset = calc_trampoline_call_offset(ops);
> +	pc = ops->trampoline + offset;
> +
> +	func = ftrace_ops_get_func(ops);
> +	new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK);
> +
> +	return ftrace_modify_code(pc, 0, new, false);
> +}
> +
> +#ifdef CONFIG_FUNCTION_GRAPH_TRACER
> +
> +static unsigned long calc_trampoline_graph_call_offset(struct ftrace_ops *ops)
> +{
> +	unsigned end_offset, call_offset, offset;
> +
> +	call_offset = (unsigned long)&ftrace_graph_call;
> +	end_offset = (unsigned long)ftrace_common_end;
> +	offset = end_offset - call_offset;
> +
> +	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
> +}
> +
> +extern int ftrace_graph_active;
> +static int ftrace_trampoline_modify_graph_call(struct ftrace_ops *ops)
> +{
> +	unsigned long offset;
> +	unsigned long pc;
> +	u32 branch, nop;
> +
> +	offset = calc_trampoline_graph_call_offset(ops);
> +	pc = ops->trampoline + offset;
> +
> +	branch = aarch64_insn_gen_branch_imm(pc,
> +					     (unsigned long)ftrace_graph_caller,
> +					     AARCH64_INSN_BRANCH_NOLINK);
> +	nop = aarch64_insn_gen_nop();
> +
> +	if (ftrace_graph_active)
> +		return ftrace_modify_code(pc, 0, branch, false);
> +	else
> +		return ftrace_modify_code(pc, 0, nop, false);
> +}

It should be posbile to share the bulk of this code with the non-graph
versions.

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
  2020-01-09 16:48   ` Mark Rutland
@ 2020-01-10 11:28     ` chengjian (D)
  -1 siblings, 0 replies; 10+ messages in thread
From: chengjian (D) @ 2020-01-10 11:28 UTC (permalink / raw)
  To: Mark Rutland
  Cc: linux-arm-kernel, linux-kernel, xiexiuqi, huawei.libin,
	bobo.shaobowang, catalin.marinas, duwe, chengjian (D)

Hi, Mark

     Thanks for your reply.

On 2020/1/10 0:48, Mark Rutland wrote:
> On Thu, Jan 09, 2020 at 02:27:36PM +0000, Cheng Jian wrote:
>
> Just how bad is this, and when/where does it matter?
>
> How much does this patch improve matters?

I will have a test about this.

>> Known issues :
>> If kaslr is enabled, the address of tramp and ftrace call
>> may be far away. Therefore, long jump support is required.
>> Here I intend to use the same solution as module relocating,
>> Reserve enough space for PLT at the end when allocating, can
>> use PLT to complete these long jumps.
> This can happen both ways; the callsite can also be too far from the
> trampoline to be able to branch to it.


Yes, that can happen both ways.

> I've had issues with that for other reasons, and I think that we might
> be able to use -fpatchable-function-entry=N,M to place a PLT immediately
> before each function for that. However, I'm wary of doing so because it
> makes it much harder to modify the patch site itself.


This sounds good. I have no better idea than this now.

At least try it first.


>
>>   
>> +GLOBAL(ftrace_common_end)
>>   	ret	x9
> This doesn't look right. Surely you want the RET, too?


I will fix this error, thanks.

> +/*
> + * ftrace_caller() or ftrace_regs_caller() trampoline
> + *				+-----------------------+
> + * ftrace_(regs_)caller =>	|	......		|
> + * ftrace_(regs_)caller_end =>	| b ftrace_common	| => nop
> + *				+-----------------------+
> + * ftrace_common =>		|	......		|
> + * function_trace_op_ptr =>	| adrp x2, sym		| => nop
> + *				| ldr  x2,[x2,:lo12:sym]| => ldr x2 <ftrace_op>
> + *				|	......		|
> + * ftrace_common_end  =>	|	retq		|
> Copy-paste from x86? arm64 doesn't have a retq instruction.
>
>> + *				+-----------------------+
>> + * ftrace_opt =>		|	ftrace_opt	|
>> + *				+-----------------------+
> Typo: s/opt/ops/ ?

Sorry for these scenes.


>> + */
>> +static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
>> +{
>> +	unsigned long start_offset_caller, end_offset_caller, caller_size;
>> +	unsigned long start_offset_common, end_offset_common, common_size;
>> +	unsigned long op_offset, offset, size, ip, npages;
>> +	void *trampoline;
>> +	unsigned long *ptr;
>> +	/* ldr x2, <label> */
>> +	u32 pc_ldr = 0x58000002;
>> +	u32 mask = BIT(19) - 1;
> Instead of open-coding this, please teach the insn framework how to
> encode LDR (immediate).


I will, Thank you.

>
>> +	int shift = 5;
>> +	int ret;
>> +	u32 nop;
>> +
>> +	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
>> +		start_offset_caller = (unsigned long)ftrace_regs_caller;
>> +		end_offset_caller = (unsigned long)ftrace_regs_caller_end;
>> +	} else {
>> +		start_offset_caller = (unsigned long)ftrace_caller;
>> +		end_offset_caller = (unsigned long)ftrace_caller_end;
>> +	}
>> +	start_offset_common = (unsigned long)ftrace_common;
>> +	end_offset_common = (unsigned long)ftrace_common_end;
>> +
>> +	op_offset = (unsigned long)function_trace_op_ptr;
>> +
>> +	/*
>> +	 * Merge ftrace_caller/ftrace_regs_caller and ftrace_common
>> +	 * to one function in ftrace trampoline.
>> +	 */
>> +	caller_size = end_offset_caller - start_offset_caller + AARCH64_INSN_SIZE;
>> +	common_size = end_offset_common - start_offset_common + AARCH64_INSN_SIZE;
>> +	size = caller_size + common_size;
>> +
>> +	trampoline = alloc_tramp(caller_size + common_size + sizeof(void *));
>> +	if (!trampoline)
>> +		return 0;
>> +
>> +	*tramp_size = caller_size + common_size + sizeof(void *);
>> +	npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
>> +
>> +	/* Copy ftrace_caller/ftrace_regs_caller onto the trampoline memory */
>> +	ret = probe_kernel_read(trampoline, (void *)start_offset_caller, caller_size);
>> +	if (WARN_ON(ret < 0))
>> +		goto free;
>> +
>> +	/*
>> +	 * Copy ftrace_common to the trampoline memory
>> +	 * below ftrace_caller/ftrace_regs_caller. so
>> +	 * we can merge the two function to one function.
>> +	 */
>> +	ret = probe_kernel_read(trampoline + caller_size, (void *)start_offset_common, common_size);
>> +	if (WARN_ON(ret < 0))
>> +		goto free;
>> +
>> +	/*
>> +	 * We merge the two functions to one function, so these is
>> +	 * no need to use jump instructions to ftrace_common, modify
>> +	 * it to NOP.
>> +	 */
>> +	ip = (unsigned long)trampoline + caller_size - AARCH64_INSN_SIZE;
>> +	nop = aarch64_insn_gen_nop();
>> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
>> +
>> +	/*
>> +	 * Stored ftrace_ops at the end of the trampoline.
>> +	 * This will be used to load the third parameter for the callback.
>> +	 * Basically, that location at the end of the trampoline takes the
>> +	 * place of the global function_trace_op variable.
>> +	 */
>> +	ptr = (unsigned long *)(trampoline + size);
>> +	*ptr = (unsigned long)ops;
>> +
>> +	/*
>> +	 * Update the trampoline ops REF
>> +	 *
>> +	 * OLD INSNS : ldr_l x2, function_trace_op
>> +	 *	adrp	x2, sym
>> +	 *	ldr	x2, [x2, :lo12:\sym]
>> +	 *
>> +	 * NEW INSNS:
>> +	 *	nop
>> +	 *	ldr x2, <ftrace_ops>
>> +	 */
>> +	op_offset -= start_offset_common;
>> +	ip = (unsigned long)trampoline + caller_size + op_offset;
>> +	nop = aarch64_insn_gen_nop();
>> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
>> +
>> +	op_offset += AARCH64_INSN_SIZE;
>> +	ip = (unsigned long)trampoline + caller_size + op_offset;
>> +	offset = (unsigned long)ptr - ip;
>> +	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
>> +		goto free;
>> +	offset = offset / AARCH64_INSN_SIZE;
>> +	pc_ldr |= (offset & mask) << shift;
>> +	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);
> I think it would be much better to have a separate template for the
> trampoline which we don't have to patch in this way. It can even be
> placed into a non-executable RO section, since the template shouldn't be
> executed directly.


A separate template !

This may be a good way, and I think the patching here is very HACK 
too(Not very friendly).

I had thought of other ways before, similar to the method on X86_64, remove
the ftrace_common(), directly modifying ftrace_caller/ftrace_reg_caller, 
We will

only need to copy the code once in this way, and these is no need to modify

call ftrace_common to NOP.


Using a trampoline template sounds great. but this also means that we 
need to

maintain a template(or maybe two templates: one for caller, another for 
regs_caller).

Hi, Mark, what do you think about it ?


>> +
>> +	ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
>> +
>> +	set_vm_flush_reset_perms(trampoline);
>> +
>> +	/*
>> +	 * Module allocation needs to be completed by making the page
>> +	 * executable. The page is still writable, which is a security hazard,
>> +	 * but anyhow ftrace breaks W^X completely.
>> +	 */
>> +	set_memory_x((unsigned long)trampoline, npages);
> Why is the page still writeable? Surely you can make it RO first?
>
> Please do not break W^X restrictions; we've tried to ensure that arm64
> does the right thing by default here.
>
> Note that arm64's ftrace_modify_code() will use a writeable alias, so it
> can be used after the memory has been marked RO.

YEAH, I will. arm64's ftrace_modify_code can work after the trampoline has
been marked RO.

>> +
>> +	return (unsigned long)trampoline;
>> +
>> +free:
>> +	tramp_free(trampoline);
>> +	return 0;
>> +}
>> +
>> +static unsigned long calc_trampoline_call_offset(struct ftrace_ops *ops)
>> +{
>> +	unsigned call_offset, end_offset, offset;
>> +
>> +	call_offset = (unsigned long)&ftrace_call;
>> +	end_offset = (unsigned long)ftrace_common_end;
>> +	offset = end_offset - call_offset;
>> +
>> +	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
>> +}
>> +
>> +static int ftrace_trampoline_modify_call(struct ftrace_ops *ops)
>> +{
>> +	unsigned long offset;
>> +	unsigned long pc;
>> +	ftrace_func_t func;
>> +	u32 new;
>> +
>> +	offset = calc_trampoline_call_offset(ops);
>> +	pc = ops->trampoline + offset;
>> +
>> +	func = ftrace_ops_get_func(ops);
>> +	new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK);
>> +
>> +	return ftrace_modify_code(pc, 0, new, false);
>> +}
>> +
>> +#ifdef CONFIG_FUNCTION_GRAPH_TRACER
>> +
>> +static unsigned long calc_trampoline_graph_call_offset(struct ftrace_ops *ops)
>> +{
>> +	unsigned end_offset, call_offset, offset;
>> +
>> +	call_offset = (unsigned long)&ftrace_graph_call;
>> +	end_offset = (unsigned long)ftrace_common_end;
>> +	offset = end_offset - call_offset;
>> +
>> +	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
>> +}
>> +
>> +extern int ftrace_graph_active;
>> +static int ftrace_trampoline_modify_graph_call(struct ftrace_ops *ops)
>> +{
>> +	unsigned long offset;
>> +	unsigned long pc;
>> +	u32 branch, nop;
>> +
>> +	offset = calc_trampoline_graph_call_offset(ops);
>> +	pc = ops->trampoline + offset;
>> +
>> +	branch = aarch64_insn_gen_branch_imm(pc,
>> +					     (unsigned long)ftrace_graph_caller,
>> +					     AARCH64_INSN_BRANCH_NOLINK);
>> +	nop = aarch64_insn_gen_nop();
>> +
>> +	if (ftrace_graph_active)
>> +		return ftrace_modify_code(pc, 0, branch, false);
>> +	else
>> +		return ftrace_modify_code(pc, 0, nop, false);
>> +}
> It should be posbile to share the bulk of this code with the non-graph
> versions.

It seems that, I will cleanup my code.


>
> Thanks,
> Mark.
>
> .


Thank you very much.

-- Cheng Jian.



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
@ 2020-01-10 11:28     ` chengjian (D)
  0 siblings, 0 replies; 10+ messages in thread
From: chengjian (D) @ 2020-01-10 11:28 UTC (permalink / raw)
  To: Mark Rutland
  Cc: chengjian (D),
	xiexiuqi, catalin.marinas, linux-kernel, bobo.shaobowang, duwe,
	huawei.libin, linux-arm-kernel

Hi, Mark

     Thanks for your reply.

On 2020/1/10 0:48, Mark Rutland wrote:
> On Thu, Jan 09, 2020 at 02:27:36PM +0000, Cheng Jian wrote:
>
> Just how bad is this, and when/where does it matter?
>
> How much does this patch improve matters?

I will have a test about this.

>> Known issues :
>> If kaslr is enabled, the address of tramp and ftrace call
>> may be far away. Therefore, long jump support is required.
>> Here I intend to use the same solution as module relocating,
>> Reserve enough space for PLT at the end when allocating, can
>> use PLT to complete these long jumps.
> This can happen both ways; the callsite can also be too far from the
> trampoline to be able to branch to it.


Yes, that can happen both ways.

> I've had issues with that for other reasons, and I think that we might
> be able to use -fpatchable-function-entry=N,M to place a PLT immediately
> before each function for that. However, I'm wary of doing so because it
> makes it much harder to modify the patch site itself.


This sounds good. I have no better idea than this now.

At least try it first.


>
>>   
>> +GLOBAL(ftrace_common_end)
>>   	ret	x9
> This doesn't look right. Surely you want the RET, too?


I will fix this error, thanks.

> +/*
> + * ftrace_caller() or ftrace_regs_caller() trampoline
> + *				+-----------------------+
> + * ftrace_(regs_)caller =>	|	......		|
> + * ftrace_(regs_)caller_end =>	| b ftrace_common	| => nop
> + *				+-----------------------+
> + * ftrace_common =>		|	......		|
> + * function_trace_op_ptr =>	| adrp x2, sym		| => nop
> + *				| ldr  x2,[x2,:lo12:sym]| => ldr x2 <ftrace_op>
> + *				|	......		|
> + * ftrace_common_end  =>	|	retq		|
> Copy-paste from x86? arm64 doesn't have a retq instruction.
>
>> + *				+-----------------------+
>> + * ftrace_opt =>		|	ftrace_opt	|
>> + *				+-----------------------+
> Typo: s/opt/ops/ ?

Sorry for these scenes.


>> + */
>> +static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
>> +{
>> +	unsigned long start_offset_caller, end_offset_caller, caller_size;
>> +	unsigned long start_offset_common, end_offset_common, common_size;
>> +	unsigned long op_offset, offset, size, ip, npages;
>> +	void *trampoline;
>> +	unsigned long *ptr;
>> +	/* ldr x2, <label> */
>> +	u32 pc_ldr = 0x58000002;
>> +	u32 mask = BIT(19) - 1;
> Instead of open-coding this, please teach the insn framework how to
> encode LDR (immediate).


I will, Thank you.

>
>> +	int shift = 5;
>> +	int ret;
>> +	u32 nop;
>> +
>> +	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
>> +		start_offset_caller = (unsigned long)ftrace_regs_caller;
>> +		end_offset_caller = (unsigned long)ftrace_regs_caller_end;
>> +	} else {
>> +		start_offset_caller = (unsigned long)ftrace_caller;
>> +		end_offset_caller = (unsigned long)ftrace_caller_end;
>> +	}
>> +	start_offset_common = (unsigned long)ftrace_common;
>> +	end_offset_common = (unsigned long)ftrace_common_end;
>> +
>> +	op_offset = (unsigned long)function_trace_op_ptr;
>> +
>> +	/*
>> +	 * Merge ftrace_caller/ftrace_regs_caller and ftrace_common
>> +	 * to one function in ftrace trampoline.
>> +	 */
>> +	caller_size = end_offset_caller - start_offset_caller + AARCH64_INSN_SIZE;
>> +	common_size = end_offset_common - start_offset_common + AARCH64_INSN_SIZE;
>> +	size = caller_size + common_size;
>> +
>> +	trampoline = alloc_tramp(caller_size + common_size + sizeof(void *));
>> +	if (!trampoline)
>> +		return 0;
>> +
>> +	*tramp_size = caller_size + common_size + sizeof(void *);
>> +	npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
>> +
>> +	/* Copy ftrace_caller/ftrace_regs_caller onto the trampoline memory */
>> +	ret = probe_kernel_read(trampoline, (void *)start_offset_caller, caller_size);
>> +	if (WARN_ON(ret < 0))
>> +		goto free;
>> +
>> +	/*
>> +	 * Copy ftrace_common to the trampoline memory
>> +	 * below ftrace_caller/ftrace_regs_caller. so
>> +	 * we can merge the two function to one function.
>> +	 */
>> +	ret = probe_kernel_read(trampoline + caller_size, (void *)start_offset_common, common_size);
>> +	if (WARN_ON(ret < 0))
>> +		goto free;
>> +
>> +	/*
>> +	 * We merge the two functions to one function, so these is
>> +	 * no need to use jump instructions to ftrace_common, modify
>> +	 * it to NOP.
>> +	 */
>> +	ip = (unsigned long)trampoline + caller_size - AARCH64_INSN_SIZE;
>> +	nop = aarch64_insn_gen_nop();
>> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
>> +
>> +	/*
>> +	 * Stored ftrace_ops at the end of the trampoline.
>> +	 * This will be used to load the third parameter for the callback.
>> +	 * Basically, that location at the end of the trampoline takes the
>> +	 * place of the global function_trace_op variable.
>> +	 */
>> +	ptr = (unsigned long *)(trampoline + size);
>> +	*ptr = (unsigned long)ops;
>> +
>> +	/*
>> +	 * Update the trampoline ops REF
>> +	 *
>> +	 * OLD INSNS : ldr_l x2, function_trace_op
>> +	 *	adrp	x2, sym
>> +	 *	ldr	x2, [x2, :lo12:\sym]
>> +	 *
>> +	 * NEW INSNS:
>> +	 *	nop
>> +	 *	ldr x2, <ftrace_ops>
>> +	 */
>> +	op_offset -= start_offset_common;
>> +	ip = (unsigned long)trampoline + caller_size + op_offset;
>> +	nop = aarch64_insn_gen_nop();
>> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
>> +
>> +	op_offset += AARCH64_INSN_SIZE;
>> +	ip = (unsigned long)trampoline + caller_size + op_offset;
>> +	offset = (unsigned long)ptr - ip;
>> +	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
>> +		goto free;
>> +	offset = offset / AARCH64_INSN_SIZE;
>> +	pc_ldr |= (offset & mask) << shift;
>> +	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);
> I think it would be much better to have a separate template for the
> trampoline which we don't have to patch in this way. It can even be
> placed into a non-executable RO section, since the template shouldn't be
> executed directly.


A separate template !

This may be a good way, and I think the patching here is very HACK 
too(Not very friendly).

I had thought of other ways before, similar to the method on X86_64, remove
the ftrace_common(), directly modifying ftrace_caller/ftrace_reg_caller, 
We will

only need to copy the code once in this way, and these is no need to modify

call ftrace_common to NOP.


Using a trampoline template sounds great. but this also means that we 
need to

maintain a template(or maybe two templates: one for caller, another for 
regs_caller).

Hi, Mark, what do you think about it ?


>> +
>> +	ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
>> +
>> +	set_vm_flush_reset_perms(trampoline);
>> +
>> +	/*
>> +	 * Module allocation needs to be completed by making the page
>> +	 * executable. The page is still writable, which is a security hazard,
>> +	 * but anyhow ftrace breaks W^X completely.
>> +	 */
>> +	set_memory_x((unsigned long)trampoline, npages);
> Why is the page still writeable? Surely you can make it RO first?
>
> Please do not break W^X restrictions; we've tried to ensure that arm64
> does the right thing by default here.
>
> Note that arm64's ftrace_modify_code() will use a writeable alias, so it
> can be used after the memory has been marked RO.

YEAH, I will. arm64's ftrace_modify_code can work after the trampoline has
been marked RO.

>> +
>> +	return (unsigned long)trampoline;
>> +
>> +free:
>> +	tramp_free(trampoline);
>> +	return 0;
>> +}
>> +
>> +static unsigned long calc_trampoline_call_offset(struct ftrace_ops *ops)
>> +{
>> +	unsigned call_offset, end_offset, offset;
>> +
>> +	call_offset = (unsigned long)&ftrace_call;
>> +	end_offset = (unsigned long)ftrace_common_end;
>> +	offset = end_offset - call_offset;
>> +
>> +	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
>> +}
>> +
>> +static int ftrace_trampoline_modify_call(struct ftrace_ops *ops)
>> +{
>> +	unsigned long offset;
>> +	unsigned long pc;
>> +	ftrace_func_t func;
>> +	u32 new;
>> +
>> +	offset = calc_trampoline_call_offset(ops);
>> +	pc = ops->trampoline + offset;
>> +
>> +	func = ftrace_ops_get_func(ops);
>> +	new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK);
>> +
>> +	return ftrace_modify_code(pc, 0, new, false);
>> +}
>> +
>> +#ifdef CONFIG_FUNCTION_GRAPH_TRACER
>> +
>> +static unsigned long calc_trampoline_graph_call_offset(struct ftrace_ops *ops)
>> +{
>> +	unsigned end_offset, call_offset, offset;
>> +
>> +	call_offset = (unsigned long)&ftrace_graph_call;
>> +	end_offset = (unsigned long)ftrace_common_end;
>> +	offset = end_offset - call_offset;
>> +
>> +	return ops->trampoline_size - AARCH64_INSN_SIZE - sizeof(void *) - offset;
>> +}
>> +
>> +extern int ftrace_graph_active;
>> +static int ftrace_trampoline_modify_graph_call(struct ftrace_ops *ops)
>> +{
>> +	unsigned long offset;
>> +	unsigned long pc;
>> +	u32 branch, nop;
>> +
>> +	offset = calc_trampoline_graph_call_offset(ops);
>> +	pc = ops->trampoline + offset;
>> +
>> +	branch = aarch64_insn_gen_branch_imm(pc,
>> +					     (unsigned long)ftrace_graph_caller,
>> +					     AARCH64_INSN_BRANCH_NOLINK);
>> +	nop = aarch64_insn_gen_nop();
>> +
>> +	if (ftrace_graph_active)
>> +		return ftrace_modify_code(pc, 0, branch, false);
>> +	else
>> +		return ftrace_modify_code(pc, 0, nop, false);
>> +}
> It should be posbile to share the bulk of this code with the non-graph
> versions.

It seems that, I will cleanup my code.


>
> Thanks,
> Mark.
>
> .


Thank you very much.

-- Cheng Jian.



_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
  2020-01-10 11:28     ` chengjian (D)
@ 2020-01-10 12:12       ` Mark Rutland
  -1 siblings, 0 replies; 10+ messages in thread
From: Mark Rutland @ 2020-01-10 12:12 UTC (permalink / raw)
  To: chengjian (D)
  Cc: linux-arm-kernel, linux-kernel, xiexiuqi, huawei.libin,
	bobo.shaobowang, catalin.marinas, duwe

On Fri, Jan 10, 2020 at 07:28:17PM +0800, chengjian (D) wrote:
> On 2020/1/10 0:48, Mark Rutland wrote:
> > On Thu, Jan 09, 2020 at 02:27:36PM +0000, Cheng Jian wrote:
> > > +	/*
> > > +	 * Update the trampoline ops REF
> > > +	 *
> > > +	 * OLD INSNS : ldr_l x2, function_trace_op
> > > +	 *	adrp	x2, sym
> > > +	 *	ldr	x2, [x2, :lo12:\sym]
> > > +	 *
> > > +	 * NEW INSNS:
> > > +	 *	nop
> > > +	 *	ldr x2, <ftrace_ops>
> > > +	 */
> > > +	op_offset -= start_offset_common;
> > > +	ip = (unsigned long)trampoline + caller_size + op_offset;
> > > +	nop = aarch64_insn_gen_nop();
> > > +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
> > > +
> > > +	op_offset += AARCH64_INSN_SIZE;
> > > +	ip = (unsigned long)trampoline + caller_size + op_offset;
> > > +	offset = (unsigned long)ptr - ip;
> > > +	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
> > > +		goto free;
> > > +	offset = offset / AARCH64_INSN_SIZE;
> > > +	pc_ldr |= (offset & mask) << shift;
> > > +	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);
> > I think it would be much better to have a separate template for the
> > trampoline which we don't have to patch in this way. It can even be
> > placed into a non-executable RO section, since the template shouldn't be
> > executed directly.
> 
> A separate template !
> 
> This may be a good way, and I think the patching here is very HACK too(Not
> very friendly).
> 
> I had thought of other ways before, similar to the method on X86_64,
> remove the ftrace_common(), directly modifying
> ftrace_caller/ftrace_reg_caller, We will only need to copy the code
> once in this way, and these is no need to modify call ftrace_common to
> NOP.
> 
> Using a trampoline template sounds great. but this also means that we
> need to aintain a template(or maybe two templates: one for caller,
> another for regs_caller).
> 
> Hi, Mark, what do you think about it ?

I think that having two templates is fine. We can factor
ftrace_common_return into a macro mirroring ftrace_regs_entry, and I
suspect we can probably figure out some way to factor the common
portion.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
@ 2020-01-10 12:12       ` Mark Rutland
  0 siblings, 0 replies; 10+ messages in thread
From: Mark Rutland @ 2020-01-10 12:12 UTC (permalink / raw)
  To: chengjian (D)
  Cc: xiexiuqi, catalin.marinas, linux-kernel, bobo.shaobowang, duwe,
	huawei.libin, linux-arm-kernel

On Fri, Jan 10, 2020 at 07:28:17PM +0800, chengjian (D) wrote:
> On 2020/1/10 0:48, Mark Rutland wrote:
> > On Thu, Jan 09, 2020 at 02:27:36PM +0000, Cheng Jian wrote:
> > > +	/*
> > > +	 * Update the trampoline ops REF
> > > +	 *
> > > +	 * OLD INSNS : ldr_l x2, function_trace_op
> > > +	 *	adrp	x2, sym
> > > +	 *	ldr	x2, [x2, :lo12:\sym]
> > > +	 *
> > > +	 * NEW INSNS:
> > > +	 *	nop
> > > +	 *	ldr x2, <ftrace_ops>
> > > +	 */
> > > +	op_offset -= start_offset_common;
> > > +	ip = (unsigned long)trampoline + caller_size + op_offset;
> > > +	nop = aarch64_insn_gen_nop();
> > > +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
> > > +
> > > +	op_offset += AARCH64_INSN_SIZE;
> > > +	ip = (unsigned long)trampoline + caller_size + op_offset;
> > > +	offset = (unsigned long)ptr - ip;
> > > +	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
> > > +		goto free;
> > > +	offset = offset / AARCH64_INSN_SIZE;
> > > +	pc_ldr |= (offset & mask) << shift;
> > > +	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);
> > I think it would be much better to have a separate template for the
> > trampoline which we don't have to patch in this way. It can even be
> > placed into a non-executable RO section, since the template shouldn't be
> > executed directly.
> 
> A separate template !
> 
> This may be a good way, and I think the patching here is very HACK too(Not
> very friendly).
> 
> I had thought of other ways before, similar to the method on X86_64,
> remove the ftrace_common(), directly modifying
> ftrace_caller/ftrace_reg_caller, We will only need to copy the code
> once in this way, and these is no need to modify call ftrace_common to
> NOP.
> 
> Using a trampoline template sounds great. but this also means that we
> need to aintain a template(or maybe two templates: one for caller,
> another for regs_caller).
> 
> Hi, Mark, what do you think about it ?

I think that having two templates is fine. We can factor
ftrace_common_return into a macro mirroring ftrace_regs_entry, and I
suspect we can probably figure out some way to factor the common
portion.

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
  2020-01-10 12:12       ` Mark Rutland
@ 2020-01-13  6:18         ` chengjian (D)
  -1 siblings, 0 replies; 10+ messages in thread
From: chengjian (D) @ 2020-01-13  6:18 UTC (permalink / raw)
  To: Mark Rutland
  Cc: linux-arm-kernel, linux-kernel, xiexiuqi, huawei.libin,
	bobo.shaobowang, catalin.marinas, duwe

On 2020/1/10 20:12, Mark Rutland wrote:
> On Fri, Jan 10, 2020 at 07:28:17PM +0800, chengjian (D) wrote:
>> On 2020/1/10 0:48, Mark Rutland wrote:
>>> On Thu, Jan 09, 2020 at 02:27:36PM +0000, Cheng Jian wrote:
>>>> +	/*
>>>> +	 * Update the trampoline ops REF
>>>> +	 *
>>>> +	 * OLD INSNS : ldr_l x2, function_trace_op
>>>> +	 *	adrp	x2, sym
>>>> +	 *	ldr	x2, [x2, :lo12:\sym]
>>>> +	 *
>>>> +	 * NEW INSNS:
>>>> +	 *	nop
>>>> +	 *	ldr x2, <ftrace_ops>
>>>> +	 */
>>>> +	op_offset -= start_offset_common;
>>>> +	ip = (unsigned long)trampoline + caller_size + op_offset;
>>>> +	nop = aarch64_insn_gen_nop();
>>>> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
>>>> +
>>>> +	op_offset += AARCH64_INSN_SIZE;
>>>> +	ip = (unsigned long)trampoline + caller_size + op_offset;
>>>> +	offset = (unsigned long)ptr - ip;
>>>> +	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
>>>> +		goto free;
>>>> +	offset = offset / AARCH64_INSN_SIZE;
>>>> +	pc_ldr |= (offset & mask) << shift;
>>>> +	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);
>>> I think it would be much better to have a separate template for the
>>> trampoline which we don't have to patch in this way. It can even be
>>> placed into a non-executable RO section, since the template shouldn't be
>>> executed directly.
>> A separate template !
>>
>> This may be a good way, and I think the patching here is very HACK too(Not
>> very friendly).
>>
>> I had thought of other ways before, similar to the method on X86_64,
>> remove the ftrace_common(), directly modifying
>> ftrace_caller/ftrace_reg_caller, We will only need to copy the code
>> once in this way, and these is no need to modify call ftrace_common to
>> NOP.
>>
>> Using a trampoline template sounds great. but this also means that we
>> need to aintain a template(or maybe two templates: one for caller,
>> another for regs_caller).
>>
>> Hi, Mark, what do you think about it ?
> I think that having two templates is fine. We can factor
> ftrace_common_return into a macro mirroring ftrace_regs_entry, and I
> suspect we can probably figure out some way to factor the common
> portion.
>
> Thanks,
> Mark.
>
> .


OK, I will do it.

Thank you, Mark.



   --Cheng Jian



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines
@ 2020-01-13  6:18         ` chengjian (D)
  0 siblings, 0 replies; 10+ messages in thread
From: chengjian (D) @ 2020-01-13  6:18 UTC (permalink / raw)
  To: Mark Rutland
  Cc: xiexiuqi, catalin.marinas, linux-kernel, bobo.shaobowang, duwe,
	huawei.libin, linux-arm-kernel

On 2020/1/10 20:12, Mark Rutland wrote:
> On Fri, Jan 10, 2020 at 07:28:17PM +0800, chengjian (D) wrote:
>> On 2020/1/10 0:48, Mark Rutland wrote:
>>> On Thu, Jan 09, 2020 at 02:27:36PM +0000, Cheng Jian wrote:
>>>> +	/*
>>>> +	 * Update the trampoline ops REF
>>>> +	 *
>>>> +	 * OLD INSNS : ldr_l x2, function_trace_op
>>>> +	 *	adrp	x2, sym
>>>> +	 *	ldr	x2, [x2, :lo12:\sym]
>>>> +	 *
>>>> +	 * NEW INSNS:
>>>> +	 *	nop
>>>> +	 *	ldr x2, <ftrace_ops>
>>>> +	 */
>>>> +	op_offset -= start_offset_common;
>>>> +	ip = (unsigned long)trampoline + caller_size + op_offset;
>>>> +	nop = aarch64_insn_gen_nop();
>>>> +	memcpy((void *)ip, &nop, AARCH64_INSN_SIZE);
>>>> +
>>>> +	op_offset += AARCH64_INSN_SIZE;
>>>> +	ip = (unsigned long)trampoline + caller_size + op_offset;
>>>> +	offset = (unsigned long)ptr - ip;
>>>> +	if (WARN_ON(offset % AARCH64_INSN_SIZE != 0))
>>>> +		goto free;
>>>> +	offset = offset / AARCH64_INSN_SIZE;
>>>> +	pc_ldr |= (offset & mask) << shift;
>>>> +	memcpy((void *)ip, &pc_ldr, AARCH64_INSN_SIZE);
>>> I think it would be much better to have a separate template for the
>>> trampoline which we don't have to patch in this way. It can even be
>>> placed into a non-executable RO section, since the template shouldn't be
>>> executed directly.
>> A separate template !
>>
>> This may be a good way, and I think the patching here is very HACK too(Not
>> very friendly).
>>
>> I had thought of other ways before, similar to the method on X86_64,
>> remove the ftrace_common(), directly modifying
>> ftrace_caller/ftrace_reg_caller, We will only need to copy the code
>> once in this way, and these is no need to modify call ftrace_common to
>> NOP.
>>
>> Using a trampoline template sounds great. but this also means that we
>> need to aintain a template(or maybe two templates: one for caller,
>> another for regs_caller).
>>
>> Hi, Mark, what do you think about it ?
> I think that having two templates is fine. We can factor
> ftrace_common_return into a macro mirroring ftrace_regs_entry, and I
> suspect we can probably figure out some way to factor the common
> portion.
>
> Thanks,
> Mark.
>
> .


OK, I will do it.

Thank you, Mark.



   --Cheng Jian



_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-01-13  6:18 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-09 14:27 [RFC PATCH] arm64/ftrace: support dynamically allocated trampolines Cheng Jian
2020-01-09 14:27 ` Cheng Jian
2020-01-09 16:48 ` Mark Rutland
2020-01-09 16:48   ` Mark Rutland
2020-01-10 11:28   ` chengjian (D)
2020-01-10 11:28     ` chengjian (D)
2020-01-10 12:12     ` Mark Rutland
2020-01-10 12:12       ` Mark Rutland
2020-01-13  6:18       ` chengjian (D)
2020-01-13  6:18         ` chengjian (D)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.